In [1]:
import pandas as pd
import numpy as np
import re
import string

Read Poverty Tweets

In [5]:
with open("Dataset/PovertyRelatedTweets.csv", "r") as file:
    length = file.readlines()
data= pd.DataFrame(columns= ["Date", "Tweets"], index=range(len(length)))

In [6]:
with open("Dataset/PovertyRelatedTweets.csv", "r") as ins:
    i = 0
    for line in ins:
        line = list(line.split(';'))
        data.loc[i].Date = line[1]
        data.loc[i].Tweets= line[4]
        i+= 1
    print(data.head(10))

               Date                                             Tweets
0              date                                               text
1  2018-09-30 19:39  "Growing Food and Faith in Impoverished Brazil...
2  2018-09-30 19:37  "In 30yrs, Vietnam from # poverty to an emergi...
3  2018-09-30 19:37  "The Politics of # Poverty: Officials grapple ...
4  2018-09-30 19:29  "Not just talking, doing. # kenyalendahand # k...
5  2018-09-30 19:24                          "Raiders 0 - 4 # Poverty"
6  2018-09-30 19:23  "For all those who believe the world has gone ...
7  2018-09-30 19:13  "Even # Breathing Is A Risk In One Of # Orland...
8  2018-09-30 19:09  ""Maybe it's time to rethink the idea that we ...
9  2018-09-30 19:00  "This is why I volunteer with Big Bros Big Sis...


In [7]:
data.to_csv("Separated_PovertyTweets.csv", encoding="utf-8")

In [8]:
train= pd.read_csv("Separated_PovertyTweets.csv")
train.drop(columns = ['Unnamed: 0'], inplace = True)
train.drop(train.index[0], inplace = True)
train.head(6)

Unnamed: 0,Date,Tweets
1,2018-09-30 19:39,"""Growing Food and Faith in Impoverished Brazil..."
2,2018-09-30 19:37,"""In 30yrs, Vietnam from # poverty to an emergi..."
3,2018-09-30 19:37,"""The Politics of # Poverty: Officials grapple ..."
4,2018-09-30 19:29,"""Not just talking, doing. # kenyalendahand # k..."
5,2018-09-30 19:24,"""Raiders 0 - 4 # Poverty"""
6,2018-09-30 19:23,"""For all those who believe the world has gone ..."


Check the length of tweet

In [9]:
train['uncleaned_len'] = [len(l) for l in train.Tweets]

In [10]:
train.head(6)

Unnamed: 0,Date,Tweets,uncleaned_len
1,2018-09-30 19:39,"""Growing Food and Faith in Impoverished Brazil...",247
2,2018-09-30 19:37,"""In 30yrs, Vietnam from # poverty to an emergi...",429
3,2018-09-30 19:37,"""The Politics of # Poverty: Officials grapple ...",309
4,2018-09-30 19:29,"""Not just talking, doing. # kenyalendahand # k...",292
5,2018-09-30 19:24,"""Raiders 0 - 4 # Poverty""",25
6,2018-09-30 19:23,"""For all those who believe the world has gone ...",234


Preprocess tweets

In [11]:
##Preprocess/clean the tweets
def preprocessTweets(train): 
    # HTML encoding if any
    
    # import unicodedata
    train.Tweets = train.Tweets.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    #train.Tweets = train.Tweets.map(lambda x: x.strip())
    #train.Tweets = train.Tweets.map(lambda x: x.strip())  
    train.Tweets = train.Tweets.str.lstrip()
    train.Tweets = train.Tweets.str.rstrip()
    train.Tweets = train.Tweets.str.strip()
        
    #@ replace
    regex_pat = re.compile(r'((?:@ [\w_]+) | (?:@[\w_]+))', flags = re.IGNORECASE)
    train.Tweets = train.Tweets.replace(regex_pat, '')
    
    #numbers
    regex_pat = re.compile(  r'(?:(?:\d+,?)+(?:\.?\d+)?)', flags = re.IGNORECASE)
    train.Tweets = train.Tweets.replace(regex_pat, '')
    
    #hashtags 
    regex_pat = re.compile(r'# ([^\s]+)', flags = re.IGNORECASE)
    train.Tweets = train.Tweets.replace(regex_pat,  r'\1')
    
    #URL
    regex_pat = re.compile( r'((\w+\.com)|(www\.[^\s]+)|(http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+))', flags = re.IGNORECASE)
    train.Tweets = train.Tweets.replace(regex_pat,' ')
    
    #retweet text 'RT'
    regex_pat = re.compile(r'^RT[\s]+', flags = re.IGNORECASE)
    train.Tweets = train.Tweets.replace(regex_pat,  '')
    
    #replace 2+ dots with a space
    regex_pat = re.compile(r'\.{2,}', flags = re.IGNORECASE)
    train.Tweets = train.Tweets.replace(regex_pat,  ' ')
    
    # remove punctuations like quote, exclamation sign, etc.
    # we replace them with a space
    regex_pat = re.compile(r'['+string.punctuation+']+', flags = re.IGNORECASE)
    train.Tweets = train.Tweets.replace(regex_pat,  ' ')
    
    # Replace multiple spaces with a single space
    regex_pat = re.compile(r'\s+', flags = re.IGNORECASE)
    train.Tweets = train.Tweets.replace(regex_pat,  ' ')

In [12]:
preprocessTweets(train)
train['cleaned_len'] = [len(l) for l in train.Tweets]

In [13]:
#Cleaned Tweets
print(train.Tweets.head(10))

1      Growing Food and Faith in Impoverished Brazil...
2      In yrs Vietnam from poverty to an emerging ma...
3      The Politics of Poverty Officials grapple wit...
4      Not just talking doing kenyalendahand kenya n...
5                                      Raiders Poverty 
6      For all those who believe the world has gone ...
7      Even Breathing Is A Risk In One Of Orlando s ...
8      Maybe it s time to rethink the idea that we k...
9      This is why I volunteer with Big Bros Big Sis...
10     Americas children in brief Key national indic...
Name: Tweets, dtype: object


Final Cleaned Tweets

In [14]:
train.to_csv("CleanedPovertyTweets_v1.csv")

In [15]:
cleaned_trained = pd.read_csv("CleanedPovertyTweets_v1.csv")
cleaned_trained.head(10)

Unnamed: 0.1,Unnamed: 0,Date,Tweets,uncleaned_len,cleaned_len
0,1,2018-09-30 19:39,Growing Food and Faith in Impoverished Brazil...,247,89
1,2,2018-09-30 19:37,In yrs Vietnam from poverty to an emerging ma...,429,220
2,3,2018-09-30 19:37,The Politics of Poverty Officials grapple wit...,309,96
3,4,2018-09-30 19:29,Not just talking doing kenyalendahand kenya n...,292,166
4,5,2018-09-30 19:24,Raiders Poverty,25,17
5,6,2018-09-30 19:23,For all those who believe the world has gone ...,234,212
6,7,2018-09-30 19:13,Even Breathing Is A Risk In One Of Orlando s ...,207,151
7,8,2018-09-30 19:09,Maybe it s time to rethink the idea that we k...,192,125
8,9,2018-09-30 19:00,This is why I volunteer with Big Bros Big Sis...,191,166
9,10,2018-09-30 19:00,Americas children in brief Key national indic...,279,211
