In [25]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizernizer

In [3]:
df_raw_data = pd.read_csv('data/sentiment140.csv', encoding='latin', header=None)
df_raw_data.columns = ['target', 'id', 'date', 'flag', 'username', 'tweet']
df_raw_data.head()

Unnamed: 0,target,id,date,flag,username,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
print("Min date: " + str(df_raw_data.date.min()) + ". Max date: " + str(df_raw_data.date.max()))
print("Unique user: " + str(df_raw_data.username.nunique()))
print("Number of data on each class: ")
print(df_raw_data.target.value_counts())

Min date: Fri Apr 17 20:30:31 PDT 2009. Max date: Wed May 27 07:27:38 PDT 2009
Unique user: 659775
Number of data on each class: 
4    800000
0    800000
Name: target, dtype: int64


In [5]:
df_raw_data = df_raw_data[['target', 'tweet']]
df_raw_data.head()

Unnamed: 0,target,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [6]:
df_raw_data[df_raw_data.target==0].head()

Unnamed: 0,target,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [7]:
df_raw_data[df_raw_data.target==4].head()

Unnamed: 0,target,tweet
800000,4,I LOVE @Health4UandPets u guys r the best!!
800001,4,im meeting up with one of my besties tonight! ...
800002,4,"@DaRealSunisaKim Thanks for the Twitter add, S..."
800003,4,Being sick can be really cheap when it hurts t...
800004,4,@LovesBrooklyn2 he has that effect on everyone


In [8]:
df_raw_data['tweet_len'] = [len(t) for t in df_raw_data.tweet]
df_raw_data.head()

Unnamed: 0,target,tweet,tweet_len
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",115
1,0,is upset that he can't update his Facebook by ...,111
2,0,@Kenichan I dived many times for the ball. Man...,89
3,0,my whole body feels itchy and like its on fire,47
4,0,"@nationwideclass no, it's not behaving at all....",111


In [9]:
df_raw_data[df_raw_data.tweet_len > 140].head(10)

Unnamed: 0,target,tweet,tweet_len
213,0,Awwh babs... you look so sad underneith that s...,142
226,0,Tuesdayï¿½ll start with reflection ï¿½n then a...,141
279,0,Whinging. My client&amp;boss don't understand ...,145
343,0,@TheLeagueSF Not Fun &amp; Furious? The new ma...,145
400,0,#3 woke up and was having an accident - &quot;...,144
464,0,"My bathtub drain is fired: it haz 1 job 2 do, ...",146
492,0,"pears &amp; Brie, bottle of Cabernet, and &quo...",150
747,0,Have an invite for &quot;Healthy Dining&quot; ...,141
957,0,Damnit I was really digging this season of Rea...,141
1064,0,Why do I keep looking...I know that what I rea...,141


In [10]:
df_raw_data.tweet[279]

"Whinging. My client&amp;boss don't understand English well. Rewrote some text unreadable. It's written by v. good writer&amp;reviewed correctly. "

In [24]:
BeautifulSoup(df_raw_data.tweet[279], 'html5lib').get_text()

"Whinging. My client&boss don't understand English well. Rewrote some text unreadable. It's written by v. good writer&reviewed correctly. "

In [76]:
tok = WordPunctTokenizer()
username_pat = r'@[A-Za-z0-9_]+|https?://[^ ]+'
web_pat = r'www.[^ ]+'
negation_dic = {("isnt", "isn't"):"is not", ("arent", "aren't"):"are not", ("wasnt", "wasn't"):"was not", 
                ("werent","weren't"): "were not", ("havent","haven't"): "have not", ("hasnt", "hasn't"): "has not", 
                ("hadnt", "hadn't"): "had not", ("wont", "won't"): "will not", ("wouldnt","wouldn't"): "would not", 
                ("dont", "don't"):"do not", ("doesnt", "doesn't"): "does not", ("didnt", "didn't") : "did not", 
                ("cant", "can't"):"can not", ("couldnt", "couldn't"):"could not", ("shouldnt","shouldn't"):"should not",
               ("mightnt", "mightn't"): "might not", ("musnt", "musn't"): "must not"}
negation_dic = {k:v for kl,v in negation_dic.items() for k in kl}
negation_pat = re.compile(r'\b('+'|'.join(negation_dic.keys()) + r')\b')

def data_cleansing(tweet):
    soup = BeautifulSoup(tweet, 'html5lib')
    tweet = soup.get_text()
    
    try:
        tweet = tweet.decode("utf-8-sig").replace(u"\ufffd","?")
    except:
        tweet = tweet
    
    tweet = re.sub(username_pat, '', tweet)
    tweet = re.sub(web_pat, '', tweet)
    tweet = tweet.lower()
    tweet = negation_pat.sub(lambda x: negation_dic[x.group()], tweet)
    tweet = re.sub("[^A-Za-z]", " ", tweet)
    
    tweet = [word for word in tok.tokenize(tweet) if len(word) > 1]
    
    tweet = (" ".join(tweet)).strip()
    
    return tweet

In [None]:
print("{}. {} appears {} times.".format(i, key, wordBank[key]))

In [86]:
cleaned_tweets = []
for i in range(0, 1600000):
    if (i+1)%100000 == 0:
        print('{} of {} have been cleaned'.format(i+1, 1600000))
    cleaned_tweets.append(data_cleansing(df_raw_data.tweet[i]))

100000 of 1600000 have been cleaned
200000 of 1600000 have been cleaned
300000 of 1600000 have been cleaned
400000 of 1600000 have been cleaned
500000 of 1600000 have been cleaned
600000 of 1600000 have been cleaned
700000 of 1600000 have been cleaned


  ' Beautiful Soup.' % markup)


800000 of 1600000 have been cleaned
900000 of 1600000 have been cleaned
1000000 of 1600000 have been cleaned
1100000 of 1600000 have been cleaned
1200000 of 1600000 have been cleaned


  ' Beautiful Soup.' % markup)


1300000 of 1600000 have been cleaned
1400000 of 1600000 have been cleaned
1500000 of 1600000 have been cleaned
1600000 of 1600000 have been cleaned


In [87]:
cleaned_tweets[0:10]

['awww that bummer you shoulda got david carr of third day to do it',
 'is upset that he can not update his facebook by texting it and might cry as result school today also blah',
 'dived many times for the ball managed to save the rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 'no it not behaving at all mad why am here because can not see you all over there',
 'not the whole crew',
 'need hug',
 'hey long time no see yes rains bit only bit lol fine thanks how you',
 'nope they did not have it',
 'que me muera']

In [89]:
df_cleaned_data = pd.DataFrame(cleaned_tweets, columns=['tweet'])
df_cleaned_data['target'] = df_raw_data.target
df_cleaned_data.head()

Unnamed: 0,tweet,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [92]:
df_cleaned_data.to_csv('data/cleaned_tweets.csv', index=False)

In [90]:
df_cleaned_data.tail()

Unnamed: 0,tweet,target
1599995,just woke up having no school is the best feel...,4
1599996,thewdb com very cool to hear old walt interviews,4
1599997,are you ready for your mojo makeover ask me fo...,4
1599998,happy th birthday to my boo of alll time tupac...,4
1599999,happy charitytuesday,4


In [91]:
df_raw_data.tail()

Unnamed: 0,target,tweet,tweet_len
1599995,4,Just woke up. Having no school is the best fee...,56
1599996,4,TheWDB.com - Very cool to hear old Walt interv...,78
1599997,4,Are you ready for your MoJo Makeover? Ask me f...,57
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...,65
1599999,4,happy #charitytuesday @theNSPCC @SparksCharity...,62


In [88]:
df_raw_data.tweet[0:10]

0    @switchfoot http://twitpic.com/2y1zl - Awww, t...
1    is upset that he can't update his Facebook by ...
2    @Kenichan I dived many times for the ball. Man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
5                        @Kwesidei not the whole crew 
6                                          Need a hug 
7    @LOLTrish hey  long time no see! Yes.. Rains a...
8                 @Tatiana_K nope they didn't have it 
9                            @twittera que me muera ? 
Name: tweet, dtype: object

In [79]:
df_raw_data.tweet[10000]

"I think there's a problem with the ISP in this area or something...my connection go too slow to do anything online yesterday &amp; today "

In [80]:
data_cleansing(df_raw_data.tweet[10000])

'think there problem with the isp in this area or something my connection go too slow to do anything online yesterday today'

In [72]:
negation_pat.sub(lambda x: negation_dic[x.group()], "I diswasnt done")

'I diswasnt done'

In [48]:
tok = WordPunctTokenizer()
tes = re.sub("[^a-zA-Z]", " ", re.sub(username_pat, "", df_raw_data.tweet[800000]))
tok.tokenize(tes)

['I', 'LOVE', 'u', 'guys', 'r', 'the', 'best']

In [49]:
[x for x  in tok.tokenize(tes) if len(x) > 1]

['LOVE', 'guys', 'the', 'best']

In [43]:
df_raw_data.tweet[800000]

'I LOVE @Health4UandPets u guys r the best!! '

In [45]:
re.sub("[^a-zA-Z]", " ", re.sub(username_pat, "", df_raw_data.tweet[800000]))

'I LOVE  u guys r the best   '

In [40]:
negation_pat

re.compile(r"\x08(isn't|aren't|wasn't|weren't|haven't|hasn't|hadn't|won't|wouldn't|don't|doesn't|didn't|can't|couldn't|shouldn't|mightn't|musn't)\x08",
re.UNICODE)

In [36]:
username_pat

'@[A-Za-z0-9_]+|https?://[^ ]+'

In [None]:
def data_preprocess(data, norm_type):
    replace_no_space = re.compile("(\\n)|(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
    replace_with_space = re.compile("(<[^>]*>)|(<br\s*/><br\s*/>)|(\-)|(\/)|([^a-zA-z])")
    stop_words = ['in', 'of', 'the', 'at', 'a', 'an', 'is', 'are', 'am', 'was', 'were', 'and']
    
    data['review'] = data['review'].str.lower()
    
    data['review'] = [''.join(replace_no_space.sub("", x)) for x in data['review'].tolist()] #remove special char
    data['review'] = [''.join(replace_with_space.sub(" ", x)) for x in data['review'].tolist()] #remove special char
    
    data['review'] = [' '.join([word for word in x.split() if word not in stop_words]) 
                               for x in data['review'].tolist()] # remove stop words
    
    if norm_type == 'stem':
        normalizer = PorterStemmer()
        data['review'] = [' '.join([normalizer.stem(word) for word in x.split()]) 
                               for x in data['review'].tolist()] #stem words
    elif norm_type == 'lemma':
        normalizer = WordNetLemmatizer()
        data['review'] = [' '.join([normalizer.lemmatize(word) for word in x.split()]) 
                               for x in data['review'].tolist()] #lemmatize words
        
    return data