# Cleaning training set

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from time import time
from bs4 import BeautifulSoup
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [2]:
columns = ['polarity', 'id', 'date', 'query_type', 'user', 'text']

dftrain = pd.read_csv('csv/training.1600000.processed.noemoticon.csv',
                     header = None, 
                     usecols = [0,5], 
                     names=columns,
                     encoding ='ISO-8859-1')

In [3]:
dftrain.head()

Unnamed: 0,polarity,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [4]:
len(dftrain)

1600000

#### Verifying tweets' length

In [6]:
dftrain['length'] = [len(t) for t in dftrain.text]

In [7]:
dftrain[dftrain.length >140].head(100)

Unnamed: 0,polarity,text,length
213,0,Awwh babs... you look so sad underneith that s...,142
226,0,Tuesdayï¿½ll start with reflection ï¿½n then a...,141
279,0,Whinging. My client&amp;boss don't understand ...,145
343,0,@TheLeagueSF Not Fun &amp; Furious? The new ma...,145
400,0,#3 woke up and was having an accident - &quot;...,144
464,0,"My bathtub drain is fired: it haz 1 job 2 do, ...",146
492,0,"pears &amp; Brie, bottle of Cabernet, and &quo...",150
747,0,Have an invite for &quot;Healthy Dining&quot; ...,141
957,0,Damnit I was really digging this season of Rea...,141
1064,0,Why do I keep looking...I know that what I rea...,141


In [8]:
len(dftrain[dftrain.length >140])

17174

In [11]:
%%time
souped_tweets = []
for i in dftrain['text']:
    soup = BeautifulSoup(i, 'lxml')
    souped = soup.get_text()
    souped_tweets.append(souped)

CPU times: user 4min 28s, sys: 17.8 s, total: 4min 45s
Wall time: 4min 46s


In [13]:
dftrain['bom_text'] = [t for t in souped_tweets]

In [14]:
dftrain.head()

Unnamed: 0,polarity,text,length,bom_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",115,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...,111,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...,89,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire,47,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",111,"@nationwideclass no, it's not behaving at all...."


In [15]:
bom_train = dftrain[['bom_text', 'polarity']].copy()

In [16]:
bom_train.head()

Unnamed: 0,bom_text,polarity
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [18]:
bom_train['length'] = [len(t) for t in bom_train.bom_text]

In [21]:
bom_train.tail()

Unnamed: 0,bom_text,polarity,length
1599995,Just woke up. Having no school is the best fee...,4,56
1599996,TheWDB.com - Very cool to hear old Walt interv...,4,78
1599997,Are you ready for your MoJo Makeover? Ask me f...,4,57
1599998,Happy 38th Birthday to my boo of alll time!!! ...,4,65
1599999,happy #charitytuesday @theNSPCC @SparksCharity...,4,62


In [20]:
bom_train[bom_train.length >140].count()

bom_text    687
polarity    687
length      687
dtype: int64

#### Removing tweets having length > 140

In [39]:
df_tweets = bom_train.drop(bom_train[bom_train.length >140].index)

In [40]:
len(df_tweets)

1599313

#### Applying Ekphrasis Library for cleaning dataset

- substitute URLs with tag `<url>`
- substitute emails with tag `<email>`
- substitute mentions with tag `<user>`
- remove percent, money, phone, time, date, number

In [41]:
%%time

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"elongated", "repeated"},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated word
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

normalized_text = []

for s in df_tweets['bom_text']:
    normalized_text.append(" ".join(text_processor.pre_process_doc(s)))
normalized_text_series = pd.Series(normalized_text)


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
CPU times: user 6min 6s, sys: 1.14 s, total: 6min 7s
Wall time: 6min 8s


In [43]:
df_tweets['normalized_text'] = [t for t in normalized_text_series]

In [44]:
df_tweets.head()

Unnamed: 0,bom_text,polarity,length,normalized_text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,115,"<user> <url> - aw <elongated> , that ' s a bum..."
1,is upset that he can't update his Facebook by ...,0,111,is upset that he can not update his facebook b...
2,@Kenichan I dived many times for the ball. Man...,0,89,<user> i dived many times for the ball . manag...
3,my whole body feels itchy and like its on fire,0,47,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all....",0,111,"<user> no , it ' s not behaving at all . i am ..."


In [45]:
len(df_tweets)

1599313

In [47]:
df_len_tweets = df_tweets[['normalized_text', 'polarity']].copy()
df_len_tweets.head()

Unnamed: 0,normalized_text,polarity
0,"<user> <url> - aw <elongated> , that ' s a bum...",0
1,is upset that he can not update his facebook b...,0
2,<user> i dived many times for the ball . manag...,0
3,my whole body feels itchy and like its on fire,0
4,"<user> no , it ' s not behaving at all . i am ...",0


In [48]:
df_len_tweets['length'] = [len(t) for t in df_len_tweets.normalized_text]
df_len_tweets.head()

Unnamed: 0,normalized_text,polarity,length
0,"<user> <url> - aw <elongated> , that ' s a bum...",0,109
1,is upset that he can not update his facebook b...,0,124
2,<user> i dived many times for the ball . manag...,0,92
3,my whole body feels itchy and like its on fire,0,46
4,"<user> no , it ' s not behaving at all . i am ...",0,110


In [49]:
df_len_tweets.length.max()

359

In [53]:
%%time
user_tweets = []

for t in df_len_tweets.normalized_text:
    user_tweets.append(t.replace('<user>', '@USER'))

df_len_tweets['text_no_user'] = [t for t in user_tweets]


CPU times: user 1.16 s, sys: 97 ms, total: 1.25 s
Wall time: 1.25 s


In [54]:
df_len_tweets.head()

Unnamed: 0,normalized_text,polarity,length,text_no_user
0,"<user> <url> - aw <elongated> , that ' s a bum...",0,109,"@USER <url> - aw <elongated> , that ' s a bumm..."
1,is upset that he can not update his facebook b...,0,124,is upset that he can not update his facebook b...
2,<user> i dived many times for the ball . manag...,0,92,@USER i dived many times for the ball . manage...
3,my whole body feels itchy and like its on fire,0,46,my whole body feels itchy and like its on fire
4,"<user> no , it ' s not behaving at all . i am ...",0,110,"@USER no , it ' s not behaving at all . i am m..."


In [55]:
url_tweets = []
for t in df_len_tweets.text_no_user:
    #print(df_len_tweets.text[i].replace(s, ''))
    url_tweets.append(t.replace('<url>', '@URL'))

df_len_tweets['text_no_url'] = [t for t in url_tweets]

In [56]:
df_len_tweets.head()

Unnamed: 0,normalized_text,polarity,length,text_no_user,text_no_url
0,"<user> <url> - aw <elongated> , that ' s a bum...",0,109,"@USER <url> - aw <elongated> , that ' s a bumm...","@USER @URL - aw <elongated> , that ' s a bumme..."
1,is upset that he can not update his facebook b...,0,124,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...
2,<user> i dived many times for the ball . manag...,0,92,@USER i dived many times for the ball . manage...,@USER i dived many times for the ball . manage...
3,my whole body feels itchy and like its on fire,0,46,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,"<user> no , it ' s not behaving at all . i am ...",0,110,"@USER no , it ' s not behaving at all . i am m...","@USER no , it ' s not behaving at all . i am m..."


In [57]:
%%time
notag_tweets = []
for t in df_len_tweets.text_no_url:
    #print(df_len_tweets.text[i].replace(s, ''))
    notag_tweets.append(re.sub(r'(<[aA-zZ]+>)', '',t))

df_len_tweets['text_no_tag'] = [t for t in notag_tweets]

CPU times: user 2.13 s, sys: 72.3 ms, total: 2.2 s
Wall time: 2.2 s


In [58]:
df_len_tweets.head()

Unnamed: 0,normalized_text,polarity,length,text_no_user,text_no_url,text_no_tag
0,"<user> <url> - aw <elongated> , that ' s a bum...",0,109,"@USER <url> - aw <elongated> , that ' s a bumm...","@USER @URL - aw <elongated> , that ' s a bumme...","@USER @URL - aw , that ' s a bummer . you sho..."
1,is upset that he can not update his facebook b...,0,124,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...
2,<user> i dived many times for the ball . manag...,0,92,@USER i dived many times for the ball . manage...,@USER i dived many times for the ball . manage...,@USER i dived many times for the ball . manage...
3,my whole body feels itchy and like its on fire,0,46,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,"<user> no , it ' s not behaving at all . i am ...",0,110,"@USER no , it ' s not behaving at all . i am m...","@USER no , it ' s not behaving at all . i am m...","@USER no , it ' s not behaving at all . i am m..."


In [59]:
finaltrain = df_len_tweets[['text_no_tag', 'polarity']].copy()

In [60]:
finaltrain.head()

Unnamed: 0,text_no_tag,polarity
0,"@USER @URL - aw , that ' s a bummer . you sho...",0
1,is upset that he can not update his facebook b...,0
2,@USER i dived many times for the ball . manage...,0
3,my whole body feels itchy and like its on fire,0
4,"@USER no , it ' s not behaving at all . i am m...",0


In [62]:
finaltrain['length'] = [len(t) for t in finaltrain.text_no_tag]
finaltrain.head()

Unnamed: 0,text_no_tag,polarity,length
0,"@USER @URL - aw , that ' s a bummer . you sho...",0,90
1,is upset that he can not update his facebook b...,0,114
2,@USER i dived many times for the ball . manage...,0,82
3,my whole body feels itchy and like its on fire,0,46
4,"@USER no , it ' s not behaving at all . i am m...",0,109


In [63]:
len(finaltrain)

1599313

In [64]:
finaltrain[finaltrain.text_no_tag == '@USER'].count()

text_no_tag    2673
polarity       2673
length         2673
dtype: int64

In [65]:
finaltrain[finaltrain.text_no_tag == '@URL'].count()

text_no_tag    22
polarity       22
length         22
dtype: int64

In [66]:
finaltrain[finaltrain.text_no_tag == '@USER @URL'].count()

text_no_tag    70
polarity       70
length         70
dtype: int64

In [67]:
finaltrain[finaltrain.text_no_tag == '@URL @USER'].count()

text_no_tag    0
polarity       0
length         0
dtype: int64

In [68]:
finaltrain[finaltrain.length == 0].count()

text_no_tag    3
polarity       3
length         3
dtype: int64

In [77]:
finaltrain[finaltrain.length >140]

Unnamed: 0,text_no_tag,polarity,length
76,@USER my donkey is sensitive about such commen...,0,142
93,"pray for me please , the ex is threatening to ...",0,141
126,"@USER wah i can not see clip , must be el - st...",0,146
279,whinging . my client & boss do not understand ...,0,148
327,. and of course . i have access to my halo ...,0,143
348,"still sick . feeling a bit better , got some n...",0,143
400,"# woke up and was having an accident - "" it '...",0,142
407,"why to we , the only school in the world , be ...",0,144
464,"my bathtub drain is fired : it haz job do , ...",0,142
527,can not sleep again . face is kinda swollen . ...,0,144


In [78]:
finaltrain[finaltrain.length >160]

Unnamed: 0,text_no_tag,polarity,length
3434,@USER taylor 8 1 4 ce - - > #& $( #& $ ! ( @ #...,0,167
5626,mai thi communication system mï ¿ ½ ch ? a cï ...,0,166
63566,@USER th ? thï ¿ ½ anh ph ? i ng ? s ? m ? i t...,0,170
83829,chï ¿ ½g nï ¿ ½ nï ¿ ½y sis boa bi r ` xï ¿ ½ ...,0,171
100794,i am jealous . like really really really painf...,0,162
111204,b ? i vï ¿ ½ m hay ? ï ¿ ½ ~ cï ¿ ½i nh ? nh ...,0,171
112865,hic mï ¿ ½nh cï ¿ ½n to - do n ? a n ? a ch ?...,0,163
143677,twed twoses for you to enjoy . @ - ) - - @ -...,0,182
146754,work / / moving kids min sets / / capturing / ...,0,168
174712,heyy . thnxx . or . inviting . me . to . ur . ...,0,174


In [72]:
finaltrain[finaltrain.length >160].count()

text_no_tag    170
polarity       170
length         170
dtype: int64

Since tweets having length >= 140 are meaningful, we decided to drop tweets having length => 160 because they seem noisy. We also drop tweets with only @USER and @URL and having length == 0.

In [80]:
finaltrain = finaltrain.drop(finaltrain[(finaltrain.length >160) | (finaltrain.text_no_tag == '@USER') | (finaltrain.text_no_tag == '@URL') | (finaltrain.text_no_tag == '@USER @URL') | (finaltrain.length == 0)].index)


In [81]:
len(finaltrain)

1596375

In [82]:
1600000 - 1596375

3625

In [83]:
finaltrain.to_csv("finaltrain.csv")