# Cleaning test set

In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from time import time
from bs4 import BeautifulSoup
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [9]:
columns = ['polarity', 'id', 'date', 'query_type', 'user', 'text']

dftest = pd.read_csv('csv/testdata.manual.2009.06.14.csv',
                     header = None, 
                     usecols = [0,5], 
                     names=columns,
                     encoding ='ISO-8859-1')

In [10]:
dftest.head()

Unnamed: 0,polarity,text
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...


In [11]:
len(dftest)

498

#### Verifying tweets' length

In [12]:
dftest['length'] = [len(t) for t in dftest.text]

In [13]:
dftest[dftest.length >140].head(100)

Unnamed: 0,polarity,text,length
208,4,@BlondeBroad it's definitely under warranty &a...,143
209,2,"RT Look, Available !Amazon Kindle2 &amp; Kindl...",144
213,0,OMG - time warner f'ed up my internet install ...,142
227,0,pissed about at&amp;t's mid-contract upgrade p...,141
230,0,@springsingfiend @dvyers @sethdaggett @jlshack...,144
469,4,@cwong08 I have a Kindle2 (&amp; Sony PRS-500)...,144


In [14]:
len(dftest[dftest.length >140])

6

In [16]:
%%time
souped_tweets = []
for i in dftest['text']:
    soup = BeautifulSoup(i, 'lxml')
    souped = soup.get_text()
    souped_tweets.append(souped)

CPU times: user 91.8 ms, sys: 7.77 ms, total: 99.6 ms
Wall time: 98.4 ms


In [17]:
dftest['bom_text'] = [t for t in souped_tweets]
#[len(t) for t in df_soup_tw_length.text]

In [18]:
dftest.head()

Unnamed: 0,polarity,text,length,bom_text
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...,111,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...,58,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck...",58,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...,140,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...,75,@mikefish Fair enough. But i have the Kindle2...


In [19]:
bom_test = dftest[['bom_text', 'polarity']].copy()

In [20]:
bom_test.head()

Unnamed: 0,bom_text,polarity
0,@stellargirl I loooooooovvvvvveee my Kindle2. ...,4
1,Reading my kindle2... Love it... Lee childs i...,4
2,"Ok, first assesment of the #kindle2 ...it fuck...",4
3,@kenburbary You'll love your Kindle2. I've had...,4
4,@mikefish Fair enough. But i have the Kindle2...,4


In [22]:
bom_test['length'] = [len(t) for t in bom_test.bom_text]

In [23]:
bom_test.tail()

Unnamed: 0,bom_text,polarity,length
493,Ask Programming: LaTeX or InDesign?: submitted...,2,102
494,"On that note, I hate Word. I hate Pages. I hat...",0,125
495,Ahhh... back in a *real* text editing environm...,4,62
496,"Trouble in Iran, I see. Hmm. Iran. Iran so far...",0,94
497,Reading the tweets coming out of Iran... The w...,0,92


In [24]:
bom_test[bom_test.length >140].count()

bom_text    0
polarity    0
length      0
dtype: int64

#### Applying Ekphrasis Library for cleaning dataset

- substitute URLs with tag `<url>`
- substitute emails with tag `<email>`
- substitute mentions with tag `<user>`
- remove percent, money, phone, time, date, number

In [25]:
%%time

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"elongated", "repeated"},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated word
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

normalized_text = []

for s in bom_test['bom_text']:
    normalized_text.append(" ".join(text_processor.pre_process_doc(s)))
normalized_text_series = pd.Series(normalized_text)


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
CPU times: user 5.45 s, sys: 411 ms, total: 5.86 s
Wall time: 5.86 s


In [26]:
print(normalized_text_series)

0      <user> i love <elongated> my kindle2 . not tha...
1      reading my kindle2 . <repeated> love it . <rep...
2      ok , first assesment of the kindle 2 . <repeat...
3      <user> you will love your kindle2 . i have had...
4      <user> fair enough . but i have the kindle2 an...
5      <user> no . it is too big . i am quite happy w...
6      fuck this economy . i hate aig and their non l...
7                         jquery is my new best friend .
8                                          loves twitter
9      how can you not love obama ? he makes jokes ab...
10     check this video out - - president obama at th...
11     <user> i firmly believe that obama / pelosi ha...
12     house correspondents dinner was last night who...
13     watchin espn . <repeated> jus seen this new ni...
14     dear nike , stop with the flywire . that shit ...
15     lebron best athlete of our generation , if not...
16     i was talking to this guy last night and he wa...
17                             

In [27]:
bom_test['normalized_text'] = [t for t in normalized_text_series]

In [28]:
bom_test.head()

Unnamed: 0,bom_text,polarity,length,normalized_text
0,@stellargirl I loooooooovvvvvveee my Kindle2. ...,4,111,<user> i love <elongated> my kindle2 . not tha...
1,Reading my kindle2... Love it... Lee childs i...,4,58,reading my kindle2 . <repeated> love it . <rep...
2,"Ok, first assesment of the #kindle2 ...it fuck...",4,58,"ok , first assesment of the kindle 2 . <repeat..."
3,@kenburbary You'll love your Kindle2. I've had...,4,140,<user> you will love your kindle2 . i have had...
4,@mikefish Fair enough. But i have the Kindle2...,4,75,<user> fair enough . but i have the kindle2 an...


In [29]:
len(bom_test)

498

In [30]:
df_len_tweets = bom_test[['normalized_text', 'polarity']].copy()
df_len_tweets.head()

Unnamed: 0,normalized_text,polarity
0,<user> i love <elongated> my kindle2 . not tha...,4
1,reading my kindle2 . <repeated> love it . <rep...,4
2,"ok , first assesment of the kindle 2 . <repeat...",4
3,<user> you will love your kindle2 . i have had...,4
4,<user> fair enough . but i have the kindle2 an...,4


In [31]:
df_len_tweets['length'] = [len(t) for t in df_len_tweets.normalized_text]
df_len_tweets.head()

Unnamed: 0,normalized_text,polarity,length
0,<user> i love <elongated> my kindle2 . not tha...,4,116
1,reading my kindle2 . <repeated> love it . <rep...,4,78
2,"ok , first assesment of the kindle 2 . <repeat...",4,79
3,<user> you will love your kindle2 . i have had...,4,148
4,<user> fair enough . but i have the kindle2 an...,4,78


In [32]:
df_len_tweets.length.max()

172

In [33]:
%%time
user_tweets = []

for t in df_len_tweets.normalized_text:
    user_tweets.append(t.replace('<user>', '@USER'))

df_len_tweets['text_no_user'] = [t for t in user_tweets]


CPU times: user 934 µs, sys: 58 µs, total: 992 µs
Wall time: 956 µs


In [34]:
df_len_tweets.head()

Unnamed: 0,normalized_text,polarity,length,text_no_user
0,<user> i love <elongated> my kindle2 . not tha...,4,116,@USER i love <elongated> my kindle2 . not that...
1,reading my kindle2 . <repeated> love it . <rep...,4,78,reading my kindle2 . <repeated> love it . <rep...
2,"ok , first assesment of the kindle 2 . <repeat...",4,79,"ok , first assesment of the kindle 2 . <repeat..."
3,<user> you will love your kindle2 . i have had...,4,148,@USER you will love your kindle2 . i have had ...
4,<user> fair enough . but i have the kindle2 an...,4,78,@USER fair enough . but i have the kindle2 and...


In [35]:
url_tweets = []
for t in df_len_tweets.text_no_user:
    #print(df_len_tweets.text[i].replace(s, ''))
    url_tweets.append(t.replace('<url>', '@URL'))

df_len_tweets['text_no_url'] = [t for t in url_tweets]

In [36]:
df_len_tweets.head()

Unnamed: 0,normalized_text,polarity,length,text_no_user,text_no_url
0,<user> i love <elongated> my kindle2 . not tha...,4,116,@USER i love <elongated> my kindle2 . not that...,@USER i love <elongated> my kindle2 . not that...
1,reading my kindle2 . <repeated> love it . <rep...,4,78,reading my kindle2 . <repeated> love it . <rep...,reading my kindle2 . <repeated> love it . <rep...
2,"ok , first assesment of the kindle 2 . <repeat...",4,79,"ok , first assesment of the kindle 2 . <repeat...","ok , first assesment of the kindle 2 . <repeat..."
3,<user> you will love your kindle2 . i have had...,4,148,@USER you will love your kindle2 . i have had ...,@USER you will love your kindle2 . i have had ...
4,<user> fair enough . but i have the kindle2 an...,4,78,@USER fair enough . but i have the kindle2 and...,@USER fair enough . but i have the kindle2 and...


In [37]:
%%time
notag_tweets = []
for t in df_len_tweets.text_no_url:
    #print(df_len_tweets.text[i].replace(s, ''))
    #notag_tweets.append(df_url_tweets.text[i].replace(r'(<[aA-zZ]+>)', ''))
    notag_tweets.append(re.sub(r'(<[aA-zZ]+>)', '',t))

df_len_tweets['text_no_tag'] = [t for t in notag_tweets]

CPU times: user 1.47 ms, sys: 38 µs, total: 1.51 ms
Wall time: 1.49 ms


In [38]:
df_len_tweets.head()

Unnamed: 0,normalized_text,polarity,length,text_no_user,text_no_url,text_no_tag
0,<user> i love <elongated> my kindle2 . not tha...,4,116,@USER i love <elongated> my kindle2 . not that...,@USER i love <elongated> my kindle2 . not that...,@USER i love my kindle2 . not that the is co...
1,reading my kindle2 . <repeated> love it . <rep...,4,78,reading my kindle2 . <repeated> love it . <rep...,reading my kindle2 . <repeated> love it . <rep...,reading my kindle2 . love it . lee childs is...
2,"ok , first assesment of the kindle 2 . <repeat...",4,79,"ok , first assesment of the kindle 2 . <repeat...","ok , first assesment of the kindle 2 . <repeat...","ok , first assesment of the kindle 2 . it fuc..."
3,<user> you will love your kindle2 . i have had...,4,148,@USER you will love your kindle2 . i have had ...,@USER you will love your kindle2 . i have had ...,@USER you will love your kindle2 . i have had ...
4,<user> fair enough . but i have the kindle2 an...,4,78,@USER fair enough . but i have the kindle2 and...,@USER fair enough . but i have the kindle2 and...,@USER fair enough . but i have the kindle2 and...


In [39]:
finaltest = df_len_tweets[['text_no_tag', 'polarity']].copy()

In [40]:
finaltest.head()

Unnamed: 0,text_no_tag,polarity
0,@USER i love my kindle2 . not that the is co...,4
1,reading my kindle2 . love it . lee childs is...,4
2,"ok , first assesment of the kindle 2 . it fuc...",4
3,@USER you will love your kindle2 . i have had ...,4
4,@USER fair enough . but i have the kindle2 and...,4


In [41]:
finaltest['length'] = [len(t) for t in finaltest.text_no_tag]
finaltest.head()

Unnamed: 0,text_no_tag,polarity,length
0,@USER i love my kindle2 . not that the is co...,4,91
1,reading my kindle2 . love it . lee childs is...,4,58
2,"ok , first assesment of the kindle 2 . it fuc...",4,59
3,@USER you will love your kindle2 . i have had ...,4,140
4,@USER fair enough . but i have the kindle2 and...,4,70


In [43]:
finaltest[finaltest.text_no_tag == '@USER'].count()

text_no_tag    0
polarity       0
length         0
dtype: int64

In [44]:
finaltest[finaltest.text_no_tag == '@URL'].count()

text_no_tag    0
polarity       0
length         0
dtype: int64

In [45]:
finaltest[finaltest.text_no_tag == '@USER @URL'].count()

text_no_tag    0
polarity       0
length         0
dtype: int64

In [46]:
finaltest[finaltest.text_no_tag == '@URL @USER'].count()

text_no_tag    0
polarity       0
length         0
dtype: int64

In [47]:
finaltest[finaltest.length == 0].count()

text_no_tag    0
polarity       0
length         0
dtype: int64

In [48]:
finaltest.isnull().values.any()

False

In [49]:
finaltest.max()

text_no_tag    zomg !  i have a g2 ! 
polarity                            4
length                            150
dtype: object

In [50]:
finaltest[finaltest.length >140]

Unnamed: 0,text_no_tag,polarity,length
11,@USER i firmly believe that obama / pelosi hav...,0,143
29,@USER i current use the nikon d90 and love it ...,4,145
39,"@USER : has been a bit crazy , with steep lear...",4,141
92,wish i could catch every mosquito in the world...,0,145
103,""" the republican party is a bunch of anti - ab...",0,143
123,viral marketing fail . this acia pills brand o...,0,143
138,it ' s unfortunate that after the stimulus pla...,0,142
162,my wrist still hurts . i have to get it looked...,4,147
164,"the dentist lied ! "" u won ' t feel any discom...",0,148
205,warren buffet became ( for a time ) the riches...,4,142


In [51]:
finaltest[finaltest.length == 150]

Unnamed: 0,text_no_tag,polarity,length
213,omg - time warner f ' ed up my internet instal...,0,150


In [53]:
finaltest.text_no_tag[213]

"omg - time warner f ' ed up my internet install - instead of today its now next saturday - another week w / o internet ! & $* ehfa ^ v9fhg [ * # fml ."

Since we want to perform a binary classification to predict negative or positive tweets, we drop tweets having polarity == 2 (neutral)

In [55]:
finaltest = finaltest.drop(finaltest[finaltest.polarity == 2].index)

In [56]:
len(finaltest)

359

In [57]:
498 - 359

139

In [58]:
finaltest.to_csv("finaltest.csv")