# Cleaning training set

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from time import time
from bs4 import BeautifulSoup
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [2]:
columns = ['mean pos', 'mean neg', 'diff', 'label', 'tweet']

dftrain = pd.read_csv('csv/vader_validation_set.csv',
                     header = 0, 
                     usecols = [3,4], 
                     names=columns,
                     encoding ='utf-8')

In [3]:
dftrain.head()

Unnamed: 0,label,tweet
0,1,i didnt mean knee high I ment in lengt it goes...
1,1,RT @cakedream: Ppl who say sex is over rated i...
2,1,RT @thelovestrings: You don't have to ruin the...
3,1,"Listening to the ""New Age"" station on @Slacker..."
4,1,@Mrhilton1985 Welcome to Twitter xx


In [4]:
dftrain = dftrain[:798]

In [5]:
dftrain.tail()

Unnamed: 0,label,tweet
793,0,RT @MrSealthedeal: #RaiseUrHandIf you used to ...
794,0,@2STix_iNaBUN nd you been callin me outta my n...
795,0,That movie was crazy!!! Depressing an action!!!!
796,0,:( oh no! :( god.. :'( little layla! RT @Layla...
797,0,Never by tea at Schiphol airport


#### Verifying tweets' length

In [6]:
dftrain['length'] = [len(t) for t in dftrain.tweet]

In [7]:
dftrain[dftrain.length >140].head(100)

Unnamed: 0,label,tweet,length
1,1,RT @cakedream: Ppl who say sex is over rated i...,148
6,1,RT @RockinGreenSoap: I Flip(in) Love @rockingr...,145
23,1,RT @Kelsey_Grammer: So GOOGLE BUZZ is a hot to...,146
36,1,RT @MyTrainerBob: I'm trying to get people to ...,154
56,1,RT @rachellehoude: I suggest we all listen to ...,156
93,1,RT @ParisHilton: Happy Valentines Day Everyone...,155
98,1,RT @jaredleto: thank u! RT @IAmCattSadler: Mos...,149
356,1,RT @FreakingDUMMY: RT @tbreezyworld LADY GAGA ...,158
425,1,"RT @Greer_Grammer: Will: ""isn't it sad that we...",152
470,1,RT @madisonviolet: Madison Violet just won the...,147


In [8]:
len(dftrain[dftrain.length >140])

16

In [9]:
dftrain.head()

Unnamed: 0,label,tweet,length
0,1,i didnt mean knee high I ment in lengt it goes...,93
1,1,RT @cakedream: Ppl who say sex is over rated i...,148
2,1,RT @thelovestrings: You don't have to ruin the...,75
3,1,"Listening to the ""New Age"" station on @Slacker...",79
4,1,@Mrhilton1985 Welcome to Twitter xx,35


#### Applying Ekphrasis Library for cleaning dataset

- substitute URLs with tag `<url>`
- substitute emails with tag `<email>`
- substitute mentions with tag `<user>`
- remove percent, money, phone, time, date, number

In [10]:
%%time

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"elongated", "repeated"},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated word
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

normalized_text = []

for s in dftrain['tweet']:
    normalized_text.append(" ".join(text_processor.pre_process_doc(s)))
normalized_text_series = pd.Series(normalized_text)


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
CPU times: user 5.55 s, sys: 429 ms, total: 5.97 s
Wall time: 5.98 s


In [11]:
print(normalized_text_series)

0      i didnt mean knee high i ment in lengt it goes...
1      rt <user> : ppl who say sex is over rated is n...
2      rt <user> : you do not have to ruin the beauti...
3      listening to the " new age " station on <user>...
4                           <user> welcome to twitter xx
5                               <user> so twitpic it lol
6      rt <user> : i flip ( in ) love <user> ! follow...
7      rt <number> + awesome twitter tips to become a...
8      <user> i was thinking that i wanna make a cake...
9      <url> check it out ! in the middle of london !...
10                                             mountains
11       <user> <user> hi there ! nice to see you back !
12                            rt <user> : i ? new york !
13     <user> my coffin has already been reshaped as ...
14          <user> i hope so dude hahaha < <number> . so
15     learning about water heaters - ah the joys of ...
16     rt <user> : " i lift my eyes up to you , to yo...
17     <user> what disneyworld 

In [12]:
dftrain['normalized_text'] = [t for t in normalized_text_series]

In [13]:
dftrain.head()

Unnamed: 0,label,tweet,length,normalized_text
0,1,i didnt mean knee high I ment in lengt it goes...,93,i didnt mean knee high i ment in lengt it goes...
1,1,RT @cakedream: Ppl who say sex is over rated i...,148,rt <user> : ppl who say sex is over rated is n...
2,1,RT @thelovestrings: You don't have to ruin the...,75,rt <user> : you do not have to ruin the beauti...
3,1,"Listening to the ""New Age"" station on @Slacker...",79,"listening to the "" new age "" station on <user>..."
4,1,@Mrhilton1985 Welcome to Twitter xx,35,<user> welcome to twitter xx


In [15]:
df_len_tweets = dftrain[['normalized_text', 'label']].copy()
df_len_tweets.head()

Unnamed: 0,normalized_text,label
0,i didnt mean knee high i ment in lengt it goes...,1
1,rt <user> : ppl who say sex is over rated is n...,1
2,rt <user> : you do not have to ruin the beauti...,1
3,"listening to the "" new age "" station on <user>...",1
4,<user> welcome to twitter xx,1


In [16]:
df_len_tweets['length'] = [len(t) for t in df_len_tweets.normalized_text]
df_len_tweets.head()

Unnamed: 0,normalized_text,label,length
0,i didnt mean knee high i ment in lengt it goes...,1,94
1,rt <user> : ppl who say sex is over rated is n...,1,159
2,rt <user> : you do not have to ruin the beauti...,1,68
3,"listening to the "" new age "" station on <user>...",1,54
4,<user> welcome to twitter xx,1,28


In [17]:
df_len_tweets.length.max()

197

In [18]:
%%time
user_tweets = []

for t in df_len_tweets.normalized_text:
    user_tweets.append(t.replace('<user>', '@USER'))

df_len_tweets['text_no_user'] = [t for t in user_tweets]


CPU times: user 1.06 ms, sys: 53 µs, total: 1.11 ms
Wall time: 1.1 ms


In [19]:
df_len_tweets.head()

Unnamed: 0,normalized_text,label,length,text_no_user
0,i didnt mean knee high i ment in lengt it goes...,1,94,i didnt mean knee high i ment in lengt it goes...
1,rt <user> : ppl who say sex is over rated is n...,1,159,rt @USER : ppl who say sex is over rated is nt...
2,rt <user> : you do not have to ruin the beauti...,1,68,rt @USER : you do not have to ruin the beautif...
3,"listening to the "" new age "" station on <user>...",1,54,"listening to the "" new age "" station on @USER ..."
4,<user> welcome to twitter xx,1,28,@USER welcome to twitter xx


In [20]:
url_tweets = []
for t in df_len_tweets.text_no_user:
    #print(df_len_tweets.text[i].replace(s, ''))
    url_tweets.append(t.replace('<url>', '@URL'))

df_len_tweets['text_no_url'] = [t for t in url_tweets]

In [21]:
df_len_tweets.head()

Unnamed: 0,normalized_text,label,length,text_no_user,text_no_url
0,i didnt mean knee high i ment in lengt it goes...,1,94,i didnt mean knee high i ment in lengt it goes...,i didnt mean knee high i ment in lengt it goes...
1,rt <user> : ppl who say sex is over rated is n...,1,159,rt @USER : ppl who say sex is over rated is nt...,rt @USER : ppl who say sex is over rated is nt...
2,rt <user> : you do not have to ruin the beauti...,1,68,rt @USER : you do not have to ruin the beautif...,rt @USER : you do not have to ruin the beautif...
3,"listening to the "" new age "" station on <user>...",1,54,"listening to the "" new age "" station on @USER ...","listening to the "" new age "" station on @USER ..."
4,<user> welcome to twitter xx,1,28,@USER welcome to twitter xx,@USER welcome to twitter xx


In [22]:
%%time
notag_tweets = []
for t in df_len_tweets.text_no_url:
    #print(df_len_tweets.text[i].replace(s, ''))
    #notag_tweets.append(df_url_tweets.text[i].replace(r'(<[aA-zZ]+>)', ''))
    notag_tweets.append(re.sub(r'(<[aA-zZ]+>)', '',t))

df_len_tweets['text_no_tag'] = [t for t in notag_tweets]

CPU times: user 1.98 ms, sys: 41 µs, total: 2.02 ms
Wall time: 2 ms


In [23]:
df_len_tweets.head()

Unnamed: 0,normalized_text,label,length,text_no_user,text_no_url,text_no_tag
0,i didnt mean knee high i ment in lengt it goes...,1,94,i didnt mean knee high i ment in lengt it goes...,i didnt mean knee high i ment in lengt it goes...,i didnt mean knee high i ment in lengt it goes...
1,rt <user> : ppl who say sex is over rated is n...,1,159,rt @USER : ppl who say sex is over rated is nt...,rt @USER : ppl who say sex is over rated is nt...,rt @USER : ppl who say sex is over rated is nt...
2,rt <user> : you do not have to ruin the beauti...,1,68,rt @USER : you do not have to ruin the beautif...,rt @USER : you do not have to ruin the beautif...,rt @USER : you do not have to ruin the beautif...
3,"listening to the "" new age "" station on <user>...",1,54,"listening to the "" new age "" station on @USER ...","listening to the "" new age "" station on @USER ...","listening to the "" new age "" station on @USER ..."
4,<user> welcome to twitter xx,1,28,@USER welcome to twitter xx,@USER welcome to twitter xx,@USER welcome to twitter xx


In [84]:
finalvalid = df_len_tweets[['text_no_tag', 'label']].copy()

In [85]:
finalvalid.head()

Unnamed: 0,text_no_tag,label
0,i didnt mean knee high i ment in lengt it goes...,1
1,rt @USER : ppl who say sex is over rated is nt...,1
2,rt @USER : you do not have to ruin the beautif...,1
3,"listening to the "" new age "" station on @USER ...",1
4,@USER welcome to twitter xx,1


In [86]:
finalvalid['length'] = [len(t) for t in finalvalid.text_no_tag]
finalvalid.head()

Unnamed: 0,text_no_tag,label,length
0,i didnt mean knee high i ment in lengt it goes...,1,94
1,rt @USER : ppl who say sex is over rated is nt...,1,148
2,rt @USER : you do not have to ruin the beautif...,1,67
3,"listening to the "" new age "" station on @USER ...",1,52
4,@USER welcome to twitter xx,1,27


In [87]:
len(finalvalid)

798

In [88]:
finalvalid[finalvalid.text_no_tag == '@USER'].count()

text_no_tag    3
label          3
length         3
dtype: int64

In [89]:
finalvalid[finalvalid.text_no_tag == '@URL'].count()

text_no_tag    0
label          0
length         0
dtype: int64

In [90]:
finalvalid[finalvalid.text_no_tag == '@USER @URL'].count()

text_no_tag    0
label          0
length         0
dtype: int64

In [91]:
finalvalid[finalvalid.text_no_tag == '@URL @USER'].count()

text_no_tag    0
label          0
length         0
dtype: int64

In [92]:
finalvalid[finalvalid.length == 0].count()

text_no_tag    0
label          0
length         0
dtype: int64

In [93]:
finalvalid.isnull().values.any()

False

In [94]:
finalvalid.max()

text_no_tag    yung berg & lil mama would be a perfect couple...
label                                                          1
length                                                       162
dtype: object

In [95]:
finalvalid[finalvalid.length >140]

Unnamed: 0,text_no_tag,label,length
1,rt @USER : ppl who say sex is over rated is nt...,1,148
36,rt @USER : i am trying to get people to try a ...,1,153
39,there is something about the sound of snow tha...,1,146
47,@USER ur indeed the supertsar . the real life...,1,143
93,rt @USER : happy valentines day everyone ! hop...,1,149
148,thanks to my dealers who joined us for our web...,1,141
193,a big thanks to everyone that came by the twir...,1,142
194,if ya can not support ur own local artist . ho...,1,145
269,"i pray each day ~ peace ~ to come , i know i ...",1,151
286,these pralines were ordered a business meetin...,1,142


In [97]:
finalvalid.head()

Unnamed: 0,text_no_tag,label,length
0,i didnt mean knee high i ment in lengt it goes...,1,94
1,rt @USER : ppl who say sex is over rated is nt...,1,148
2,rt @USER : you do not have to ruin the beautif...,1,67
3,"listening to the "" new age "" station on @USER ...",1,52
4,@USER welcome to twitter xx,1,27


In [100]:
len(finalvalid)

798

In [101]:
finalvalid.to_csv("finalvalid.csv")