In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [54]:
pd.set_option('display.max_rows', 500)

In [56]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### 0. Importing data

In [57]:
train_df = pd.read_csv("../datasets/tweet-sentiment-extraction/train.csv")
test_df = pd.read_csv("../datasets/tweet-sentiment-extraction/test.csv")

### 1. Data Exploration

In [58]:
# Understanding the data at high level

In [59]:
test_df

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive
...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative
3530,416863ce47,All alone in this old house again. Thanks for...,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive


In [60]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [61]:
train_df.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [62]:
train_df["text"].describe()

count                                    27480
unique                                   27480
top        I`d have responded, if I were going
freq                                         1
Name: text, dtype: object

In [63]:
train_df.shape

(27481, 4)

In [64]:
## Data imbalance check
train_df.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [65]:
## check text column null row, 
train_df["text"].isnull().values.any()

True

In [66]:
train_df["text"].isnull().sum()

1

In [67]:
# index
train_df[train_df["text"].isnull()].index

Int64Index([314], dtype='int64')

In [68]:
train_df.iloc[314, :]
# make sense to keep this data 

textID           fdb77c3752
text                    NaN
selected_text           NaN
sentiment           neutral
Name: 314, dtype: object

In [69]:
train_df["text"].fillna(value="", inplace=True)
train_df["selected_text"].fillna(value="", inplace=True)

In [70]:
train_df.iloc[314, :]

textID           fdb77c3752
text                       
selected_text              
sentiment           neutral
Name: 314, dtype: object

In [71]:
## adding new column with additional info
#train_df["text_len"] = train_df["text"].apply(len)

In [72]:
## does text_len have any correlation with the target? if yes, would be valuable to include as a feature


In [73]:
#label_encode = {"positive": 1, "neutral": 0, "negative": -1}
#train_df["sentiment"] = train_df.sentiment.apply(lambda row: label_encode[row])

In [74]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


### 3. Data Preprocessing

In [76]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [78]:
train_df.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [79]:
train_df.shape, test_df.shape

((27481, 4), (3534, 3))

### 1. Text Tokenization

In [40]:
from nltk.tokenize.treebank import TreebankWordTokenizer

In [41]:
def df_tokenization(s):
 return TreebankWordTokenizer().tokenize(s)

In [42]:
train_df["token"] = train_df["text"].apply(lambda x: df_tokenization(x))

In [43]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,token
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[I`d, have, responded, ,, if, I, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[Sooo, SAD, I, will, miss, you, here, in, San,..."
2,088c60f138,my boss is bullying me...,bullying me,negative,"[my, boss, is, bullying, me, ...]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[what, interview, !, leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[Sons, of, ****, ,, why, couldn`t, they, put, ..."
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,"[wish, we, could, come, see, u, on, Denver, hu..."
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,"[I`ve, wondered, about, rake, to., The, client..."
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,"[Yay, good, for, both, of, you., Enjoy, the, b..."
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,"[But, it, was, worth, it, ****, .]"


### 2. Token normalization

In [44]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def df_lemmatizer(s_list):
    out_ll = []
    for s in s_list:
        out_ll.append(lemmatizer.lemmatize(s))
    return out_ll

In [45]:
train_df["token_norm"] = train_df["token"].apply(lambda x: df_lemmatizer(x))

In [46]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[I`d, have, responded, ,, if, I, were, going]","[I`d, have, responded, ,, if, I, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[Sooo, SAD, I, will, miss, you, here, in, San,...","[Sooo, SAD, I, will, miss, you, here, in, San,..."
2,088c60f138,my boss is bullying me...,bullying me,negative,"[my, boss, is, bullying, me, ...]","[my, bos, is, bullying, me, ...]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[what, interview, !, leave, me, alone]","[what, interview, !, leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[Sons, of, ****, ,, why, couldn`t, they, put, ...","[Sons, of, ****, ,, why, couldn`t, they, put, ..."
...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,"[wish, we, could, come, see, u, on, Denver, hu...","[wish, we, could, come, see, u, on, Denver, hu..."
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,"[I`ve, wondered, about, rake, to., The, client...","[I`ve, wondered, about, rake, to., The, client..."
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,"[Yay, good, for, both, of, you., Enjoy, the, b...","[Yay, good, for, both, of, you., Enjoy, the, b..."
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,"[But, it, was, worth, it, ****, .]","[But, it, wa, worth, it, ****, .]"


In [47]:
train_df[train_df["token"] != train_df["token_norm"]].sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
17355,c861f23a07,Ugh i can`t sleep and it`s reallyy gettin to m...,bad,negative,"[Ugh, i, can`t, sleep, and, it`s, reallyy, get...","[Ugh, i, can`t, sleep, and, it`s, reallyy, get..."
17675,865244d92d,Watchin tyra bored like always my stomach hurts,my stomach hurts,negative,"[Watchin, tyra, bored, like, always, my, stoma...","[Watchin, tyra, bored, like, always, my, stoma..."
22369,d25cfac9d5,"Nina was on my lap, but just decided to jump off","Nina was on my lap, but just decided to jump off",neutral,"[Nina, was, on, my, lap, ,, but, just, decided...","[Nina, wa, on, my, lap, ,, but, just, decided,..."
7542,7620ab849a,in Singapore! its so warm over here! Having a ...,blast,positive,"[in, Singapore, !, its, so, warm, over, here, ...","[in, Singapore, !, it, so, warm, over, here, !..."
17385,35878b750c,I googled 'engagement rings' & this is the EX...,* love,positive,"[I, googled, 'engagement, rings, ', &, this, i...","[I, googled, 'engagement, ring, ', &, this, is..."
12870,77b1f297a3,"thx, I was aware, 2 day festival -multi ban...",I can`t do any mor,negative,"[thx, ,, I, was, aware, ,, 2, day, festival, -...","[thx, ,, I, wa, aware, ,, 2, day, festival, -m..."
2138,56e16709a4,_**** not as yet... i would just like to know ...,hopefuly,positive,"[_****, not, as, yet, ..., i, would, just, lik...","[_****, not, a, yet, ..., i, would, just, like..."
10112,5039bf5cc6,_creek No worries - thank google! There`s noth...,No worries - thank google!,positive,"[_creek, No, worries, -, thank, google, !, The...","[_creek, No, worry, -, thank, google, !, There..."
11378,b17e92da44,"the music in the trailer was terrible imo, ch...",terrible,negative,"[the, music, in, the, trailer, was, terrible, ...","[the, music, in, the, trailer, wa, terrible, i..."
4869,c34e27df82,Is sad when people`s phones are dead,Is sad wh,negative,"[Is, sad, when, people`s, phones, are, dead]","[Is, sad, when, people`s, phone, are, dead]"


In [48]:
from nltk.stem.porter import *

In [49]:
stemmer = PorterStemmer()

In [50]:
def df_stemmer(s_list):
    return [stemmer.stem(plural) for plural in s_list]

In [51]:
train_df["token_norm_stem"] = train_df["token"].apply(lambda x: df_stemmer(x))

In [52]:
train_df[train_df["token"] != train_df["token_norm"]].sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm,token_norm_stem
14502,3e04dbc3b2,Anybody have advice on who to use for printing...,Anybody have advice on who to use for printing...,neutral,"[Anybody, have, advice, on, who, to, use, for,...","[Anybody, have, advice, on, who, to, use, for,...","[anybodi, have, advic, on, who, to, use, for, ..."
12195,d06b41fef3,"To all the gorgeous moms out there, Happy Mom`...",Happy Mom`s Daaay!,positive,"[To, all, the, gorgeous, moms, out, there, ,, ...","[To, all, the, gorgeous, mom, out, there, ,, H...","[to, all, the, gorgeou, mom, out, there, ,, ha..."
7704,458a36f95d,DANG i want to be on the beach late at night....,best.,positive,"[DANG, i, want, to, be, on, the, beach, late, ...","[DANG, i, want, to, be, on, the, beach, late, ...","[dang, i, want, to, be, on, the, beach, late, ..."
26463,8731c63d84,I wanna see that movie 'Keith' with JMcCartney...,I wanna see that movie 'Keith' with JMcCartney...,neutral,"[I, wan, na, see, that, movie, 'Keith, ', with...","[I, wan, na, see, that, movie, 'Keith, ', with...","[i, wan, na, see, that, movi, 'keith, ', with,..."
22094,c225e9d712,"hows the peas doin? if you meet them, tell th...","hows the peas doin? if you meet them, tell the...",neutral,"[hows, the, peas, doin, ?, if, you, meet, them...","[hows, the, pea, doin, ?, if, you, meet, them,...","[how, the, pea, doin, ?, if, you, meet, them, ..."
18578,b73bd70b63,bored well its not like any one can see this i...,bored,negative,"[bored, well, its, not, like, any, one, can, s...","[bored, well, it, not, like, any, one, can, se...","[bore, well, it, not, like, ani, one, can, see..."
6202,698b6e5aef,"I want tuna & salmon sashimi, B.C. rolls and d...","I want tuna & salmon sashimi, B.C. rolls and d...",neutral,"[I, want, tuna, &, salmon, sashimi, ,, B.C., r...","[I, want, tuna, &, salmon, sashimi, ,, B.C., r...","[i, want, tuna, &, salmon, sashimi, ,, b.c., r..."
4456,ac7475a944,"Yeah, yeah. Less #degenerate than current occ...","Yeah, yeah. Less #degenerate than current occu...",neutral,"[Yeah, ,, yeah., Less, #, degenerate, than, cu...","[Yeah, ,, yeah., Less, #, degenerate, than, cu...","[yeah, ,, yeah., less, #, degener, than, curre..."
26579,3484dd8b5e,"was mostly sick when she went to bed, but woke...",dead,negative,"[was, mostly, sick, when, she, went, to, bed, ...","[wa, mostly, sick, when, she, went, to, bed, ,...","[wa, mostli, sick, when, she, went, to, bed, ,..."
2134,82f88a3667,Trying to recover photos after a problem betwe...,Trying to recover photos after a problem betwe...,negative,"[Trying, to, recover, photos, after, a, proble...","[Trying, to, recover, photo, after, a, problem...","[tri, to, recov, photo, after, a, problem, bet..."


In [222]:
# dropping stemming 

train_df.drop(["token_norm_stem"], axis=1, inplace=True)

In [223]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[I`d, have, responded, ,, if, I, were, going]","[I`d, have, responded, ,, if, I, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[Sooo, SAD, I, will, miss, you, here, in, San,...","[Sooo, SAD, I, will, miss, you, here, in, San,..."
2,088c60f138,my boss is bullying me...,bullying me,negative,"[my, boss, is, bullying, me, ...]","[my, bos, is, bullying, me, ...]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[what, interview, !, leave, me, alone]","[what, interview, !, leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[Sons, of, ****, ,, why, couldn`t, they, put, ...","[Sons, of, ****, ,, why, couldn`t, they, put, ..."
...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,"[wish, we, could, come, see, u, on, Denver, hu...","[wish, we, could, come, see, u, on, Denver, hu..."
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,"[I`ve, wondered, about, rake, to., The, client...","[I`ve, wondered, about, rake, to., The, client..."
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,"[Yay, good, for, both, of, you., Enjoy, the, b...","[Yay, good, for, both, of, you., Enjoy, the, b..."
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,"[But, it, was, worth, it, ****, .]","[But, it, wa, worth, it, ****, .]"


### 3. Normalizing capital letters

In [224]:
train_df[train_df['text'].str.isupper()].shape

(169, 6)

In [225]:
train_df[train_df['text'].str.isupper()]

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
42,2e7082d1c8,MAYDAY?!,MAYDAY?!,neutral,"[MAYDAY, ?, !]","[MAYDAY, ?, !]"
43,684081e4e7,RATT ROCKED NASHVILLE TONITE..ONE THING SUCKED...,RATT ROCKED NASHVILLE TONITE..ONE THING SUCKED...,neutral,"[RATT, ROCKED, NASHVILLE, TONITE..ONE, THING, ...","[RATT, ROCKED, NASHVILLE, TONITE..ONE, THING, ..."
80,bbbc46889b,THANK YYYYYYYYYOOOOOOOOOOUUUUU!,THANK YYYYYYYYYOOOOOOOOOOUUUUU!,positive,"[THANK, YYYYYYYYYOOOOOOOOOOUUUUU, !]","[THANK, YYYYYYYYYOOOOOOOOOOUUUUU, !]"
193,3e880ec28d,WHAT ABOUT ME ?? I VOTE EVERY DAY FOR YOU !...,WHAT ABOUT ME ?? I VOTE EVERY DAY FOR YOU !!!!!,negative,"[WHAT, ABOUT, ME, ?, ?, I, VOTE, EVERY, DAY, F...","[WHAT, ABOUT, ME, ?, ?, I, VOTE, EVERY, DAY, F..."
276,894e188d01,HAPPY MOTHERS DAY.!,HAPPY,positive,"[HAPPY, MOTHERS, DAY., !]","[HAPPY, MOTHERS, DAY., !]"
391,47990d2312,BRAINFREEZE,BRAINFREEZE,neutral,[BRAINFREEZE],[BRAINFREEZE]
456,fba4c01756,NOW IM SAD BUT IM NOT GIVING IN FIRST..I DIDNT...,SAD,negative,"[NOW, IM, SAD, BUT, IM, NOT, GIVING, IN, FIRST...","[NOW, IM, SAD, BUT, IM, NOT, GIVING, IN, FIRST..."
786,a30612b7f6,OH NEVERMIND I THINK THIS THING IS UNSALVAGEABLE,UNSALVAGEABLE,negative,"[OH, NEVERMIND, I, THINK, THIS, THING, IS, UNS...","[OH, NEVERMIND, I, THINK, THIS, THING, IS, UNS..."
831,33fd898450,GOODNIGHT MAGIC AND PRETTY WORLD,GOODNIGHT,positive,"[GOODNIGHT, MAGIC, AND, PRETTY, WORLD]","[GOODNIGHT, MAGIC, AND, PRETTY, WORLD]"
1150,aa189955b1,BUT THEY ARE EXPENSIVE.,E EXPENSIVE,negative,"[BUT, THEY, ARE, EXPENSIVE, .]","[BUT, THEY, ARE, EXPENSIVE, .]"


In [236]:
to_process_rows= train_df[train_df['text'].str.isupper()].index

In [243]:
train_df.iloc[to_process_rows, 5] = train_df.iloc[to_process_rows, :]["token_norm"].apply(lambda x: list(map(lambda y:y.lower(),x)))

In [248]:
train_df[train_df['text'].str.isupper()]

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
42,2e7082d1c8,MAYDAY?!,MAYDAY?!,neutral,"[MAYDAY, ?, !]","[mayday, ?, !]"
43,684081e4e7,RATT ROCKED NASHVILLE TONITE..ONE THING SUCKED...,RATT ROCKED NASHVILLE TONITE..ONE THING SUCKED...,neutral,"[RATT, ROCKED, NASHVILLE, TONITE..ONE, THING, ...","[ratt, rocked, nashville, tonite..one, thing, ..."
80,bbbc46889b,THANK YYYYYYYYYOOOOOOOOOOUUUUU!,THANK YYYYYYYYYOOOOOOOOOOUUUUU!,positive,"[THANK, YYYYYYYYYOOOOOOOOOOUUUUU, !]","[thank, yyyyyyyyyoooooooooouuuuu, !]"
193,3e880ec28d,WHAT ABOUT ME ?? I VOTE EVERY DAY FOR YOU !...,WHAT ABOUT ME ?? I VOTE EVERY DAY FOR YOU !!!!!,negative,"[WHAT, ABOUT, ME, ?, ?, I, VOTE, EVERY, DAY, F...","[what, about, me, ?, ?, i, vote, every, day, f..."
276,894e188d01,HAPPY MOTHERS DAY.!,HAPPY,positive,"[HAPPY, MOTHERS, DAY., !]","[happy, mothers, day., !]"
391,47990d2312,BRAINFREEZE,BRAINFREEZE,neutral,[BRAINFREEZE],[brainfreeze]
456,fba4c01756,NOW IM SAD BUT IM NOT GIVING IN FIRST..I DIDNT...,SAD,negative,"[NOW, IM, SAD, BUT, IM, NOT, GIVING, IN, FIRST...","[now, im, sad, but, im, not, giving, in, first..."
786,a30612b7f6,OH NEVERMIND I THINK THIS THING IS UNSALVAGEABLE,UNSALVAGEABLE,negative,"[OH, NEVERMIND, I, THINK, THIS, THING, IS, UNS...","[oh, nevermind, i, think, this, thing, is, uns..."
831,33fd898450,GOODNIGHT MAGIC AND PRETTY WORLD,GOODNIGHT,positive,"[GOODNIGHT, MAGIC, AND, PRETTY, WORLD]","[goodnight, magic, and, pretty, world]"
1150,aa189955b1,BUT THEY ARE EXPENSIVE.,E EXPENSIVE,negative,"[BUT, THEY, ARE, EXPENSIVE, .]","[but, they, are, expensive, .]"


In [255]:
rows_with_uppercase = []

def df_capital_check(s):
    flag=False
    for s_i in s:
        if s_i.isupper():
            flag=True 
    rows_with_uppercase.append(flag)

In [256]:
train_df["token_norm"].apply(lambda x: df_capital_check(x))

0        None
1        None
2        None
3        None
4        None
         ... 
27476    None
27477    None
27478    None
27479    None
27480    None
Name: token_norm, Length: 27481, dtype: object

In [259]:
filtered_rows = pd.Series(rows_with_uppercase, name="bools")

In [261]:
filtered_rows

0         True
1         True
2        False
3        False
4        False
         ...  
27476    False
27477     True
27478    False
27479    False
27480     True
Name: bools, Length: 27481, dtype: bool

In [264]:
train_df.loc[filtered_rows, :].sample(100)

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
13735,5b65d15956,http://twitpic.com/676hv - R.I.P Fristy just t...,R.I.P Fristy just thinking abt her,negative,"[http, :, //twitpic.com/676hv, -, R.I.P, Frist...","[http, :, //twitpic.com/676hv, -, R.I.P, Frist..."
7032,9b402bca9b,i say business ive only sold 2....and its not ...,i say business ive only sold 2....and its not ...,neutral,"[i, say, business, ive, only, sold, 2, ..., .a...","[i, say, business, ive, only, sold, 2, ..., .a..."
26885,936f854d95,@ cayogial i wanted to come to BZ this summer ...,SUCKS,negative,"[@, cayogial, i, wanted, to, come, to, BZ, thi...","[@, cayogial, i, wanted, to, come, to, BZ, thi..."
11359,bd485719c4,Cool. You should qik some stuff from the news...,Cool.,positive,"[Cool., You, should, qik, some, stuff, from, t...","[Cool., You, should, qik, some, stuff, from, t..."
3969,808f007968,just left work. inventory was way easy and I ...,! it was amazing.,positive,"[just, left, work., inventory, was, way, easy,...","[just, left, work., inventory, wa, way, easy, ..."
9577,46bef3e9ee,From ME and no one else! Muhahahaaaa! Well may...,From ME and no one else! Muhahahaaaa! Well may...,neutral,"[From, ME, and, no, one, else, !, Muhahahaaaa,...","[From, ME, and, no, one, else, !, Muhahahaaaa,..."
9234,8f1e16353f,Had lunch at a Japanese sushi restaurant & I o...,Had lunch at a Japanese sushi restaurant & I o...,neutral,"[Had, lunch, at, a, Japanese, sushi, restauran...","[Had, lunch, at, a, Japanese, sushi, restauran..."
18242,41b9aed9db,Last day to sign up for GoCincinnati! www.cros...,AWESOME,positive,"[Last, day, to, sign, up, for, GoCincinnati, !...","[Last, day, to, sign, up, for, GoCincinnati, !..."
20349,09f3cc6de2,It`s pretty warm out now! I forgot K was sick...,It`s pretty warm out now! I forgot K was sick ...,neutral,"[It`s, pretty, warm, out, now, !, I, forgot, K...","[It`s, pretty, warm, out, now, !, I, forgot, K..."
22023,71cff7e393,"Girl, if you were part of the tour, I would s...","Girl, if you were part of the tour, I would se...",neutral,"[Girl, ,, if, you, were, part, of, the, tour, ...","[Girl, ,, if, you, were, part, of, the, tour, ..."


In [283]:
def df_lowercase(s):
    out_s = []
    for s_i in s:
        out_s.append(s_i.lower())
    return out_s

In [284]:
# Todo: more analysis on pronouns and country 
#Remove noise; hyperlinks and puncatuation

#### Acronyms

In [279]:
acrn_index_series = train_df["text"].str.findall(r"([A-Z]\.[A-Z]])+").apply(lambda x: False if x==[] else True)

In [280]:
train_df.loc[acrn_index_series, :]

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm


In [285]:
train_df["token_norm"] = train_df["token_norm"].apply(lambda x: df_lowercase(x))

In [286]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[I`d, have, responded, ,, if, I, were, going]","[i`d, have, responded, ,, if, i, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[Sooo, SAD, I, will, miss, you, here, in, San,...","[sooo, sad, i, will, miss, you, here, in, san,..."
2,088c60f138,my boss is bullying me...,bullying me,negative,"[my, boss, is, bullying, me, ...]","[my, bos, is, bullying, me, ...]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[what, interview, !, leave, me, alone]","[what, interview, !, leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[Sons, of, ****, ,, why, couldn`t, they, put, ...","[sons, of, ****, ,, why, couldn`t, they, put, ..."
...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,"[wish, we, could, come, see, u, on, Denver, hu...","[wish, we, could, come, see, u, on, denver, hu..."
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,"[I`ve, wondered, about, rake, to., The, client...","[i`ve, wondered, about, rake, to., the, client..."
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,"[Yay, good, for, both, of, you., Enjoy, the, b...","[yay, good, for, both, of, you., enjoy, the, b..."
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,"[But, it, was, worth, it, ****, .]","[but, it, wa, worth, it, ****, .]"


### Feature Engineering

In [287]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [288]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))

In [292]:
tfidf.fit_transform(train_df["token_norm"].values)

AttributeError: 'list' object has no attribute 'lower'

In [18]:
# check duplicates 
train_df.duplicated().sum(), test_df.duplicated().sum()

(0, 0)

In [41]:
# uniformity : convert to lower case 
train_df['text'] = train_df['text'].apply(lambda x : str(x).lower())
train_df['selected_text'] = train_df['selected_text'].apply(lambda x : str(x).lower())

In [42]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment
15059,cc0945a961,alredy had my chocolate it is impossible to ...,it is impossible to resist ;),positive
17547,a231d7864b,had a good day selling at feria urbana. the la...,good day se,positive
8460,e2abcae593,smh @ playing dress up! lol. i can`t see the ...,smh,negative
8052,7c17aea36e,..ok brother...did you change your num and no...,.you no good dude,negative
18536,6af2e23e1c,sat in the pub. pretty quiet so far. prob leav...,pretty quiet so far. pr,negative
2240,978068a4d7,watching supernatural those boys can hunt me ...,watching supernatural those boys can hunt me ...,neutral
7158,bac11ddc14,i know. it sucks,. it sucks,negative
25654,937d903511,okayyy you can read it to me cause then i`ll ...,special,positive
20816,4c1726ce91,including myself... guess umma be partying al...,including myself... guess umma be partying alo...,neutral
13962,4cfbcced8f,"yer, oh that`s **** cause u hell need to post...","yer, oh that`s **** cause u hell need to post ...",neutral


In [43]:
from sklearn.model_selection import train_test_split
x_train, x_val , y_train , y_val = train_test_split(train_df[['text','sentiment']],train_df['selected_text'],test_size=0.2, random_state=42)
x_train.shape, x_val.shape , y_train.shape , y_val.shape


((21984, 2), (5497, 2), (21984,), (5497,))

In [44]:
from transformers import RobertaTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [45]:
## Converting strings to a sequence of ids (integer), using the tokenizer and vocabulary.


In [46]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base',add_prefix_space=True)

In [47]:
## Tokenize and prepare for the model a sequence or a pair of sequences.

In [48]:
max_len=128
count = x_train.shape[0]
input_ids = np.zeros((count,max_len),dtype='int32')
attention_mask = np.zeros((count,max_len),dtype='int32')


In [49]:
print(x_train['text'].values[0])

doctor who has finished


In [50]:
from tqdm import tqdm
for i,each in tqdm(enumerate(x_train.values)):
  
  val = tokenizer.encode_plus(each[0],each[1],add_special_tokens=True,max_length=128,return_attention_mask=True,pad_to_max_length=True,return_tensors='pt',verbose=False)
  input_ids[i] = val['input_ids']
  attention_mask[i] = val['attention_mask']

21984it [00:09, 2310.20it/s]


In [82]:
x_train

Unnamed: 0,text,sentiment
11293,doctor who has finished,0
11299,you should.,0
18204,"back at school again. almost weekend. oh wait,...",0
22728,my computer is so slooowww this morning. i th...,0
1231,on my way to dazzle bar!!,0
...,...,...
21575,star trek was pure awesome! love it!!! <3333 ...,1
5390,"will be going to indiana baptist sunday, pray ...",0
860,is sitting thru the boring bits in titanic wai...,0
15795,missed the play,-1


### References:
1. https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk