In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/pawnesh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/pawnesh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pawnesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 0. Importing data

In [4]:
train_df = pd.read_csv("../datasets/tweet-sentiment-extraction/train.csv")
test_df = pd.read_csv("../datasets/tweet-sentiment-extraction/test.csv")

### 1. Data Exploration

In [5]:
# Understanding the data at high level

In [6]:
test_df

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive
...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative
3530,416863ce47,All alone in this old house again. Thanks for...,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive


In [7]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [8]:
train_df.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [9]:
train_df["text"].describe()

count                                    27480
unique                                   27480
top        I`d have responded, if I were going
freq                                         1
Name: text, dtype: object

In [10]:
train_df.shape

(27481, 4)

In [11]:
## Data imbalance check
train_df.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [12]:
## check text column null row, 
train_df["text"].isnull().values.any()

True

In [13]:
train_df["text"].isnull().sum()

1

In [14]:
# index
train_df[train_df["text"].isnull()].index

Int64Index([314], dtype='int64')

In [15]:
train_df.iloc[314, :]
# make sense to keep this data 

textID           fdb77c3752
text                    NaN
selected_text           NaN
sentiment           neutral
Name: 314, dtype: object

In [16]:
train_df["text"].fillna(value="", inplace=True)
train_df["selected_text"].fillna(value="", inplace=True)

In [17]:
train_df.iloc[314, :]

textID           fdb77c3752
text                       
selected_text              
sentiment           neutral
Name: 314, dtype: object

In [18]:
## adding new column with additional info
#train_df["text_len"] = train_df["text"].apply(len)

In [19]:
## does text_len have any correlation with the target? if yes, would be valuable to include as a feature


In [20]:
#label_encode = {"positive": 1, "neutral": 0, "negative": -1}
#train_df["sentiment"] = train_df.sentiment.apply(lambda row: label_encode[row])

In [21]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


### 3. Data Preprocessing

In [22]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [23]:
train_df.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [24]:
train_df.shape, test_df.shape

((27481, 4), (3534, 3))

### Cleaning text

In [25]:
# removing html tag and urls
from bs4 import BeautifulSoup
train_df["text"]=train_df["text"].apply(lambda x: BeautifulSoup(x).get_text())
import re
train_df["text"]=train_df["text"].apply(lambda x: re.sub(r"http\S+", "", x))



In [26]:
#!pip install contractions


In [27]:
# expanding contractions
import contractions

    
def df_expand_contractions(text):
    text = text.replace("`", "'")
    expanded_words = []   
    for word in text.split():
      # using contractions.fix to expand the shortened words
      expanded_words.append(contractions.fix(word))
    return " ".join(i for i in expanded_words)
    

In [28]:
train_df["text"] = train_df["text"].apply(lambda x: df_expand_contractions(x))

In [29]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I would have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why could not they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see you on Denver husband l...,d lost,negative
27477,4f4c4fc327,I have wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - yo...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [30]:
# removing non alpha words
train_df["text"]=train_df["text"].apply(lambda x: " ".join([re.sub("[^A-Za-z]+", "", x) for x in nltk.word_tokenize(x)]))

In [31]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,I would have responded if I were going,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego,Sooo SAD,negative
2,088c60f138,my boss is bullying me,bullying me,negative
3,9642c003ef,what interview leave me alone,leave me alone,negative
4,358bd9e861,Sons of why could not they put them on th...,"Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see you on Denver husband l...,d lost,negative
27477,4f4c4fc327,I have wondered about rake to The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you Enjoy the break you...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it,But it was worth it ****.,positive


In [32]:
# removing extra spaces between words
train_df["text"]=train_df["text"].apply(lambda x: re.sub(" +", " ", x))

In [33]:
# removing stopwords
from nltk.corpus import stopwords
stop = stopwords.words("english")
train_df["text"]=train_df["text"].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

In [34]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,I would responded I going,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I miss San Diego,Sooo SAD,negative
2,088c60f138,boss bullying,bullying me,negative
3,9642c003ef,interview leave alone,leave me alone,negative
4,358bd9e861,Sons could put releases already bought,"Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish could come see Denver husband lost job af...,d lost,negative
27477,4f4c4fc327,I wondered rake The client made clear NET forc...,", don`t force",negative
27478,f67aae2310,Yay good Enjoy break probably need hectic week...,Yay good for both of you.,positive
27479,ed167662a5,But worth,But it was worth it ****.,positive


### 1. Text Tokenization

In [35]:
from nltk.tokenize.treebank import TreebankWordTokenizer

In [36]:
def df_tokenization(s):
 return TreebankWordTokenizer().tokenize(s)

In [37]:
train_df["token"] = train_df["text"].apply(lambda x: df_tokenization(x))

In [38]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,token
0,cb774db0d1,I would responded I going,"I`d have responded, if I were going",neutral,"[I, would, responded, I, going]"
1,549e992a42,Sooo SAD I miss San Diego,Sooo SAD,negative,"[Sooo, SAD, I, miss, San, Diego]"
2,088c60f138,boss bullying,bullying me,negative,"[boss, bullying]"
3,9642c003ef,interview leave alone,leave me alone,negative,"[interview, leave, alone]"
4,358bd9e861,Sons could put releases already bought,"Sons of ****,",negative,"[Sons, could, put, releases, already, bought]"
...,...,...,...,...,...
27476,4eac33d1c0,wish could come see Denver husband lost job af...,d lost,negative,"[wish, could, come, see, Denver, husband, lost..."
27477,4f4c4fc327,I wondered rake The client made clear NET forc...,", don`t force",negative,"[I, wondered, rake, The, client, made, clear, ..."
27478,f67aae2310,Yay good Enjoy break probably need hectic week...,Yay good for both of you.,positive,"[Yay, good, Enjoy, break, probably, need, hect..."
27479,ed167662a5,But worth,But it was worth it ****.,positive,"[But, worth]"


### 2. Token normalization

In [39]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def df_lemmatizer(s_list):
    out_ll = []
    for s in s_list:
        out_ll.append(lemmatizer.lemmatize(s))
    return out_ll

In [40]:
train_df["token_norm"] = train_df["token"].apply(lambda x: df_lemmatizer(x))

In [41]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
0,cb774db0d1,I would responded I going,"I`d have responded, if I were going",neutral,"[I, would, responded, I, going]","[I, would, responded, I, going]"
1,549e992a42,Sooo SAD I miss San Diego,Sooo SAD,negative,"[Sooo, SAD, I, miss, San, Diego]","[Sooo, SAD, I, miss, San, Diego]"
2,088c60f138,boss bullying,bullying me,negative,"[boss, bullying]","[bos, bullying]"
3,9642c003ef,interview leave alone,leave me alone,negative,"[interview, leave, alone]","[interview, leave, alone]"
4,358bd9e861,Sons could put releases already bought,"Sons of ****,",negative,"[Sons, could, put, releases, already, bought]","[Sons, could, put, release, already, bought]"
...,...,...,...,...,...,...
27476,4eac33d1c0,wish could come see Denver husband lost job af...,d lost,negative,"[wish, could, come, see, Denver, husband, lost...","[wish, could, come, see, Denver, husband, lost..."
27477,4f4c4fc327,I wondered rake The client made clear NET forc...,", don`t force",negative,"[I, wondered, rake, The, client, made, clear, ...","[I, wondered, rake, The, client, made, clear, ..."
27478,f67aae2310,Yay good Enjoy break probably need hectic week...,Yay good for both of you.,positive,"[Yay, good, Enjoy, break, probably, need, hect...","[Yay, good, Enjoy, break, probably, need, hect..."
27479,ed167662a5,But worth,But it was worth it ****.,positive,"[But, worth]","[But, worth]"


In [42]:
train_df[train_df["token"] != train_df["token_norm"]].sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
19499,93d89f358e,I read pretty awesome web comics today And not...,I read some pretty awesome web comics today. A...,neutral,"[I, read, pretty, awesome, web, comics, today,...","[I, read, pretty, awesome, web, comic, today, ..."
23469,5849da8575,lol brandy mr whiskers,lol brandy and mr whiskers is on,neutral,"[lol, brandy, mr, whiskers]","[lol, brandy, mr, whisker]"
25973,5ef6cb9505,Happy Birthday cheers PAO,Happy,positive,"[Happy, Birthday, cheers, PAO]","[Happy, Birthday, cheer, PAO]"
15105,dc7c50d401,tina adriii guys literally going sydney juneju...,_tina @_adriii guys i am literally going to s...,neutral,"[tina, adriii, guys, literally, going, sydney,...","[tina, adriii, guy, literally, going, sydney, ..."
670,4f0ab06d15,aah well friends Just wondering great weather ...,good.,positive,"[aah, well, friends, Just, wondering, great, w...","[aah, well, friend, Just, wondering, great, we..."
19569,1a42e9ced0,We compare horror stories mate LOL,We can compare horror stories mate LOL,neutral,"[We, compare, horror, stories, mate, LOL]","[We, compare, horror, story, mate, LOL]"
20085,7f3a3bc5d9,I think I might fall love jihoon boys flower,love,positive,"[I, think, I, might, fall, love, jihoon, boys,...","[I, think, I, might, fall, love, jihoon, boy, ..."
24477,89f387304c,hours left teenager ill sleepin hours depressing,how depressing,negative,"[hours, left, teenager, ill, sleepin, hours, d...","[hour, left, teenager, ill, sleepin, hour, dep..."
25430,b01cd83ef4,NHL Not fan either team head says Detroit feet...,"#NHL Not a fan of either team, my head says De...",neutral,"[NHL, Not, fan, either, team, head, says, Detr...","[NHL, Not, fan, either, team, head, say, Detro..."
14286,92198986f3,My dad trying force learn drive I like things ...,I`m not good at in public,negative,"[My, dad, trying, force, learn, drive, I, like...","[My, dad, trying, force, learn, drive, I, like..."


In [43]:
from nltk.stem.porter import *

In [44]:
stemmer = PorterStemmer()

In [45]:
def df_stemmer(s_list):
    return [stemmer.stem(plural) for plural in s_list]

In [46]:
train_df["token_norm_stem"] = train_df["token"].apply(lambda x: df_stemmer(x))

In [47]:
train_df[train_df["token"] != train_df["token_norm"]].sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm,token_norm_stem
20333,5c47380aef,TextMate crashed first time months Not bad act...,crashed for the first time in 3 months. Not to...,neutral,"[TextMate, crashed, first, time, months, Not, ...","[TextMate, crashed, first, time, month, Not, b...","[textmat, crash, first, time, month, not, bad,..."
26704,68e13bfa2b,Monday blues Not today,"Monday blues? Not today, not for me.",neutral,"[Monday, blues, Not, today]","[Monday, blue, Not, today]","[monday, blue, not, today]"
11372,a63638f545,BGT amazing tonight threee amazing acts got se...,amazing,positive,"[BGT, amazing, tonight, threee, amazing, acts,...","[BGT, amazing, tonight, threee, amazing, act, ...","[bgt, amaz, tonight, threee, amaz, act, got, s..."
9101,b128fea147,Had eat sandwhich since guys meeting No sushi ...,Had to eat my sandwhich since guys were in a m...,neutral,"[Had, eat, sandwhich, since, guys, meeting, No...","[Had, eat, sandwhich, since, guy, meeting, No,...","[had, eat, sandwhich, sinc, guy, meet, no, sus..."
886,1f82ec212e,Searching home things cook dinner evening It m...,Searching my home for a few things to cook the...,neutral,"[Searching, home, things, cook, dinner, evenin...","[Searching, home, thing, cook, dinner, evening...","[search, home, thing, cook, dinner, even, it, ..."
4412,ee5b47b5a9,I know exactly saying cool tapes better p,g.. its so not cool... that is why tapes were ...,neutral,"[I, know, exactly, saying, cool, tapes, better...","[I, know, exactly, saying, cool, tape, better, p]","[i, know, exactli, say, cool, tape, better, p]"
16909,95d40ac569,My vibe currently downed The thing amusing Jok...,My vibe is currently downed. The only thing a...,neutral,"[My, vibe, currently, downed, The, thing, amus...","[My, vibe, currently, downed, The, thing, amus...","[my, vibe, current, down, the, thing, amus, jo..."
20423,796e5ef0c2,Guess I would better look new best friend VIP ...,best,positive,"[Guess, I, would, better, look, new, best, fri...","[Guess, I, would, better, look, new, best, fri...","[guess, i, would, better, look, new, best, fri..."
2647,5d7af585d3,woman transfer first impressions sexualmaterna...,weak,negative,"[woman, transfer, first, impressions, sexualma...","[woman, transfer, first, impression, sexualmat...","[woman, transfer, first, impress, sexualmatern..."
10559,1742c433ec,piecing photo quilt Boeing employees hung Hunt...,"Sadly, many people in the quilt pics are now l...",negative,"[piecing, photo, quilt, Boeing, employees, hun...","[piecing, photo, quilt, Boeing, employee, hung...","[piec, photo, quilt, boe, employe, hung, hunti..."


In [48]:
# dropping stemming 

train_df.drop(["token_norm_stem"], axis=1, inplace=True)

In [49]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
0,cb774db0d1,I would responded I going,"I`d have responded, if I were going",neutral,"[I, would, responded, I, going]","[I, would, responded, I, going]"
1,549e992a42,Sooo SAD I miss San Diego,Sooo SAD,negative,"[Sooo, SAD, I, miss, San, Diego]","[Sooo, SAD, I, miss, San, Diego]"
2,088c60f138,boss bullying,bullying me,negative,"[boss, bullying]","[bos, bullying]"
3,9642c003ef,interview leave alone,leave me alone,negative,"[interview, leave, alone]","[interview, leave, alone]"
4,358bd9e861,Sons could put releases already bought,"Sons of ****,",negative,"[Sons, could, put, releases, already, bought]","[Sons, could, put, release, already, bought]"
...,...,...,...,...,...,...
27476,4eac33d1c0,wish could come see Denver husband lost job af...,d lost,negative,"[wish, could, come, see, Denver, husband, lost...","[wish, could, come, see, Denver, husband, lost..."
27477,4f4c4fc327,I wondered rake The client made clear NET forc...,", don`t force",negative,"[I, wondered, rake, The, client, made, clear, ...","[I, wondered, rake, The, client, made, clear, ..."
27478,f67aae2310,Yay good Enjoy break probably need hectic week...,Yay good for both of you.,positive,"[Yay, good, Enjoy, break, probably, need, hect...","[Yay, good, Enjoy, break, probably, need, hect..."
27479,ed167662a5,But worth,But it was worth it ****.,positive,"[But, worth]","[But, worth]"


### 3. Normalizing capital letters

In [50]:
train_df[train_df['text'].str.isupper()].shape

(205, 6)

In [51]:
train_df[train_df['text'].str.isupper()]

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
42,2e7082d1c8,MAYDAY,MAYDAY?!,neutral,[MAYDAY],[MAYDAY]
43,684081e4e7,RATT ROCKED NASHVILLE TONITE ONE THING SUCKED ...,RATT ROCKED NASHVILLE TONITE..ONE THING SUCKED...,neutral,"[RATT, ROCKED, NASHVILLE, TONITE, ONE, THING, ...","[RATT, ROCKED, NASHVILLE, TONITE, ONE, THING, ..."
80,bbbc46889b,THANK YYYYYYYYYOOOOOOOOOOUUUUU,THANK YYYYYYYYYOOOOOOOOOOUUUUU!,positive,"[THANK, YYYYYYYYYOOOOOOOOOOUUUUU]","[THANK, YYYYYYYYYOOOOOOOOOOUUUUU]"
193,3e880ec28d,WHAT ABOUT ME I VOTE EVERY DAY FOR YOU,WHAT ABOUT ME ?? I VOTE EVERY DAY FOR YOU !!!!!,negative,"[WHAT, ABOUT, ME, I, VOTE, EVERY, DAY, FOR, YOU]","[WHAT, ABOUT, ME, I, VOTE, EVERY, DAY, FOR, YOU]"
276,894e188d01,HAPPY MOTHERS DAY,HAPPY,positive,"[HAPPY, MOTHERS, DAY]","[HAPPY, MOTHERS, DAY]"
391,47990d2312,BRAINFREEZE,BRAINFREEZE,neutral,[BRAINFREEZE],[BRAINFREEZE]
456,fba4c01756,NOW I AM SAD BUT I AM NOT GIVING IN FIRST I DI...,SAD,negative,"[NOW, I, AM, SAD, BUT, I, AM, NOT, GIVING, IN,...","[NOW, I, AM, SAD, BUT, I, AM, NOT, GIVING, IN,..."
786,a30612b7f6,OH NEVERMIND I THINK THIS THING IS UNSALVAGEABLE,UNSALVAGEABLE,negative,"[OH, NEVERMIND, I, THINK, THIS, THING, IS, UNS...","[OH, NEVERMIND, I, THINK, THIS, THING, IS, UNS..."
831,33fd898450,GOODNIGHT MAGIC AND PRETTY WORLD,GOODNIGHT,positive,"[GOODNIGHT, MAGIC, AND, PRETTY, WORLD]","[GOODNIGHT, MAGIC, AND, PRETTY, WORLD]"
1150,aa189955b1,BUT THEY ARE EXPENSIVE,E EXPENSIVE,negative,"[BUT, THEY, ARE, EXPENSIVE]","[BUT, THEY, ARE, EXPENSIVE]"


In [52]:
to_process_rows= train_df[train_df['text'].str.isupper()].index

In [53]:
train_df.iloc[to_process_rows, 5] = train_df.iloc[to_process_rows, :]["token_norm"].apply(lambda x: list(map(lambda y:y.lower(),x)))

In [54]:
train_df[train_df['text'].str.isupper()]

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
42,2e7082d1c8,MAYDAY,MAYDAY?!,neutral,[MAYDAY],[mayday]
43,684081e4e7,RATT ROCKED NASHVILLE TONITE ONE THING SUCKED ...,RATT ROCKED NASHVILLE TONITE..ONE THING SUCKED...,neutral,"[RATT, ROCKED, NASHVILLE, TONITE, ONE, THING, ...","[ratt, rocked, nashville, tonite, one, thing, ..."
80,bbbc46889b,THANK YYYYYYYYYOOOOOOOOOOUUUUU,THANK YYYYYYYYYOOOOOOOOOOUUUUU!,positive,"[THANK, YYYYYYYYYOOOOOOOOOOUUUUU]","[thank, yyyyyyyyyoooooooooouuuuu]"
193,3e880ec28d,WHAT ABOUT ME I VOTE EVERY DAY FOR YOU,WHAT ABOUT ME ?? I VOTE EVERY DAY FOR YOU !!!!!,negative,"[WHAT, ABOUT, ME, I, VOTE, EVERY, DAY, FOR, YOU]","[what, about, me, i, vote, every, day, for, you]"
276,894e188d01,HAPPY MOTHERS DAY,HAPPY,positive,"[HAPPY, MOTHERS, DAY]","[happy, mothers, day]"
391,47990d2312,BRAINFREEZE,BRAINFREEZE,neutral,[BRAINFREEZE],[brainfreeze]
456,fba4c01756,NOW I AM SAD BUT I AM NOT GIVING IN FIRST I DI...,SAD,negative,"[NOW, I, AM, SAD, BUT, I, AM, NOT, GIVING, IN,...","[now, i, am, sad, but, i, am, not, giving, in,..."
786,a30612b7f6,OH NEVERMIND I THINK THIS THING IS UNSALVAGEABLE,UNSALVAGEABLE,negative,"[OH, NEVERMIND, I, THINK, THIS, THING, IS, UNS...","[oh, nevermind, i, think, this, thing, is, uns..."
831,33fd898450,GOODNIGHT MAGIC AND PRETTY WORLD,GOODNIGHT,positive,"[GOODNIGHT, MAGIC, AND, PRETTY, WORLD]","[goodnight, magic, and, pretty, world]"
1150,aa189955b1,BUT THEY ARE EXPENSIVE,E EXPENSIVE,negative,"[BUT, THEY, ARE, EXPENSIVE]","[but, they, are, expensive]"


In [55]:
rows_with_uppercase = []

def df_capital_check(s):
    flag=False
    for s_i in s:
        if s_i.isupper():
            flag=True 
    rows_with_uppercase.append(flag)

In [56]:
train_df["token_norm"].apply(lambda x: df_capital_check(x))

0        None
1        None
2        None
3        None
4        None
         ... 
27476    None
27477    None
27478    None
27479    None
27480    None
Name: token_norm, Length: 27481, dtype: object

In [57]:
filtered_rows = pd.Series(rows_with_uppercase, name="bools")

In [58]:
filtered_rows

0         True
1         True
2        False
3        False
4        False
         ...  
27476    False
27477     True
27478    False
27479    False
27480     True
Name: bools, Length: 27481, dtype: bool

In [59]:
train_df.loc[filtered_rows, :].sample(100)

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
18921,173b6b6ee3,store Lol I liquour,at the store! Lol I don`t have any liquour here,neutral,"[store, Lol, I, liquour]","[store, Lol, I, liquour]"
24293,43778d1d23,know I want left home weekend,. I don`t want to be left in our home by myself,negative,"[know, I, want, left, home, weekend]","[know, I, want, left, home, weekend]"
25879,949c90a928,wow tired going bed GOOD NIGHT,"wow i`m tired... going to bed, GOOD NIGHT",neutral,"[wow, tired, going, bed, GOOD, NIGHT]","[wow, tired, going, bed, GOOD, NIGHT]"
24798,d9639a0a97,Watching TV best people whole world My Mum My ...,Love,positive,"[Watching, TV, best, people, whole, world, My,...","[Watching, TV, best, people, whole, world, My,..."
10434,3de7a2f5fe,Trip DC next week canceled So Artomatic beer,"Trip to DC next week canceled. So, no Artomat...",neutral,"[Trip, DC, next, week, canceled, So, Artomatic...","[Trip, DC, next, week, canceled, So, Artomatic..."
10494,277acc9efd,thank I really appreciate babe,thank,positive,"[thank, I, really, appreciate, babe]","[thank, I, really, appreciate, babe]"
27363,a232d645c7,Stu Lantz awesome I miss Chick Hearn though,Stu Lantz is awesome! I miss Chick Hearn tho,neutral,"[Stu, Lantz, awesome, I, miss, Chick, Hearn, t...","[Stu, Lantz, awesome, I, miss, Chick, Hearn, t..."
14154,04b6314652,I say smudge I start calling people given mali...,I start calling people given the malicious act...,negative,"[I, say, smudge, I, start, calling, people, gi...","[I, say, smudge, I, start, calling, people, gi..."
925,bd5e56bd6d,What I learn today Never post anything sold eb...,my bad,negative,"[What, I, learn, today, Never, post, anything,...","[What, I, learn, today, Never, post, anything,..."
20858,eb86aa056b,I forget Venezuela Talking like classmates,I forget you`re from Venezuela! Talking with y...,neutral,"[I, forget, Venezuela, Talking, like, classmates]","[I, forget, Venezuela, Talking, like, classmate]"


In [60]:
def df_lowercase(s):
    out_s = []
    for s_i in s:
        out_s.append(s_i.lower())
    return out_s

In [61]:
# Todo: more analysis on pronouns and country 
#Remove noise; hyperlinks and puncatuation

#### Acronyms

In [62]:
acrn_index_series = train_df["text"].str.findall(r"([A-Z]\.[A-Z]])+").apply(lambda x: False if x==[] else True)

In [63]:
train_df.loc[acrn_index_series, :]

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm


In [64]:
train_df["token_norm"] = train_df["token_norm"].apply(lambda x: df_lowercase(x))

In [65]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
0,cb774db0d1,I would responded I going,"I`d have responded, if I were going",neutral,"[I, would, responded, I, going]","[i, would, responded, i, going]"
1,549e992a42,Sooo SAD I miss San Diego,Sooo SAD,negative,"[Sooo, SAD, I, miss, San, Diego]","[sooo, sad, i, miss, san, diego]"
2,088c60f138,boss bullying,bullying me,negative,"[boss, bullying]","[bos, bullying]"
3,9642c003ef,interview leave alone,leave me alone,negative,"[interview, leave, alone]","[interview, leave, alone]"
4,358bd9e861,Sons could put releases already bought,"Sons of ****,",negative,"[Sons, could, put, releases, already, bought]","[sons, could, put, release, already, bought]"
...,...,...,...,...,...,...
27476,4eac33d1c0,wish could come see Denver husband lost job af...,d lost,negative,"[wish, could, come, see, Denver, husband, lost...","[wish, could, come, see, denver, husband, lost..."
27477,4f4c4fc327,I wondered rake The client made clear NET forc...,", don`t force",negative,"[I, wondered, rake, The, client, made, clear, ...","[i, wondered, rake, the, client, made, clear, ..."
27478,f67aae2310,Yay good Enjoy break probably need hectic week...,Yay good for both of you.,positive,"[Yay, good, Enjoy, break, probably, need, hect...","[yay, good, enjoy, break, probably, need, hect..."
27479,ed167662a5,But worth,But it was worth it ****.,positive,"[But, worth]","[but, worth]"


### Count Vectorization

In [66]:
# unique words in corpus
unique_words = set()

def df_unique_word_set(s_list):
    for s_i in s_list:
        unique_words.add(s_i)

In [67]:
train_df["token_norm"].apply(lambda x: df_unique_word_set(x))

0        None
1        None
2        None
3        None
4        None
         ... 
27476    None
27477    None
27478    None
27479    None
27480    None
Name: token_norm, Length: 27481, dtype: object

In [68]:
len(unique_words)

23158

In [69]:
unique_words

{'igloo',
 'lovin',
 'beto',
 'potrait',
 'gpt',
 'providing',
 'leonard',
 'danger',
 'godddd',
 'america',
 'ru',
 'doomsday',
 'etc',
 'presh',
 'condom',
 'lawrence',
 'adriana',
 'cake',
 'avid',
 'whers',
 'submit',
 'tidying',
 'laaaaaaaaave',
 'euro',
 'chem',
 'ulu',
 'wow',
 'jadore',
 'blackberrymessenger',
 'beetle',
 'plana',
 'moreee',
 'sciifi',
 'bannerbomb',
 'mhs',
 'posada',
 'caspar',
 'rangers',
 'interpreter',
 'celebrates',
 'coloursfest',
 'six',
 'wasabi',
 'motto',
 'opps',
 'hanging',
 'feeder',
 'lindy',
 'sided',
 'warmth',
 'coatandkaycom',
 'offf',
 'frosted',
 'tweeet',
 'prereunion',
 'wout',
 'myweakness',
 'waved',
 'collerbone',
 'workstation',
 'remeber',
 'bridget',
 'cloth',
 'ffevered',
 'broth',
 'sixth',
 'gummy',
 'hotcold',
 'sixty',
 'represent',
 'accomplished',
 'mammas',
 'fandom',
 'ronaldo',
 'hognose',
 'headacheall',
 'inovera',
 'same',
 'stuffit',
 'bwahaha',
 'dikasih',
 'nutts',
 'rrod',
 'dasit',
 'butteflies',
 'pressure',
 'phi

In [70]:
# building vocab

vocab = {}

for ind_, word in enumerate(unique_words):
    vocab[word]=ind_

In [71]:
vocab

{'igloo': 0,
 'lovin': 1,
 'beto': 2,
 'potrait': 3,
 'gpt': 4,
 'providing': 5,
 'leonard': 6,
 'danger': 7,
 'godddd': 8,
 'america': 9,
 'ru': 10,
 'doomsday': 11,
 'etc': 12,
 'presh': 13,
 'condom': 14,
 'lawrence': 15,
 'adriana': 16,
 'cake': 17,
 'avid': 18,
 'whers': 19,
 'submit': 20,
 'tidying': 21,
 'laaaaaaaaave': 22,
 'euro': 23,
 'chem': 24,
 'ulu': 25,
 'wow': 26,
 'jadore': 27,
 'blackberrymessenger': 28,
 'beetle': 29,
 'plana': 30,
 'moreee': 31,
 'sciifi': 32,
 'bannerbomb': 33,
 'mhs': 34,
 'posada': 35,
 'caspar': 36,
 'rangers': 37,
 'interpreter': 38,
 'celebrates': 39,
 'coloursfest': 40,
 'six': 41,
 'wasabi': 42,
 'motto': 43,
 'opps': 44,
 'hanging': 45,
 'feeder': 46,
 'lindy': 47,
 'sided': 48,
 'warmth': 49,
 'coatandkaycom': 50,
 'offf': 51,
 'frosted': 52,
 'tweeet': 53,
 'prereunion': 54,
 'wout': 55,
 'myweakness': 56,
 'waved': 57,
 'collerbone': 58,
 'workstation': 59,
 'remeber': 60,
 'bridget': 61,
 'cloth': 62,
 'ffevered': 63,
 'broth': 64,
 'si

In [72]:
# Counter 

from collections import Counter, defaultdict

In [73]:
## count words in each row 
rows_count_words = []

def df_count_words(s):
    counter = defaultdict(int)
    for s_i in s:
        counter[s_i]+=1
    rows_count_words.append(dict(counter))
    
        
        

In [74]:
train_df["token_norm"].apply(lambda x: df_count_words(x))

0        None
1        None
2        None
3        None
4        None
         ... 
27476    None
27477    None
27478    None
27479    None
27480    None
Name: token_norm, Length: 27481, dtype: object

In [75]:
rows_count_words

[{'i': 2, 'would': 1, 'responded': 1, 'going': 1},
 {'sooo': 1, 'sad': 1, 'i': 1, 'miss': 1, 'san': 1, 'diego': 1},
 {'bos': 1, 'bullying': 1},
 {'interview': 1, 'leave': 1, 'alone': 1},
 {'sons': 1, 'could': 1, 'put': 1, 'release': 1, 'already': 1, 'bought': 1},
 {'shameless': 1,
  'plugging': 1,
  'best': 1,
  'rangers': 1,
  'forum': 1,
  'earth': 1},
 {'feeding': 1, 'baby': 1, 'fun': 1, 'smile': 1, 'coo': 1},
 {'soooo': 1, 'high': 1},
 {'both': 1},
 {'journey': 1, 'wow': 1, 'became': 1, 'cooler': 1, 'hehe': 1, 'possible': 1},
 {'much': 1,
  'love': 1,
  'hopeful': 1,
  'reckon': 1,
  'chance': 1,
  'minimal': 1,
  'p': 1,
  'never': 1,
  'going': 1,
  'get': 1,
  'cake': 1,
  'stuff': 1},
 {'i': 1,
  'really': 2,
  'like': 1,
  'song': 1,
  'love': 1,
  'story': 1,
  'taylor': 1,
  'swift': 1},
 {'my': 1, 'sharpie': 1, 'running': 1, 'dangerously': 1, 'low': 1, 'ink': 1},
 {'want': 1, 'go': 1, 'music': 1, 'tonight': 1, 'lost': 1, 'voice': 1},
 {'test': 2, 'lg': 1, 'env': 1},
 {'uh':

In [76]:
## row, vocab index (col index) and value(count)
row, val, col = [],[],[]

for ind_ in range(len(train_df)):
    for key, value in rows_count_words[ind_].items():
        col1 = vocab[key]
        val1 = value
        row.append(ind_)
        col.append(col1)
        val.append(val1)
        

In [79]:
from scipy.sparse import csr_matrix

In [87]:
cv_x_train = csr_matrix((val, (row, col)), shape=(len(train_df), len(vocab))).toarray()

In [88]:
np.unique(cv_x_train)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [89]:
cv_x_train.shape

(27481, 23158)

In [90]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_df['sentiment'] = le.fit_transform(train_df.sentiment.values)

In [91]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,token,token_norm
0,cb774db0d1,I would responded I going,"I`d have responded, if I were going",1,"[I, would, responded, I, going]","[i, would, responded, i, going]"
1,549e992a42,Sooo SAD I miss San Diego,Sooo SAD,0,"[Sooo, SAD, I, miss, San, Diego]","[sooo, sad, i, miss, san, diego]"
2,088c60f138,boss bullying,bullying me,0,"[boss, bullying]","[bos, bullying]"
3,9642c003ef,interview leave alone,leave me alone,0,"[interview, leave, alone]","[interview, leave, alone]"
4,358bd9e861,Sons could put releases already bought,"Sons of ****,",0,"[Sons, could, put, releases, already, bought]","[sons, could, put, release, already, bought]"
...,...,...,...,...,...,...
27476,4eac33d1c0,wish could come see Denver husband lost job af...,d lost,0,"[wish, could, come, see, Denver, husband, lost...","[wish, could, come, see, denver, husband, lost..."
27477,4f4c4fc327,I wondered rake The client made clear NET forc...,", don`t force",0,"[I, wondered, rake, The, client, made, clear, ...","[i, wondered, rake, the, client, made, clear, ..."
27478,f67aae2310,Yay good Enjoy break probably need hectic week...,Yay good for both of you.,2,"[Yay, good, Enjoy, break, probably, need, hect...","[yay, good, enjoy, break, probably, need, hect..."
27479,ed167662a5,But worth,But it was worth it ****.,2,"[But, worth]","[but, worth]"


In [94]:
y_train = train_df["sentiment"].to_numpy()

### Dataset splitting

In [96]:
from sklearn.model_selection import train_test_split


In [97]:
X_train,X_test,Y_train, Y_test = train_test_split(cv_x_train, y_train, test_size=0.25, random_state=30)

In [99]:
X_train.shape

(20610, 23158)

In [100]:
X_test.shape

(6871, 23158)

### Modelling

In [106]:
# simple model

In [101]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)

In [102]:
clf.fit(X_train,Y_train)

In [103]:
y_test_pred=clf.predict(X_test)


In [104]:
from sklearn.metrics import classification_report
report=classification_report(Y_test, y_test_pred,output_dict=True)

In [105]:
report

{'0': {'precision': 0.647964796479648,
  'recall': 0.6081569437274136,
  'f1-score': 0.6274300932090545,
  'support': 1937},
 '1': {'precision': 0.6126468555632343,
  'recall': 0.6409978308026031,
  'f1-score': 0.6265017667844522,
  'support': 2766},
 '2': {'precision': 0.7072718851320056,
  'recall': 0.7043357933579336,
  'f1-score': 0.7058007857638087,
  'support': 2168},
 'accuracy': 0.6517246397904235,
 'macro avg': {'precision': 0.6559611790582959,
  'recall': 0.6511635226293168,
  'f1-score': 0.6532442152524385,
  'support': 6871},
 'weighted avg': {'precision': 0.6524602620048279,
  'recall': 0.6517246397904235,
  'f1-score': 0.6517846137400191,
  'support': 6871}}

### References:
1. https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk