In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /Users/pawnesh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/pawnesh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pawnesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pawnesh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
pd.set_option('display.max_rows', 500)

### Importing data

In [4]:
train_df = pd.read_csv("../datasets/tweet-sentiment-extraction/train.csv")
test_df = pd.read_csv("../datasets/tweet-sentiment-extraction/test.csv")

### Data Exploration

In [5]:
train_df.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [6]:
train_df["text"].describe()

count                                    27480
unique                                   27480
top        I`d have responded, if I were going
freq                                         1
Name: text, dtype: object

In [7]:
## Data imbalance check
train_df.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [8]:
## check text column null row, 
train_df["text"].isnull().values.any()

True

In [9]:
train_df["text"].isnull().sum()

1

In [10]:
# index
train_df[train_df["text"].isnull()].index

Int64Index([314], dtype='int64')

In [11]:
train_df.iloc[314, :]
# make sense to keep this data 

textID           fdb77c3752
text                    NaN
selected_text           NaN
sentiment           neutral
Name: 314, dtype: object

In [12]:
train_df["text"].fillna(value="", inplace=True)
train_df["selected_text"].fillna(value="", inplace=True)

In [13]:
train_df.iloc[314, :]

textID           fdb77c3752
text                       
selected_text              
sentiment           neutral
Name: 314, dtype: object

In [14]:
# Convert to lowercase; 
train_df["pre_process"] = train_df["text"].apply(lambda x: " ".join(x.lower() for x in str(x).split()))

In [15]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"i`d have responded, if i were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego!!!
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me...
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview! leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"sons of ****, why couldn`t they put them on th..."
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish we could come see u on denver husband los...
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,i`ve wondered about rake to. the client has ma...
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good for both of you. enjoy the break - yo...
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,but it was worth it ****.


### Data Preprocessing

In [16]:
from sklearn.preprocessing import LabelEncoder


In [17]:
le = LabelEncoder()
train_df['sentiment'] = le.fit_transform(train_df.sentiment.values)

In [18]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",1,"i`d have responded, if i were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,0,sooo sad i will miss you here in san diego!!!
2,088c60f138,my boss is bullying me...,bullying me,0,my boss is bullying me...
3,9642c003ef,what interview! leave me alone,leave me alone,0,what interview! leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",0,"sons of ****, why couldn`t they put them on th..."
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,0,wish we could come see u on denver husband los...
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",0,i`ve wondered about rake to. the client has ma...
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,2,yay good for both of you. enjoy the break - yo...
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,2,but it was worth it ****.


In [19]:
test_df['sentiment'] = le.transform(test_df.sentiment.values)

In [20]:
test_df

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,1
1,96d74cb729,Shanghai is also really exciting (precisely -...,2
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",0
3,01082688c6,happy bday!,2
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,2
...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",0
3530,416863ce47,All alone in this old house again. Thanks for...,2
3531,6332da480c,I know what you mean. My little dog is sinkin...,0
3532,df1baec676,_sutra what is your next youtube video gonna b...,2


#### Removing HTML tags and urls

In [21]:
from bs4 import BeautifulSoup
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: BeautifulSoup(x).get_text())
import re
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: re.sub(r"http\S+", "", x))



In [22]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
9965,a617ec43e3,It`s great,great,2,it`s great
13882,716bbb29cd,_ci ewww poor you wish you all the best! than...,ewww poor you wish you all the best! thank Go...,1,_ci ewww poor you wish you all the best! thank...
9960,30116fe6bd,"Josh Excuse me, but, at what time is it goin...","Josh Excuse me, but, at what time is it going...",1,"josh excuse me, but, at what time is it going ..."
3038,1a762a9ca0,"haha that was good, slightly geeky but funny x","good,",2,"haha that was good, slightly geeky but funny x"
2002,2cc97395e7,ughhhh.... sad day.,sad,0,ughhhh.... sad day.
14880,f28af05ca2,Yay! Thank you.. Feels like ur yelling at us ...,Yay! Thank you.. Feels like ur yelling at us LO,1,yay! thank you.. feels like ur yelling at us lol
11459,4e2edbcbab,I`m all stuffted up but I gotta gets up & go n...,I`m all stuffted up but I gotta gets up & go n...,1,i`m all stuffted up but i gotta gets up & go n...
6083,fdc3815cee,http://twitpic.com/33hus that`s my baby,http://twitpic.com/33hus that`s my baby,1,that`s my baby
19217,80a43d94a8,Hey JK...Wish Evenlyn a Happy Mothers Day tom...,Happy,2,hey jk...wish evenlyn a happy mothers day tomo...
25583,51242b88bf,(SINGZ) SO I THINK IM JST ABT DONE BEING UR GI...,(SINGZ) SO I THINK IM JST ABT DONE BEING UR GI...,1,(singz) so i think im jst abt done being ur gi...


#### Contraction on text

In [23]:
def contractions(s):
 s = re.sub(r"won`t", "will not",s)
 s = re.sub(r"would`t", "would not",s)
 s = re.sub(r"could`t", "could not",s)
 s = re.sub(r"\`d", " would",s)
 s = re.sub(r"can\`t", "can not",s)
 s = re.sub(r"n\`t", " not", s)
 s= re.sub(r"\`re", " are", s)
 s = re.sub(r"\`s", " is", s)
 s = re.sub(r"\`ll", " will", s)
 s = re.sub(r"\`t", " not", s)
 s = re.sub(r"\`ve", " have", s)
 s = re.sub(r"\`m", " am", s)
 return s
train_df["pre_process"]=train_df["pre_process"].apply(lambda x:contractions(x))

In [24]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
14710,de32b9761a,dont quit.,dont quit.,1,dont quit.
7895,d39b91228c,I`ve given the $19.99 question more thought. ...,t. U may be right.,2,i have given the $19.99 question more thought....
11386,e31563f420,"Trickery? No, just exasperation at seeing **...",d uglier,0,"trickery? no, just exasperation at seeing ****..."
13873,a5d8cc3057,I`m not even in your situation and now I`m s...,". I`m sorry, that`s really sucky.",0,i am not even in your situation and now i am s...
26801,6eb6485ae6,i am waaayyyy hungry! oh fyi my work email is...,down again BLAH!,0,i am waaayyyy hungry! oh fyi my work email is ...
12695,0759e8d514,When you`re driven to be in the theatre you d...,When you`re driven to be in the theatre you de...,1,when you are driven to be in the theatre you d...
11926,ce26df8553,Facebook Farm town become slower and slower,Facebook Farm town become slower and slower,1,facebook farm town become slower and slower
11820,dc027078a1,me. confused about a guy...he`s great but wh...,me. confused about a guy...he`s great but why...,1,me. confused about a guy...he is great but why...
6147,d073fa9b62,I kind of miss a certain someone already. and...,I kind of miss a certain someone already. and...,1,"i kind of miss a certain someone already. and,..."
13215,014a8f6298,We`ll miss you ... Those lucky Edmontonians!,We`ll miss you,0,we will miss you ... those lucky edmontonians!


#### Removing non-alpha words

In [25]:
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: " ".join([re.sub("[^A-Za-z]+", "", x) for x in nltk.word_tokenize(x)]))

In [26]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
21931,130945d03c,SO DO I!! My jet is in the repair shop...lol,op...lo,2,so do i my jet is in the repair shop lol
5775,aa730765ab,kenny u alive!!!...I`m here getting da hair d...,a sad,0,kenny u alive i am here getting da hair do...
27441,aec8d3c1e5,Wanted to get that piercing too but thought i...,Wanted to get that piercing too but thought it...,1,wanted to get that piercing too but thought it...
6633,cfc7dbb6ee,aww hope uve hada good day xxxxx,w hope uve hada good day,2,aww hope uve hada good day xxxxx
11448,7e795b2ff3,Not doing single ones 2day...beat lol but wo...,d fun,2,not doing single ones day beat lol but would ...
1726,da9e839af0,"nope not going be able to finish it tonight, ...","nope not going be able to finish it tonight, g...",1,nope not going be able to finish it tonight g...
4078,cfe32cb6f7,"hey bettiye, i did see her. stay tuned for v...","hey bettiye, i did see her. stay tuned for vi...",1,hey bettiye i did see her stay tuned for vid...
17347,15e802cb38,just finished eating,just finished eating,1,just finished eating
9470,d9ca6707c1,walah me 2 still i am not getting the full idea,m not getting the full idea,0,walah me still i am not getting the full idea
2732,8a30be8cc7,"And i ain`t happy go lucky, its just today... ...","And i ain`t happy go lucky, its just today... ...",1,and i ai not happy go lucky its just today t...


#### Removing extra space b/w words

In [27]:
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: re.sub(" +", " ", x))

In [28]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",1,i would have responded if i were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,0,sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,0,my boss is bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,0,what interview leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",0,sons of why could not they put them on the rel...
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,0,wish we could come see u on denver husband los...
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",0,i have wondered about rake to the client has m...
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,2,yay good for both of you enjoy the break you p...
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,2,but it was worth it


#### Removing stopwords

In [29]:
from nltk.corpus import stopwords
stop = stopwords.words("english")
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))


In [30]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
6318,3c0e24b99e,My new design portfolio is finally online: htt...,My new design portfolio is finally online: htt...,1,new design portfolio finally online still need...
10606,a9dd5c8794,Had a great night! Tomorrow is mothers day,great,2,great night tomorrow mothers day
20425,4ada6dda00,"Working from home today, my back is killing me...",killing me,0,working home today back killing doctor physio ...
21900,f8b21ca906,16lbs is still a huge achievement though,huge achievement,2,lbs still huge achievement though
13630,d05f8cd3cc,yeah ill be on verizon... so those are my two...,yeah ill be on verizon... so those are my two ...,1,yeah ill verizon two options right
15895,26ea6f59f0,@_missrachel how much?,@_missrachel how much?,1,missrachel much
7363,a72fd81a70,don`t care about my struggles..,don`t care about my struggles..,0,care struggles
10889,f3b1420f6e,"had the best mother`s day! breakfast, shopping...",had the best mother`s day!,2,best mother day breakfast shopping lunch drink...
20535,daa3843c18,OMG The LG KC910 touchscreen fone is such a pi...,pile of shite!!,0,omg lg kc touchscreen fone pile shite roll new...
24167,004820461a,Jacqueline Wilson day on CBBC,Jacqueline Wilson day on CBBC,1,jacqueline wilson day cbbc


#### Lemmatization 

In [31]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [32]:
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

In [33]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
5354,b00a057b1a,Haha yea if i knew yew wanted 2 go i would ha...,Haha yea if i knew yew wanted 2 go i would ha...,1,haha yea knew yew wanted go would told yew
22320,5328c72bfd,OMG I`M SOO EXCITED! i`ve been waiting for it...,OMG I`M SOO EXCITED!,2,omg soo excited waiting ever since saw th one ...
20110,b4e7391d34,'Isn`t she lovely? Isn`t she beautiful?' Sorry...,you`re not really in a position to make that j...,0,lovely beautiful sorry stevie really position ...
2759,c1b8ead73d,"My Boy is leaving for the Summer, Going to sta...",I`m gonna miss him!!,0,boy leaving summer going stay grandparent gon ...
18714,9b14baf1b9,"urgh, i really hate that medicine",hate,0,urgh really hate medicine
26569,93000a69d3,Drinking coffee....MMMMM.....coffee,Drinking coffee....MMMMM.....coffee,1,drinking coffee mmmmm coffee
20397,defcb95b1f,yes 7.50 here thanks for the change obama. I ...,yes 7.50 here thanks for the change obama. I h...,1,yes thanks change obama start mail ordering
26421,6b9c032583,Thanks for making me laugh,Thanks for making me laugh,2,thanks making laugh
15063,5c4100c5ac,"Ugh, I hate 90 degree weather",hate 90,0,ugh hate degree weather
1163,2889cb3b49,o can`t work that one out pain in the **** t...,o can`t work that one out pain in the **** th...,0,work one pain spammer


#### Data split

In [34]:
from sklearn.model_selection import train_test_split


In [35]:
X_train,X_test,Y_train, Y_test = train_test_split(train_df["pre_process"], train_df["sentiment"], test_size=0.25, random_state=30)

In [36]:
X_train.shape

(20610,)

In [37]:
Y_train.shape

(20610,)

#### Token into features with BOW TF-IDF

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [39]:
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

#### Modelling

In [40]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)

In [41]:
clf.fit(tf_x_train,Y_train)

In [42]:
y_test_pred=clf.predict(tf_x_test)


#### Evaluation

In [43]:
from sklearn.metrics import classification_report
report=classification_report(Y_test, y_test_pred,output_dict=True)


In [44]:
report

{'0': {'precision': 0.6543075245365322,
  'recall': 0.619514713474445,
  'f1-score': 0.6364359586316627,
  'support': 1937},
 '1': {'precision': 0.6235294117647059,
  'recall': 0.6514822848879248,
  'f1-score': 0.6371994342291372,
  'support': 2766},
 '2': {'precision': 0.7210060549604099,
  'recall': 0.7140221402214022,
  'f1-score': 0.7174971031286211,
  'support': 2168},
 'accuracy': 0.6622034638335031,
 'macro avg': {'precision': 0.666280997087216,
  'recall': 0.6616730461945907,
  'f1-score': 0.6637108319964736,
  'support': 6871},
 'weighted avg': {'precision': 0.6629627645353817,
  'recall': 0.6622034638335031,
  'f1-score': 0.6623204492112028,
  'support': 6871}}

### Pipeline

In [45]:
# Todo