In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [32]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
pd.set_option('display.max_rows', 500)

### Importing data

In [4]:
train_df = pd.read_csv("../datasets/tweet-sentiment-extraction/train.csv")
test_df = pd.read_csv("../datasets/tweet-sentiment-extraction/test.csv")

### Data Exploration

In [5]:
train_df.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [6]:
train_df["text"].describe()

count                                    27480
unique                                   27480
top        I`d have responded, if I were going
freq                                         1
Name: text, dtype: object

In [7]:
## Data imbalance check
train_df.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [8]:
## check text column null row, 
train_df["text"].isnull().values.any()

True

In [9]:
train_df["text"].isnull().sum()

1

In [10]:
# index
train_df[train_df["text"].isnull()].index

Int64Index([314], dtype='int64')

In [11]:
train_df.iloc[314, :]
# make sense to keep this data 

textID           fdb77c3752
text                    NaN
selected_text           NaN
sentiment           neutral
Name: 314, dtype: object

In [12]:
train_df["text"].fillna(value="", inplace=True)
train_df["selected_text"].fillna(value="", inplace=True)

In [13]:
train_df.iloc[314, :]

textID           fdb77c3752
text                       
selected_text              
sentiment           neutral
Name: 314, dtype: object

In [17]:
# Convert to lowercase; 
train_df["pre_process"] = train_df["text"].apply(lambda x: " ".join(x.lower() for x in str(x).split()))

In [18]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"i`d have responded, if i were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego!!!
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me...
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview! leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"sons of ****, why couldn`t they put them on th..."
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish we could come see u on denver husband los...
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,i`ve wondered about rake to. the client has ma...
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good for both of you. enjoy the break - yo...
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,but it was worth it ****.


### Data Preprocessing

In [48]:
from sklearn.preprocessing import LabelEncoder


In [52]:
le = LabelEncoder()
train_df['sentiment'] = le.fit_transform(train_df.sentiment.values)

In [53]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",1,would responded going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,0,sooo sad miss san diego
2,088c60f138,my boss is bullying me...,bullying me,0,bos bullying
3,9642c003ef,what interview! leave me alone,leave me alone,0,interview leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",0,son could put release already bought
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,0,wish could come see u denver husband lost job ...
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",0,wondered rake client made clear net force devs...
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,2,yay good enjoy break probably need hectic week...
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,2,worth


In [54]:
test_df['sentiment'] = le.transform(test_df.sentiment.values)

In [55]:
test_df

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,1
1,96d74cb729,Shanghai is also really exciting (precisely -...,2
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",0
3,01082688c6,happy bday!,2
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,2
...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",0
3530,416863ce47,All alone in this old house again. Thanks for...,2
3531,6332da480c,I know what you mean. My little dog is sinkin...,0
3532,df1baec676,_sutra what is your next youtube video gonna b...,2


#### Removing HTML tags and urls

In [22]:
from bs4 import BeautifulSoup
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: BeautifulSoup(x).get_text())
import re
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: re.sub(r"http\S+", "", x))

In [25]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
22543,76dbf5ffbd,"Home from Jack`s now, can`t say I`m too happy ...",can`t say I`m too happy,negative,"home from jack`s now, can`t say i`m too happy ..."
87,cd0d522bb1,His snoring is so annoying n it keeps me from ...,His snoring is so annoying n it keeps me from ...,neutral,his snoring is so annoying n it keeps me from ...
4055,fe0c97fb9f,yes but they hate it when we do it...silly boys,hate it,negative,yes but they hate it when we do it...silly boys
3632,195605cfe6,word to yer mother!! \m/,word to yer mother!! \m/,neutral,word to yer mother!! \m/
18471,9fe798ecfe,is not liking the online livebox right now. In...,Internet is dead.,negative,is not liking the online livebox right now. in...
9630,2c09848db1,I think Destiny has officially gone crazy... h...,hahahaha,positive,i think destiny has officially gone crazy... h...
1898,700eb9d742,Went out to get groceries...prices are inflati...,inflating,negative,went out to get groceries...prices are inflati...
20443,d665b59814,Up a little.. Good luck to tam and robert who ...,Good luck to,positive,up a little.. good luck to tam and robert who ...
8413,c422f33e79,I didn`t read the details so that may be it. ...,it can`t be that exciting.,negative,i didn`t read the details so that may be it. s...
7762,140dc0f7ab,Going for a ride! Perfect night to go out and ...,Going for a ride! Perfect night to go out and ...,positive,going for a ride! perfect night to go out and ...


#### Contraction on text

In [69]:
def contractions(s):
 s = re.sub(r"won`t", "will not",s)
 s = re.sub(r"would`t", "would not",s)
 s = re.sub(r"could`t", "could not",s)
 s = re.sub(r"\`d", " would",s)
 s = re.sub(r"can\`t", "can not",s)
 s = re.sub(r"n\`t", " not", s)
 s= re.sub(r"\`re", " are", s)
 s = re.sub(r"\`s", " is", s)
 s = re.sub(r"\`ll", " will", s)
 s = re.sub(r"\`t", " not", s)
 s = re.sub(r"\`ve", " have", s)
 s = re.sub(r"\`m", " am", s)
 return s
train_df["pre_process"]=train_df["pre_process"].apply(lambda x:contractions(x))

In [70]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
18772,095aa261cb,"Uggh everything, I`m sending you!!! I keep sen...",I suck at life,0,uggh everything sending keep sending twitter f...
4884,3d05361785,about to start work and missing him like crazy...,missing him like crazy.I,0,start work missing like crazyi wont seeing til...
18874,a0235f0820,great night,great night,2,great night
6769,9ac16ebcc2,"right, so richard marx`s right here waiting is...",nauseatingly,0,right richard marx right waiting really making...
25250,856c445b1e,may the 4th be with you! HAPPY STAR WARS DAY!,HAPPY,2,may th happy star war day
22469,0a269cef6e,How are you today!?,How are you today!?,1,today
2898,ab65c6b83f,TAYLOR! are you back in Nashville now..or LA?...,? did ya have fun? love ya girly!,2,taylor back nashville la well trip ya fun love...
4508,a0a3065378,"_bmac I knooow, I want to sail","_bmac I knooow, I want to sail",1,bmac knooow want sail
14296,c8f996b680,exactly Prime Minister doesn`t need take car...,exactly Prime Minister doesn`t need take care...,1,exactly prime minister need take care issue
20748,9672b4ee61,GREEN DAY IS PERFORMING ON SNL NEXT WEEK! sor...,"sorry, that made me happy and i`m still all di...",1,green day performing snl next week sorry made ...


#### Removing non-alpha words

In [71]:
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: " ".join([re.sub("[^A-Za-z]+", "", x) for x in nltk.word_tokenize(x)]))

In [34]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
3642,56c95cc7de,Just came back from hanging out with some frie...,good.,positive,just came back from hanging out with some frie...
21306,8c06ec15b8,stealth shopping - got wife b-day present whil...,stealth shopping - got wife b-day present whil...,neutral,stealth shopping got wife bday present while ...
9529,8df54b96bd,Checking out zensify on my iphone. works fine ...,Checking out zensify on my iphone. works fine ...,neutral,checking out zensify on my iphone works fine ...
10016,d4cc846912,What does it matter? We ALL love you. Take y...,We ALL love you.,positive,what does it matter we all love you take you...
23173,b78e793bcc,Our Quinn puppy got a sick.,sick.,negative,our quinn puppy got a sick
13722,7d3f92c5b5,"Going lay down, I`ll get up officially @ 1! Ye...",Good(night) again!,positive,going lay down i will get up officially ye...
15466,4ec6e9138b,Sitting in traffic while my car gets rained on...,Doesn`t the weather know this is California?,negative,sitting in traffic while my car gets rained on...
12524,a3710e95cc,Haha thanks.,thanks.,positive,haha thanks
1345,64d352606b,"Trying to watch your vids, but the audio has ...","Trying to watch your vids, but the audio has b...",neutral,trying to watch your vids but the audio has b...
16558,0da207081a,_marie better than Hudgens. OMG our song came...,_marie better than Hudgens. OMG our song came...,neutral,marie better than hudgens omg our song came o...


#### Removing extra space b/w words

In [35]:
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: re.sub(" +", " ", x))

In [36]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,i would have responded if i were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why could not they put them on the rel...
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish we could come see u on denver husband los...
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,i have wondered about rake to the client has m...
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good for both of you enjoy the break you p...
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,but it was worth it


#### Removing stopwords

In [37]:
from nltk.corpus import stopwords
stop = stopwords.words("english")
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))


In [38]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
4216,c5f1b3ee3e,"Not feeling very good at all, why does this ha...",good,positive,feeling good happen today days going friends t...
24833,fa0983d231,so excited for you and Paris ooh lala i look...,e lovely,positive,excited paris ooh lala look forward knowing lo...
7652,ac4040f7af,watching The Uninvited in my room. Can`t wait ...,watching The Uninvited in my room. Can`t wait ...,neutral,watching uninvited room wait go home go deid f...
5864,bbbd57c093,Me too! need some tour dates stat!,Me too! need some tour dates stat,neutral,need tour dates stat
3137,a00b2b3fc0,do it and I`ll give you the first hug,do it and I`ll give you the first hug,positive,give first hug
581,1fce3e2d7b,no bueno. hollykins needs to feel better asa...,no bueno. hollykins needs to feel better asap...,neutral,bueno hollykins needs feel better asap ps miss...
19447,cdbe4df131,OMG so exctied,exctied,positive,omg exctied
17864,f7e69e437d,waitin 4 the skool bus soo tired nd still soo ...,tired,negative,waitin skool bus soo tired nd still soo much w...
25575,bb7a8693ea,Sad to be leaving so soon,Sad,negative,sad leaving soon
6625,a244d7aad7,Can`t I mix and match Dumb drive in!,Dumb,negative,mix match dumb drive


#### Lemmatization 

In [39]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [40]:
train_df["pre_process"]=train_df["pre_process"].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

In [56]:
train_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment,pre_process
15969,64810cb721,not just me then,not just me then,1,
6477,8cd4ccab2b,is having headache and colds...,headache and colds..,0,headache cold
17774,1186f88a11,Oh noooo that sucks Did you reschedule for a...,sucks,0,oh noooo suck reschedule another show
8103,2eac733725,"Good morning hun! I will, promise Glad you l...",Glad,2,good morning hun promise glad like em lovely day
21405,e0e093c2e3,; Re: Home - that`s exactly what I meant... ho...,uh... I brokt it...,0,home exactly meant home dtown also computer uh...
25809,83a44d5505,OK Just a thought that occured to me...,OK Just a thought that occured to me...,1,ok thought occured
500,18f60b8879,but my bday is JUNE 19.. this is wack... and ...,wack.,0,bday june wack ihavent seen promotion bday par...
20146,8d7f102cba,Panera is not being nice to my iPhone,not being nice,0,panera nice iphone
20896,df783bff1f,Checking out Twitter Trying to find people I k...,Unfortunately,0,checking twitter trying find people know unfor...
3885,22ecd73521,So apparently i left my front door wide open b...,Love,2,apparently left front door wide open going sho...


#### Data split

In [57]:
from sklearn.model_selection import train_test_split


In [58]:
X_train,X_test,Y_train, Y_test = train_test_split(train_df["pre_process"], train_df["sentiment"], test_size=0.25, random_state=30)

In [59]:
X_train.shape

(20610,)

In [60]:
Y_train.shape

(20610,)

#### Token into features with BOW TF-IDF

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [62]:
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

#### Modelling

In [63]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)

In [64]:
clf.fit(tf_x_train,Y_train)

LinearSVC(random_state=0)

In [65]:
y_test_pred=clf.predict(tf_x_test)


#### Evaluation

In [66]:
from sklearn.metrics import classification_report
report=classification_report(Y_test, y_test_pred,output_dict=True)


In [67]:
report

{'0': {'precision': 0.6543075245365322,
  'recall': 0.619514713474445,
  'f1-score': 0.6364359586316627,
  'support': 1937},
 '1': {'precision': 0.6235294117647059,
  'recall': 0.6514822848879248,
  'f1-score': 0.6371994342291372,
  'support': 2766},
 '2': {'precision': 0.7210060549604099,
  'recall': 0.7140221402214022,
  'f1-score': 0.7174971031286211,
  'support': 2168},
 'accuracy': 0.6622034638335031,
 'macro avg': {'precision': 0.666280997087216,
  'recall': 0.6616730461945907,
  'f1-score': 0.6637108319964736,
  'support': 6871},
 'weighted avg': {'precision': 0.6629627645353817,
  'recall': 0.6622034638335031,
  'f1-score': 0.6623204492112028,
  'support': 6871}}