In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### 0. Importing data

In [5]:
train_df = pd.read_csv("../datasets/tweet-sentiment-extraction/train.csv")
test_df = pd.read_csv("../datasets/tweet-sentiment-extraction/test.csv")

### 1. Data Exploration

In [1]:
# Understanding the data at high level

In [45]:
test_df

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive
...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative
3530,416863ce47,All alone in this old house again. Thanks for...,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive


In [4]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [6]:
train_df.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [7]:
train_df["text"].describe()

count                                    27480
unique                                   27480
top        I`d have responded, if I were going
freq                                         1
Name: text, dtype: object

In [8]:
train_df.shape

(27481, 4)

In [10]:
## Data imbalance check
train_df.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [14]:
## check text column null row, 
train_df["text"].isnull().values.any()

True

In [15]:
train_df["text"].isnull().sum()

1

In [19]:
# index
train_df[train_df["text"].isnull()].index

Int64Index([314], dtype='int64')

In [21]:
train_df.iloc[314, :]
# make sense to keep this data 

textID           fdb77c3752
text                    NaN
selected_text           NaN
sentiment           neutral
Name: 314, dtype: object

In [25]:
train_df["text"].fillna(value="", inplace=True)
train_df["selected_text"].fillna(value="", inplace=True)

In [26]:
train_df.iloc[314, :]

textID           fdb77c3752
text                       
selected_text              
sentiment           neutral
Name: 314, dtype: object

In [28]:
## adding new column with additional info
train_df["text_len"] = train_df["text"].apply(len)

In [29]:
## does text_len have any correlation with the target? if yes, would be valuable to include as a feature


In [33]:
label_encode = {"positive": 1, "neutral": 0, "negative": -1}
train_df["sentiment"] = train_df.sentiment.apply(lambda row: label_encode[row])

In [34]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,text_len
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",0,36
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,-1,46
2,088c60f138,my boss is bullying me...,bullying me,-1,25
3,9642c003ef,what interview! leave me alone,leave me alone,-1,31
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",-1,75
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,-1,77
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",-1,122
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,1,111
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,1,27


In [35]:
corr = train_df.corr()

In [36]:
corr

Unnamed: 0,sentiment,text_len
sentiment,1.0,0.001509
text_len,0.001509,1.0


In [37]:
# No correlation with text_len

### 3. Data Preprocessing

In [27]:
train_df["text_len"] = train_df["text"].apply(len)

In [28]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,text_len
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,36
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,46
2,088c60f138,my boss is bullying me...,bullying me,negative,25
3,9642c003ef,what interview! leave me alone,leave me alone,negative,31
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,75
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,77
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,122
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,111
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,27


In [30]:
# Text preprocessing

In [31]:
# Main issue is to convert text format to numerical feature vector in order to perform the classification task.
# Approach: Bag-of-words 
#   each unique word in a text file will be represented by one number.

In [41]:
import string
from nltk.corpus import stopwords
nltk.download("stopwords")


def text_processing(text):
    stopwords_lst = stopwords.words("english")
    # remove punctuation
    nopunc = "".join([char for char in text if char not in string.punctuation])
    # remove stopwords
    nostop = " ".join([word for word in nopunc.split() if word.lower() not in stopwords_lst])
    
    return nostop

In [42]:
train_df["clean_text"] = train_df.text.apply(text_processing)

In [43]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,text_len,clean_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,36,Id responded going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,46,Sooo SAD miss San Diego
2,088c60f138,my boss is bullying me...,bullying me,negative,25,boss bullying
3,9642c003ef,what interview! leave me alone,leave me alone,negative,31,interview leave alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,75,Sons couldnt put releases already bought
...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,77,wish could come see u Denver husband lost job ...
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,122,Ive wondered rake client made clear NET dont f...
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,111,Yay good Enjoy break probably need hectic week...
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,27,worth
