## Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import string

import nltk
from nltk.stem.porter import *

from sklearn.model_selection import  train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Dataset

In [2]:
data = pd.read_csv("./dataset/sentiment.tsv", sep="\t")
data.head()

Unnamed: 0,sentiment,text
0,neg,"@jamielewislewis i cant believe it, it really ..."
1,neg,Had a dream about a walk in fast food resturau...
2,neg,hates @internet @explrer (angry)(angry) **but ...
3,neg,@federalcase I said I go out for eat 5:negneg...
4,neg,@babykates7 yeah they won't do the surgery til...


Rename the columns to have a better understanding in the code.

In [3]:
data.columns = ['label', 'body_text']
data.head()

Unnamed: 0,label,body_text
0,neg,"@jamielewislewis i cant believe it, it really ..."
1,neg,Had a dream about a walk in fast food resturau...
2,neg,hates @internet @explrer (angry)(angry) **but ...
3,neg,@federalcase I said I go out for eat 5:negneg...
4,neg,@babykates7 yeah they won't do the surgery til...


In [4]:
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])
data.head()

Unnamed: 0,label,body_text
0,0,"@jamielewislewis i cant believe it, it really ..."
1,0,Had a dream about a walk in fast food resturau...
2,0,hates @internet @explrer (angry)(angry) **but ...
3,0,@federalcase I said I go out for eat 5:negneg...
4,0,@babykates7 yeah they won't do the surgery til...


## Cleaning Data

#### Remove @user

In [5]:
def remove_pattern(in_text, pattern):
    r = re.findall(pattern, in_text)
    for i in r:
        in_text = re.sub(i, '', in_text)
    return in_text

In [6]:
vec_rp = np.vectorize(remove_pattern)
data['tidy_text'] = vec_rp(data['body_text'], '@[\w]*')
data.head()

Unnamed: 0,label,body_text,tidy_text
0,0,"@jamielewislewis i cant believe it, it really ...","i cant believe it, it really doesnt belong th..."
1,0,Had a dream about a walk in fast food resturau...,Had a dream about a walk in fast food resturau...
2,0,hates @internet @explrer (angry)(angry) **but ...,hates (angry)(angry) **but no choice** http...
3,0,@federalcase I said I go out for eat 5:negneg...,I said I go out for eat 5:negneg p.m. I dis...
4,0,@babykates7 yeah they won't do the surgery til...,yeah they won't do the surgery till the cold ...


#### Remove special characters

In [7]:
data['tidy_text'] = data['tidy_text'].str.replace('[^a-zA-z#]', ' ')
data.head()

Unnamed: 0,label,body_text,tidy_text
0,0,"@jamielewislewis i cant believe it, it really ...",i cant believe it it really doesnt belong th...
1,0,Had a dream about a walk in fast food resturau...,Had a dream about a walk in fast food resturau...
2,0,hates @internet @explrer (angry)(angry) **but ...,hates angry angry but no choice http...
3,0,@federalcase I said I go out for eat 5:negneg...,I said I go out for eat negneg p m I dis...
4,0,@babykates7 yeah they won't do the surgery til...,yeah they won t do the surgery till the cold ...


#### Tokenized tweets

In [8]:
tokenized = data['tidy_text'].apply(lambda x: x.split())
tokenized.head()

0    [i, cant, believe, it, it, really, doesnt, bel...
1    [Had, a, dream, about, a, walk, in, fast, food...
2    [hates, angry, angry, but, no, choice, http, p...
3    [I, said, I, go, out, for, eat, negneg, p, m, ...
4    [yeah, they, won, t, do, the, surgery, till, t...
Name: tidy_text, dtype: object

In [9]:
stemmer = PorterStemmer()
tokenized = tokenized.apply(lambda x: [stemmer.stem(i) for i in x])
tokenized.head()

0    [i, cant, believ, it, it, realli, doesnt, belo...
1    [had, a, dream, about, a, walk, in, fast, food...
2    [hate, angri, angri, but, no, choic, http, plu...
3    [i, said, i, go, out, for, eat, negneg, p, m, ...
4    [yeah, they, won, t, do, the, surgeri, till, t...
Name: tidy_text, dtype: object

In [11]:
for i in range(len(tokenized)):
    tokenized[i] = ' '.join(tokenized[i])
data['tidy_text'] = tokenized
data.head()

Unnamed: 0,label,body_text,tidy_text
0,0,"@jamielewislewis i cant believe it, it really ...",i cant believ it it realli doesnt belong there...
1,0,Had a dream about a walk in fast food resturau...,had a dream about a walk in fast food resturau...
2,0,hates @internet @explrer (angry)(angry) **but ...,hate angri angri but no choic http plurk com p...
3,0,@federalcase I said I go out for eat 5:negneg...,i said i go out for eat negneg p m i disappoin...
4,0,@babykates7 yeah they won't do the surgery til...,yeah they won t do the surgeri till the cold i...


In [25]:
def punct_count(in_text):
    count = sum([1 for char in in_text if char in string.punctuation])
    return round(count/(len(in_text) - in_text.count(" ")), 3)*100

In [26]:
data['body_length'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['%_punctuation'] = data['body_text'].apply(lambda x: punct_count(x))
data.head()

Unnamed: 0,label,body_text,tidy_text,body_length,%_punctuation
0,0,"@jamielewislewis i cant believe it, it really ...",i cant believ it it realli doesnt belong there...,114,4.4
1,0,Had a dream about a walk in fast food resturau...,had a dream about a walk in fast food resturau...,96,1.0
2,0,hates @internet @explrer (angry)(angry) **but ...,hate angri angri but no choic http plurk com p...,75,21.3
3,0,@federalcase I said I go out for eat 5:negneg...,i said i go out for eat negneg p m i disappoin...,58,8.6
4,0,@babykates7 yeah they won't do the surgery til...,yeah they won t do the surgeri till the cold i...,82,6.1
