# Introduction to Sentiment Analysis
Creating a simple sentiment analysis model using bag-of-words and SVM

In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import cross_val_score

import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from sklearn.feature_extraction.text import CountVectorizer

import datetime

In [2]:
#Load dataset
data = pd.read_csv('../data/training.1600000.processed.noemoticon.csv',
                    encoding='latin-1',
                    header = None,
                    usecols=[0,5],
                    names=['target','text'])

data = data.sample(n=50000).reset_index(drop=True)
print(len(data))
data.head()

50000


Unnamed: 0,target,text
0,4,"New look, new life, new love"
1,4,@creolesugar i can always write more. which ...
2,4,@ethand you have so much to say that you need ...
3,0,@WenrichCaps i miss you too Maybe i can see ...
4,0,I have a spot on my back. Do you think room se...


## Clean dataset

In [3]:
def clean_text(text):
    """
    Return cleaned text:
        - Lowercase
        - Remove stopwords
        - Remove punctuation
    
    params
    ------------
        text: string
    """

    text = text.lower() #lowercase
    tokens = word_tokenize(text)
    
    tokens = [t for t in tokens if not t in stopwords] #remove stopwords
    tokens = [t for t in tokens if t.isalnum()] #remove punctuation
    text_clean = " ".join(tokens)
    
    return text_clean

text = data['text'][3]
print(text)
clean_text(text)

@WenrichCaps i miss you too   Maybe i can see you guys tonight, if it's not too late..


'wenrichcaps miss maybe see guys tonight'

In [4]:
#Update dataset
data['text'] = [clean_text(text) for text in data['text']]
data.head()

Unnamed: 0,target,text
0,4,new look new life new love
1,4,creolesugar always write lesson r u referring ...
2,4,ethand much say need 2 twitter accounts
3,0,wenrichcaps miss maybe see guys tonight
4,0,spot back think room service come get downside...


In [5]:
train = data[0:40000]
test = data[40000:50000].reset_index(drop=True)
print(len(train),len(test))

40000 10000


## Feature Engineering

In [6]:
# Create count vectoriser 
vectorizer = CountVectorizer(analyzer='word',
                        ngram_range=(1, 2)
                        ,max_features=500)

#Transform training corpus into feature matrix
X = vectorizer.fit_transform(train['text'])
feature_names = vectorizer.get_feature_names()

x_train = pd.DataFrame(data=X.toarray(),columns=feature_names)
y_train = train['target']

In [7]:
# Transform testing corpus into feature matrix
X = vectorizer.transform(test['text'])

x_test = pd.DataFrame(data=X.toarray(),columns=feature_names)
y_test = test['target']

In [8]:
# Min-Max scalling
x_max = x_train.max()
x_min = x_train.min()

x_train = (x_train - x_min)/x_max
x_test = (x_test - x_min)/x_max

In [9]:
print(len(x_test))
x_test.head()

10000


Unnamed: 0,10,100,able,actually,add,afternoon,ago,agree,ah,almost,...,ya,yay,yea,yeah,year,years,yes,yesterday,yet,youtube
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Modelling

In [18]:
def svm_kfold(C):
    """
    Returns the k_fold cross-validation accuracy for a given C parameter
    params
    ------------
        C: int
    """
    clf = svm.SVC(kernel='linear', C=C)
    scores = cross_val_score(clf,
                         X = x_train, 
                         y = y_train, 
                         cv=5,
                         n_jobs=-1)
    
    score = round(np.mean(scores),2)
    return score

In [19]:
begin = datetime.datetime.now()

print(svm_kfold(1))

print(datetime.datetime.now() - begin)

KeyboardInterrupt: 

In [11]:
#tune C hyper-parameter
print("C Parameter:")
for C in [0.001,0.01,0.1,1,10]:
    score = svm_kfold(C)
    print("{}: {}".format(C,score))

C Parameter:
0.001: 0.68
0.01: 0.72
0.1: 0.73
1: 0.73


KeyboardInterrupt: 

In [None]:
#final model
begin = datetime.datetime.now()

clf = svm.SVC(kernel='linear', C=1).fit(x_train, y_train)

y_pred = model.predict(x_test)

#accuracy on test set
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy: {}".format(accuracy))

#confusion matrix
conf_matrix = confusion_matrix(y_test,y_pred)
conf_matrix

print(datetime.datetime.now() - begin)

Accuracy: 0.711
