# Sentiment Analysis

In [13]:
#Required Imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection  import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd
import random

In [18]:
#Reading The Data Into A DataFrame
data_train = pd.read_csv('original_train_data.csv',delimiter="\t", quoting=3, names=['Sentiment','Text'])
data_train.head()

Unnamed: 0,Sentiment,Text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [3]:
#Reading The Testing Data Into DataFrame 
data_test = pd.read_csv('original_test_data.csv', header=None, delimiter="\t", quoting=3, names=['Text'])
data_test.head()

Unnamed: 0,Text
0,""" I don't care what anyone says, I like Hillar..."
1,have an awesome time at purdue!..
2,"Yep, I'm still in London, which is pretty awes..."
3,"Have to say, I hate Paris Hilton's behavior bu..."
4,i will love the lakers.


In [4]:
#Dataset Summary
print('Length Of The Training Dataset: ',len(data_train))
print('Length Of The Testing Dataset: ',len(data_test))
print('Number Of Positive Reviews In Training Dataset: ', sum(data_train.Sentiment == 1))
print('Number Of Negative Reviews In Testing Dataset: ', sum(data_train.Sentiment == 0))

Length Of The Training Dataset:  7086
Length Of The Testing Dataset:  33052
Number Of Positive Reviews In Training Dataset:  3995
Number Of Negative Reviews In Testing Dataset:  3091


In [5]:
#Lets Seperate The Data Into Features And Labels
features_train = data_train.Text
labels_train = data_train.Sentiment

In [6]:
#Lets Create A MultiNomialNB Classifier 
model = make_pipeline(CountVectorizer(stop_words='english', max_features=100), MultinomialNB())

#Lets Fit The Data To The Model 
model.fit(features_train,labels_train)

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [7]:
#Now Lets Use This Model To Predict Labels For Test Data
labels = model.predict(data_test.Text)
print(labels)

[1 1 1 ..., 1 1 0]


In [8]:
#Lets Print Out Some Random Elements Of Test Data To Verify Results.
for i in range(0,10):
    k = random.randint(0,101) 
    print(data_test.iloc[k][0],'------------->', labels[k],'\n')

I want a ThinkPad or something. -------------> 1 

harvard is for dumb people. -------------> 1 

i will love the lakers. -------------> 1 

and honda's are awesome:). -------------> 1 

I like Honda... -------------> 1 

seattle sucks anyways. -------------> 0 

I like honda civics!!!!!!. -------------> 1 

I think Angelina Jolie is so much more beautiful than Jennifer Anniston, who, by the way, is majorly OVERRATED. -------------> 1 

i liked MIT though, esp their little info book( -------------> 1 

Before I left Missouri, I thought London was going to be so good and cool and fun and a really great experience and I was really excited. -------------> 1 



In [9]:
#Now Lets Use The Above Model To Predict The Sentiment On Some Random Data.
def predict(s, model=model):
    pred = model.predict([s])
    return pred[0]

In [10]:
predict('I would Love To Vist London City')

1

In [11]:
predict('I Hate Loud Music')

0