<h3 align='center'>Sentiment Analysis with movie review</h3>

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.utils import shuffle
# LIbrary to Clean the texts
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
ps = PorterStemmer()


# Importing the dataset
TrainDataset = pd.read_csv('Train.csv')
TrainDataset=shuffle(TrainDataset)
TestDataset = pd.read_csv('Test.csv')
TestDataset=shuffle(TestDataset)

#load the train and test split
X_train=TrainDataset['Review']
y_train=TrainDataset.iloc[:,1].values
X_test=TestDataset['Review']
y_test=TestDataset.iloc[:,1].values

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elisontuscano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/elisontuscano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def getCleanReview(review):
    #only keep alphabets remove rest
    review = re.sub('[^a-zA-Z]', ' ', review)
    #turn all reviews into lowercase
    review = review.lower()
    review = review.split()
    #remove stopwords and do stemming
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    #Lemmatize and merge the review together after making all the changes
    review = ' '.join([lemmatizer.lemmatize(word) for word in review])
    return review

Convert the following review into bag of vectors

In [3]:
X_train[0]

'For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.'

In [4]:
#set up tfidfvectorizor
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizor=TfidfVectorizer(stop_words='english', max_df=0.7,max_features=10000)

#fit and transform train and test set
tfidf_train=tfidf_vectorizor.fit_transform(X_train).toarray()
tfidf_test=tfidf_vectorizor.transform(X_test).toarray()
tfidf_train.shape

(25000, 10000)

In [5]:
tfidf_train[0]

array([0., 0., 0., ..., 0., 0., 0.])

<h3 align='center'>Fit the data with naive bayes</h3>

In [6]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
NaiveClassifier = GaussianNB()
NaiveClassifier.fit(tfidf_train, y_train)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = NaiveClassifier, X = tfidf_train, y = y_train, cv = 10)
accuracies.mean()

0.78464

In [7]:
#checking the accuracy on test data
accuracies = cross_val_score(estimator = NaiveClassifier, X = tfidf_test, y = y_test, cv = 10)
accuracies.mean()

0.77908

<h3 align='center'>Comparison with Random Forest Algorithm</h3>

In [8]:
from sklearn.ensemble import RandomForestClassifier
RFClassifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
RFClassifier.fit(tfidf_train, y_train)

# Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = RFClassifier, X = tfidf_train, y = y_train, cv = 10)
accuracies.mean()

0.79152

In [9]:
#checking the accuracy on test data
accuracies = cross_val_score(estimator =RFClassifier, X = tfidf_test, y = y_test, cv = 10)
accuracies.mean()

0.7932400000000001

<h3 align='center'>Improvement with Artificial neural network</h3>

In [10]:
y_train

array(['pos', 'pos', 'neg', ..., 'neg', 'pos', 'pos'], dtype=object)

In [11]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoderY = LabelEncoder()
Y_train= labelencoderY.fit_transform(y_train)
Y_test=labelencoderY.transform(y_test)

In [12]:
Y_train

array([1, 1, 0, ..., 0, 1, 1])

In [13]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(Dense(output_dim = 256, init = 'uniform', activation = 'relu', input_dim = 10000))
# Adding the second hidden layer
classifier.add(Dense(output_dim = 64, init = 'uniform', activation = 'relu'))
# Adding the output layer
classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'sigmoid'))
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(tfidf_train, Y_train, batch_size = 10, nb_epoch = 5,validation_data=(tfidf_test,Y_test))

Using TensorFlow backend.
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1b254d8890>

In [19]:
classifier.save('model/ann_model.h5')
score = classifier.evaluate(tfidf_test, Y_test, verbose=0)
print('Test accuracy : {} %'. format(score[1]*100))

Test accuracy : 84.30799841880798 %
