<h3><u>Sentiment Analysis on IMDB movie reviews using TF-IDF weighting scheme, Sentiment Score and Mutlinomial Naives Bayes Classification</u></h3>
<h4>Girija Prakash Shingte<br>shingte.girija@gmail.com

In [1]:
#Import necessary libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
#clean the data by removing stopwords, special symbols and digits
def data_cleaning(file_name):
    reviews = open(file_name,'r')
    list_of_stopwords = list(set(stopwords.words()))
    list_of_digits = re.compile('[0-9]')
    rev = []
    lbl = []
    for review in reviews:
        sent = review.split('\t')[0]
        label = review.split('\t')[1]
        wordlist = []
        sent = sent.split(' ')
        for word in sent:
            word = word.lower()
            #remove stopwords
            if(word not in list_of_stopwords
               #check if digits are present
               and not list_of_digits.search(word)
               #check if special symbols are present
               and word.isalnum()
               #check if length of word is not less than 1
               and not len(word)<=1):
                wordlist.append(word)
        #check if the wordlist is not empty
        if(len(wordlist)>1):
            #convert wordlist back to sentence and append to review list
            rev.append(' '.join(wordlist))
            #append label
            #label is converted from string to integer
            lbl.append(int(label.rstrip('\n')))
    return(rev,lbl)

rev, lbl = data_cleaning('imdb_labelled.txt')

In [3]:
#find tf-idf score for all the terms in every sentence and sum it up
def find_tfidfscore(rev):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(rev).todense()
    X = pd.DataFrame(X, columns=vectorizer.get_feature_names())
    X['total'] = X.sum(axis=1)
    tfidf = list(X['total'])
    return(tfidf)
    
tfidf = find_tfidfscore(rev)

In [4]:
#find sentiment score for each term in the sentence: positive, negative and neutrality
def find_sentimentscore(rev):
    positive_sentiment = []
    negative_sentiment = []
    neutrality = []
    for review in rev:
        positive = 0
        negative = 0
        neutral = 0
        pos = pos_tag(review)
        for item in pos:
            synset = swn.senti_synsets(item[0])
            for x in synset:
                positive+= x.pos_score()
                negative+= x.neg_score()
                neutral+= x.obj_score()
        positive_sentiment.append(round(positive,3))
        negative_sentiment.append(round(negative,3))
        neutrality.append(round(neutral))
    return(positive_sentiment, negative_sentiment, neutrality)

positive_sentiment, negative_sentiment, neutrality = find_sentimentscore(rev)

In [107]:
#create dataframe
df = pd.DataFrame()
df['tfidf'] = tfidf
df['positive_sentiment'] = positive_sentiment
df['negative_sentiment'] = negative_sentiment
df['neutrality'] = neutrality
df['label'] = lbl
df.head()

Unnamed: 0,tfidf,positive_sentiment,negative_sentiment,neutrality,label
0,1.919818,1.0,1.5,124,0
1,2.623103,2.375,2.25,208,0
2,3.927078,6.375,5.75,554,0
3,1.987891,1.0,1.5,134,0
4,2.935115,3.0,1.75,245,1


In [108]:
#Train-test split
X = df.drop('label',axis=1)
y = df[['label']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=300)

In [109]:
#Fit model
naivebayes = MultinomialNB(alpha = 0.9)
naivebayes.fit(X_train, y_train.values.ravel())

MultinomialNB(alpha=0.9, class_prior=None, fit_prior=True)

In [110]:
#Predict result
y_pred = list(naivebayes.predict(X_test))
y_test = list(y_test['label'])

In [111]:
print(accuracy_score(y_test, y_pred))  

0.5714285714285714


<h3>Results:</h3>
<ul>
    <li>Accuracy using TF-IDF weighting and Sentiment Score is poor</li>
</ul>  
<h3>Scope of Improvement:</h3>
<ul>
    <li>Using a different classifier(SVM, Logistic Regression)
    <li>Using CountVectorizer in place of TF-IDF weighting scheme

<h3>References:</h3>
<ul>
<li>Uci.edu. (2015). UCI Machine Learning Repository: Sentiment Labelled Sentences Data Set. [online] Available at: https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences.