# Importing the dependancies 

In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.metrics import *
from nltk.stem import PorterStemmer

In [2]:
#Reading the data into a Pandas dataframe
df = pd.read_csv("IMDB Dataset.csv",low_memory=False)

In [3]:
#Changing the sentiment part in the data to boolean
df["sentiment"].replace({"positive": 1, "negative": 0}, inplace=True)

In [4]:
print(df)

                                                  review  sentiment
0      One of the other reviewers has mentioned that ...          1
1      A wonderful little production. <br /><br />The...          1
2      I thought this was a wonderful way to spend ti...          1
3      Basically there's a family where a little boy ...          0
4      Petter Mattei's "Love in the Time of Money" is...          1
...                                                  ...        ...
49995  I thought this movie did a down right good job...          1
49996  Bad plot, bad dialogue, bad acting, idiotic di...          0
49997  I am a Catholic taught in parochial elementary...          0
49998  I'm going to have to disagree with the previou...          0
49999  No one expects the Star Trek movies to be high...          0

[50000 rows x 2 columns]


# Preprocessing

In [5]:
#For lemmatizing or stemming
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

#Getting the english stopwords
stopwords = stopwords.words('english')

#For each review removes the non alphanumeric characters and stopwords and converts to lower case
#To use stemming or lemmatizing please uncomment the corresponding part
for i in range(len(df["review"])):
    row=df["review"][i]
    my_list = row.split(" ")
    words = []
    for w in my_list:
         if (w.isalnum()) and w not in stopwords:
            w=w.lower()
            #w=stemmer.stem(w)
            #w=lemmatizer.lemmatize(w)
            words.append(w)
    my_string = ' '.join(words)
    #print(my_string)
    df["review"][i]=my_string
            
 
   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Analyzing the linguistic features in the data

In [6]:
#Takes the words in the positive and negative reviews separately
pos=[]
neg=[]
for i in range(len(df["sentiment"])):
    list=df['review'][i].split(" ")
    if(df['sentiment'][i]==1):
        for el in list:
            pos.append(el)
    else:
        for el in list:
            neg.append(el)
#Converts the list to pandas dataframe
df_pos=pd.DataFrame(pos)
df_neg=pd.DataFrame(neg)

In [7]:
#Calculates the frequency distributions
posfreq = nltk.FreqDist(df_pos[0])
negfreq = nltk.FreqDist(df_neg[0])

In [8]:
#Printing the 5 most common words in both positive and negative reviews
pos_most_common_list=posfreq.most_common(5)
neg_most_common_list=negfreq.most_common(5)
print ("The 5 most common words in positive reviews:\n")
for i,j in pos_most_common_list:
    print("'"+str(i)+"' occurred "+str(j)+" times")
print ("\nThe 5 most common words in negative reviews:\n")
for i,j in neg_most_common_list:
    print("'"+str(i)+"' occurred "+str(j)+" times")
#As you can see the most common words are common in both positive and negative reviews

The 5 most common words in positive reviews:

'i' occurred 62219 times
'the' occurred 34324 times
'film' occurred 29361 times
'movie' occurred 26678 times
'one' occurred 23268 times

The 5 most common words in negative reviews:

'i' occurred 70269 times
'the' occurred 35774 times
'movie' occurred 34805 times
'film' occurred 25717 times
'one' occurred 21710 times


In [9]:
#Checking the frequency of certain words in positive and negative reviews
print("The number of occurances of the word 'great'in positive reviews are "+str(df_pos[0].value_counts()['great']))
print("The number of occurances of the word 'great'in negative reviews are "+str(df_neg[0].value_counts()['great']))
print("The number of occurances of the word 'worst'in positive reviews are "+str(df_pos[0].value_counts()['worst']))
print("The number of occurances of the word 'worst'in negative reviews are "+str(df_neg[0].value_counts()['worst']))
#As you can see some words occur more in positive reviews and some others occur more in negative reviews

The number of occurances of the word 'great'in positive reviews are 11139
The number of occurances of the word 'great'in negative reviews are 4326
The number of occurances of the word 'worst'in positive reviews are 371
The number of occurances of the word 'worst'in negative reviews are 4411


In [10]:
#Calculating the avg number of words in positive and negative reviews and comparing both
poslensum=0
posnum=0
neglensum=0
negnum=0
for i in range(len(df["sentiment"])):
    list=df['review'][i].split(" ")
    if(df['sentiment'][i]==1):
        poslensum+=len(list)
        posnum+=1
    else:
        neglensum+=len(list)
        negnum+=1
        
print("The avg length of positive review is "+str(poslensum/posnum)+" words")
print("The avg length of negative review is "+str(neglensum/negnum)+" words")

The avg length of positive review is 100.10028 words
The avg length of negative review is 98.01312 words


# Dividing the data into training data and testing data

In [11]:
#40000 is for training and rest 10000 is for testing
df_train = df.iloc[:40000,:]
df_test = df.iloc[40001:,:]

# 1.Using Support Vector Machines

In [12]:
#Converting the data to vector format to feed into the model
from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
svm_train = vector.fit_transform(df_train['review'])
svm_test = vector.transform(df_test['review'])

In [13]:
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

#Defining the classifier
classifier = svm.SVC(kernel='linear')

#Training
classifier.fit(svm_train, df_train['sentiment'])

#Prediction
prediction = classifier.predict(svm_test)

In [14]:
#Generating report with precision, recall,etc.
report = classification_report(df_test['sentiment'], prediction, output_dict=True)
print('positive: ', report['1'])
print('negative: ', report['0'])

positive:  {'precision': 0.8746280499900814, 'recall': 0.8805672059117235, 'f1-score': 0.8775875796178344, 'support': 5007}
negative:  {'precision': 0.8793868495361032, 'recall': 0.8733974358974359, 'f1-score': 0.8763819095477386, 'support': 4992}


In [15]:
#Calculating the accuracy
accuracy_score(df_test['sentiment'], prediction)

0.876987698769877

# 2.Using XGBoost

In [16]:
import xgboost as xgb

In [17]:
#Converting the data to vector format to feed into the model
from sklearn.feature_extraction.text import CountVectorizer

vector1 = CountVectorizer(binary = True) 
vector1.fit(df['review']) # find all the unique words from the training set
train_x = vector1.fit_transform(df_train['review'])
test_x = vector1.transform(df_test['review'])

In [18]:
#Converting the data to the required format to feed into the xgb model
xgb_train = xgb.DMatrix(train_x, df_train['sentiment'])
xgb_test = xgb.DMatrix(test_x, df_test['sentiment'])

In [19]:
#Defining the parameters for the model
param = {        
        'objective':'multi:softprob',
        'learning_rate':0.75,
        'max_depth':50,
        'num_class':2,
        'subsample':0.8,
        'colsample_bytree':0.8,
        'eval_metric':'mlogloss',
        'min_child_weight':10,
        'reg_alpha':1.5, 
        'reg_lambda':5,
        'scale_pos_weight':1,  
        'verbose':1,        
        'n_thread':-1 
    }

#Training the model
xgb_model =xgb.train(param,xgb_train,num_boost_round = 500)

Parameters: { "n_thread", "scale_pos_weight", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [20]:
#Prediction
prediction1 = xgb_model.predict(xgb_test)

In [21]:
#Finding the appropriate class between 0 and 1 based on which has higher score
prediction1 = np.asarray([np.argmax(line) for line in prediction1])

In [22]:
#Generating report with precision, recall,etc.
report1 = classification_report(df_test['sentiment'], prediction1, output_dict=True)
print('positive: ', report1['1'])
print('negative: ', report1['0'])

positive:  {'precision': 0.8444755804643715, 'recall': 0.8426203315358498, 'f1-score': 0.8435469359192243, 'support': 5007}
negative:  {'precision': 0.8424945032980212, 'recall': 0.8443509615384616, 'f1-score': 0.8434217108554277, 'support': 4992}


In [23]:
#Calculating the accuracy
accuracy_score(df_test['sentiment'], prediction1)

0.8434843484348434

# References 

https://www.geeksforgeeks.org/python-stemming-words-with-nltk/
https://www.geeksforgeeks.org/python-lemmatization-with-nltk/
https://medium.com/@vasista/sentiment-analysis-using-svm-338d418e3ff1
https://ai.plainenglish.io/sentiment-classification-using-xgboost-7abdaf4771f9