# Movie Review Classification Using Bag Of Words.

In [4]:
#import Statments

import pandas as pd
import numpy as np
import sklearn
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from spacy.lang.en.stop_words import STOP_WORDS

In [5]:
# loading Data into a Dataframe

df=pd.read_csv('IMDB Dataset.csv')
df.head(4)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative


In [6]:
# Creating a new feature class 

df['class']= df['sentiment'].apply(lambda x : 1 if x=='positive' else 0)

In [7]:
df=df.drop('sentiment', axis='columns')
df.head(4)

Unnamed: 0,review,class
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0


In [8]:
df=df[:10000]
df.shape

(10000, 2)

In [9]:
#pre-processing(remove all Stop_Words, lemmatization, stemming)
nlp=spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(text)
    
    no_stop_words = [token.text for token in doc if not token.is_stop or token.is_punct or token.is_space]
    return " ".join(no_stop_words) 


In [10]:
df['new_review']=df.review.apply(preprocess)

In [11]:
df.shape
df=df.drop(['review'],axis='columns')

In [108]:
# spliting the data into training and testing sets

X_train,X_test,y_train,y_test= train_test_split(df.new_review,df['class'],test_size=0.2)

In [109]:
X_train.shape

(8000,)

In [110]:
X_test.shape

(2000,)

In [111]:
# Count Vectorizer for converting reviews into a Bag of Words

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X_train_cv= cv.fit_transform(X_train)
X_train_cv.shape

(8000, 47828)

In [112]:
X_train_cv.toarray()[:4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Naive Bayes Model 

In [126]:
# creating a Model

from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()


In [127]:
model.fit(X_train_cv,y_train)

In [128]:
X_test_cv=cv.transform(X_test)
X_test_cv.shape

(2000, 47828)

In [129]:
model.score(X_test_cv,y_test)

0.8585

In [130]:
y_predicted= model.predict(X_test_cv)

In [131]:
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86       965
           1       0.88      0.84      0.86      1035

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000



# Random Forest Model

In [132]:
# creating a Random Forest Model

from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=50,criterion='entropy')
model.fit(X_train_cv,y_train)

In [133]:
model.score(X_test_cv,y_test)

0.8485

In [134]:
y_predicted= model.predict(X_test_cv)

In [135]:
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       965
           1       0.86      0.84      0.85      1035

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



# KNN Model

In [122]:
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier(n_neighbors=10,metric='euclidean')
model.fit(X_train_cv,y_train)

In [123]:
model.score(X_test_cv,y_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.5745

In [124]:
y_predicted= model.predict(X_test_cv)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [125]:
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.54      0.83      0.65       965
           1       0.68      0.34      0.45      1035

    accuracy                           0.57      2000
   macro avg       0.61      0.58      0.55      2000
weighted avg       0.61      0.57      0.55      2000



## Result: Random Forest and Naive Bayes gives good results 

### Note:  KNN does not perform well for high dimensional features as it becomes difficult to calculate distance in each dimension