# Movie Review Classification Using Bag Of Words.

In [1]:
#import Statments

import pandas as pd
import numpy as np
import sklearn
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# loading Data into a Dataframe

df=pd.read_csv('IMDB Dataset.csv')
df.head(4)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative


In [3]:
# Creating a new feature class 

df['class']= df['sentiment'].apply(lambda x : 1 if x=='positive' else 0)

In [4]:
df=df.drop('sentiment', axis='columns')
df.head(4)

Unnamed: 0,review,class
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0


In [5]:
# spliting the data into training and testing sets

X_train,X_test,y_train,y_test= train_test_split(df.review,df['class'],test_size=0.2)

In [6]:
X_train.shape

(40000,)

In [7]:
X_test.shape

(10000,)

In [8]:
# Count Vectorizer for converting reviews into a Bag of Words

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X_train_cv= cv.fit_transform(X_train)
X_train_cv.shape

(40000, 92942)

In [9]:
X_train_cv.toarray()[:4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Naive Bayes Model 

In [10]:
# creating a Model

from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()


In [11]:
model.fit(X_train_cv,y_train)

In [12]:
X_test_cv=cv.transform(X_test)
X_test_cv.shape

(10000, 92942)

In [13]:
y_predicted= model.predict(X_test_cv)

In [14]:
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      5005
           1       0.87      0.81      0.84      4995

    accuracy                           0.84     10000
   macro avg       0.85      0.84      0.84     10000
weighted avg       0.85      0.84      0.84     10000



# Random Forest Model

In [15]:
# creating a Random Forest Model

from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=50,criterion='entropy')
model.fit(X_train_cv,y_train)

In [16]:
model.score(X_test_cv,y_test)

0.8374

In [17]:
y_predicted= model.predict(X_test_cv)

In [18]:
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      5005
           1       0.84      0.84      0.84      4995

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



# KNN Model

In [19]:
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier(n_neighbors=10,metric='euclidean')
model.fit(X_train_cv,y_train)

In [20]:
model.score(X_test_cv,y_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.654

In [21]:
y_predicted= model.predict(X_test_cv)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [22]:
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.65      0.66      0.66      5005
           1       0.65      0.65      0.65      4995

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.65     10000
weighted avg       0.65      0.65      0.65     10000



## Result: Random Forest and Naive Bayes gives good results 

### Note:  KNN does not perform well for high dimensional features as it becomes difficult to calculate distance in each dimension