In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


# Movie Review Classification Using Bag Of Words

In [2]:
#import Statments

import pandas as pd
import numpy as np
import sklearn
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from spacy.lang.en.stop_words import STOP_WORDS

In [3]:
# loading Data into a Dataframe

df=pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head(4)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative


In [4]:
# Creating a new feature class 

df['class']= df['sentiment'].apply(lambda x : 1 if x=='positive' else 0)

In [5]:
df=df.drop('sentiment', axis='columns')
df.head(4)

Unnamed: 0,review,class
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0


In [6]:
df.shape

(50000, 2)

In [7]:
#pre-processing(remove all Stop_Words, lemmatization, stemming)
nlp=spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(text)
    
    no_stop_words = [token.text for token in doc if not token.is_stop or token.is_punct or token.is_space]
    return " ".join(no_stop_words) 


In [8]:
df['new_review']=df.review.apply(preprocess)

In [9]:
df.shape
df=df.drop(['review'],axis='columns')

In [11]:
# spliting the data into training and testing sets

X_train,X_test,y_train,y_test= train_test_split(df.new_review,df['class'],test_size=0.2)

In [12]:
X_train.shape

(40000,)

In [13]:
X_test.shape

(10000,)

In [14]:
# Count Vectorizer for converting reviews into a Bag of Words

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X_train_cv= cv.fit_transform(X_train)
X_train_cv.shape

(40000, 92319)

In [15]:
X_train_cv.toarray()[:4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Naive Bayes Model 

In [16]:
# creating a Model

from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()


In [17]:
model.fit(X_train_cv,y_train)

MultinomialNB()

In [18]:
X_test_cv=cv.transform(X_test)
X_test_cv.shape

(10000, 92319)

In [19]:
y_predicted= model.predict(X_test_cv)

In [20]:
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87      5061
           1       0.88      0.84      0.86      4939

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



# Random Forest Model

In [21]:
# creating a Random Forest Model

from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=50,criterion='entropy')
model.fit(X_train_cv,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=50)

In [22]:
model.score(X_test_cv,y_test)

0.8514

In [23]:
y_predicted= model.predict(X_test_cv)

In [24]:
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      5061
           1       0.85      0.85      0.85      4939

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



# KNN Model

In [25]:
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier(n_neighbors=10,metric='euclidean')
model.fit(X_train_cv,y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=10)

In [26]:
model.score(X_test_cv,y_test)

0.6093

In [27]:
y_predicted= model.predict(X_test_cv)

In [28]:
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.58      0.80      0.67      5061
           1       0.67      0.41      0.51      4939

    accuracy                           0.61     10000
   macro avg       0.63      0.61      0.59     10000
weighted avg       0.63      0.61      0.59     10000



# Result: Random Forest and Naive Bayes gives good results
## Note: KNN does not perform well for high dimensional features as it becomes difficult to calculate distance in each dimension