In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [5]:
df['Category'] = df['sentiment'].apply(lambda x: 1 if x=='positive' else 0)

In [6]:
df.head()

Unnamed: 0,review,sentiment,Category
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [7]:
X_train,X_test,y_train,y_test = train_test_split(df.review,df.Category,test_size=0.20)

In [8]:
X_train.shape

(40000,)

In [9]:

X_test.shape

(10000,)

In [10]:
X_train[:5]

15211    I saw this short film on the dvd for Ridley Sc...
37397    Kim Basinger and Mickey Rourke star in this co...
38668    Dirty Dancing follows the story of Frances 'Ba...
18667    This movie rivals "Plan 9" as one of the dumbe...
7050     "River's Edge" was one of the most disturbing ...
Name: review, dtype: object

In [11]:
y_train[:5]

15211    0
37397    0
38668    1
18667    0
7050     1
Name: Category, dtype: int64

In [12]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),                                                    
    ('random_forest', (RandomForestClassifier(n_estimators=50, criterion='entropy')))      
])


clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      4996
           1       0.84      0.84      0.84      5004

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [13]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),                                                    
    ('KNN', (KNeighborsClassifier(n_neighbors=10, metric='euclidean')))      
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.67      0.66      4996
           1       0.66      0.64      0.65      5004

    accuracy                           0.66     10000
   macro avg       0.66      0.66      0.66     10000
weighted avg       0.66      0.66      0.66     10000



In [14]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),                                                    
    ('Multi_NB', (MultinomialNB()))      
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      4996
           1       0.87      0.82      0.84      5004

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [15]:
# Therefore ww got an accuracy of 85% from two models