In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("./moviereviews.csv")
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2000 non-null   object
 1   review  1965 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [4]:
df = df.dropna()

In [5]:
df['review'].str.isspace().sum()

27

In [6]:
df[df['review'].str.isspace()]

Unnamed: 0,label,review
57,neg,
71,pos,
147,pos,
151,pos,
283,pos,
307,pos,
313,neg,
323,pos,
343,pos,
351,neg,


In [7]:
df = df[~df['review'].str.isspace()]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1938 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1938 non-null   object
 1   review  1938 non-null   object
dtypes: object(2)
memory usage: 45.4+ KB


In [9]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer(stop_words='english')

In [12]:
matrix = cv.fit_transform(df[df['label']=='neg']['review'])
freqs = zip(cv.get_feature_names(), matrix.sum(axis=0).tolist()[0])    

print("Top 20 words used for Negative reviews.")
print(sorted(freqs, key=lambda x: -x[1])[:20])

Top 20 words used for Negative reviews.
[('film', 4063), ('movie', 3131), ('like', 1808), ('just', 1480), ('time', 1127), ('good', 1117), ('bad', 997), ('character', 926), ('story', 908), ('plot', 888), ('characters', 838), ('make', 813), ('really', 743), ('way', 734), ('little', 696), ('don', 683), ('does', 666), ('doesn', 648), ('action', 635), ('scene', 634)]




In [13]:
matrix = cv.fit_transform(df[df['label']=='pos']['review'])
freqs = zip(cv.get_feature_names(), matrix.sum(axis=0).tolist()[0])    
# sort from largest to smallest
print("Top 20 words used for Positive reviews.")
print(sorted(freqs, key=lambda x: -x[1])[:20])

Top 20 words used for Positive reviews.
[('film', 5002), ('movie', 2389), ('like', 1721), ('just', 1273), ('story', 1199), ('good', 1193), ('time', 1175), ('character', 1037), ('life', 1032), ('characters', 957), ('way', 864), ('films', 851), ('does', 828), ('best', 788), ('people', 769), ('make', 764), ('little', 751), ('really', 731), ('man', 728), ('new', 702)]


In [14]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [16]:
pipe = Pipeline([('tfidf', TfidfVectorizer()),('svc', LinearSVC()),])

In [17]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('svc', LinearSVC())])

In [18]:
from sklearn.metrics import classification_report, confusion_matrix

In [19]:
preds = pipe.predict(X_test)

In [20]:
print(confusion_matrix(y_test, preds))

[[164  27]
 [ 38 159]]


In [21]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         neg       0.81      0.86      0.83       191
         pos       0.85      0.81      0.83       197

    accuracy                           0.83       388
   macro avg       0.83      0.83      0.83       388
weighted avg       0.83      0.83      0.83       388



In [22]:
pipe = Pipeline([('tfidf', TfidfVectorizer()),('mnb', MultinomialNB()),])

In [23]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])

In [24]:
preds = pipe.predict(X_test)

In [25]:
print(confusion_matrix(y_test, preds))

[[176  15]
 [ 59 138]]


In [26]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         neg       0.75      0.92      0.83       191
         pos       0.90      0.70      0.79       197

    accuracy                           0.81       388
   macro avg       0.83      0.81      0.81       388
weighted avg       0.83      0.81      0.81       388



In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
pipe = Pipeline([('tfidf', TfidfVectorizer()),('rfc', RandomForestClassifier()),])

In [29]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('rfc', RandomForestClassifier())])

In [30]:
preds = pipe.predict(X_test)

In [31]:
print(confusion_matrix(y_test, preds))

[[167  24]
 [ 64 133]]


In [32]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         neg       0.72      0.87      0.79       191
         pos       0.85      0.68      0.75       197

    accuracy                           0.77       388
   macro avg       0.79      0.77      0.77       388
weighted avg       0.79      0.77      0.77       388



In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
pipe = Pipeline([('tfidf', TfidfVectorizer()),('lr', LogisticRegression()),])

In [35]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())])

In [36]:
preds = pipe.predict(X_test)

In [37]:
print(confusion_matrix(y_test, preds))

[[162  29]
 [ 41 156]]


In [38]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         neg       0.80      0.85      0.82       191
         pos       0.84      0.79      0.82       197

    accuracy                           0.82       388
   macro avg       0.82      0.82      0.82       388
weighted avg       0.82      0.82      0.82       388



In [39]:
from sklearn.neighbors import KNeighborsClassifier

In [40]:
pipe = Pipeline([('tfidf', TfidfVectorizer()),('knn', KNeighborsClassifier()),])

In [41]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('knn', KNeighborsClassifier())])

In [42]:
preds = pipe.predict(X_test)

In [43]:
print(confusion_matrix(y_test, preds))

[[ 49 142]
 [  9 188]]


In [44]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         neg       0.84      0.26      0.39       191
         pos       0.57      0.95      0.71       197

    accuracy                           0.61       388
   macro avg       0.71      0.61      0.55       388
weighted avg       0.71      0.61      0.56       388



In [45]:
from sklearn.ensemble import GradientBoostingClassifier

In [46]:
pipe = Pipeline([('tfidf', TfidfVectorizer()),('gbc', GradientBoostingClassifier()),])

In [47]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('gbc', GradientBoostingClassifier())])

In [48]:
preds = pipe.predict(X_test)

In [49]:
print(confusion_matrix(y_test, preds))

[[157  34]
 [ 41 156]]


In [50]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         neg       0.79      0.82      0.81       191
         pos       0.82      0.79      0.81       197

    accuracy                           0.81       388
   macro avg       0.81      0.81      0.81       388
weighted avg       0.81      0.81      0.81       388

