<a href="https://colab.research.google.com/github/gorzanskik-ai/Binary-classification-of-movie-reviews/blob/main/colab/03_tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/movie-reviews/output/clean_transform.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

In [None]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewer ha mentioned that af...,positive
1,A wonderful little production br br The filmin...,positive
2,I thought this wa a wonderful way to spend tim...,positive
3,Basically there a family where a little boy Ja...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [None]:
X = df['review'].copy()
y = df['sentiment'].copy()

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (40000,)
y_train shape: (40000,)
X_test shape: (10000,)
y_test shape: (10000,)


Vectorization

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')

X_train shape: (40000, 154048)
X_test shape: (10000, 154048)


Models comparision

In [None]:
#logistic regression
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [None]:
#decisiontree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

In [None]:
#randomforest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
#svm
svm = SGDClassifier(loss='hinge', max_iter=500, random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

Best model

In [None]:
print('Accuracy')
print(f'Logistic regression: {accuracy_score(y_test, y_pred_lr)}')
print(f'Decision Tree Classifier: : {accuracy_score(y_test, y_pred_dt)}')
print(f'Random Forest Classifier: {accuracy_score(y_test, y_pred_rf)}')
print(f'SGD Classifier: {accuracy_score(y_test, y_pred_svm)}')

Accuracy
Logistic regression: 0.8972
Decision Tree Classifier: : 0.7155
Random Forest Classifier: 0.8455
SGD Classifier: 0.8985


In [None]:
#classification report
cr = classification_report(y_test, y_pred_lr)
print(cr)

              precision    recall  f1-score   support

           0       0.91      0.88      0.90      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [None]:
#confu
cm = confusion_matrix(y_test, y_pred_lr)

def plot_confusion_matrix(cm):
    cm = cm[::-1]
    cm = pd.DataFrame(cm, columns=['pred_neg', 'pred_pos'], index=['true_positive', 'true_negative'])

    fig = ff.create_annotated_heatmap(z=cm.values, x=list(cm.columns), y=list(cm.index),
                                      colorscale='ice', showscale=True, reversescale=True)
    fig.update_layout(width=400, height=400, title='Confusion Matrix', font_size=16)
    fig.show()

plot_confusion_matrix(cm)

In [None]:
pickle.dump(lr, open('/content/drive/MyDrive/movie-reviews/output/tfidf_model.pickle', 'wb'))