In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
df = pd.read_csv("../data/cleaned/imdb_cleaned.csv")

X = df["clean_review"]
y = df["label"]


In [3]:
import pickle

with open("../data/cleaned/tfidf_unigram.pkl", "rb") as f:
    tfidf_uni = pickle.load(f)

with open("../data/cleaned/tfidf_bigram.pkl", "rb") as f:
    tfidf_bi = pickle.load(f)


In [4]:
X_uni = tfidf_uni.transform(X)
X_bi  = tfidf_bi.transform(X)


In [5]:

X_uni_train, X_uni_test, y_train, y_test = train_test_split(
    X_uni, y, test_size=0.2, random_state=42, stratify=y
)

X_bi_train, X_bi_test, _, _ = train_test_split(
    X_bi, y, test_size=0.2, random_state=42, stratify=y
)


In [6]:

# Unigram
lr_uni = LogisticRegression(max_iter=1000)
lr_uni.fit(X_uni_train, y_train)
y_pred_uni = lr_uni.predict(X_uni_test)

print("Unigram TF-IDF")
print(classification_report(y_test, y_pred_uni))

# Bigram
lr_bi = LogisticRegression(max_iter=1000)
lr_bi.fit(X_bi_train, y_train)
y_pred_bi = lr_bi.predict(X_bi_test)

print("Bigram TF-IDF")
print(classification_report(y_test, y_pred_bi))


Unigram TF-IDF
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Bigram TF-IDF
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [8]:

nb_uni = MultinomialNB()
nb_uni.fit(X_uni_train, y_train)

y_pred_nb_uni = nb_uni.predict(X_uni_test)

print("Naive Bayes – Unigram")
print(classification_report(y_test, y_pred_nb_uni))
confusion_matrix(y_test, y_pred_nb_uni)
#naive bayes assumes all words are independent which is not true in natural language which is why it performs slightly worse than logistic regression

Naive Bayes – Unigram
              precision    recall  f1-score   support

           0       0.86      0.85      0.85      5000
           1       0.85      0.86      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



array([[4234,  766],
       [ 707, 4293]])

In [9]:
nb_bi = MultinomialNB()
nb_bi.fit(X_bi_train, y_train)

y_pred_nb_bi = nb_bi.predict(X_bi_test)

print("Naive Bayes – Bigram")
print(classification_report(y_test, y_pred_nb_bi))
confusion_matrix(y_test, y_pred_nb_bi)


Naive Bayes – Bigram
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      5000
           1       0.85      0.88      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



array([[4228,  772],
       [ 619, 4381]])