# Prepare Data

In [1]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("../../IMDBDataset.csv")
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


# Text Vectorization

In [3]:
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(df.head())

X_train, X_test, y_train, y_test = train_test_split(
    df['review'],           # Features (text)
    df['label'],            # Labels (0 or 1)
    test_size=0.5,          # 50% test, 50% train = 25k/25k
    random_state=42,        # for reproducibility
    stratify=df['label']    # ensures equal pos/neg distribution
)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


                                              review sentiment  label
0  One of the other reviewers has mentioned that ...  positive      1
1  A wonderful little production. <br /><br />The...  positive      1
2  I thought this was a wonderful way to spend ti...  positive      1
3  Basically there's a family where a little boy ...  negative      0
4  Petter Mattei's "Love in the Time of Money" is...  positive      1


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# 4. Logistic Regression
lr = LogisticRegression(max_iter=1_000)
lr.fit(X_train_vec, y_train)
y_pred_lr = lr.predict(X_test_vec)

print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=["Negative", "Positive"]))



# Support Vector Machines (SVM)

In [23]:
from sklearn.svm import LinearSVC

# Train the model
svm = LinearSVC()
svm.fit(X_train_vec, y_train)

# Predict
y_pred_svm = svm.predict(X_test_vec)

# Evaluate
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, target_names=["Negative", "Positive"]))

SVM Accuracy: 0.87308
              precision    recall  f1-score   support

    Negative       0.88      0.87      0.87     12500
    Positive       0.87      0.88      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



# k-Nearest Neighbors (k-NN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_vec, y_train)

y_pred = knn.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))

Accuracy: 0.72192
              precision    recall  f1-score   support

    Negative       0.72      0.72      0.72     12500
    Positive       0.72      0.73      0.72     12500

    accuracy                           0.72     25000
   macro avg       0.72      0.72      0.72     25000
weighted avg       0.72      0.72      0.72     25000



# Decision Tree

In [None]:

from sklearn.tree import DecisionTreeClassifier

# 5. Decision Tree
dt = DecisionTreeClassifier(max_depth=20, random_state=42)
dt.fit(X_train_vec, y_train)
y_pred_dt = dt.predict(X_test_vec)

print("\n=== Decision Tree ===")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt, target_names=["Negative", "Positive"]))



=== Logistic Regression ===
Accuracy: 0.88248
              precision    recall  f1-score   support

    Negative       0.89      0.87      0.88     12500
    Positive       0.87      0.90      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000


=== Decision Tree ===
Accuracy: 0.73268
              precision    recall  f1-score   support

    Negative       0.79      0.64      0.70     12500
    Positive       0.69      0.83      0.76     12500

    accuracy                           0.73     25000
   macro avg       0.74      0.73      0.73     25000
weighted avg       0.74      0.73      0.73     25000



# Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

# Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=420)
rf.fit(X_train_vec, y_train)

# Predict
y_pred_rf = rf.predict(X_test_vec)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=["Negative", "Positive"]))

Random Forest Accuracy: 0.84584
              precision    recall  f1-score   support

    Negative       0.84      0.86      0.85     12500
    Positive       0.85      0.83      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

