# Cell 1 — Imports & Setup

In [1]:
import sys
sys.path.append("../")

import pandas as pd

from src.features import build_tfidf
from src.train import train_logistic_regression, train_svm
from src.evaluate import evaluate


# Cell 2 — Load Processed Data

In [2]:
train_df = pd.read_csv("../data/processed/train.csv")
val_df = pd.read_csv("../data/processed/val.csv")
test_df = pd.read_csv("../data/processed/test.csv")

train_df.head()


Unnamed: 0,review,sentiment
0,caught little gem totally accident back reviva...,1
1,cant believe let movie accomplish favor friend...,0
2,spoiler alert get nerve people remake use term...,0
3,there one thing ive learnt watching george rom...,0
4,remember theater review said horrible well did...,0


# Cell 3 — Separate Features & Labels

In [3]:
X_train_text = train_df["review"]
y_train = train_df["sentiment"]

X_val_text = val_df["review"]
y_val = val_df["sentiment"]

X_test_text = test_df["review"]
y_test = test_df["sentiment"]


# Cell 4 — TF-IDF Vectorization

In [4]:
X_train, X_val, X_test, tfidf_vectorizer = build_tfidf(
    X_train_text,
    X_val_text,
    X_test_text
)

X_train.shape


(40000, 30000)

# Cell 5 — Train Logistic Regression

In [5]:
lr_model = train_logistic_regression(
    X_train, y_train,
    save_path="../models/logistic_regression.pkl"
)


# Cell 6 — Validate Logistic Regression

In [6]:
evaluate(lr_model, X_val, y_val, name="Logistic Regression (Validation)")



Logistic Regression (Validation) Results
----------------------------------------
Accuracy: 0.9014

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      2500
           1       0.89      0.92      0.90      2500

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000


Confusion Matrix:
[[2206  294]
 [ 199 2301]]


# Cell 7 — Train SVM

In [7]:
svm_model = train_svm(
    X_train, y_train,
    save_path="../models/svm.pkl"
)


# Cell 8 — Validate SVM

In [8]:
evaluate(svm_model, X_val, y_val, name="SVM (Validation)")



SVM (Validation) Results
----------------------------------------
Accuracy: 0.8976

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      2500
           1       0.89      0.91      0.90      2500

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000


Confusion Matrix:
[[2210  290]
 [ 222 2278]]


# Cell 9 — Final Test Evaluation (Optional but Recommended)

In [9]:
evaluate(lr_model, X_test, y_test, name="Logistic Regression (Test)")
evaluate(svm_model, X_test, y_test, name="SVM (Test)")



Logistic Regression (Test) Results
----------------------------------------
Accuracy: 0.8992

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      2500
           1       0.90      0.90      0.90      2500

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000


Confusion Matrix:
[[2237  263]
 [ 241 2259]]

SVM (Test) Results
----------------------------------------
Accuracy: 0.8982

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      2500
           1       0.90      0.90      0.90      2500

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000


Confusion Matrix:
[[2248  252]
 [ 257 2243]]
