# Movie Review Sentiment Classification

Classify movie reviews as positive or negative using TF-IDF.

**Dataset:** [https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)  
**Text column:** `review`  **Target:** `sentiment`  
**Type:** Binary Text Classification

> **TODO:** Download the dataset, place it in `../../data/raw/`, then update `DATA_PATH`, `TEXT_COL`, and `TARGET` below.

In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    classification_report, ConfusionMatrixDisplay,
    roc_auc_score, roc_curve,
)
sns.set_theme(style='whitegrid')

## 1. Load Data

In [None]:
DATA_PATH = "../../data/raw/IMDB Dataset.csv"
TEXT_COL = "review"  # TODO: verify column name
TARGET = "sentiment"       # TODO: verify column name

df = pd.read_csv(DATA_PATH, encoding='latin-1')
df = df[[TEXT_COL, TARGET]].dropna()
print(f'Shape: {df.shape}')
print(df[TARGET].value_counts())
df.head()

## 2. EDA

In [None]:
# Class distribution
df[TARGET].value_counts().plot(kind='bar')
plt.title(f'Class Distribution: {TARGET}')
plt.xticks(rotation=0); plt.tight_layout(); plt.show()

# Text length
df['text_len'] = df[TEXT_COL].str.len()
df.groupby(TARGET)['text_len'].hist(alpha=0.6, bins=40)
plt.title('Text Length by Class')
plt.xlabel('Characters'); plt.tight_layout(); plt.show()
print(df.groupby(TARGET)['text_len'].describe())

## 3. Text Preprocessing

In [None]:
def preprocess(text: str) -> str:
    text = text.lower()
    text = re.sub(r'<[^>]+>', ' ', text)  # strip HTML
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df[TEXT_COL].astype(str).apply(preprocess)
df[['clean_text']].head(3)

## 4. Train / Test Split

In [None]:
X = df['clean_text']
y = df[TARGET]

# TODO: encode labels if they are strings, e.g.:
# y = y.map({'positive': 1, 'negative': 0})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f'Train: {len(X_train)}, Test: {len(X_test)}')

## 5. Model Training

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=50_000, sublinear_tf=True)

models = {
    'Naive Bayes': Pipeline([('tfidf', tfidf), ('clf', MultinomialNB())]),
    'Logistic Regression': Pipeline([('tfidf', tfidf),
                                    ('clf', LogisticRegression(max_iter=1000))]),
    'LinearSVC': Pipeline([('tfidf', tfidf), ('clf', LinearSVC(max_iter=2000))]),
}

results = {}
for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    results[name] = {'pipe': pipe, 'preds': preds}
    print(f'\n=== {name} ===')
    print(classification_report(y_test, preds))

## 6. Evaluation

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, len(results), figsize=(5 * len(results), 4))
for ax, (name, res) in zip(axes, results.items()):
    ConfusionMatrixDisplay.from_predictions(y_test, res['preds'], ax=ax)
    ax.set_title(name)
plt.tight_layout(); plt.show()

In [None]:
# Top TF-IDF terms per class (Logistic Regression)
lr_pipe = results['Logistic Regression']['pipe']
vocab = lr_pipe.named_steps['tfidf'].get_feature_names_out()
coef = lr_pipe.named_steps['clf'].coef_[0]

top_pos = pd.Series(coef, index=vocab).nlargest(15)
top_neg = pd.Series(coef, index=vocab).nsmallest(15)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
top_pos.sort_values().plot(kind='barh', ax=axes[0])
axes[0].set_title('Top Positive Terms')
top_neg.sort_values().plot(kind='barh', ax=axes[1])
axes[1].set_title('Top Negative Terms')
plt.tight_layout(); plt.show()

## 7. Conclusion

| Model | Accuracy | F1 |
|---|---|---|
| *(fill after running)* | | |

**Observations:**
- 

**Next steps:**
- Tune `max_features` and `ngram_range` in TF-IDF
- Try character-level n-grams for noisy text
- Explore pre-trained embeddings (GloVe, fastText)