In [None]:
# =========================
# CORE / DATA HANDLING
# =========================
import os
import csv
import numpy as np
import pandas as pd

# =========================
# COLAB / I/O
# =========================
from google.colab import drive
drive.mount('/content/drive')

# =========================
# MODELLING & PREPROCESSING
# =========================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier

# =========================
# EVALUATION
# =========================
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score
)

# =========================
# VISUALISATION
# =========================
import matplotlib.pyplot as plt
import seaborn as sns


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/clean_amazon_reviews.csv")


In [None]:
df.shape

(382120, 13)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382120 entries, 0 to 382119
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      382120 non-null  int64 
 1   ProductId               382120 non-null  object
 2   UserId                  382120 non-null  object
 3   ProfileName             382120 non-null  object
 4   HelpfulnessNumerator    382120 non-null  int64 
 5   HelpfulnessDenominator  382120 non-null  int64 
 6   Score                   382120 non-null  int64 
 7   Summary                 382120 non-null  object
 8   Text                    382120 non-null  object
 9   sentiment_4             382120 non-null  object
 10  text_len                382120 non-null  int64 
 11  word_count              382120 non-null  int64 
 12  review_time             382120 non-null  object
dtypes: int64(6), object(7)
memory usage: 37.9+ MB


---
---

# Modelling

---
---

In [None]:
# Train test split cell (locking on top so models are trained on the same data for everyone)
X = df["Text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
# verifying data
print("Train:", X_train.shape, y_train.shape)
print("Test :", X_test.shape, y_test.shape)

print("\nTrain label counts:")
print(y_train.value_counts(normalize=True))

print("\nTest label counts:")
print(y_test.value_counts(normalize=True))


Train: (305696,) (305696,)
Test : (76424,) (76424,)

Train label counts:
sentiment_4
very_positive    0.634042
negative         0.146786
positive         0.142750
neutral          0.076422
Name: proportion, dtype: float64

Test label counts:
sentiment_4
very_positive    0.634042
negative         0.146786
positive         0.142756
neutral          0.076416
Name: proportion, dtype: float64


---
---
### 1 -  TF-IDF + Logistic Regression

---
---
### 2 - TF-IDF + LinearSVC

---
---
### 3 – ColumnTransformer model (text + numeric helpers)

---
---
### 4 – TF-IDF → TruncatedSVD (≈300) → MLPClassifier (scikit-learn)

# TO DO:

# Modelling (CRISP-DM)

## 1. Modelling Objectives
- Goal: build a **binary** sentiment classifier (negative vs positive) on **Text_clean**.
- Success: Macro-F1 ≥ 0.80, with strong recall on **negative**.

## 2. Data for Modelling (input from EDA)
- Using columns: `Text_clean` (feature), `sentiment` (target).
- Split: stratified 80/20 train/test (fixed random_state=42).
- Imbalance: handle via `class_weight="balanced"` (and compare to train-only resampling if needed).

## 3. Baseline & Pipelines
- **Baseline 1:** TF-IDF (1–2 grams, 10k feats) + LogisticRegression (class_weight="balanced").
- **Baseline 2:** TF-IDF (1–2 grams, 10k feats) + LinearSVC (class_weight="balanced").

## 4. Cross-Validation (Model Selection)
- 5-fold stratified CV on train set.
- Metrics: **macro-F1** (primary), per-class recall (secondary).
- Store mean ± std across folds.

## 5. Hyperparameter Search
- Tune TF-IDF (max_features, ngram_range, min_df) and model params:
  - Logistic: C ∈ {0.5,1,2}, max_iter {200,400}
  - LinearSVC: C ∈ {0.5,1,2}
- Produce a **results table** (model, params, CV macro-F1, fit time).

## 6. Class Imbalance Sensitivity (Optional)
- Compare `class_weight="balanced"` vs **RandomUnderSampler** on TF-IDF features (train only).
- Report impact on minority (negative) recall and macro-F1.

## 7. Advanced Model (MLP, scikit-learn)
- Pipeline: TF-IDF (1–2 grams, 20k feats) → TruncatedSVD (e.g., 300 dims) → StandardScaler → **MLPClassifier**.
- Early stopping on validation split; report macro-F1.
- (Optional) Use **sample_weight** from class frequencies.

## 8. Final Evaluation (Hold-out Test)
- Evaluate the **best model** on the test set:
  - Macro-F1, accuracy, per-class precision/recall/F1.
  - Confusion matrix (heatmap).
- Short **error analysis**: show a few false positives/negatives.

## 9. Model Export & Reproducibility
- Save the fitted pipeline with vectorizer to disk (`joblib.dump`).
- Save label mapping and TF-IDF params.

## 10. (Optional) Deployment
- Simple API (FastAPI) or minimal web app (Streamlit/Render) that loads the pipeline and predicts.
- Add input sanitation and the same cleaning used in training.

## 11. Reporting Notes (for the write-up)
- Summarize why 1★ & 5★ only (reduced label noise).
- Note which model wins and why (speed/accuracy/robustness).
- Include the CV results table and the test confusion matrix.
