In [19]:
# =========================
# CORE / DATA HANDLING
# =========================
import os
import csv
import numpy as np
import pandas as pd

# =========================
# COLAB / I/O
# =========================
from google.colab import drive
drive.mount('/content/drive')

# =========================
# MODELLING & PREPROCESSING
# =========================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier

# =========================
# EVALUATION
# =========================
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score
)

# =========================
# VISUALISATION
# =========================
import matplotlib.pyplot as plt
import seaborn as sns


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
df = pd.read_csv("/content/drive/MyDrive/clean_amazon_reviews.csv")


In [26]:
df.shape

(382120, 13)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382120 entries, 0 to 382119
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      382120 non-null  int64 
 1   ProductId               382120 non-null  object
 2   UserId                  382120 non-null  object
 3   ProfileName             382120 non-null  object
 4   HelpfulnessNumerator    382120 non-null  int64 
 5   HelpfulnessDenominator  382120 non-null  int64 
 6   Score                   382120 non-null  int64 
 7   Summary                 382120 non-null  object
 8   Text                    382120 non-null  object
 9   sentiment_4             382120 non-null  object
 10  text_len                382120 non-null  int64 
 11  word_count              382120 non-null  int64 
 12  review_time             382120 non-null  object
dtypes: int64(6), object(7)
memory usage: 37.9+ MB


---
---

# Modelling

---
---

In [29]:
# Train test split cell (locking on top so models are trained on the same data for everyone)
X = df["Text"]
y = df["sentiment_4"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [31]:
# verifying data
print("Train:", X_train.shape, y_train.shape)
print("Test :", X_test.shape, y_test.shape)

print("\nTrain label counts:")
print(y_train.value_counts(normalize=True))

print("\nTest label counts:")
print(y_test.value_counts(normalize=True))


Train: (305696,) (305696,)
Test : (76424,) (76424,)

Train label counts:
sentiment_4
very_positive    0.634042
negative         0.146786
positive         0.142750
neutral          0.076422
Name: proportion, dtype: float64

Test label counts:
sentiment_4
very_positive    0.634042
negative         0.146786
positive         0.142756
neutral          0.076416
Name: proportion, dtype: float64


---
---
### 1 - Baseline: TF-IDF + Logistic Regression

---
---
### 2 - Strong model: TF-IDF + LinearSVC

---
---
### 3 – Feature-rich model: ColumnTransformer (text + numeric) + Logistic Regression

---
---
### 4 – Advanced model: Stacking (LogReg + LinearSVC)