In [4]:
# ────────────── standard lib ──────────────
from pathlib import Path
import json
import time
import random
import os

# ────────────── scientific stack ──────────────
import numpy as np
import pandas as pd

# ────────────── plotting ──────────────
import matplotlib.pyplot as plt
import seaborn as sns

# ────────────── scikit-learn ──────────────
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# ────────────── utilities ──────────────
import scipy.special  # for scipy.special.softmax

In [12]:
########## EDITABLE PARAMS ##########
RUN_ID      = "logreg_bow_baseline"
TRAIN_DATA_PATH   = "data/training_split.csv"
VAL_DATA_PATH   = "data/validation_split.csv"
SEEDS       = [13, 21, 42]           # -- three independent passes
BATCH_SIZE  = 32                     # only used for timing parity
MAX_FEATURES = 10_000
NGRAM_RANGE  = (1, 2)
C            = 1.0                  # LR inverse-reg strength
MAX_ITER     = 300
#####################################

OUT_ROOT = Path(f"results/{RUN_ID}")
OUT_ROOT.mkdir(parents=True, exist_ok=True)

In [6]:
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v:k for k,v in label2id.items()}

train_df = pd.read_csv(TRAIN_DATA_PATH)
val_df   = pd.read_csv(VAL_DATA_PATH)

train_df["label"] = train_df["label"].map(label2id).astype("int64")
val_df["label"]   = val_df["label"].map(label2id).astype("int64")

In [8]:
records = []

for seed in SEEDS:
    # 3-a  reproducible randomness
    np.random.seed(seed); random.seed(seed)

    # 3-b  vectoriser + model for this seed
    vectoriser = CountVectorizer(
        ngram_range=NGRAM_RANGE,
        max_features=MAX_FEATURES
    )
    X_train = vectoriser.fit_transform(train_df["sentence"])
    X_val   = vectoriser.transform(val_df["sentence"])

    y_train = train_df["label"].values
    y_val   = val_df["label"].values

    model = LogisticRegression(
        C=C, max_iter=MAX_ITER,
        random_state=seed,              # <<< makes coef_ reproducible
        n_jobs=-1,
    )

    tic = time.time()
    model.fit(X_train, y_train)
    logits = model.decision_function(X_val)      # shape (N, 3)
    latency = time.time() - tic

In [9]:
proba = scipy.special.softmax(logits, axis=1)
p_raw = proba.dot(np.arange(3))          # float in [0,2]
p_int = np.argmax(proba, axis=1)         # or model.predict(...)
y_int = y_val

In [10]:
mae   = np.abs(p_raw - y_int).mean()
score = 0.5 * (2 - mae)
acc   = accuracy_score(y_int, p_int)

rec = {
    "seed": seed,
    "score":        float(score),
    "mae":          float(mae),
    "accuracy":     float(acc),
    "latency_sec":  float(latency),
}
records.append(rec)

# --- file outputs ---
out_dir = OUT_ROOT / f"seed_{seed}"
out_dir.mkdir(exist_ok=True)
json.dump(rec, open(out_dir/"metrics.json", "w"), indent=2)

cm = confusion_matrix(y_int, p_int, labels=[0,1,2], normalize="true")
fig, ax = plt.subplots(figsize=(3,3))
sns.heatmap(cm, annot=True, fmt=".2f",
            xticklabels=list(label2id), yticklabels=list(label2id), ax=ax)
ax.set_title(f"{RUN_ID} | seed {seed}")
fig.tight_layout(); fig.savefig(out_dir/"confusion_matrix.png", dpi=200)
plt.close(fig)

mis = val_df.iloc[np.where(p_int != y_int)[0]][["id","sentence","label"]]
mis["pred"] = [id2label[i] for i in p_int[p_int != y_int]]
mis["label"] = mis["label"].map(id2label)
mis.to_csv(out_dir/"misclassified.csv", index=False)

In [13]:
df_rec = pd.DataFrame(records)

# parameter counting
n_params_total = model.coef_.size + model.intercept_.size   # all trainable
n_params_total_M = round(n_params_total / 1_000_000, 2)

agg = {
    "score_mean":    df_rec["score"].mean(),
    "score_std":     df_rec["score"].std(ddof=0),
    "mae_mean":      df_rec["mae"].mean(),
    "mae_std":       df_rec["mae"].std(ddof=0),
    "latency_sec_mean": df_rec["latency_sec"].mean(),
    "latency_sec_std":  df_rec["latency_sec"].std(ddof=0),
    "params_M_total":     n_params_total_M,
    "params_M_trainable": n_params_total_M,   # all weights are trained
    "batch_size": BATCH_SIZE,
    "max_features": MAX_FEATURES
}
json.dump(agg, open(OUT_ROOT/"aggregate.json", "w"), indent=2)
print(agg)


{'score_mean': np.float64(0.7798940194147028), 'score_std': 0.0, 'mae_mean': np.float64(0.4402119611705944), 'mae_std': 0.0, 'latency_sec_mean': np.float64(6.768828392028809), 'latency_sec_std': 0.0, 'params_M_total': 0.03, 'params_M_trainable': 0.03, 'batch_size': 32, 'max_features': 10000}
