In [5]:
import sys
from pathlib import Path
import numpy as np, pickle

ROOT = Path.cwd().parent
sys.path.append(str(ROOT))
ART = ROOT / "artifacts/word12"

with open(ART / "feature_extractor.pickle", "rb") as f:
    fe = pickle.load(f)

X_train = np.load(ART / "X_train.npy")
y_train = np.load(ART / "y_train.npy")
X_val = np.load(ART / "X_val.npy")
y_val = np.load(ART / "y_val.npy")
X_test = np.load(ART / "X_test.npy")

print("Loaded shapes:")
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape)

X_train.mean(), X_train.std()


Loaded shapes:
(12000, 13870) (12000,)
(3000, 13870) (3000,)
(10001, 13870)


(np.float64(0.00027558257290665735), np.float64(0.026846531679251332))

In [15]:
import numpy as np

def train_linear_svm_sgd(
    X, y,
    lr=0.1,
    reg=1e-4,          # L2 regularization strength (lambda)
    epochs=10,
    batch_size=512,
    seed=42
):
    """
    Linear SVM with hinge loss using SGD.
    y must be in {0,1}. Internally we use {-1,+1}.
    Objective: (reg/2)||w||^2 + mean(max(0, 1 - y*(Xw+b)))
    """
    rng = np.random.default_rng(seed)

    # Convert labels to {-1, +1}
    y_pm = np.where(y == 1, 1.0, -1.0)

    n, d = X.shape
    w = np.zeros(d, dtype=float)
    b = 0.0

    for epoch in range(epochs):
        idx = rng.permutation(n)

        for start in range(0, n, batch_size):
            batch_idx = idx[start:start + batch_size]
            Xb = X[batch_idx]
            yb = y_pm[batch_idx]

            scores = Xb @ w + b
            margins = yb * scores

            # Hinge-active points: margins < 1
            active = margins < 1.0
            if np.any(active):
                Xa = Xb[active]
                ya = yb[active]

                # Gradients (mean over batch active points)
                grad_w_hinge = -(ya[:, None] * Xa).mean(axis=0)
                grad_b_hinge = -(ya).mean()
            else:
                grad_w_hinge = 0.0
                grad_b_hinge = 0.0

            # Add L2 regularization gradient
            grad_w = grad_w_hinge + reg * w
            grad_b = grad_b_hinge  # usually we don't regularize bias

            # SGD step
            w -= lr * grad_w
            b -= lr * grad_b

        # Optional: quick progress print
        # Compute average hinge loss on a small sample
        # print(f"epoch {epoch+1}/{epochs} done")

    return w, b

def svm_scores(X, w, b):
    return X @ w + b

def predict_linear_svm(X, w, b):
    # Classify by sign
    scores = X @ w + b
    return (scores >= 0).astype(int)

In [11]:
def f1_score_binary(y_true, y_pred):
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)

    tp = int(((y_true == 1) & (y_pred == 1)).sum())
    fp = int(((y_true == 0) & (y_pred == 1)).sum())
    fn = int(((y_true == 1) & (y_pred == 0)).sum())

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
    return f1, precision, recall, tp, fp, fn

In [16]:
def tune_threshold(scores, y_true, n_grid=200, print_every=25):
    """
    Tune threshold t for y_pred = (scores >= t).
    Uses a grid between score percentiles for stability.
    """
    y_true = y_true.astype(int)

    lo = np.percentile(scores, 1)
    hi = np.percentile(scores, 99)
    thresholds = np.linspace(lo, hi, n_grid)

    best = {"f1": -1, "t": None, "prec": None, "rec": None, "tp": None, "fp": None, "fn": None}

    for i, t in enumerate(thresholds, start=1):
        y_pred = (scores >= t).astype(int)
        f1, prec, rec, tp, fp, fn = f1_score_binary(y_true, y_pred)

        if f1 > best["f1"]:
            best.update({"f1": f1, "t": float(t), "prec": prec, "rec": rec, "tp": tp, "fp": fp, "fn": fn})

        if (i % print_every == 0) or (i == 1) or (i == len(thresholds)):
            print(
                f"[{i:>3}/{len(thresholds)}] t={t:+.4f} | "
                f"F1={f1:.4f} (P={prec:.3f}, R={rec:.3f}) | "
                f"best F1={best['f1']:.4f} @ t={best['t']:+.4f}"
            )

    print("\nBEST:")
    print(
        f"t={best['t']:+.4f} | F1={best['f1']:.4f} | "
        f"P={best['prec']:.4f} | R={best['rec']:.4f} | "
        f"TP={best['tp']} FP={best['fp']} FN={best['fn']}"
    )
    return best

In [14]:
w, b = train_linear_svm_sgd(
    X_train, y_train,
    lr=0.5,
    reg=1e-3,
    epochs=10,
    batch_size=512,
    seed=42
)

yhat_val = predict_linear_svm(X_val, w, b)
f1, prec, rec, tp, fp, fn = f1_score_binary(y_val, yhat_val)

print(f"Validation F1: {f1:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f}")
print(f"TP={tp} FP={fp} FN={fn}")

Validation F1: 0.6224 | Precision: 0.7225 | Recall: 0.5467
TP=492 FP=189 FN=408


In [17]:
scores_val = svm_scores(X_val, w, b)
best = tune_threshold(scores_val, y_val, n_grid=300, print_every=30)

[  1/300] t=-2.7593 | F1=0.4651 (P=0.303, R=1.000) | best F1=0.4651 @ t=-2.7593
[ 30/300] t=-2.1548 | F1=0.4774 (P=0.314, R=0.998) | best F1=0.4774 @ t=-2.1548
[ 60/300] t=-1.5295 | F1=0.5386 (P=0.371, R=0.984) | best F1=0.5386 @ t=-1.5295
[ 90/300] t=-0.9042 | F1=0.7248 (P=0.605, R=0.904) | best F1=0.7248 @ t=-0.9042
[120/300] t=-0.2789 | F1=0.6763 (P=0.702, R=0.652) | best F1=0.7409 @ t=-0.8208
[150/300] t=+0.3464 | F1=0.5892 (P=0.780, R=0.473) | best F1=0.7409 @ t=-0.8208
[180/300] t=+0.9718 | F1=0.4505 (P=0.875, R=0.303) | best F1=0.7409 @ t=-0.8208
[210/300] t=+1.5971 | F1=0.2732 (P=0.864, R=0.162) | best F1=0.7409 @ t=-0.8208
[240/300] t=+2.2224 | F1=0.1714 (P=0.924, R=0.094) | best F1=0.7409 @ t=-0.8208
[270/300] t=+2.8477 | F1=0.0949 (P=0.938, R=0.050) | best F1=0.7409 @ t=-0.8208
[300/300] t=+3.4731 | F1=0.0624 (P=0.967, R=0.032) | best F1=0.7409 @ t=-0.8208

BEST:
t=-0.8208 | F1=0.7409 | P=0.6397 | R=0.8800 | TP=792 FP=446 FN=108


In [13]:
regs = [1e-5, 1e-4, 1e-3, 1e-2]
lrs  = [0.1, 0.5]

best = (-1, None, None)

for reg in regs:
    for lr in lrs:
        w, b = train_linear_svm_sgd(X_train, y_train, lr=lr, reg=reg, epochs=8, batch_size=512, seed=42)
        yhat = predict_linear_svm(X_val, w, b)
        f1, prec, rec, *_ = f1_score_binary(y_val, yhat)
        print(f"lr={lr} reg={reg} -> F1={f1:.4f} (P={prec:.3f}, R={rec:.3f})")

        if f1 > best[0]:
            best = (f1, lr, reg)

print("Best:", best)

lr=0.1 reg=1e-05 -> F1=0.5684 (P=0.697, R=0.480)
lr=0.5 reg=1e-05 -> F1=0.5912 (P=0.738, R=0.493)
lr=0.1 reg=0.0001 -> F1=0.5688 (P=0.698, R=0.480)
lr=0.5 reg=0.0001 -> F1=0.5879 (P=0.742, R=0.487)
lr=0.1 reg=0.001 -> F1=0.5684 (P=0.697, R=0.480)
lr=0.5 reg=0.001 -> F1=0.5923 (P=0.736, R=0.496)
lr=0.1 reg=0.01 -> F1=0.5722 (P=0.699, R=0.484)
lr=0.5 reg=0.01 -> F1=0.5836 (P=0.724, R=0.489)
Best: (0.5922974767596282, 0.5, 0.001)
