In [1]:
import sys
from pathlib import Path
import numpy as np, pickle
from scipy import sparse

ROOT = Path.cwd().parent
sys.path.append(str(ROOT))
ART = ROOT / "artifacts/word12_char35"

with open(ART / "feature_extractor.pkl", "rb") as f:
    fe = pickle.load(f)

X_train = sparse.load_npz(ART / "X_train.npz")
X_val = sparse.load_npz(ART / "X_val.npz")   
X_test = sparse.load_npz(ART / "X_test.npz")

#X_train = np.load(ART / "X_train.npy")
y_train = np.load(ART / "y_train.npy")
#X_val = np.load(ART / "X_val.npy")
y_val = np.load(ART / "y_val.npy")
#X_test = np.load(ART / "X_test.npy")

print("Loaded shapes:")
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape)


Loaded shapes:
(12000, 66029) (12000,)
(3000, 66029) (3000,)
(10001, 66029)


In [2]:
import numpy as np

def train_linear_svm_sgd(
    X, y,
    lr=0.1,
    reg=1e-4,          # L2 regularization strength (lambda)
    epochs=10,
    batch_size=512,
    seed=42
):
    """
    Linear SVM with hinge loss using SGD.
    y must be in {0,1}. Internally we use {-1,+1}.
    Objective: (reg/2)||w||^2 + mean(max(0, 1 - y*(Xw+b)))
    """
    rng = np.random.default_rng(seed)

    # Convert labels to {-1, +1}
    y_pm = np.where(y == 1, 1.0, -1.0)

    n, d = X.shape
    w = np.zeros(d, dtype=float)
    b = 0.0

    for epoch in range(epochs):
        idx = rng.permutation(n)

        for start in range(0, n, batch_size):
            batch_idx = idx[start:start + batch_size]
            Xb = X[batch_idx]
            yb = y_pm[batch_idx]

            scores = Xb @ w + b
            margins = yb * scores

            # Hinge-active points: margins < 1
            active = margins < 1.0
            if np.any(active):
                Xa = Xb[active]
                ya = yb[active]

                # Gradients (mean over batch active points)
                m = Xa.shape[0]
                grad_w_hinge = -(Xa.T @ ya) / m
                grad_w_hinge = np.asarray(grad_w_hinge).ravel()
                grad_b_hinge = -(ya).mean()
            else:
                grad_w_hinge = 0.0
                grad_b_hinge = 0.0

            # Add L2 regularization gradient
            grad_w = grad_w_hinge + reg * w
            grad_b = grad_b_hinge  # usually we don't regularize bias

            # SGD step
            w -= lr * grad_w
            b -= lr * grad_b

        # Optional: quick progress print
        # Compute average hinge loss on a small sample
        # print(f"epoch {epoch+1}/{epochs} done")

    return w, b

def svm_scores(X, w, b):
    return X @ w + b

def predict_linear_svm(X, w, b):
    # Classify by sign
    scores = X @ w + b
    return (scores >= 0).astype(int)

In [3]:
def f1_score_binary(y_true, y_pred):
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)

    tp = int(((y_true == 1) & (y_pred == 1)).sum())
    fp = int(((y_true == 0) & (y_pred == 1)).sum())
    fn = int(((y_true == 1) & (y_pred == 0)).sum())

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
    return f1, precision, recall, tp, fp, fn

In [4]:
def tune_threshold(scores, y_true, n_grid=200, print_every=25):
    """
    Tune threshold t for y_pred = (scores >= t).
    Uses a grid between score percentiles for stability.
    """
    y_true = y_true.astype(int)

    lo = np.percentile(scores, 1)
    hi = np.percentile(scores, 99)
    thresholds = np.linspace(lo, hi, n_grid)

    best = {"f1": -1, "t": None, "prec": None, "rec": None, "tp": None, "fp": None, "fn": None}

    for i, t in enumerate(thresholds, start=1):
        y_pred = (scores >= t).astype(int)
        f1, prec, rec, tp, fp, fn = f1_score_binary(y_true, y_pred)

        if f1 > best["f1"]:
            best.update({"f1": f1, "t": float(t), "prec": prec, "rec": rec, "tp": tp, "fp": fp, "fn": fn})

        if (i % print_every == 0) or (i == 1) or (i == len(thresholds)):
            print(
                f"[{i:>3}/{len(thresholds)}] t={t:+.4f} | "
                f"F1={f1:.4f} (P={prec:.3f}, R={rec:.3f}) | "
                f"best F1={best['f1']:.4f} @ t={best['t']:+.4f}"
            )

    print("\nBEST:")
    print(
        f"t={best['t']:+.4f} | F1={best['f1']:.4f} | "
        f"P={best['prec']:.4f} | R={best['rec']:.4f} | "
        f"TP={best['tp']} FP={best['fp']} FN={best['fn']}"
    )
    return best

In [5]:
w, b = train_linear_svm_sgd(
    X_train, y_train,
    lr=0.5,
    reg=1e-3,
    epochs=10,
    batch_size=512,
    seed=42
)

yhat_val = predict_linear_svm(X_val, w, b)
f1, prec, rec, tp, fp, fn = f1_score_binary(y_val, yhat_val)

print(f"Validation F1: {f1:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f}")
print(f"TP={tp} FP={fp} FN={fn}")

Validation F1: 0.7143 | Precision: 0.7846 | Recall: 0.6556
TP=590 FP=162 FN=310


In [6]:
scores_val = svm_scores(X_val, w, b)
best = tune_threshold(scores_val, y_val, n_grid=300, print_every=30)

[  1/300] t=-2.7537 | F1=0.4651 (P=0.303, R=1.000) | best F1=0.4651 @ t=-2.7537
[ 30/300] t=-2.1769 | F1=0.4764 (P=0.313, R=0.999) | best F1=0.4764 @ t=-2.1769
[ 60/300] t=-1.5801 | F1=0.5386 (P=0.369, R=0.993) | best F1=0.5386 @ t=-1.5801
[ 90/300] t=-0.9833 | F1=0.7285 (P=0.589, R=0.956) | best F1=0.7285 @ t=-0.9833
[120/300] t=-0.3865 | F1=0.7691 (P=0.730, R=0.812) | best F1=0.7738 @ t=-0.5258
[150/300] t=+0.2103 | F1=0.6798 (P=0.830, R=0.576) | best F1=0.7738 @ t=-0.5258
[180/300] t=+0.8070 | F1=0.5429 (P=0.893, R=0.390) | best F1=0.7738 @ t=-0.5258
[210/300] t=+1.4038 | F1=0.3433 (P=0.918, R=0.211) | best F1=0.7738 @ t=-0.5258
[240/300] t=+2.0006 | F1=0.1884 (P=0.959, R=0.104) | best F1=0.7738 @ t=-0.5258
[270/300] t=+2.5974 | F1=0.0951 (P=0.978, R=0.050) | best F1=0.7738 @ t=-0.5258
[300/300] t=+3.1941 | F1=0.0645 (P=1.000, R=0.033) | best F1=0.7738 @ t=-0.5258

BEST:
t=-0.5258 | F1=0.7738 | P=0.7132 | R=0.8456 | TP=761 FP=306 FN=139


In [13]:
regs = [1e-5, 1e-4, 1e-3, 1e-2]
lrs  = [0.1, 0.5]

best = (-1, None, None)

for reg in regs:
    for lr in lrs:
        w, b = train_linear_svm_sgd(X_train, y_train, lr=lr, reg=reg, epochs=8, batch_size=512, seed=42)
        yhat = predict_linear_svm(X_val, w, b)
        f1, prec, rec, *_ = f1_score_binary(y_val, yhat)
        print(f"lr={lr} reg={reg} -> F1={f1:.4f} (P={prec:.3f}, R={rec:.3f})")

        if f1 > best[0]:
            best = (f1, lr, reg)

print("Best:", best)

lr=0.1 reg=1e-05 -> F1=0.5684 (P=0.697, R=0.480)
lr=0.5 reg=1e-05 -> F1=0.5912 (P=0.738, R=0.493)
lr=0.1 reg=0.0001 -> F1=0.5688 (P=0.698, R=0.480)
lr=0.5 reg=0.0001 -> F1=0.5879 (P=0.742, R=0.487)
lr=0.1 reg=0.001 -> F1=0.5684 (P=0.697, R=0.480)
lr=0.5 reg=0.001 -> F1=0.5923 (P=0.736, R=0.496)
lr=0.1 reg=0.01 -> F1=0.5722 (P=0.699, R=0.484)
lr=0.5 reg=0.01 -> F1=0.5836 (P=0.724, R=0.489)
Best: (0.5922974767596282, 0.5, 0.001)
