In [2]:
import json, torch, numpy as np
import os
from biLSTM import BiLSTMEncoder
from sklearn.metrics import f1_score, precision_recall_fscore_support, classification_report
from sklearn.ensemble import HistGradientBoostingClassifier
import sentencepiece as spm
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
import joblib
from tqdm import tqdm



SEED = 42

# helpers

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
best_ckpt_dir = 'best_ckpts'

def f1_with_best_threshold(y_true, proba, average="binary"):
    """
    Sweep thresholds on the *external val set* to pick the best F1.
    Returns (best_f1, best_threshold).
    """
    thresholds = np.linspace(0.05, 0.95, 19)
    best_f1, best_t = -1.0, 0.5
    for t in thresholds:
        y_pred = (proba >= t).astype(int)
        f1 = f1_score(y_true, y_pred, average=average)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return best_f1, best_t

@torch.no_grad()
def extract_features(encoder, dl, device):
    encoder.eval()
    feats, labels = [], []
    for xb, yb in dl:
        xb = xb.to(device)
        z  = encoder(xb)                                # [B, feat_dim] (e.g., 512)
        feats.append(z.cpu().numpy().astype(np.float32))
        labels.append(yb.numpy().astype(np.int64))
    return np.concatenate(feats), np.concatenate(labels)

def vocab_to_id_mapper(
        input_df
        ,max_len
        ,sp
) :
    
    pad_id = sp.piece_to_id("<pad>")
    if pad_id == -1:  
        pad_id = 0

    
    def encode_ids(text) :
        if not isinstance(text, str):
            text = "" if pd.isna(text) else str(text)
        return sp.encode_as_ids(text)

    def pad_ids(ids,max_len,pad_id) -> np.ndarray:
        if len(ids) >= max_len:
            return np.array(ids[:max_len], dtype=np.int32)
        return np.array(ids + [pad_id] * (max_len - len(ids)), dtype=np.int32)

    
    df = input_df.copy()
    df["sp_ids"] = df["Body"].apply(encode_ids)

    # overwrite sp_ids_padded with NumPy arrays directly
    df["sp_ids_padded"] = df["sp_ids"].apply(lambda ids: pad_ids(ids, max_len, pad_id))

    return df

def df_to_ids_and_labels(df):
    # 'sp_ids_padded' should be a list/array per row; stack to [N, max_len]
    X_ids = np.stack(df["sp_ids_padded"].values).astype(np.int64)
    y = df["label"].astype(np.int64).values
    return X_ids, y



def make_loader(X_ids, y, batch_size=128, shuffle=False):
    X = torch.tensor(X_ids, dtype=torch.long)
    y = torch.tensor(y,     dtype=torch.long)
    ds = TensorDataset(X, y)
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, pin_memory=True)

# reloading model from saved checkpoint

In [4]:
# load manifest + embedding matrix
# CKPT_DIR = "checkpoints"
MANIFEST_PATH = os.path.join( best_ckpt_dir, "manifest.json")

with open(MANIFEST_PATH, "r") as f:
    manifest = json.load(f)

embedding_matrix = np.load(manifest["embedding_matrix_file"])

print(f"Loaded manifest from: {MANIFEST_PATH}")
print(f"Embedding matrix shape: {embedding_matrix.shape}")


# load checkpoint and extract encoder weights
ckpt = torch.load(manifest["best_ckpt"], map_location=device)
state_dict = ckpt["model"]  

# If model was saved with full classifier, extract encoder weights only
if any(k.startswith("encoder.") for k in state_dict.keys()):
    encoder_state = {
        k.replace("encoder.", ""): v
        for k, v in state_dict.items()
        if k.startswith("encoder.")
    }
else:
    # already encoder-only (safety fallback)
    encoder_state = state_dict

# rebuild and load encoder
encoder = BiLSTMEncoder(
    embedding_matrix=embedding_matrix,
    pad_id=manifest["pad_id"],
    hidden_dim=manifest["hidden_dim"],
    num_layers=manifest["num_layers"],
    dropout=0.0,  # irrelevant once frozen
    bidirectional=manifest["bidirectional"],
    freeze_embeddings=True,
).to(device)

encoder.load_state_dict(encoder_state, strict=True)
encoder.eval()
for p in encoder.parameters():
    p.requires_grad = False

print("✅ Encoder reloaded and frozen.")


Loaded manifest from: best_ckpts/manifest.json
Embedding matrix shape: (50000, 300)


  ckpt = torch.load(manifest["best_ckpt"], map_location=device)
INFO:biLSTM:BiLSTM Encoder initialized | emb_dim=300, hidden_dim=256, layers=2, bidirectional=True, freeze_embeddings=True


✅ Encoder reloaded and frozen.


# tokenising and extracting features for HistGradientBoost

In [3]:
load_path = 'embedder_files/'
sp = spm.SentencePieceProcessor()
sp.load(load_path+"email_sp.model")
pad_id = sp.piece_to_id("<pad>")


In [3]:
train_df = pd.read_csv('../../../datasets/encoder_dataset/clean/bilstm/encoder_train_clean.csv')
val_df = pd.read_csv('../../../datasets/encoder_dataset/clean/bilstm/encoder_valid_clean.csv')
test_df = pd.read_csv('../../../datasets/encoder_dataset/clean/bilstm/encoder_test_clean.csv')

print(len(set(train_df["text_combined"]) & set(test_df["text_combined"])))
print(len(set(val_df["text_combined"]) & set(test_df["text_combined"])))
print(len(set(train_df["text_combined"]) & set(val_df["text_combined"])))


0
0
0


In [7]:

train_df.rename(columns = {'text_combined':'Body'},inplace=True)
val_df.rename(columns = {'text_combined':'Body'},inplace=True)
test_df.rename(columns = {'text_combined':'Body'},inplace=True)


In [8]:
train_df = vocab_to_id_mapper(train_df,256,sp)
val_df = vocab_to_id_mapper(val_df,256,sp)
test_df = vocab_to_id_mapper(test_df,256,sp)

In [21]:
train_df.dtypes

Body             object
label             int64
sp_ids           object
sp_ids_padded    object
dtype: object

In [9]:
Xtr_ids, ytr = df_to_ids_and_labels(train_df)
Xva_ids, yva = df_to_ids_and_labels(val_df)
Xte_ids, yte = df_to_ids_and_labels(test_df)


train_dl = make_loader(Xtr_ids, ytr, batch_size=128, shuffle=False)
val_dl   = make_loader(Xva_ids, yva, batch_size=128, shuffle=False)
test_dl  = make_loader(Xte_ids, yte, batch_size=128, shuffle=False)

In [None]:
print("Train IDs shape:", Xtr_ids.shape)
print("Val IDs shape:", Xva_ids.shape)
print("Test IDs shape:", Xte_ids.shape)

Train IDs shape: (65662, 256)
Val IDs shape: (8062, 256)
Test IDs shape: (8056, 256)
Train labels: [31386 34276]


In [12]:
X_tr, y_tr = extract_features(encoder, train_dl, device)
X_va, y_va = extract_features(encoder, val_dl,   device)
X_te, y_te = extract_features(encoder, test_dl,  device)

# simple hyper param tuning

In [17]:
# tiny tuning grid (8 configs total)
param_grid = [
    {"learning_rate": 0.03, "max_depth": 4, "l2_regularization": 0.0, "min_samples_leaf": 20, "max_iter": 600},
    {"learning_rate": 0.03, "max_depth": 4, "l2_regularization": 1.0, "min_samples_leaf": 50, "max_iter": 600},
    {"learning_rate": 0.03, "max_depth": 6, "l2_regularization": 0.0, "min_samples_leaf": 20, "max_iter": 600},
    {"learning_rate": 0.03, "max_depth": 6, "l2_regularization": 1.0, "min_samples_leaf": 50, "max_iter": 600},

    {"learning_rate": 0.05, "max_depth": 4, "l2_regularization": 0.0, "min_samples_leaf": 20, "max_iter": 600},
    {"learning_rate": 0.05, "max_depth": 4, "l2_regularization": 1.0, "min_samples_leaf": 50, "max_iter": 600},
    {"learning_rate": 0.05, "max_depth": 6, "l2_regularization": 0.0, "min_samples_leaf": 20, "max_iter": 600},
    {"learning_rate": 0.05, "max_depth": 6, "l2_regularization": 1.0, "min_samples_leaf": 50, "max_iter": 600},


]


best_cfg, best_model, best_val_f1, best_thr = None, None, -1.0, 0.5
tuning_log = []

for cfg in tqdm(param_grid, desc="Grid Search (HGB)", ncols=100):
    hgb = HistGradientBoostingClassifier(
        **cfg,
        early_stopping=True,
        n_iter_no_change=20,
        validation_fraction=0.05,
        random_state=SEED
    )

    hgb.fit(X_tr, y_tr)
    proba_va = hgb.predict_proba(X_va)[:, 1]
    f1_va, thr = f1_with_best_threshold(y_va, proba_va)

    tuning_log.append({**cfg, "val_F1": f1_va, "thr": thr})

    if f1_va > best_val_f1:
        best_cfg, best_model, best_val_f1, best_thr = cfg, hgb, f1_va, thr

print(f"✅ Best (val): {best_cfg} | F1={best_val_f1:.4f} | thr={best_thr:.3f}")

Grid Search (HGB): 100%|██████████████████████████████████████████████| 8/8 [02:54<00:00, 21.85s/it]

✅ Best (val): {'learning_rate': 0.05, 'max_depth': 4, 'l2_regularization': 0.0, 'min_samples_leaf': 20, 'max_iter': 600} | F1=0.9911 | thr=0.350





# retraining on best config

In [18]:
# training best config for histboost 

# Train on TRAIN ONLY, using the chosen best config
hgb = HistGradientBoostingClassifier(
    **best_cfg,
    early_stopping=True, n_iter_no_change=30, validation_fraction=0.1,
    random_state=SEED
)
hgb.fit(X_tr, y_tr)

proba_va = hgb.predict_proba(X_va)[:, 1]
val_f1, val_thr = f1_with_best_threshold(y_va, proba_va)
y_hat_va = (proba_va >= val_thr).astype(int)
p, r, f1, _ = precision_recall_fscore_support(y_va, y_hat_va, average="binary")

print(f"[VAL] F1={f1:.4f}  P={p:.4f}  R={r:.4f}  (thr={val_thr:.3f})")
print(classification_report(y_va, y_hat_va, digits=4))


[VAL] F1=0.9906  P=0.9911  R=0.9902  (thr=0.950)
              precision    recall  f1-score   support

           0     0.9889    0.9900    0.9895      3795
           1     0.9911    0.9902    0.9906      4267

    accuracy                         0.9901      8062
   macro avg     0.9900    0.9901    0.9900      8062
weighted avg     0.9901    0.9901    0.9901      8062



In [None]:
# save model + metadata for clean reload later
joblib.dump(hgb, os.path.join(best_ckpt_dir, "model.pkl"))

meta = {
    "config": best_cfg,
    "val_threshold": float(val_thr),
    "feat_dim": int(X_tr.shape[1]),
    "seed": SEED,
    "trained_on": "train_only",
    "metrics": {"val_precision": float(p), "val_recall": float(r), "val_F1": float(f1)},
}
with open(os.path.join(best_ckpt_dir, "meta.json"), "w") as f:
    json.dump(meta, f, indent=2)

print(f"Saved model → {os.path.join(best_ckpt_dir, 'model.pkl')}")
print(f"Saved meta  → {os.path.join(best_ckpt_dir, 'meta.json')}")

Saved model → best_ckpts/model.pkl
Saved meta  → best_ckpts/meta.json
