In [None]:
from biLSTM import BiLSTMClassifier
import pandas as pd
import numpy as np
import sentencepiece as spm
import re
from bs4 import BeautifulSoup
import unicodedata
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
import os
from torch.utils.data import DataLoader
import random
import torch.nn as nn
from sklearn.metrics import f1_score
import torch.nn.functional as F
import json
from sklearn.metrics import precision_recall_fscore_support,accuracy_score
import time


SEED = 42  # or any integer

# Python & NumPy

np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# PyTorch (CPU & CUDA)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)  # if multi-GPU

# Tokeniser

#### helpers

In [3]:
# masking special token
def mask_tokens(text):
    # replace URLs (http, https, www)
    text = re.sub(r'(https?://\S+|www\.\S+)', '<URL>', text)

    # replace common file extensions (customize list)
    text = re.sub(r'\b[\w\-]+\.(pdf|docx|xlsx|txt|csv|tar|doc\.gz|doc)\b', '<FILE>', text)

    # emails
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', text)

    # money 
    text = re.sub(r'\$\d+(?:\.\d{2})?','<MONEY>',text)

    # numbers 
    text = re.sub(r'\b\d+\b','<NUMBER>',text)
    text = text.replace('<NUMBER>', '')

    return text


# un HTML raw text 
def strip_html(raw_html):
    """
    Strip HTML tags, scripts, styles, and normalize whitespace
    to return clean raw text from HTML emails.
    """
    soup = BeautifulSoup(raw_html, "html.parser")

    
    for a in soup.find_all("a"):
        href = a.get("href",None)
        if not href:   # skip if no href
                continue

        # print(a_attribute)

        a_attribute = mask_tokens(href)

        if a_attribute == '<URL>' : 
            a.replace_with('<URL>')

        elif a_attribute =='<EMAIL>' : 
            a.replace_with('<EMAIL>')
        
        elif a_attribute == '<FILE>' : 
            a.replace_with('<FILE>')

        elif a_attribute == '<MONEY>' : 
            a.replace_with('<MONEY>')
        
        elif a_attribute == '<NUMBER>' : 
            a.replace_with('<NUMBER>')

    # remove script, style, head, and metadata tags
    for tag in soup(["script", "style", "head", "title", "meta", "[document]"]):
        tag.decompose()

    # extract text
    text = soup.get_text(separator=" ")

    # normalize unicode 
    text = unicodedata.normalize("NFKC", text)

    # replace non-breaking spaces specifically (unicode)
    text = text.replace("\xa0", " ")

    # collapse all whitespace tokens (line breaks, tabs, multiple spaces) into one space and remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # rim leading/trailing spaces
    return text

# special case handling
mapper = str.maketrans({
    '0':'o','1':'l','3':'e','4':'a','5':'s','7':'t','$':'s','@':'a'
})

def deobfuscate_words(text):
    """
    capture non-alphanumeric sequence in windows of 1-3 and replaces with ' ' 
    l-o-v-e -> l-o , - is detected and removed -> love
    """
    # replace text to number 
    text = text.translate(mapper)
    # remove weird spaces etc 
    text = re.sub(r'(?i)(?<=\w)[^A-Za-z0-9\s]{1,3}(?=\w)', '', text)
    return text

def word_capper(text):
    text = re.sub(r'(.)\1{' + str(2) + r',}', lambda m: m.group(1)*2, text)
    text = re.sub(r'([!?.,])\1{1,}', r'\1\1', text)
    return text


# whitelist filtering
def char_lvl_whitelist_filter(text): 
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\!\?\'\":;\-\_\(\)\@\#\$\%\^\&\<\>]', '', text)
    return text

# word level processor 
def lemmatizer(text) :
    lemmatizer = WordNetLemmatizer()
    sentence = ''

    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]

    return sentence.join(lemmatized_words)

#final clean
def final_punc_removal(text):
    text = re.sub(r'[^A-Za-z0-9\s<>]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

    
def preprocess_email_text(raw): 
    """
    the whole pipeline of processing
    input : dataframe with text column and ham/spam label
    output : dataframe with cleaned sentences and ham/spam label
    """
    raw = strip_html(raw) # process html first to capture links from <a> tags
    raw = mask_tokens(raw) # mask special tokens 
    raw = deobfuscate_words(raw)
    raw = word_capper(raw)
    raw = lemmatizer(raw)
    raw = char_lvl_whitelist_filter(raw)
    raw = final_punc_removal(raw)
    raw = raw.lower()
    return raw

def preprocess_email_df(df, text_col):
    df[text_col] = df[text_col].apply(preprocess_email_text)
    return df


def vocab_builder(
    input_df
    ,vocab_size
    ,model_type
) : 
    
    input_df["Body"].to_csv("emails_clean.txt", index=False, header=False)

    # train SentencePiece model
    spm.SentencePieceTrainer.Train(
        f"--input=emails_clean.txt "
        f"--model_prefix=email_sp "
        f"--vocab_size={vocab_size} "
        f"--character_coverage=1.0 "
        f"--model_type={model_type} "
        f"--shuffle_input_sentence=false "
        f"--seed_sentencepiece_size=1000000 "
        f"--user_defined_symbols=<url>,<email>,<file>,<money>,<pad>"
    )


#####################################################################################################################
def vocab_to_id_mapper(
        input_df
        ,max_len
        ,sp
) :
    
    pad_id = sp.piece_to_id("<pad>")
    if pad_id == -1:  
        pad_id = 0

    
    def encode_ids(text) :
        if not isinstance(text, str):
            text = "" if pd.isna(text) else str(text)
        return sp.encode_as_ids(text)

    def pad_ids(ids,max_len,pad_id) -> np.ndarray:
        if len(ids) >= max_len:
            return np.array(ids[:max_len], dtype=np.int32)
        return np.array(ids + [pad_id] * (max_len - len(ids)), dtype=np.int32)

    
    df = input_df.copy()
    df["sp_ids"] = df["Body"].apply(encode_ids)

    # overwrite sp_ids_padded with NumPy arrays directly
    df["sp_ids_padded"] = df["sp_ids"].apply(lambda ids: pad_ids(ids, max_len, pad_id))

    return df

def build_embedding_matrix(w2v, sp, pad_id: int, seed: int = 42):
    """
    Build embedding matrix aligned with SentencePiece IDs.
    """
    vocab_size = sp.get_piece_size()
    emb_dim = w2v.vector_size

    E = np.zeros((vocab_size, emb_dim), dtype=np.float32)
    rng = np.random.default_rng(seed)

    for sp_id in range(vocab_size):
        piece = sp.id_to_piece(sp_id)
        if piece in w2v.wv:
            E[sp_id] = w2v.wv[piece]
        else:
            E[sp_id] = rng.normal(0.0, 0.01, size=emb_dim).astype(np.float32)

    # Keep PAD = 0
    if 0 <= pad_id < vocab_size:
        E[pad_id] = 0.0

    metadata = {
        "vocab_size": vocab_size,
        "emb_dim": emb_dim,
        "pad_id": pad_id,
        "trained_vocab": len(w2v.wv),
        "oov_count": vocab_size - len(w2v.wv),
    }
    return E, metadata

class TextDS(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(np.stack(X), dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.X[i], self.y[i]

In [4]:
# build sp -> word embedin matrix 
load_path = 'full_447_batch_A/'


# load saved model
w2v_model = Word2Vec.load(load_path+"word2vec.model")

#sentencePiece model & pad_id
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load(load_path+"email_sp.model")
pad_id = sp.piece_to_id("<pad>")
if pad_id == -1:
    pad_id = 0

subword_processor = sp 

embedding_matrix, embedding_summary = build_embedding_matrix(w2v_model,subword_processor,pad_id)

INFO:gensim.utils:loading Word2Vec object from full_447_batch_A/word2vec.model
INFO:gensim.utils:loading wv recursively from full_447_batch_A/word2vec.model.wv.* with mmap=None
INFO:gensim.utils:loading vectors from full_447_batch_A/word2vec.model.wv.vectors.npy with mmap=None
INFO:gensim.utils:loading syn1neg from full_447_batch_A/word2vec.model.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': 'full_447_batch_A/word2vec.model', 'datetime': '2025-10-14T16:41:50.601672', 'gensim': '4.3.3', 'python': '3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 08:03:38) [Clang 14.0.6 ]', 'platform': 'macOS-14.5-arm64-arm-64bit', 'event': 'loaded'}


In [5]:
embedding_summary


{'vocab_size': 50000,
 'emb_dim': 300,
 'pad_id': 7,
 'trained_vocab': 48242,
 'oov_count': 1758}

In [6]:
pad_id = subword_processor.piece_to_id('<pad>')
pad_id

7

#### Train-valid-test split

In [16]:
train_df_raw = pd.read_csv('../raw_encoder_data_sets/train_set.csv')
val_df_raw = pd.read_csv('../raw_encoder_data_sets/valid_set.csv')
test_df_raw = pd.read_csv('../raw_encoder_data_sets/test_set.csv')


In [17]:
print(train_df_raw.shape)
print(val_df_raw.shape)
print(test_df_raw.shape)


(65662, 2)
(8208, 2)
(8208, 2)


In [19]:
print(len(set(val_df_raw["text_combined"]) & set(test_df_raw["text_combined"])))
print(len(set(val_df_raw["text_combined"]) & set(train_df_raw["text_combined"])))
print(len(set(test_df_raw["text_combined"]) & set(train_df_raw["text_combined"])))

0
0
0


In [20]:
# preprocess same as word2vec preprocessing
train_df = preprocess_email_df(train_df_raw,'text_combined')
val_df = preprocess_email_df(val_df_raw,'text_combined')
test_df = preprocess_email_df(test_df_raw,'text_combined')


In [None]:
text_col = "text_combined"

# remove overlaps in hierarchical order: Train → Val → Test
train_texts = set(train_df[text_col])

# remove from val if it appears in train
val_df = val_df[~val_df[text_col].isin(train_texts)].reset_index(drop=True)
val_texts = set(val_df[text_col])

# remove from test if it appears in train or val
test_df = test_df[
    ~test_df[text_col].isin(train_texts.union(val_texts))
].reset_index(drop=True)

In [31]:
train_df.to_csv('clean_data_bilstm/train_clean.csv',index=False)
val_df.to_csv('clean_data_bilstm/val_clean.csv',index= False)
test_df.to_csv('clean_data_bilstm/test_clean.csv',index = False)

train_df = pd.read_csv('clean_data_bilstm/train_clean.csv')
val_df = pd.read_csv('clean_data_bilstm/val_clean.csv')
test_df = pd.read_csv('clean_data_bilstm/test_clean.csv')


In [34]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(65662, 2)
(8062, 2)
(8056, 2)


In [32]:
# check no overlaps across sets
print(len(set(train_df["text_combined"]) & set(test_df["text_combined"])))
print(len(set(val_df["text_combined"]) & set(test_df["text_combined"])))
print(len(set(train_df["text_combined"]) & set(val_df["text_combined"])))

0
0
0


In [8]:
train_df.rename(columns = {'text_combined':'Body'},inplace=True)
val_df.rename(columns = {'text_combined':'Body'},inplace=True)
test_df.rename(columns = {'text_combined':'Body'},inplace=True)


In [9]:
# tokenise and pad
train_df = vocab_to_id_mapper(train_df,256,sp)
val_df = vocab_to_id_mapper(val_df,256,sp)
test_df = vocab_to_id_mapper(test_df,256,sp)


In [10]:
# convert to torch object for model injection

train_ds = TextDS(train_df['sp_ids_padded'].values, train_df['label'].values)
val_ds   = TextDS(val_df['sp_ids_padded'].values, val_df['label'].values)
test_ds  = TextDS(test_df['sp_ids_padded'].values, test_df['label'].values)

In [11]:
assert train_df['sp_ids_padded'].apply(len).eq(256).all()
assert val_df['sp_ids_padded'].apply(len).eq(256).all()
assert test_df['sp_ids_padded'].apply(len).eq(256).all()

# Training Loop

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [None]:
CKPT_DIR = "checkpoints"
os.makedirs(CKPT_DIR, exist_ok=True)

def save_ckpt(path, epoch, model, optimizer, scheduler, config, best_f1, seed):
    state = {
        "epoch": epoch,
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict() if scheduler is not None else None,
        "config": config,
        "best_val_f1": best_f1,
        "seed": seed,
    }
    tmp = path + ".tmp"
    torch.save(state, tmp)
    os.replace(tmp, path)  # atomic replace

def load_ckpt(path, model, optimizer=None, scheduler=None, map_location="cpu"):
    ckpt = torch.load(path, map_location=map_location)
    model.load_state_dict(ckpt["model"])
    if optimizer is not None and "optimizer" in ckpt and ckpt["optimizer"] is not None:
        optimizer.load_state_dict(ckpt["optimizer"])
    if scheduler is not None and "scheduler" in ckpt and ckpt["scheduler"] is not None:
        scheduler.load_state_dict(ckpt["scheduler"])
    return ckpt

def latest_epoch_path():
    files = [f for f in os.listdir(CKPT_DIR) if f.startswith("epoch_") and f.endswith(".pt")]
    if not files: return None
    files.sort()
    return os.path.join(CKPT_DIR, files[-1])

def evaluate_metrics(dl, model, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for xb, yb in dl:
            xb = xb.to(device)
            logits = model(xb)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            y_true.extend(yb.numpy())
            y_pred.extend(preds)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    acc = accuracy_score(y_true, y_pred)
    return p, r, f1, acc

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

def load_best_for_inference(manifest_path="checkpoints/manifest.json", map_location=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if map_location is None else map_location
    with open(manifest_path) as f:
        m = json.load(f)

    # rebuild model
    embedding_matrix = np.load(m["embedding_matrix_file"])
    model = BiLSTMClassifier(
        embedding_matrix=embedding_matrix,
        pad_id=m["pad_id"],
        hidden_dim=m["hidden_dim"],
        num_layers=m["num_layers"],
        dropout=m["dropout"],
        bidirectional=m["bidirectional"],
        num_classes=m["num_classes"],
    ).to(device)

    # load best weights
    ckpt = torch.load(m["best_ckpt"], map_location=device)
    model.load_state_dict(ckpt["model"])
    model.eval()
    return model, m

def get_lr(optim):  
    for pg in optim.param_groups:
        return pg["lr"]

In [14]:


g = torch.Generator()
g.manual_seed(SEED)

train_dl = DataLoader(train_ds, batch_size=64, shuffle=True,
                      num_workers=2, pin_memory=True,
                      worker_init_fn=seed_worker, generator=g)
val_dl   = DataLoader(val_ds, batch_size=128, shuffle=False,
                      num_workers=2, pin_memory=True,
                      worker_init_fn=seed_worker, generator=g)
test_dl  = DataLoader(test_ds, batch_size=128, shuffle=False,
                      num_workers=2, pin_memory=True,
                      worker_init_fn=seed_worker, generator=g)

In [None]:
CONFIG = {
    "seed": 42,
    "batch_size_train": 64,
    "batch_size_eval": 128,
    "hidden_dim": 256,
    "num_layers": 2,
    "dropout": 0.3,
    "bidirectional": True,
    "lr": 1e-3,
    "max_epochs": 20,
    "patience": 3,            # early stopping on val F1
    "pad_id": int(pad_id),
    "max_len": 256,
    "num_classes": 2,
}


# Model/optim/scheduler from your CONFIG
model = BiLSTMClassifier(
    embedding_matrix=embedding_matrix,
    pad_id=CONFIG["pad_id"],
    hidden_dim=CONFIG["hidden_dim"],
    num_layers=CONFIG["num_layers"],
    dropout=CONFIG["dropout"],
    bidirectional=CONFIG["bidirectional"],
    num_classes=CONFIG["num_classes"],
).to(device)

criterion = torch.nn.CrossEntropyLoss()  # or weighted, as earlier
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=CONFIG["lr"], weight_decay=CONFIG.get("weight_decay", 0.0)
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=1
)


INFO:biLSTM:BiLSTM Encoder initialized | emb_dim=300, hidden_dim=256, layers=2, bidirectional=True, freeze_embeddings=True


In [None]:

best_f1 = -1.0
best_val_loss = float("inf")
start_epoch = 1

# resume if a checkpoint exists
latest = latest_epoch_path()
if latest:
    ckpt = load_ckpt(latest, model, optimizer, scheduler, map_location=device)
    start_epoch = ckpt["epoch"] + 1
    best_f1 = ckpt.get("best_val_f1", -1.0)
    print(f"Resumed from {latest} at epoch {start_epoch-1}, best_val_f1={best_f1:.4f}")

no_improve = 0
history = []  

for epoch in range(start_epoch, CONFIG["max_epochs"] + 1):
    model.train()
    # Optional: re-seed per epoch to keep determinism stable across resumes
    # torch.manual_seed(CONFIG["seed"] + epoch)
    # torch.cuda.manual_seed_all(CONFIG["seed"] + epoch)

    running_loss = 0.0

    # add tqdm progress bar for training loop
    train_pbar = tqdm(train_dl, desc=f"Epoch {epoch:02d} [Train]", leave=False)
    for xb, yb in train_pbar:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        if "clip_grad_norm" in CONFIG and CONFIG["clip_grad_norm"]:
            torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG["clip_grad_norm"])
        optimizer.step()

        running_loss += loss.item()
        train_pbar.set_postfix(loss=f"{loss.item():.4f}")  # show current batch loss

    # compute average train loss for the epoch
    train_loss_avg = running_loss / max(1, len(train_dl))

    # compute validation loss (in addition to F1)
    model.eval()
    val_running_loss = 0.0
    with torch.no_grad():
        val_pbar = tqdm(val_dl, desc=f"Epoch {epoch:02d} [Val]", leave=False)
        for xb, yb in val_pbar:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            vloss = criterion(logits, yb)
            val_running_loss += vloss.item()
            val_pbar.set_postfix(vloss=f"{vloss.item():.4f}")

    val_loss_avg = val_running_loss / max(1, len(val_dl))

    p, r, val_f1, acc = evaluate_metrics(val_dl, model, device)
    val_loss_avg = val_running_loss / max(1, len(val_dl))
    scheduler.step(val_loss_avg)
    print(f"epoch {epoch:02d} | train_loss={train_loss_avg:.4f} | val_loss={val_loss_avg:.4f} | "
          f"P={p:.4f} | R={r:.4f} | F1={val_f1:.4f}")

    # record metrics
    history.append({
        "epoch": epoch,
        "train_loss": float(train_loss_avg),
        "val_loss": float(val_loss_avg),
        "val_precision": float(p),
        "val_recall": float(r),
        "val_F1": float(val_f1),
        "val_accuracy": float(acc),
    })

    # save per-epoch checkpoint
    save_ckpt(os.path.join(CKPT_DIR, f"epoch_{epoch:02d}.pt"),
              epoch, model, optimizer, scheduler, CONFIG, best_f1, CONFIG["seed"])

    # update best model based on F1 (for saving)
    if val_f1 > best_f1:
        best_f1 = val_f1
        save_ckpt(os.path.join(CKPT_DIR, "best.pt"),
                  epoch, model, optimizer, scheduler, CONFIG, best_f1, CONFIG["seed"])

    # early stopping based on val loss
    if val_loss_avg < best_val_loss - 1e-5:  # small tolerance
        best_val_loss = val_loss_avg
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= CONFIG["patience"]:
            print("Early stopping triggered on validation loss.")
            break

# save training results
history_df = pd.DataFrame(history)  
history_path = os.path.join(CKPT_DIR, "training_history.csv")  
history_df.to_csv(history_path, index=False) 
print("Saved training history to:", history_path) 

# save embedding matrix
embedding_path = os.path.join(CKPT_DIR, "embedding_matrix.npy")
np.save(embedding_path, embedding_matrix)

# create manifest
manifest = {
    "seed": CONFIG["seed"],
    "pad_id": CONFIG["pad_id"],
    "max_len": CONFIG["max_len"],
    "hidden_dim": CONFIG["hidden_dim"],
    "num_layers": CONFIG["num_layers"],
    "dropout": CONFIG["dropout"],
    "bidirectional": CONFIG["bidirectional"],
    "num_classes": CONFIG["num_classes"],
    "embedding_matrix_file": embedding_path,
    "sp_model_path": "email_sp.model",
    "best_ckpt": os.path.join(CKPT_DIR, "best.pt"),
}
with open(os.path.join(CKPT_DIR, "manifest.json"), "w") as f:
    json.dump(manifest, f, indent=2)
print("Saved manifest and embedding matrix.")

In [None]:

# # load best before final test
# best_ckpt = load_ckpt(os.path.join(CKPT_DIR, "best.pt"), model, map_location=device)
# print(f"Loaded best model from epoch {best_ckpt['epoch']} with val_F1={best_ckpt['best_val_f1']:.4f}")

In [None]:
# reloader 



In [None]:
def pr_auc(dloader):
    model.eval()
    probs, gold = [], []
    with torch.no_grad():
        for xb, yb in dloader:
            xb = xb.to(device)
            p = F.softmax(model(xb), dim=-1)[:,1].cpu().numpy()
            probs.append(p); gold.append(yb.numpy())
    return average_precision_score(np.concatenate(gold), np.concatenate(probs))

test_f1 = evaluate_f1(test_dl, model, device)
test_prauc = pr_auc(test_dl)
print(f"TEST macro-F1={test_f1:.4f}  PR-AUC={test_prauc:.4f}")