In [1]:
from biLSTM import BiLSTMClassifier
import pandas as pd
import numpy as np
import sentencepiece as spm
import re
from bs4 import BeautifulSoup
import unicodedata
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
import os
from torch.utils.data import DataLoader,TensorDataset
import random
import torch.nn as nn
from sklearn.metrics import f1_score
import torch.nn.functional as F
import json
from sklearn.metrics import precision_recall_fscore_support,accuracy_score
import time
import itertools
import hashlib
import random
import joblib

SEED = 42

np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# PyTorch (CPU & CUDA)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)  # if multi-GPU

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# helpers

In [3]:
# helpers
def vocab_to_id_mapper(
        input_df
        ,max_len
        ,sp
) :
    
    pad_id = sp.piece_to_id("<pad>")
    if pad_id == -1:  
        pad_id = 0

    
    def encode_ids(text) :
        if not isinstance(text, str):
            text = "" if pd.isna(text) else str(text)
        return sp.encode_as_ids(text)

    def pad_ids(ids,max_len,pad_id) -> np.ndarray:
        if len(ids) >= max_len:
            return np.array(ids[:max_len], dtype=np.int32)
        return np.array(ids + [pad_id] * (max_len - len(ids)), dtype=np.int32)

    
    df = input_df.copy()
    df["sp_ids"] = df["Body"].apply(encode_ids)

    # overwrite sp_ids_padded with NumPy arrays directly
    df["sp_ids_padded"] = df["sp_ids"].apply(lambda ids: pad_ids(ids, max_len, pad_id))

    return df

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

def df_to_ids_and_labels(df):
    # 'sp_ids_padded' should be a list/array per row; stack to [N, max_len]
    X_ids = np.stack(df["sp_ids_padded"].values).astype(np.int64)
    y = df["label"].astype(np.int64).values
    return X_ids, y

def make_loader(X_ids, y, batch_size=128, shuffle=False):
    X = torch.tensor(X_ids, dtype=torch.long)
    y = torch.tensor(y,     dtype=torch.long)
    ds = TensorDataset(X, y)
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, pin_memory=True)

@torch.no_grad()
def extract_features(encoder, dl, device):
    encoder.eval()
    feats, labels = [], []
    for xb, yb in dl:
        xb = xb.to(device)
        z  = encoder(xb)                                # [B, feat_dim] (e.g., 512)
        feats.append(z.cpu().numpy().astype(np.float32))
        labels.append(yb.numpy().astype(np.int64))
    return np.concatenate(feats), np.concatenate(labels)


class TextDS(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(np.stack(X), dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.X[i], self.y[i]

# base model reload

#### test set data tokenisation

In [4]:
best_dir_path = 'best_ckpts'

with open(best_dir_path+'/manifest.json','r') as f:
    mf = json.load(f)



embed_matrix_path = mf['embedding_matrix_file']
sp_model_path = mf['sp_model_path']
best_ckpt = mf['best_ckpt']

pad_id       = int(mf["pad_id"])
max_len      = int(mf["max_len"])
hidden_dim   = int(mf["hidden_dim"])
num_layers   = int(mf["num_layers"])
bidirectional= bool(mf["bidirectional"])
num_classes  = int(mf["num_classes"])
droupout     = float(mf['dropout'])
weight_decay = float(mf['weight_decay'])
best_thr = float(mf['val_threshold'])

In [5]:
# reload embedding matrix and sp process for tokenisation of test set
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

embedding_matrix = np.load(embed_matrix_path)


In [6]:
print(embedding_matrix.shape)
print(sp.get_piece_size())

(50000, 300)
50000


In [None]:
g = torch.Generator()
g.manual_seed(SEED)

test_df = pd.read_csv('clean_data_bilstm/test_clean.csv')
test_df.rename(columns = {'text_combined':'Body'},inplace=True)
test_df = vocab_to_id_mapper(test_df,256,sp)
test_ds  = TextDS(test_df['sp_ids_padded'].values, test_df['label'].values)
assert test_df['sp_ids_padded'].apply(len).eq(256).all()
test_dl  = DataLoader(test_ds, batch_size=128, shuffle=False,
                      num_workers=2, pin_memory=True,
                      worker_init_fn=seed_worker, generator=g)

In [32]:
# ---- rebuild model using manifest config ----
model = BiLSTMClassifier(
    embedding_matrix=embedding_matrix,
    pad_id=pad_id,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    dropout=droupout,
    bidirectional=bidirectional,
    num_classes=num_classes,
).to(device)

# ---- load checkpoint ----
ckpt = torch.load(best_ckpt, map_location=device)
model.load_state_dict(ckpt["model"], strict=True)
model.eval()

print(f"Model reloaded successfully onto device {device}")
print(f"Loaded epoch: {ckpt.get('epoch', '?')} | Best F1: {ckpt.get('best_val_f1', '?'):.4f}")
print(" Model Architecture Summary")
print("=" * 40)
print(f"Model Type      : {model.__class__.__name__}")
print(f"Hidden Dim      : {hidden_dim}")
print(f"Num Layers      : {num_layers}")
print(f"Bidirectional   : {bidirectional}")
print(f"Dropout         : {droupout}")
print(f"Embedding Dim   : {embedding_matrix.shape[1]}")
print(f"Vocab Size      : {embedding_matrix.shape[0]}")
print(f"Output Classes  : {num_classes}")
print("=" * 40)
print(f"Total Parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable Params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print("=" * 40)

INFO:biLSTM:BiLSTM Encoder initialized | emb_dim=300, hidden_dim=256, layers=2, bidirectional=True, freeze_embeddings=True


Model reloaded successfully onto device cpu
Loaded epoch: 10 | Best F1: 0.9899
 Model Architecture Summary
Model Type      : BiLSTMClassifier
Hidden Dim      : 256
Num Layers      : 2
Bidirectional   : True
Dropout         : 0.5
Embedding Dim   : 300
Vocab Size      : 50000
Output Classes  : 2
Total Parameters: 17,721,794
Trainable Params: 2,721,794


# ablation reload

In [33]:
with open(best_dir_path+'/meta.json', "r") as f:
    meta = json.load(f)
val_thr   = float(meta["val_threshold"])
feat_dim  = int(meta["feat_dim"])

In [34]:
abl_model = BiLSTMClassifier(
    embedding_matrix=embedding_matrix,
    pad_id=pad_id,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    dropout=droupout,
    bidirectional=bidirectional,
    num_classes=num_classes,
).to(device)

# ---- load checkpoint ----
ckpt = torch.load(best_ckpt, map_location=device)
abl_model.load_state_dict(ckpt["model"], strict=True)
abl_model.eval()
encoder = abl_model.encoder
encoder.eval()


hgb = joblib.load(best_dir_path+'/model.pkl')
print("✅ HistGB classifier head loaded")

INFO:biLSTM:BiLSTM Encoder initialized | emb_dim=300, hidden_dim=256, layers=2, bidirectional=True, freeze_embeddings=True


✅ HistGB classifier head loaded


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


#### test data tokenisation

In [35]:
# test_df is from upstream
Xte_ids, yte = df_to_ids_and_labels(test_df)
abl_test_dl  = make_loader(Xte_ids, yte, batch_size=128, shuffle=False)
X_te, y_te = extract_features(encoder, test_dl,  device)



In [36]:
assert X_te.shape[1] == feat_dim, f"feat dim mismatch: {X_te.shape[1]} vs meta {feat_dim}"
print("Features:", X_te.shape)

Features: (8056, 512)


# ablation 2 reload

In [8]:
g = torch.Generator()
g.manual_seed(SEED)

cross_dom_df = pd.read_csv('clean_data_bilstm/cross_domain_clean.csv')
cross_dom_df.rename(columns = {'cleaned':'Body'},inplace=True)
cross_dom_df = vocab_to_id_mapper(cross_dom_df,256,sp)
cross_dom_ds  = TextDS(cross_dom_df['sp_ids_padded'].values, cross_dom_df['label'].values)
assert cross_dom_df['sp_ids_padded'].apply(len).eq(256).all()
cross_dom_dl  = DataLoader(cross_dom_ds, batch_size=128, shuffle=False,
                      num_workers=2, pin_memory=True,
                      worker_init_fn=seed_worker, generator=g)

In [10]:
cross_dom_df

Unnamed: 0,original_text,Body,label,sp_ids,sp_ids_padded
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,0,"[248, 1066, 4713, 525, 765, 6448, 753, 507, 43...","[248, 1066, 4713, 525, 765, 6448, 753, 507, 43..."
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,0,"[1358, 1018, 24452, 20932, 84, 38488]","[1358, 1018, 24452, 20932, 84, 38488, 7, 7, 7,..."
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...,1,"[526, 2516, 43, 9, 10718, 113, 203, 30, 1209, ...","[526, 2516, 43, 9, 10718, 113, 203, 30, 1209, ..."
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,0,"[84, 5081, 1017, 221, 1822, 4510, 84, 16, 1251...","[84, 5081, 1017, 221, 1822, 4510, 84, 16, 1251..."
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,0,"[18302, 24, 619, 554, 204, 3292, 30, 30831, 20...","[18302, 24, 619, 554, 204, 3292, 30, 30831, 20..."
...,...,...,...,...,...
5088,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried contact u u...,1,"[85, 68, 18, 876, 728, 289, 70, 116, 3976, 499...","[85, 68, 18, 876, 728, 289, 70, 116, 3976, 499..."
5089,Will Ì_ b going to esplanade fr home?,will b going to esplanade fr home,0,"[110, 27, 628, 30, 1133, 10541, 669, 115, 937]","[110, 27, 628, 30, 1133, 10541, 669, 115, 937,..."
5090,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggestions,0,"[34617, 250, 43, 7581, 60, 89, 221, 1714, 409,...","[34617, 250, 43, 7581, 60, 89, 221, 1714, 409,..."
5091,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,0,"[18, 3174, 643, 365, 45703, 284, 24, 14716, 39...","[18, 3174, 643, 365, 45703, 284, 24, 14716, 39..."
