In [2]:
import json
import torch
import numpy as np
import os
from sklearn.metrics import (
    f1_score, precision_recall_fscore_support, 
    classification_report, accuracy_score
)
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, Dataset
import joblib
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertModel
from peft import PeftModel
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cpu')

In [2]:
best_ckpt_dir = 'best_ckpts_distilbert'

def make_hgb(
    learning_rate=0.05,
    max_depth=6,
    max_iter=600,
    l2_regularization=1.0,
    min_samples_leaf=50,
    class_weight=None,
):
    """
    Create HistGradientBoostingClassifier with specified parameters
    """
    return HistGradientBoostingClassifier(
        learning_rate=learning_rate,
        max_depth=max_depth,
        max_iter=max_iter,
        l2_regularization=l2_regularization,
        min_samples_leaf=min_samples_leaf,
        class_weight=class_weight,
        early_stopping=True,
        n_iter_no_change=30,
        validation_fraction=0.1,
        random_state=SEED,
        scoring="loss",
        categorical_features=None,
        monotonic_cst=None
    )

def f1_with_best_threshold(y_true, proba, average="binary"):
    """
    Sweep thresholds on the validation set to pick the best F1.
    Returns (best_f1, best_threshold).
    """
    thresholds = np.linspace(0.05, 0.95, 19)
    best_f1, best_t = -1.0, 0.5
    for t in thresholds:
        y_pred = (proba >= t).astype(int)
        f1 = f1_score(y_true, y_pred, average=average, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return best_f1, best_t

@torch.no_grad()
def extract_features(encoder, dl, device):
    """Extract features from DistilBERT encoder"""
    encoder.eval()
    feats, labels = [], []
    for batch in dl:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Extract [CLS] token embeddings
        outputs = encoder(input_ids=input_ids, attention_mask=attention_mask)
        z = outputs.last_hidden_state[:, 0, :]  # [B, 768]
        
        feats.append(z.cpu().numpy().astype(np.float32))
        labels.append(batch['label'].numpy().astype(np.int64))
    
    return np.concatenate(feats), np.concatenate(labels)

In [3]:
class EmailDataset(Dataset):
    """PyTorch Dataset for email data with DistilBERT tokenization"""
    
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def make_loader(dataset, batch_size=128, shuffle=False):
    """Create DataLoader from dataset"""
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, 
                     pin_memory=True, num_workers=2)

In [4]:
# Load manifest
MANIFEST_PATH = os.path.join(best_ckpt_dir, "manifest.json")

with open(MANIFEST_PATH, "r") as f:
    manifest = json.load(f)

print(f"Loaded manifest from: {MANIFEST_PATH}")
print(f"LoRA config: r={manifest['lora_r']}, alpha={manifest['lora_alpha']}")

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load base DistilBERT model
base_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Load LoRA weights
lora_weights_path = manifest["lora_weights_path"]
encoder = PeftModel.from_pretrained(base_model, lora_weights_path).to(device)

# Freeze encoder
encoder.eval()
for p in encoder.parameters():
    p.requires_grad = False

print("✓ Encoder reloaded and frozen.")


Loaded manifest from: best_ckpts_distilbert/manifest.json
LoRA config: r=8, alpha=16


✓ Encoder reloaded and frozen.


In [None]:
train_df = pd.read_csv('../../../datasets/encoder_dataset/clean/distilbert/train_clean.csv')
val_df = pd.read_csv('../../../datasets/encoder_dataset/clean/distilbert/val_clean.csv')
test_df = pd.read_csv('../../../datasets/encoder_dataset/clean/distilbert/test_clean.csv')


text_col = 'text_combined'

# Verify no overlaps
print("Overlap checks:")
print(f"Train-Test: {len(set(train_df[text_col]) & set(test_df[text_col]))}")
print(f"Val-Test: {len(set(val_df[text_col]) & set(test_df[text_col]))}")
print(f"Train-Val: {len(set(train_df[text_col]) & set(val_df[text_col]))}")


Overlap checks:
Train-Test: 0
Val-Test: 0
Train-Val: 0


In [6]:
# Create datasets
train_ds = EmailDataset(train_df[text_col].values, train_df['label'].values, 
                        tokenizer, max_length=256)
val_ds = EmailDataset(val_df[text_col].values, val_df['label'].values, 
                      tokenizer, max_length=256)
test_ds = EmailDataset(test_df[text_col].values, test_df['label'].values, 
                       tokenizer, max_length=256)

# Create DataLoaders
train_dl = make_loader(train_ds, batch_size=128, shuffle=False)
val_dl = make_loader(val_ds, batch_size=128, shuffle=False)
test_dl = make_loader(test_ds, batch_size=128, shuffle=False)

print(f"DataLoaders created:")
print(f"  Train batches: {len(train_dl)}")
print(f"  Val batches: {len(val_dl)}")
print(f"  Test batches: {len(test_dl)}")


DataLoaders created:
  Train batches: 513
  Val batches: 63
  Test batches: 63


In [7]:
print(f"Train shape: {len(train_df)}")
print(f"Val shape: {len(val_df)}")
print(f"Test shape: {len(test_df)}")
print(f"Train labels: {np.bincount(train_df['label'].values)}")


Train shape: 65655
Val shape: 8060
Test shape: 8056
Train labels: [31385 34270]


In [8]:
print("\nExtracting features from DistilBERT encoder...")

X_tr, y_tr = extract_features(encoder, train_dl, device)
X_va, y_va = extract_features(encoder, val_dl, device)
X_te, y_te = extract_features(encoder, test_dl, device)

print(f"\nFeature extraction complete:")
print(f"  Train features: {X_tr.shape}")
print(f"  Val features: {X_va.shape}")
print(f"  Test features: {X_te.shape}")



Extracting features from DistilBERT encoder...

Feature extraction complete:
  Train features: (65655, 768)
  Val features: (8060, 768)
  Test features: (8056, 768)


In [9]:
# Tiny tuning grid (8 configs total)
param_grid = [
    {"learning_rate": 0.03, "max_depth": 4, "l2_regularization": 0.0, "min_samples_leaf": 20, "max_iter": 600},
    {"learning_rate": 0.03, "max_depth": 4, "l2_regularization": 1.0, "min_samples_leaf": 50, "max_iter": 600},
    {"learning_rate": 0.03, "max_depth": 6, "l2_regularization": 0.0, "min_samples_leaf": 20, "max_iter": 600},
    {"learning_rate": 0.03, "max_depth": 6, "l2_regularization": 1.0, "min_samples_leaf": 50, "max_iter": 600},

    {"learning_rate": 0.05, "max_depth": 4, "l2_regularization": 0.0, "min_samples_leaf": 20, "max_iter": 600},
    {"learning_rate": 0.05, "max_depth": 4, "l2_regularization": 1.0, "min_samples_leaf": 50, "max_iter": 600},
    {"learning_rate": 0.05, "max_depth": 6, "l2_regularization": 0.0, "min_samples_leaf": 20, "max_iter": 600},
    {"learning_rate": 0.05, "max_depth": 6, "l2_regularization": 1.0, "min_samples_leaf": 50, "max_iter": 600},
]

best_cfg, best_model, best_val_f1, best_thr = None, None, -1.0, 0.5
tuning_log = []

for cfg in tqdm(param_grid, desc="Grid Search (HGB)", ncols=100):
    hgb = HistGradientBoostingClassifier(
        **cfg,
        early_stopping=True,
        n_iter_no_change=20,
        validation_fraction=0.05,
        random_state=SEED
    )

    hgb.fit(X_tr, y_tr)
    proba_va = hgb.predict_proba(X_va)[:, 1]
    f1_va, thr = f1_with_best_threshold(y_va, proba_va)

    tuning_log.append({**cfg, "val_F1": f1_va, "thr": thr})

    if f1_va > best_val_f1:
        best_cfg, best_model, best_val_f1, best_thr = cfg, hgb, f1_va, thr

print(f"✓ Best (val): {best_cfg} | F1={best_val_f1:.4f} | thr={best_thr:.3f}")

# Save tuning log
tuning_df = pd.DataFrame(tuning_log).sort_values("val_F1", ascending=False)
tuning_df.to_csv(os.path.join(best_ckpt_dir, "hgb_tuning_log.csv"), index=False)
print(f"\n✓ Tuning log saved")

Grid Search (HGB): 100%|██████████████████████████████████████████████| 8/8 [03:18<00:00, 24.78s/it]

✓ Best (val): {'learning_rate': 0.03, 'max_depth': 4, 'l2_regularization': 0.0, 'min_samples_leaf': 20, 'max_iter': 600} | F1=0.9937 | thr=0.950

✓ Tuning log saved





## train best HGB configuration

In [10]:
# Train on TRAIN ONLY, using the chosen best config
hgb = HistGradientBoostingClassifier(
    **best_cfg,
    early_stopping=True, 
    n_iter_no_change=30, 
    validation_fraction=0.1,
    random_state=SEED
)
hgb.fit(X_tr, y_tr)

# Evaluate on validation
proba_va = hgb.predict_proba(X_va)[:, 1]
val_f1, val_thr = f1_with_best_threshold(y_va, proba_va)
y_hat_va = (proba_va >= val_thr).astype(int)
p, r, f1, _ = precision_recall_fscore_support(y_va, y_hat_va, average="binary", zero_division=0)

print(f"[VAL] F1={f1:.4f}  P={p:.4f}  R={r:.4f}  (thr={val_thr:.3f})")
print(classification_report(y_va, y_hat_va, digits=4))


[VAL] F1=0.9936  P=0.9934  R=0.9937  (thr=0.950)
              precision    recall  f1-score   support

           0     0.9929    0.9926    0.9928      3794
           1     0.9934    0.9937    0.9936      4266

    accuracy                         0.9932      8060
   macro avg     0.9932    0.9931    0.9932      8060
weighted avg     0.9932    0.9932    0.9932      8060



In [11]:
# Save model
joblib.dump(hgb, os.path.join(best_ckpt_dir, "hgb_model.pkl"))

# Save metadata
meta = {
    "config": best_cfg,
    "val_threshold": float(val_thr),
    "feat_dim": int(X_tr.shape[1]),
    "seed": SEED,
    "trained_on": "train_only",
    "metrics": {
        "val_precision": float(p), 
        "val_recall": float(r), 
        "val_F1": float(f1)
    },
}
with open(os.path.join(best_ckpt_dir, "hgb_meta.json"), "w") as f:
    json.dump(meta, f, indent=2)

print(f"Saved model → {os.path.join(best_ckpt_dir, 'hgb_model.pkl')}")
print(f"Saved meta  → {os.path.join(best_ckpt_dir, 'hgb_meta.json')}")


Saved model → best_ckpts_distilbert/hgb_model.pkl
Saved meta  → best_ckpts_distilbert/hgb_meta.json
