In [1]:
# load processed df
from IPython.utils.capture import capture_output

with capture_output():
    %run 03_preprocessing.ipynb

### Model Training

In [2]:
# ===== Cell 1 â€” Model setup (use existing X_train, X_test, y_train, y_test, train_df, test_df) =====
import os, math, time, numpy as np, pandas as pd
from scipy import sparse
from tqdm import tqdm

# Sanity: existing sparse matrices and labels
assert sparse.isspmatrix_csr(X_train) and sparse.isspmatrix_csr(X_test)
assert isinstance(y_train, (np.ndarray, pd.Series, list)) and isinstance(y_test, (np.ndarray, pd.Series, list))

# Text for Transformer (no new features; use memo_clean as-is)
train_text = train_df["memo_clean"].astype(str).tolist()
test_text  = test_df["memo_clean"].astype(str).tolist()

# Label mapping without LabelEncoder
y_train_arr = np.asarray(y_train)
y_test_arr  = np.asarray(y_test)
classes = np.unique(y_train_arr)
class_to_id = {c:i for i,c in enumerate(classes)}
id_to_class = {i:c for c,i in class_to_id.items()}
y_train_ids = np.vectorize(class_to_id.get)(y_train_arr)
y_test_ids  = np.vectorize(class_to_id.get)(y_test_arr)

NUM_CLASSES = len(classes)

# Timing helpers
def time_block(fn, *args, **kwargs):
    t0 = time.perf_counter()
    out = fn(*args, **kwargs)
    dt = time.perf_counter() - t0
    return out, dt

def per_1k(latency_seconds, n_items):
    return (latency_seconds / max(n_items,1)) * 1000.0

# Chunked prediction with tqdm for sklearn models on sparse matrices
def predict_proba_in_chunks(model, X, chunk_size=65536):
    n = X.shape[0]
    probs = []
    for i in tqdm(range(0, n, chunk_size), total=math.ceil(n/chunk_size), desc="Predict (sklearn)", leave=False):
        probs.append(model.predict_proba(X[i:i+chunk_size]))
    return np.vstack(probs)


In [3]:
# ===== Cell 2 â€” Model creation & training (MLP on TF-IDF; DistilBERT on memo_clean) =====
# --- A) MLP (scikit-learn) ---
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    hidden_layer_sizes=(512,256),
    activation="relu",
    solver="adam",
    alpha=2e-5,
    batch_size=2048,
    learning_rate_init=1.5e-3,
    max_iter=40,
    early_stopping=True,
    n_iter_no_change=3,
    validation_fraction=0.1,
    random_state=42,
    verbose=False,
)

_, mlp_train_time = time_block(mlp.fit, X_train, y_train_arr)

# --- B) DistilBERT (Hugging Face Transformers) on memo_clean only ---
# install if needed
try:
    import transformers, torch
except ImportError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "transformers", "torch", "datasets", "accelerate", "tqdm"])
    import transformers, torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class TxnDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=64)
        enc = {k: np.array(v) for k,v in enc.items()}
        enc["labels"] = np.int64(self.labels[idx])
        return enc

train_ds = TxnDataset(train_text, y_train_ids)
test_ds  = TxnDataset(test_text,  y_test_ids)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_CLASSES)
model.to(device)

# training args tuned for throughput; tqdm is automatic
args = TrainingArguments(
    output_dir="./bert_out",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=3e-5,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_ratio=0.06,
    evaluation_strategy="no",
    logging_steps=200,
    save_strategy="no",
    report_to="none",
)

trainer = Trainer(model=model, args=args, train_dataset=train_ds)

_, bert_train_time = time_block(trainer.train)

print(f"Train time â€” MLP:       {mlp_train_time:.2f}s")
print(f"Train time â€” DistilBERT:{bert_train_time:.2f}s on {device}")


2025-11-11 06:21:47.096090: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-11 06:21:47.096147: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-11 06:21:47.097312: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-11 06:21:47.104108: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of DistilBertForSequenceClassification w

Step,Training Loss
200,2.1384
400,1.545
600,1.0553
800,0.7618
1000,0.5873
1200,0.524
1400,0.4402
1600,0.3737
1800,0.321
2000,0.3047


Train time â€” MLP:       2416.30s
Train time â€” DistilBERT:9175.03s on cuda


### Model Evaluation

In [None]:
# ===== Cell 3 â€” Evaluation (no retraining; consistent with your variable names) =====
import math, numpy as np, torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.utils.data import Subset

# --- MLP: latency + metrics ---
(_, mlp_proba_train_time) = time_block(predict_proba_in_chunks, mlp, X_train, 65536)
(mlp_proba_test, mlp_proba_test_time) = time_block(predict_proba_in_chunks, mlp, X_test, 65536)

mlp_pred_train = mlp.predict(X_train).astype(str)
mlp_pred_test  = mlp.predict(X_test).astype(str)

mlp_train_acc  = accuracy_score(y_train_arr.astype(str), mlp_pred_train)
mlp_train_f1   = f1_score(y_train_arr.astype(str),  mlp_pred_train, average="macro")
mlp_test_acc   = accuracy_score(y_test_arr.astype(str),  mlp_pred_test)
mlp_test_f1    = f1_score(y_test_arr.astype(str),   mlp_pred_test,  average="macro")

print(f"[MLP]   Train acc={mlp_train_acc:.4f}  Macro-F1={mlp_train_f1:.4f}  Proba latency={mlp_proba_train_time:.2f}s  Per-1k={per_1k(mlp_proba_train_time, X_train.shape[0]):.3f} ms")
print(f"[MLP]   Test  acc={mlp_test_acc:.4f}   Macro-F1={mlp_test_f1:.4f}   Proba latency={mlp_proba_test_time:.2f}s   Per-1k={per_1k(mlp_proba_test_time, X_test.shape[0]):.3f} ms")
print(classification_report(y_test_arr.astype(str), mlp_pred_test, digits=3))

# --- DistilBERT (your variables: trainer/model/tokenizer/train_ds/test_ds/device) ---
# Test-set inference (timed; Trainer shows tqdm)
(pred_out, bert_test_pred_time) = time_block(trainer.predict, test_ds)
bert_logits = pred_out.predictions
bert_pred_ids = bert_logits.argmax(axis=1)
bert_pred_lbl = np.array([id_to_class[i] for i in bert_pred_ids], dtype=str)

bert_test_acc = accuracy_score(y_test_arr.astype(str), bert_pred_lbl)
bert_test_f1  = f1_score(y_test_arr.astype(str), bert_pred_lbl, average="macro")
print(f"[DistilBERT] Test acc={bert_test_acc:.4f}  Macro-F1={bert_test_f1:.4f}  Pred latency={bert_test_pred_time:.2f}s  Per-1k={per_1k(bert_test_pred_time, len(test_text)):.3f} ms")
print(classification_report(y_test_arr.astype(str), bert_pred_lbl, digits=3))

# Optional: quick train-sample eval without manual tensor plumbing (uses HF Trainer)
rng = np.random.default_rng(42)
sample_n = min(100_000, len(train_ds))
sample_idx = rng.choice(len(train_ds), size=sample_n, replace=False)
train_subset = Subset(train_ds, sample_idx)

(train_pred_out, bert_train_pred_time) = time_block(trainer.predict, train_subset)
train_logits = train_pred_out.predictions
train_pred_ids = train_logits.argmax(axis=1)

# map ids -> strings for fair comparison
train_true_ids = y_train_ids[sample_idx]
train_true_lbl = np.array([id_to_class[i] for i in train_true_ids], dtype=str)
train_pred_lbl = np.array([id_to_class[i] for i in train_pred_ids], dtype=str)

bert_train_acc = accuracy_score(train_true_lbl, train_pred_lbl)
bert_train_f1  = f1_score(train_true_lbl,  train_pred_lbl, average="macro")
print(f"[DistilBERT] Train-sample acc={bert_train_acc:.4f}  Macro-F1={bert_train_f1:.4f}  Pred latency(sample)={bert_train_pred_time:.2f}s  Per-1k={per_1k(bert_train_pred_time, sample_n):.3f} ms")

# --- Logit-level ensemble (align class orders) ---
# Map MLP probs (mlp.classes_) to the 'classes' order used for BERT ids
mlp_class_index = {label: i for i, label in enumerate(mlp.classes_)}
mlp_reindexed = np.zeros((mlp_proba_test.shape[0], len(classes)), dtype=np.float32)
for j, label in enumerate(classes):
    mlp_reindexed[:, j] = mlp_proba_test[:, mlp_class_index[label]]

ens_proba_test = 0.5 * mlp_reindexed + 0.5 * (torch.softmax(torch.tensor(bert_logits), dim=-1).numpy())
ens_pred_ids   = ens_proba_test.argmax(axis=1)
ens_pred_lbl   = np.array([id_to_class[i] for i in ens_pred_ids], dtype=str)

ens_test_acc = accuracy_score(y_test_arr.astype(str), ens_pred_lbl)
ens_test_f1  = f1_score(y_test_arr.astype(str),  ens_pred_lbl, average="macro")
ens_pred_time = mlp_proba_test_time + bert_test_pred_time

print(f"[Ensemble] Test acc={ens_test_acc:.4f}  Macro-F1={ens_test_f1:.4f}  Pred latencyâ‰ˆ{ens_pred_time:.2f}s  Per-1kâ‰ˆ{per_1k(ens_pred_time, len(test_text)):.3f} ms")
print(classification_report(y_test_arr.astype(str), ens_pred_lbl, digits=3))

                                                                  

[MLP]   Train acc=0.9753  Macro-F1=0.9674  Proba latency=18.15s  Per-1k=0.019 ms
[MLP]   Test  acc=0.9272   Macro-F1=0.8811   Proba latency=6.19s   Per-1k=0.018 ms
                     precision    recall  f1-score   support

          EDUCATION      0.740     0.448     0.558      1170
 FOOD_AND_BEVERAGES      0.898     0.943     0.920    124002
GENERAL_MERCHANDISE      0.941     0.920     0.930    132571
          GROCERIES      0.957     0.924     0.940     56577
           MORTGAGE      0.986     0.866     0.922       409
          OVERDRAFT      0.987     0.967     0.977       953
               PETS      0.977     0.909     0.942      2667
               RENT      0.802     0.803     0.802       629
             TRAVEL      0.956     0.923     0.939     17808

           accuracy                          0.927    336786
          macro avg      0.916     0.856     0.881    336786
       weighted avg      0.928     0.927     0.927    336786

