In [2]:
# load processed df
from IPython.utils.capture import capture_output

with capture_output():
    %run 03_preprocessing.ipynb

## Data Preparation

- Encode labels for model training.
- Reduce TF-IDF dimensionality using random projection.
- Convert numeric features to dense float tensors.
- Tokenize memo text for the tiny transformer.
- Build PyTorch datasets and dataloaders combining text, TF-IDF, and numeric features.

In [4]:
# =========================
# Cell 1 — Setup (fixed projection conversion)
# =========================
import time, numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.random_projection import SparseRandomProjection
from scipy.sparse import issparse

# Labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)
num_labels  = len(le.classes_)

# Project TF-IDF to low-dim dense (keeps signal, speeds training)
TFIDF_PROJ_DIM = 256
srp = SparseRandomProjection(n_components=TFIDF_PROJ_DIM, random_state=42)

t0 = time.perf_counter()
X_train_tfidf_proj = srp.fit_transform(X_train_tfidf)  # may be sparse
X_test_tfidf_proj  = srp.transform(X_test_tfidf)
tfidf_proj_time = time.perf_counter() - t0

def to_dense32(x):
    if issparse(x):
        x = x.toarray()
    return np.asarray(x, dtype=np.float32)

# Convert everything to dense float32
X_train_tfidf_proj = to_dense32(X_train_tfidf_proj)
X_test_tfidf_proj  = to_dense32(X_test_tfidf_proj)
X_train_num_d      = to_dense32(X_train_num)
X_test_num_d       = to_dense32(X_test_num)

from transformers import AutoTokenizer
MODEL_NAME = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

import torch
from torch.utils.data import Dataset, DataLoader

MAX_LEN    = 96
BATCH_SIZE = 64

class TxnDataset(Dataset):
    def __init__(self, texts, y, num_feats, tfidf_proj):
        self.texts = list(texts)
        self.y     = np.asarray(y, dtype=np.int64)
        self.num   = np.asarray(num_feats, dtype=np.float32)
        self.tfp   = np.asarray(tfidf_proj, dtype=np.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, i):
        enc = tokenizer(
            self.texts[i],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        batch = {k: v.squeeze(0) for k, v in enc.items()}
        batch["num"] = torch.from_numpy(self.num[i])
        batch["tfp"] = torch.from_numpy(self.tfp[i])
        y = torch.tensor(self.y[i], dtype=torch.long)
        return batch, y

train_ds = TxnDataset(train_df["memo_clean"], y_train_enc, X_train_num_d, X_train_tfidf_proj)
test_ds  = TxnDataset(test_df["memo_clean"],  y_test_enc,  X_test_num_d,  X_test_tfidf_proj)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print({
    "device": str(DEVICE),
    "n_train": len(train_ds),
    "n_test": len(test_ds),
    "num_labels": num_labels,
    "num_dim": X_train_num_d.shape[1],
    "tfidf_proj_dim": TFIDF_PROJ_DIM,
    "tfidf_projection_time_sec": round(tfidf_proj_time, 2)
})

config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

{'device': 'cuda', 'n_train': 969666, 'n_test': 336786, 'num_labels': 9, 'num_dim': 22, 'tfidf_proj_dim': 256, 'tfidf_projection_time_sec': 0.16}


## Model Definition

- Use frozen tiny BERT (bert-tiny) for semantic text encoding.
- Process numeric engineered features through a small MLP.
- Transform projected TF-IDF vectors with another MLP.
- Concatenate all feature streams into a fused representation.
- Apply a lightweight classifier head for category prediction.

In [5]:
# =========================
# Cell 2 — Model creation (tiny Transformer + numeric + TF-IDF projection)
# =========================
import torch.nn as nn
from transformers import AutoModel, get_linear_schedule_with_warmup

NUM_DIM = X_train_num_d.shape[1]
TFP_DIM = X_train_tfidf_proj.shape[1]

class TinyFusion(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(MODEL_NAME)
        hid = self.backbone.config.hidden_size  # 128 for bert-tiny

        # Freeze backbone for speed (uncomment next 4 lines to allow light finetune on last layer)
        for p in self.backbone.parameters():
            p.requires_grad = False

        self.num_mlp = nn.Sequential(
            nn.LayerNorm(NUM_DIM),
            nn.Linear(NUM_DIM, 64), nn.GELU(),
        )
        self.tfp_mlp = nn.Sequential(
            nn.LayerNorm(TFP_DIM),
            nn.Linear(TFP_DIM, 128), nn.GELU(),
        )
        fused_in = hid + 64 + 128
        self.head = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(fused_in, 256), nn.GELU(), nn.Dropout(0.2),
            nn.Linear(256, num_labels)
        )

    def forward(self, batch):
        out = self.backbone(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).last_hidden_state
        cls = out[:, 0]  # first token representation
        num = self.num_mlp(batch["num"])
        tfp = self.tfp_mlp(batch["tfp"])
        fused = torch.cat([cls, num, tfp], dim=1)
        return self.head(fused)

model = TinyFusion(num_labels=num_labels).to(DEVICE)

# Loss (optional class weights for imbalance)
import numpy as np, torch
counts = np.bincount(y_train_enc, minlength=num_labels)
weights = (1.0 / (counts + 1e-9))
weights = weights / weights.sum() * num_labels
criterion = nn.CrossEntropyLoss(weight=torch.tensor(weights, dtype=torch.float32, device=DEVICE))

# Optimizer/scheduler (only head and small MLPs train → fast)
optimizer = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=2e-3, weight_decay=1e-2)
EPOCHS = 3
total_steps = EPOCHS * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, int(0.1*total_steps), total_steps)

print("Model ready.", {"backbone_frozen": True, "epochs": EPOCHS})

2025-11-13 00:26:52.009893: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-13 00:26:52.009945: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-13 00:26:52.011247: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-13 00:26:52.017769: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Model ready. {'backbone_frozen': True, 'epochs': 3}


model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

### Model Evaluation

- Train only the small fusion and MLP layers for speed.
- Track loss, accuracy, and F1-macro during training.
- Measure training time and inference latency.
- Evaluate the model on both train and test sets.
- Output metrics and classification performance summary.

In [8]:
# =========================
# Cell 3 — Train, evaluate, and report latency
# =========================
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.cuda.amp import GradScaler, autocast
import numpy as np, time, torch

def epoch_pass(loader, train=False):
    model.train() if train else model.eval()
    total_loss, n = 0.0, 0
    preds, trues = [], []
    scaler = GradScaler(enabled=(DEVICE.type=="cuda" and train))
    for batch, y in loader:
        for k in ["input_ids","attention_mask","num","tfp"]:
            batch[k] = batch[k].to(DEVICE, non_blocking=True)
        y = y.to(DEVICE, non_blocking=True)
        with autocast(enabled=(DEVICE.type=="cuda")):
            logits = model(batch)
            loss = criterion(logits, y)
        if train:
            optimizer.zero_grad(set_to_none=True)
            if DEVICE.type=="cuda":
                scaler.scale(loss).backward()
                scaler.step(optimizer); scaler.update()
            else:
                loss.backward(); optimizer.step()
            scheduler.step()
        total_loss += loss.item() * y.size(0); n += y.size(0)
        preds.extend(torch.argmax(logits,1).detach().cpu().numpy()); trues.extend(y.detach().cpu().numpy())
    acc = accuracy_score(trues, preds); f1m = f1_score(trues, preds, average="macro"); f1w = f1_score(trues, preds, average="weighted")
    return total_loss/max(n,1), acc, f1m, f1w, np.array(trues), np.array(preds)

t_train = time.perf_counter()
for ep in range(1, EPOCHS+1):
    tr_loss, tr_acc, tr_f1m, tr_f1w, _, _ = epoch_pass(train_loader, train=True)
    print(f"epoch {ep:02d} | train_loss {tr_loss:.4f} acc {tr_acc:.4f} f1_macro {tr_f1m:.4f}")
train_time = time.perf_counter() - t_train
print(f"training_time_sec: {train_time:.2f}")

@torch.inference_mode()
def eval_latency(loader, name):
    t0 = time.perf_counter()
    loss, acc, f1m, f1w, y_true, y_pred = epoch_pass(loader, train=False)
    elapsed = time.perf_counter() - t0
    n = len(y_true); lat_ms = (elapsed/max(n,1))*1000.0; tps = n/max(elapsed,1e-9)
    print(f"[{name}] n={n} acc={acc:.4f} f1_macro={f1m:.4f} f1_weighted={f1w:.4f} latency={lat_ms:.2f} ms/sample throughput={tps:.1f} samples/s")
    print("\n"+classification_report(y_true, y_pred, target_names=le.classes_, digits=4))
    return acc, f1m, f1w, lat_ms

train_acc, train_f1m, train_f1w, train_lat = eval_latency(train_loader, "train")
test_acc,  test_f1m,  test_f1w,  test_lat  = eval_latency(test_loader,  "test")

print({
    "tfidf_projection_time_sec": round(tfidf_proj_time, 2),
    "training_time_sec": round(train_time, 2),
    "train_acc": round(train_acc,4),
    "test_acc":  round(test_acc,4),
    "test_f1_macro": round(test_f1m,4),
    "test_latency_ms_per_sample": round(test_lat,2)
})

epoch 01 | train_loss 0.2016 acc 0.8999 f1_macro 0.7746
epoch 02 | train_loss 0.1994 acc 0.8999 f1_macro 0.7747
epoch 03 | train_loss 0.2008 acc 0.9001 f1_macro 0.7743
training_time_sec: 383.67
[train] n=969666 acc=0.9239 f1_macro=0.8374 f1_weighted=0.9266 latency=0.12 ms/sample throughput=8371.9 samples/s

                     precision    recall  f1-score   support

          EDUCATION     0.2644    0.9516    0.4138      3329
 FOOD_AND_BEVERAGES     0.9109    0.9327    0.9217    357992
GENERAL_MERCHANDISE     0.9689    0.8929    0.9293    391492
          GROCERIES     0.9342    0.9654    0.9495    162754
           MORTGAGE     0.8699    0.9986    0.9298       710
          OVERDRAFT     0.9098    0.9992    0.9524      2433
               PETS     0.6708    0.9798    0.7964      6599
               RENT     0.5642    0.9829    0.7169      2518
             TRAVEL     0.8972    0.9572    0.9262     41839

           accuracy                         0.9239    969666
          macro av