In [None]:
# load processed df
from IPython.utils.capture import capture_output

with capture_output():
    %run 03_preprocessing.ipynb

### Model Training

In [None]:
# === PyTorch MLP for sparse features (densified per batch) ===
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter

# --- Reproducibility & device ---
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Label encoding (y_train/y_test are strings) ---
le = LabelEncoder()
y_train_int = le.fit_transform(y_train)
y_test_int  = le.transform(y_test)
n_classes = len(le.classes_)

# --- Train/Val split (from training set) ---
train_idx, val_idx = train_test_split(
    np.arange(X_train.shape[0]),
    test_size=0.10,
    random_state=SEED,
    stratify=y_train_int
)
X_tr, X_val = X_train[train_idx], X_train[val_idx]
y_tr, y_val = y_train_int[train_idx], y_train_int[val_idx]

# --- Class weights (to handle imbalance) ---
counts = np.bincount(y_tr, minlength=n_classes)
class_weights = counts.sum() / np.maximum(counts, 1)
class_weights = class_weights / class_weights.mean()
class_weights_t = torch.tensor(class_weights, dtype=torch.float32, device=device)

# --- Batch generator that densifies CSR per batch (memory-safe) ---
def csr_batch_generator(X_csr, y_arr, batch_size=1024, shuffle=True, device=device):
    n = X_csr.shape[0]
    idx = np.arange(n)
    while True:
        if shuffle:
            np.random.shuffle(idx)
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            b = idx[start:end]
            Xb = X_csr[b].toarray().astype(np.float32)
            yb = y_arr[b]
            yield torch.from_numpy(Xb).to(device), torch.from_numpy(yb).long().to(device)

# --- Simple, strong MLP ---
input_dim = X_train.shape[1]

class MLP(nn.Module):
    def __init__(self, in_dim, out_dim, hidden=(512, 256), dropout=0.3):
        super().__init__()
        self.bn0 = nn.BatchNorm1d(in_dim)
        self.fc1 = nn.Linear(in_dim, hidden[0])
        self.fc2 = nn.Linear(hidden[0], hidden[1])
        self.out = nn.Linear(hidden[1], out_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # BatchNorm expects dense; we already densify per batch
        x = self.bn0(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        return self.out(x)

model = MLP(input_dim, n_classes, hidden=(512, 256), dropout=0.3).to(device)

# --- Optimizer & loss ---
lr = 1e-3
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss(weight=class_weights_t)

# --- Training/validation loaders (generators) ---
BATCH_SIZE = 1024
train_gen = csr_batch_generator(X_tr,  y_tr,  batch_size=BATCH_SIZE, shuffle=True,  device=device)
val_gen   = csr_batch_generator(X_val, y_val, batch_size=BATCH_SIZE, shuffle=False, device=device)

steps_per_epoch = int(np.ceil(X_tr.shape[0]  / BATCH_SIZE))
val_steps = int(np.ceil(X_val.shape[0] / BATCH_SIZE))

In [None]:
from copy import deepcopy

EPOCHS = 12
best_val_acc = 0.0
best_state = None
patience = 3
since_improve = 0

def run_epoch(gen, steps, train=True):
    model.train(mode=train)
    total_loss, total_correct, total_n = 0.0, 0, 0
    for _ in range(steps):
        xb, yb = next(gen)
        logits = model(xb)
        loss = criterion(logits, yb)
        if train:
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()
        with torch.no_grad():
            preds = logits.argmax(dim=1)
            total_correct += (preds == yb).sum().item()
            total_n += yb.size(0)
            total_loss += loss.item() * yb.size(0)
    return total_loss / max(total_n, 1), total_correct / max(total_n, 1)

for epoch in range(1, EPOCHS + 1):
    tr_loss, tr_acc = run_epoch(train_gen, steps_per_epoch, train=True)
    with torch.no_grad():
        val_loss, val_acc = run_epoch(val_gen, val_steps, train=False)

    print(f"Epoch {epoch:02d} | "
          f"train_loss={tr_loss:.4f} train_acc={tr_acc:.4f} | "
          f"val_loss={val_loss:.4f} val_acc={val_acc:.4f}")

    # Early stopping on best val_acc
    if val_acc > best_val_acc + 1e-4:
        best_val_acc = val_acc
        best_state = deepcopy(model.state_dict())
        since_improve = 0
    else:
        since_improve += 1
        if since_improve >= patience:
            print(f"Early stopping at epoch {epoch}. Best val_acc={best_val_acc:.4f}")
            break

# Load best model
if best_state is not None:
    model.load_state_dict(best_state)

### Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

def predict_in_batches(X_csr, batch_size=1024):
    model.eval()
    preds = []
    with torch.no_grad():
        for start in range(0, X_csr.shape[0], batch_size):
            end = min(start + batch_size, X_csr.shape[0])
            Xb = torch.from_numpy(X_csr[start:end].toarray().astype(np.float32)).to(device)
            logits = model(Xb)
            preds.append(logits.argmax(dim=1).cpu().numpy())
    return np.concatenate(preds, axis=0)

# Train/Val/Test predictions
y_tr_pred  = predict_in_batches(X_tr)
y_val_pred = predict_in_batches(X_val)
y_te_pred  = predict_in_batches(X_test)

# Accuracies
train_acc = accuracy_score(y_tr,  y_tr_pred)
val_acc   = accuracy_score(y_val, y_val_pred)
test_acc  = accuracy_score(y_test_int, y_te_pred)

print(f"Train Acc: {train_acc:.4f}")
print(f"Val   Acc: {val_acc:.4f}")
print(f"Test  Acc: {test_acc:.4f}")

# Detailed metrics (macro-F1 is important for imbalance)
print("\n=== Classification Report (TEST) ===")
print(classification_report(y_test_int, y_te_pred, target_names=le.classes_, digits=4))

# Confusion matrix (TEST)
cm = confusion_matrix(y_test_int, y_te_pred)
cm_df = pd.DataFrame(cm, index=[f"true_{c}" for c in le.classes_],
                        columns=[f"pred_{c}" for c in le.classes_])
display(cm_df.head(12))  # show top rows if many classes

# (Optional) Save label mapping for deployment
label_map = {i: cls for i, cls in enumerate(le.classes_)}
print("\nLabel map:", label_map)