In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
DATA_DIR = "/kaggle/input/porto-seguro-safe-driver-prediction"
TRAIN_CSV = f"{DATA_DIR}/train.csv"


# DTSA 5511 Final Project — Porto Seguro Safe Driver (Tabular DL)

**Problem.** Predict whether a customer will file an auto-insurance claim in the next year. Binary, **heavily imbalanced**.

**Data & provenance.** Kaggle: *Porto Seguro Safe Driver Prediction* (`train.csv`). Features are anonymized: `*_num`, `*_bin`, `*_cat`. Missing values often encoded as `-1`. I use the public training split only. License/terms as on Kaggle.

**Goal & metrics.** Optimize discrimination and ranking under class imbalance. I report **ROC-AUC**, **PR-AUC** (primary), Brier score (calibration), and confusion matrix at the **F1-optimal threshold** on a stratified hold-out set.

**EDA**
- Positive rate: ~3–4%.  
- Missingness: several columns encode missing as `-1` → I add **missing indicators** per numeric feature.  
- Numerics show skew; I **median-impute** and **standardize**.  
- Categoricals: high-cardinality in places; I **label-encode** with train-only vocab + **unknown** bucket for unseen.

**Methods.**
- **Baseline:** XGBoost (`gpu_hist` when available) with `scale_pos_weight`.
- **Deep model:** MLP for tabular with **categorical embeddings** + numeric block, dropout, BCE with `pos_weight` (and a focal-loss ablation). Early stopping on **PR-AUC**.  
- **Ablations:** (1) BCE vs Focal; (2) with vs without categorical embeddings; (3) class-balanced sampler on/off.  
- **Calibration:** reliability plot + Brier score.

**Validation.** Single stratified **80/20** split, `seed=42`. No time component in features, so plain split is acceptable. All preprocessing fit on train only.

**Repro.**
- Environment: Kaggle Notebook (CPU/GPU).  
- Data path: `/kaggle/input/porto-seguro-safe-driver-prediction/train.csv`.  
- Run cells top-to-bottom. Figures stored under `/kaggle/working/reports/figures/`. Summary CSV at `/kaggle/working/results_summary.csv`.  
- Random seeds fixed where supported.

**Notes / limitations.**
- Features are anonymized engineered signals; external covariates were not added.
- On tabular data, tree ensembles are strong baselines; I include both and discuss trade-offs.

**Academic honesty.** This is my own work. I used public documentation for library usage; all sources are cited where relevant.


In [3]:
# --- Drop-in: DL with categorical embeddings (no F import needed) ---
import torch
import torch.nn as nn
import torch.nn.functional as nnF
from torch.utils.data import Dataset, DataLoader
import numpy as np

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

class TabDataset(Dataset):
    def __init__(self, df, y, num_cols_names, cat_cols_names):
        # Build tensors by column names to preserve dtypes:
        self.num = torch.tensor(df[num_cols_names].values, dtype=torch.float32)
        # Ensure cats are integer-coded 0..K-1
        self.cat = torch.tensor(df[cat_cols_names].values, dtype=torch.long) if len(cat_cols_names) else None
        self.y   = None if y is None else torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.num)
    def __getitem__(self, i):
        if self.y is None:
            return self.num[i], (self.cat[i] if self.cat is not None else None)
        return self.num[i], (self.cat[i] if self.cat is not None else None), self.y[i]

class TabMLP(nn.Module):
    def __init__(self, n_num, cat_cardinalities, d_emb=32, hidden=(256,128), dropout=0.2):
        super().__init__()
        self.has_cat = len(cat_cardinalities) > 0
        if self.has_cat:
            self.embs = nn.ModuleList([nn.Embedding(card, d_emb) for card in cat_cardinalities])
            cat_dim = d_emb * len(cat_cardinalities)
        else:
            self.embs = nn.ModuleList()
            cat_dim = 0
        in_dim = n_num + cat_dim
        layers = []
        last = in_dim
        for h in hidden:
            layers += [nn.Linear(last, h), nn.ReLU(), nn.Dropout(dropout)]
            last = h
        layers += [nn.Linear(last, 1)]
        self.net = nn.Sequential(*layers)

    def forward(self, num, cat=None):
        if self.has_cat and cat is not None:
            embs = [emb(cat[:, i]) for i, emb in enumerate(self.embs)]
            cat_feat = torch.cat(embs, dim=1)
            x = torch.cat([num, cat_feat], dim=1)
        else:
            x = num
        return self.net(x).squeeze(1)  # logits

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction="mean"):
        super().__init__()
        self.alpha, self.gamma, self.reduction = alpha, gamma, reduction
    def forward(self, logits, targets):
        # targets: 0/1 long or float; logits: raw
        bce = nnF.binary_cross_entropy_with_logits(logits, targets.float(), reduction="none")
        p   = torch.sigmoid(logits)
        pt  = p*targets + (1-p)*(1-targets)
        loss = self.alpha * (1-pt).pow(self.gamma) * bce
        return loss.mean() if self.reduction == "mean" else loss.sum()

def train_mlp(
    Xtr, ytr, Xva, yva,
    num_cols_names, cat_cols_names, cat_cardinalities,
    epochs=50, batch=2048, lr=3e-4, d_emb=32, hidden=(256,128), dropout=0.2,
    use_focal=True, class_weight=True, early_stop_pat=6
):
    pin = torch.cuda.is_available()
    tr_ds = TabDataset(Xtr, ytr, num_cols_names, cat_cols_names)
    va_ds = TabDataset(Xva, yva, num_cols_names, cat_cols_names)
    tr_ld = DataLoader(tr_ds, batch_size=batch, shuffle=True,  num_workers=2, pin_memory=pin)
    va_ld = DataLoader(va_ds, batch_size=batch*2, shuffle=False, num_workers=2, pin_memory=pin)

    model = TabMLP(len(num_cols_names), cat_cardinalities, d_emb, hidden, dropout).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=lr)

    if use_focal:
        criterion = FocalLoss(alpha=0.25, gamma=2.0)
    else:
        if class_weight:
            pos_w = (ytr==0).sum() / max(1,(ytr==1).sum())
            criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_w], device=DEVICE))
        else:
            criterion = nn.BCEWithLogitsLoss()

    best_pr = -1.0
    best_state = None
    patience = 0

    from sklearn.metrics import average_precision_score, roc_auc_score

    for ep in range(1, epochs+1):
        model.train()
        total = 0.0
        for batch in tr_ld:
            num, cat, yy = batch
            num = num.to(DEVICE, non_blocking=True)
            cat = (cat.to(DEVICE, non_blocking=True) if cat is not None else None)
            yy  = yy.to(DEVICE, non_blocking=True)

            opt.zero_grad()
            logits = model(num, cat)
            loss = criterion(logits, yy.float())
            loss.backward(); opt.step()
            total += loss.item() * yy.size(0)

        # eval
        model.eval()
        preds, ys = [], []
        with torch.no_grad():
            for batch in va_ld:
                num, cat, yy = batch
                num = num.to(DEVICE, non_blocking=True)
                cat = (cat.to(DEVICE, non_blocking=True) if cat is not None else None)
                logits = model(num, cat)
                preds.append(torch.sigmoid(logits).cpu().numpy())
                ys.append(yy.numpy())
        preds = np.concatenate(preds); ys = np.concatenate(ys)
        roc = roc_auc_score(ys, preds)
        pr  = average_precision_score(ys, preds)
        print(f"[Epoch {ep:02d}] loss={total/len(tr_ds):.6f} ROC-AUC={roc:.5f} PR-AUC={pr:.5f}")

        if pr > best_pr + 1e-5:
            best_pr = pr
            best_state = {k: v.cpu() for k, v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= early_stop_pat:
                print("Early stopping.")
                break

    if best_state is not None:
        model.load_state_dict({k: v.to(DEVICE) for k, v in best_state.items()})
    return model

# ---- Train with your existing splits/columns ----
# IMPORTANT: ensure cat cols are already integer-coded 0..K-1 from your preprocessing.
cat_cardinalities = [int(X_tr[c].max()) + 1 for c in cat_cols]  # each cat col must be 0..K-1

mlp = train_mlp(
    X_tr, y_tr, X_te, y_te,
    num_cols_names=num_cols,
    cat_cols_names=cat_cols,
    cat_cardinalities=cat_cardinalities,
    epochs=50, batch=2048, lr=3e-4, d_emb=32, hidden=(256,128), dropout=0.2,
    use_focal=True, class_weight=True, early_stop_pat=6
)

# Inference on validation
from sklearn.metrics import roc_auc_score, average_precision_score
va_ds = TabDataset(X_te, None, num_cols, cat_cols)
va_ld = DataLoader(va_ds, batch_size=4096, shuffle=False, pin_memory=torch.cuda.is_available())
mlp.eval(); preds=[]
with torch.no_grad():
    for num, cat in va_ld:
        num = num.to(DEVICE, non_blocking=True)
        cat = (cat.to(DEVICE, non_blocking=True) if cat is not None else None)
        preds.append(torch.sigmoid(mlp(num, cat)).cpu().numpy())
proba_mlp = np.concatenate(preds)

print("[MLP-Emb] ROC-AUC:", roc_auc_score(y_te, proba_mlp).round(5),
      "PR-AUC:", average_precision_score(y_te, proba_mlp).round(5))


[Epoch 01] loss=0.011848 ROC-AUC=0.62250 PR-AUC=0.06066
[Epoch 02] loss=0.011151 ROC-AUC=0.62456 PR-AUC=0.06276
[Epoch 03] loss=0.011121 ROC-AUC=0.62662 PR-AUC=0.06328
[Epoch 04] loss=0.011118 ROC-AUC=0.62600 PR-AUC=0.06359
[Epoch 05] loss=0.011080 ROC-AUC=0.62470 PR-AUC=0.06217
[Epoch 06] loss=0.011084 ROC-AUC=0.62477 PR-AUC=0.06343
[Epoch 07] loss=0.011057 ROC-AUC=0.62210 PR-AUC=0.06289
[Epoch 08] loss=0.011048 ROC-AUC=0.62141 PR-AUC=0.06247
[Epoch 09] loss=0.011009 ROC-AUC=0.62559 PR-AUC=0.06322
[Epoch 10] loss=0.011001 ROC-AUC=0.61997 PR-AUC=0.06187
Early stopping.
[MLP-Emb] ROC-AUC: 0.61997 PR-AUC: 0.06187
