In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

from f2_preprocessor import Preprocessor

import optuna

from tabm import TabM
from rtdl_num_embeddings import LinearReLUEmbeddings  # simple but good :contentReference[oaicite:1]{index=1}
from focal_loss import FocalBCEWithLogitsLoss


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
num_cols = [
    "year_of_born",
    "email_or_tel_available",
    "safety_rating",
    "annual_income",
    "high_education_ind",
    "address_change_ind",
    "past_num_of_claims",
    "liab_prct",
    "policy_report_filed_ind",
    "claim_est_payout",
    "vehicle_made_year",
    "vehicle_price",
    "vehicle_weight",
    "age_of_DL",
    "vehicle_mileage",
]

cat_cols = [
    "gender",
    "living_status",
    "zip_code",
    "claim_day_of_week",
    "accident_site",
    "witness_present_ind",
    "channel",
    "vehicle_category",
    "vehicle_color",
    "accident_type",
    "in_network_bodyshop",
]

In [3]:
device = 'cpu'

In [4]:
def compute_best_f1(probs: np.ndarray,
                    targets: np.ndarray,
                    thresholds: np.ndarray | None = None):
    """
    probs:    shape (N,), predicted positive probabilities
    targets:  shape (N,), 0/1 labels
    thresholds: optional array of candidate thresholds (0..1)

    Returns (best_f1, best_threshold)
    """
    if thresholds is None:
        # Coarse but reasonable grid; you can make it denser if you want
        thresholds = np.linspace(0.05, 0.95, 19)

    best_f1 = 0.0
    best_t = 0.5

    for t in thresholds:
        preds = (probs >= t).astype(int)
        f1 = f1_score(targets, preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_t = t

    return best_f1, best_t


In [5]:
df = pd.read_csv("data/Training_TriGuard.csv")
df = df.dropna(subset=['subrogation'])

df = df.drop(columns=["claim_number"], errors="ignore")

target_col = "subrogation"
y_all = df[target_col].to_numpy().astype(np.int64)

In [6]:
def build_model(n_num_features: int,
                cat_cardinalities,
                trial: optuna.Trial) -> nn.Module:
    # EDIT: tighten but beef up architecture search space
    n_blocks = trial.suggest_int("n_blocks", 3, 5)                 # was 2–5
    d_block  = trial.suggest_int("d_block", 384, 1024, log=True)   # was 256–1024
    dropout  = trial.suggest_float("dropout", 0.0, 0.4)
    k        = trial.suggest_int("k", 8, 32, step=8)               # keep 8–32

    num_emb = LinearReLUEmbeddings(n_num_features)

    model = TabM.make(
        n_num_features=n_num_features,
        num_embeddings=num_emb,
        cat_cardinalities=cat_cardinalities,
        d_out=1,
        n_blocks=n_blocks,
        d_block=d_block,
        dropout=dropout,
        k=k,
    )

    return model.to(device)

In [7]:
def run_oof_tabm(df, target_col, num_cols, cat_cols,
                 best_params, n_splits=5, device=device):
    """
    Run K-fold OOF CV with fixed best_params to get:
      - oof_probs: out-of-fold probabilities for each training row
      - oof_targets: true labels
      - global_best_f1, global_best_t: OOF-based threshold and F1
    """

    y_all = df[target_col].astype(int).to_numpy()
    oof_probs = np.zeros_like(y_all, dtype=float)

    skf = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=42,
    )

    fixed_trial = optuna.trial.FixedTrial(best_params)

    for fold_idx, (train_idx, valid_idx) in enumerate(skf.split(df, y_all)):
        print(f"\n=== OOF Fold {fold_idx+1}/{n_splits} ===")

        train_df = df.iloc[train_idx].reset_index(drop=True)
        valid_df = df.iloc[valid_idx].reset_index(drop=True)

        # --- Preprocess for this fold ---
        preproc = Preprocessor(
            num_cols=num_cols,
            cat_cols=cat_cols,
            target_col=target_col,
        )

        X_num_train, X_cat_train, y_train = preproc.fit_transform(train_df)
        X_num_valid, X_cat_valid, y_valid = preproc.transform(valid_df)

        # --- Build model with best architecture ---
        n_num_features = X_num_train.shape[1]
        cat_cardinalities = [preproc.cat_cardinalities_[c] for c in cat_cols]

        model = build_model(
            n_num_features=n_num_features,
            cat_cardinalities=cat_cardinalities,
            trial=fixed_trial,
        ).to(device)

        # --- Hyperparams from best_params ---
        lr = best_params["lr"]
        weight_decay = best_params["weight_decay"]
        batch_size = best_params["batch_size"]
        n_epochs = best_params["n_epochs"]

        # --- Tensors & datasets ---
        X_num_tr_t = torch.from_numpy(X_num_train)
        X_cat_tr_t = torch.from_numpy(X_cat_train)
        y_tr_t = torch.from_numpy(y_train)

        X_num_va_t = torch.from_numpy(X_num_valid)
        X_cat_va_t = torch.from_numpy(X_cat_valid)
        y_va_t = torch.from_numpy(y_valid)

        train_dataset = TensorDataset(X_num_tr_t, X_cat_tr_t, y_tr_t)
        valid_dataset = TensorDataset(X_num_va_t, X_cat_va_t, y_va_t)

        # --- Imbalance-aware sampler (same idea as before) ---
        y_train_flat = y_train.ravel()
        class_counts = np.bincount(y_train_flat.astype(int))
        class_counts = np.maximum(class_counts, 1)
        class_weights = 1.0 / class_counts
        sample_weights = class_weights[y_train_flat.astype(int)]

        sampler = WeightedRandomSampler(
            weights=torch.from_numpy(sample_weights).float(),
            num_samples=len(sample_weights),
            replacement=True,
        )

        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            sampler=sampler,
            drop_last=False,
        )
        valid_loader = DataLoader(
            valid_dataset,
            batch_size=batch_size,
            shuffle=False,
            drop_last=False,
        )

        # --- Loss & optimizer ---
        # Use the *same* choice you used for final training (BCE or focal).
        criterion = torch.nn.BCEWithLogitsLoss()

        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=lr,
            weight_decay=weight_decay,
        )

        # --- Train ---
        for epoch in range(n_epochs):
            model.train()
            running_loss = 0.0

            for Xn_b, Xc_b, yb in train_loader:
                Xn_b = Xn_b.to(device)
                Xc_b = Xc_b.to(device)
                yb = yb.to(device)

                logits = model(Xn_b, Xc_b)       # (B, k, 1)
                B, k, _ = logits.shape
                logits_flat = logits.reshape(B * k, 1)
                y_flat = yb.repeat_interleave(k, dim=0)

                loss = criterion(logits_flat, y_flat)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

            if (epoch + 1) % max(1, n_epochs // 3) == 0:
                print(f"  epoch {epoch+1}/{n_epochs}, loss={running_loss:.4f}")

        # --- Collect OOF probs for this fold ---
        model.eval()
        fold_probs = []

        with torch.no_grad():
            for Xn_b, Xc_b, yb in valid_loader:
                Xn_b = Xn_b.to(device)
                Xc_b = Xc_b.to(device)

                logits = model(Xn_b, Xc_b)                   # (B, k, 1)
                probs = torch.sigmoid(logits).mean(dim=1)    # (B, 1)
                probs = probs.squeeze(-1).cpu().numpy()
                fold_probs.append(probs)

        fold_probs = np.concatenate(fold_probs)
        assert len(valid_idx) == len(fold_probs)
        oof_probs[valid_idx] = fold_probs

    # --- Global OOF threshold ---
    global_best_f1, global_best_t = compute_best_f1(oof_probs, y_all)
    print("\n=== Global OOF Results ===")
    print(f"Global OOF F1: {global_best_f1:.6f} at threshold {global_best_t:.4f}")

    return oof_probs, y_all, global_best_f1, global_best_t


In [8]:
def train_one_fold(model,
                   train_loader,
                   valid_loader,
                   n_epochs: int,
                   lr: float,
                   weight_decay: float,
                   # pos_weight_eff: float
                   ) -> float:
    # EDIT: pass in effective pos_weight (after scaling)
    # pos_weight = torch.tensor([pos_weight_eff], device=device)
    criterion = FocalBCEWithLogitsLoss(alpha=0.75, gamma=2.0)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay,
    )

    # -------- TRAIN --------
    for epoch in range(n_epochs):
        model.train()
        for X_num_b, X_cat_b, y_b in train_loader:
            X_num_b = X_num_b.to(device)
            X_cat_b = X_cat_b.to(device)
            y_b = y_b.to(device)   # (B, 1)

            logits = model(X_num_b, X_cat_b)          # (B, k, 1)
            B, k, _ = logits.shape

            # EDIT: use reshape instead of view
            logits_flat = logits.reshape(B * k, 1)    # (B*k, 1)
            y_flat = y_b.repeat_interleave(k, dim=0)  # (B*k, 1)

            loss = criterion(logits_flat, y_flat)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # -------- VALID: compute probs and F1 --------
    model.eval()
    all_probs = []
    all_targets = []

    with torch.no_grad():
        for X_num_b, X_cat_b, y_b in valid_loader:
            X_num_b = X_num_b.to(device)
            X_cat_b = X_cat_b.to(device)
            y_b = y_b.to(device)

            logits = model(X_num_b, X_cat_b)              # (B, k, 1)
            probs = torch.sigmoid(logits).mean(dim=1)     # (B, 1)
            probs = probs.squeeze(-1).cpu().numpy()
            targets = y_b.squeeze(-1).cpu().numpy()

            all_probs.append(probs)
            all_targets.append(targets)

    all_probs = np.concatenate(all_probs)
    all_targets = np.concatenate(all_targets)

    # Optional defensive mask
    mask = ~np.isnan(all_probs) & ~np.isnan(all_targets)
    all_probs = all_probs[mask]
    all_targets = all_targets[mask]

    if len(np.unique(all_targets)) < 2:
        return 0.0

    best_f1, best_t = compute_best_f1(all_probs, all_targets)
    return float(best_f1)


In [9]:
def objective(trial: optuna.Trial) -> float:
    # Hyperparameters for TabM
    lr = trial.suggest_float("lr", 5e-4, 1e-2, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
    n_epochs = trial.suggest_int("n_epochs", 20, 40)

    # NO pos_weight here anymore

    n_splits = 3
    skf = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=42,
    )

    fold_f1s = []

    for fold_idx, (train_idx, valid_idx) in enumerate(skf.split(df, y_all)):
        train_df = df.iloc[train_idx].reset_index(drop=True)
        valid_df = df.iloc[valid_idx].reset_index(drop=True)

        preproc = Preprocessor(
            num_cols=num_cols,
            cat_cols=cat_cols,
            target_col=target_col,
        )

        X_num_train, X_cat_train, y_train = preproc.fit_transform(train_df)
        X_num_valid, X_cat_valid, y_valid = preproc.transform(valid_df)

        n_num_features = X_num_train.shape[1]
        cat_cardinalities = [
            preproc.cat_cardinalities_[c] for c in cat_cols
        ]

        model = build_model(n_num_features, cat_cardinalities, trial)

        X_num_tr_t = torch.from_numpy(X_num_train)
        X_cat_tr_t = torch.from_numpy(X_cat_train)
        y_tr_t = torch.from_numpy(y_train)

        X_num_va_t = torch.from_numpy(X_num_valid)
        X_cat_va_t = torch.from_numpy(X_cat_valid)
        y_va_t = torch.from_numpy(y_valid)

        train_dataset = TensorDataset(X_num_tr_t, X_cat_tr_t, y_tr_t)
        valid_dataset = TensorDataset(X_num_va_t, X_cat_va_t, y_va_t)

        # Imbalance-aware sampler (this alone is OK)
        y_train_flat = y_train.ravel()
        class_counts = np.bincount(y_train_flat.astype(int))
        class_counts = np.maximum(class_counts, 1)
        class_weights = 1.0 / class_counts
        sample_weights = class_weights[y_train_flat.astype(int)]

        sampler = WeightedRandomSampler(
            weights=torch.from_numpy(sample_weights).float(),
            num_samples=len(sample_weights),
            replacement=True,
        )

        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            sampler=sampler,
            drop_last=False,
        )
        valid_loader = DataLoader(
            valid_dataset,
            batch_size=batch_size,
            shuffle=False,
            drop_last=False,
        )

        fold_f1 = train_one_fold(
            model=model,
            train_loader=train_loader,
            valid_loader=valid_loader,
            n_epochs=n_epochs,
            lr=lr,
            weight_decay=weight_decay,
            # no pos_weight_eff here
        )
        fold_f1s.append(fold_f1)

        trial.report(fold_f1, step=fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()

    mean_f1 = float(np.mean(fold_f1s))
    return mean_f1


In [10]:
pruner = optuna.pruners.MedianPruner(n_warmup_steps=1)

study = optuna.create_study(
    direction="maximize",
    study_name="tabm_subrogation_cv_f1",
    pruner=pruner
)
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best mean CV F1:", study.best_value)
print("Best params:", study.best_params)


[I 2025-11-13 20:06:18,515] A new study created in memory with name: tabm_subrogation_cv_f1
Best trial: 0. Best value: 0.372162:   2%|▏         | 1/50 [10:42<8:45:03, 642.92s/it]

[I 2025-11-13 20:17:01,440] Trial 0 finished with value: 0.37216242076746936 and parameters: {'lr': 0.009594805377504345, 'weight_decay': 0.001471509843215915, 'batch_size': 64, 'n_epochs': 34, 'n_blocks': 5, 'd_block': 426, 'dropout': 0.1248820182171233, 'k': 32}. Best is trial 0 with value: 0.37216242076746936.


Best trial: 0. Best value: 0.372162:   4%|▍         | 2/50 [15:22<5:43:08, 428.93s/it]

[I 2025-11-13 20:21:40,581] Trial 1 finished with value: 0.37216242076746936 and parameters: {'lr': 0.0005853753666740638, 'weight_decay': 0.00011930776451925345, 'batch_size': 64, 'n_epochs': 24, 'n_blocks': 3, 'd_block': 715, 'dropout': 0.22457869381193063, 'k': 16}. Best is trial 0 with value: 0.37216242076746936.


Best trial: 0. Best value: 0.372162:   6%|▌         | 3/50 [18:45<4:15:29, 326.16s/it]

[I 2025-11-13 20:25:04,449] Trial 2 finished with value: 0.37216242076746936 and parameters: {'lr': 0.0021426029456718676, 'weight_decay': 1.0630538987061776e-05, 'batch_size': 128, 'n_epochs': 31, 'n_blocks': 5, 'd_block': 540, 'dropout': 0.3727842333347411, 'k': 8}. Best is trial 0 with value: 0.37216242076746936.


Best trial: 0. Best value: 0.372162:   8%|▊         | 4/50 [20:37<3:05:01, 241.34s/it]

[I 2025-11-13 20:26:55,750] Trial 3 finished with value: 0.37216242076746936 and parameters: {'lr': 0.005344129122935598, 'weight_decay': 0.0058424215890045235, 'batch_size': 128, 'n_epochs': 22, 'n_blocks': 4, 'd_block': 472, 'dropout': 0.352038791456758, 'k': 8}. Best is trial 0 with value: 0.37216242076746936.


Best trial: 0. Best value: 0.372162:   8%|▊         | 4/50 [25:17<4:50:50, 379.37s/it]


[W 2025-11-13 20:31:35,982] Trial 4 failed with parameters: {'lr': 0.0024117749699391626, 'weight_decay': 0.00019740776764004176, 'batch_size': 256, 'n_epochs': 38, 'n_blocks': 4, 'd_block': 888, 'dropout': 0.13374842445226798, 'k': 16} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/cb/0wwk4_d90ps1fw1_wg2m60340000gn/T/ipykernel_17526/4284683966.py", line 76, in objective
    fold_f1 = train_one_fold(
        model=model,
    ...<5 lines>...
        # no pos_weight_eff here
    )
  File "/var/folders/cb/0wwk4_d90ps1fw1_wg2m60340000gn/T/ipykernel_17526/507404640.py", line 27, in train_one_fold
    logits = model(X_num_b, X_cat_b)          # (B, k, 1)
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/nn/modules/mo

KeyboardInterrupt: 

In [None]:
best_params = study.best_params
print(best_params)

{'lr': 0.0011183425690718294, 'weight_decay': 0.004187499814600279, 'batch_size': 64, 'n_epochs': 33, 'pos_weight_scale': 2.159985493642329, 'n_blocks': 4, 'd_block': 455, 'dropout': 0.18355459979095332, 'k': 24}


In [None]:
# --- NEW: get OOF-based threshold using best_params ---
oof_probs, oof_targets, oof_f1, oof_t = run_oof_tabm(
    df=df,
    target_col=target_col,
    num_cols=num_cols,
    cat_cols=cat_cols,
    best_params=best_params,
    n_splits=5,
    device=device,
)

print("OOF F1:", oof_f1)
print("OOF best threshold:", oof_t)


In [None]:
# Refit preprocessor on the full training data
preproc_final = Preprocessor(num_cols=num_cols, cat_cols=cat_cols, target_col=target_col)
X_num_full, X_cat_full, y_full = preproc_final.fit_transform(df)

# Prepare tensors
X_num_full_t = torch.from_numpy(X_num_full)
X_cat_full_t = torch.from_numpy(X_cat_full)
y_full_t     = torch.from_numpy(y_full)

full_dataset = TensorDataset(X_num_full_t, X_cat_full_t, y_full_t)

full_loader = DataLoader(
    full_dataset,
    batch_size=best_params["batch_size"],
    shuffle=True,
    drop_last=False
)

In [None]:
# Build final TabM model from best params
model_final = build_model(
    n_num_features=X_num_full.shape[1],
    cat_cardinalities=[preproc_final.cat_cardinalities_[c] for c in cat_cols],
    trial=optuna.trial.FixedTrial(best_params)
)

In [None]:
criterion = FocalBCEWithLogitsLoss(alpha=0.75, gamma=2.0)

optimizer = torch.optim.AdamW(
    model_final.parameters(),
    lr=best_params["lr"],
    weight_decay=best_params["weight_decay"],
)

n_epochs = best_params["n_epochs"]

for epoch in range(n_epochs):
    model_final.train()
    running_loss = 0.0
    for Xn, Xc, yb in full_loader:
        Xn = Xn.to(device)
        Xc = Xc.to(device)
        yb = yb.to(device)

        logits = model_final(Xn, Xc)        # (B, k, 1)
        B, k, _ = logits.shape
        logits_flat = logits.reshape(B * k, 1)
        y_flat = yb.repeat_interleave(k, dim=0)

        loss = criterion(logits_flat, y_flat)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    if (epoch+1) % 5 == 0:
        print(f"Epoch {epoch+1}/{n_epochs}, loss={running_loss:.4f}")


Epoch 5/33, loss=140.5011
Epoch 10/33, loss=121.5093
Epoch 15/33, loss=113.7620
Epoch 20/33, loss=112.7051
Epoch 25/33, loss=111.7766
Epoch 30/33, loss=111.8216


In [None]:
# Diagnostic Code

from sklearn.metrics import f1_score, confusion_matrix

model_final.eval()
with torch.no_grad():
    Xn_tr = X_num_full_t.to(device)
    Xc_tr = X_cat_full_t.to(device)
    logits_tr = model_final(Xn_tr, Xc_tr)                 # (N, k, 1)
    probs_tr = torch.sigmoid(logits_tr).mean(dim=1)       # (N, 1)
    probs_tr = probs_tr.squeeze(-1).cpu().numpy()

print("Train probs min/mean/max:", probs_tr.min(), probs_tr.mean(), probs_tr.max())

y_train_true = y_full.ravel().astype(int)
y_train_pred_03 = (probs_tr >= 0.3).astype(int)

print("Train F1 @0.3:", f1_score(y_train_true, y_train_pred_03))
print("Train confusion @0.5:\n", confusion_matrix(y_train_true, y_train_pred_05))


Train probs min/mean/max: 0.0005119745 0.25603554 0.842113
Train F1 @0.3: 0.5988591317799478
Train confusion @0.5:
 [[12912   972]
 [ 2268  1847]]


In [None]:
best_f1_train, best_t_train = compute_best_f1(probs_tr, y_full.ravel().astype(int))
print("Best F1 on train:", best_f1_train)
print("Best threshold:", best_t_train)

Best F1 on train: 0.5988591317799478
Best threshold: 0.3


In [None]:
real_test = pd.read_csv("data/Testing_TriGuard.csv")

# Keep columns needed for domain features
claim_numbers = real_test["claim_number"].copy()

# Use the final preprocessor (fit on full training data!)
X_num_test, X_cat_test, _ = preproc_final.transform(real_test)

X_num_test_t = torch.from_numpy(X_num_test)
X_cat_test_t = torch.from_numpy(X_cat_test)


In [None]:
model_final.eval()
all_probs = []

with torch.no_grad():
    Xn = X_num_test_t.to(device)
    Xc = X_cat_test_t.to(device)

    logits = model_final(Xn, Xc)                 # (N, k, 1)
    probs = torch.sigmoid(logits).mean(dim=1)    # (N, 1)
    probs = probs.squeeze(-1).cpu().numpy()

    all_probs = probs


In [None]:
real_pred_proba = all_probs
real_pred_label = (real_pred_proba >= oof_t).astype(int)

In [None]:
# Diagnostic Code

print("Min prob:", real_pred_proba.min())
print("Max prob:", real_pred_proba.max())
print("Mean prob:", real_pred_proba.mean())
print("First 20 probs:", real_pred_proba[:20])
print("First 20 labels:", real_pred_label[:20])
print("Unique labels:", np.unique(real_pred_label))


Min prob: 0.0007758692
Max prob: 0.82720715
Mean prob: 0.25367633
First 20 probs: [0.21576667 0.3347236  0.06319809 0.609448   0.36384344 0.03800942
 0.45161465 0.06597612 0.23697914 0.66205806 0.09418901 0.47217855
 0.2660694  0.2500148  0.22387505 0.58648896 0.054098   0.04736322
 0.15361674 0.4636856 ]
First 20 labels: [0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 1]
Unique labels: [0 1]


In [None]:
prediction = pd.DataFrame({
    "claim_number": claim_numbers,
    "subrogation": real_pred_label
})

prediction.to_csv("results/tabm_xxxx_prediction.csv", index=False)

print("Saved:", "results/tabm_xxxx_prediction.csv")


Saved: results/tabm_5938_prediction.csv


Model Saving Pipeline

In [None]:
from TabM_save_load import TabM_save_load

In [None]:
# 1. Create the pipeline wrapper
pipeline = TabM_save_load(
    model=model_final,
    preprocessor=preproc_final,
    threshold=oof_t, 
    best_params=best_params,
    num_cols=num_cols,
    cat_cols=cat_cols,
    device=device
)

# 2. Save everything to a folder
save_dir = "models/tabm_full_pipeline_xxxx"
pipeline.save(save_dir)

TabM pipeline saved successfully to models/tabm_full_pipeline_5938
