# 0. Library

In [None]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import time
import warnings
import pickle
import os

# --- Sklearn ---
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (roc_auc_score, roc_curve, precision_recall_curve, auc,
                             accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, brier_score_loss, log_loss, classification_report)

# --- Imbalanced Learn ---
from imblearn.over_sampling import SMOTE # Or other variants if preferred

# --- TabNet & PyTorch ---
import torch
from pytorch_tabnet.tab_model import TabNetClassifier


# 1. Utility functions + Configurations

In [None]:
# --- Helper Functions (can be imported from a utils file or redefined) ---
def calculate_ks(y_true, y_prob):
    """Calculates the Kolmogorov-Smirnov (KS) statistic."""
    df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})
    df = df.sort_values(by='y_prob', ascending=False)
    # Ensure y_true sums are not zero before division
    sum_true = df['y_true'].sum()
    sum_false = len(df) - sum_true
    if sum_true == 0 or sum_false == 0:
        return 0.0 # KS is 0 if one class is missing
    df['cumulative_true'] = df['y_true'].cumsum() / sum_true
    df['cumulative_false'] = (1 - df['y_true']).cumsum() / sum_false
    ks = max(abs(df['cumulative_true'] - df['cumulative_false']))
    return ks

def find_optimal_threshold_j_statistic(y_true, y_prob_oof):
    """Finds the optimal threshold maximizing Youden's J statistic (Sensitivity + Specificity - 1)."""
    fpr, tpr, thresholds = roc_curve(y_true, y_prob_oof)
     # Handle cases where thresholds might not be strictly decreasing
    valid_indices = np.where(np.isfinite(thresholds))[0]
    if len(valid_indices) == 0:
        print("Warning: No valid thresholds found for J-statistic calculation.")
        return 0.5 # Default fallback
    fpr, tpr, thresholds = fpr[valid_indices], tpr[valid_indices], thresholds[valid_indices]

    if len(thresholds) == 0:
         print("Warning: Threshold array is empty after filtering.")
         return 0.5

    j_statistic = tpr - fpr
    optimal_idx = np.argmax(j_statistic)
    optimal_threshold = thresholds[optimal_idx]
    # Ensure threshold is within [0, 1] bounds if necessary due to floating point issues
    optimal_threshold = max(0.0, min(1.0, optimal_threshold))
    print(f"Optimal threshold based on Youden's J-Statistic (OOF): {optimal_threshold:.4f}")
    return optimal_threshold

def evaluate_model(y_true, y_pred_proba, y_pred_binary, model_name="Model"):
    """Calculates and prints standard classification metrics."""
    # Add epsilon to probabilities for log_loss if necessary
    eps = 1e-15
    y_pred_proba = np.clip(y_pred_proba, eps, 1 - eps)

    auc_roc = roc_auc_score(y_true, y_pred_proba)
    gini = 2 * auc_roc - 1
    ks = calculate_ks(y_true, y_pred_proba)
    accuracy = accuracy_score(y_true, y_pred_binary)
    precision = precision_score(y_true, y_pred_binary, zero_division=0)
    recall = recall_score(y_true, y_pred_binary, zero_division=0)
    f1 = f1_score(y_true, y_pred_binary, zero_division=0)
    brier = brier_score_loss(y_true, y_pred_proba)
    logloss = log_loss(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, y_pred_binary)

    print(f"\n--- Evaluation Metrics for {model_name} ---")
    print(f"AUC ROC:        {auc_roc:.4f}")
    print(f"Gini Coefficient: {gini:.4f}")
    print(f"KS Statistic:   {ks:.4f}")
    print(f"Accuracy:       {accuracy:.4f}")
    print(f"Precision:      {precision:.4f}")
    print(f"Recall (TPR):   {recall:.4f}")
    print(f"F1-Score:       {f1:.4f}")
    print(f"Brier Score:    {brier:.4f}")
    print(f"Log Loss:       {logloss:.4f}")
    print("\nConfusion Matrix:")
    print(cm)

    results = {
        'Model': model_name,
        'AUC': auc_roc, 'Gini': gini, 'KS': ks, 'Accuracy': accuracy,
        'Precision': precision, 'Recall': recall, 'F1': f1,
        'Brier': brier, 'LogLoss': logloss
    }
    return results

def plot_roc_curve(y_true, y_prob, model_name):
    """Plots the ROC curve."""
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    auc_roc = roc_auc_score(y_true, y_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_roc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--') # Diagonal line
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} - ROC Curve')
    plt.legend()
    plt.grid(True)
    # Save the plot
    plot_filename = f"roc_curve_{model_name.replace(' ', '_')}.png"
    plt.savefig(plot_filename)
    print(f"ROC curve saved to {plot_filename}")
    plt.show()

In [None]:
# --- Configuration ---
DATA_PATH = '../data/processed/'
MODEL_OUTPUT_PATH = './tabnet_outputs/' # Directory to save model/results
os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)
SEED = 42
N_SPLITS = 5 # Number of folds for Cross-Validation
SMOTE_STRATEGY = 0.5 # Ratio after resampling
TARGET = 'TARGET'
ID_COL = 'SK_ID_CURR'

# TabNet Specific Config
# These are EXAMPLE parameters, tuning is recommended
TABNET_PARAMS = dict(
    n_d=24, n_a=24, # Dimension of prediction/attention layers (adjust based on feature count/memory)
    n_steps=3,      # Number of steps in the architecture
    gamma=1.3,      # Coefficient for feature reusage penalty
    lambda_sparse=1e-4, # Sparsity loss coefficient
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2), # Learning rate (often needs tuning)
    scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.5), # ReduceLROnPlateau scheduler
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    mask_type='sparsemax', # Attention mechanism type ('sparsemax' or 'entmax')
    verbose=10,      # Print loss every 10 epochs
    seed=SEED
)
# Training Config
MAX_EPOCHS = 100     # Max epochs per fold
PATIENCE = 15       # Early stopping patience
BATCH_SIZE = 2048   # Adjust based on GPU memory
VIRTUAL_BATCH_SIZE = 256 # Used if BATCH_SIZE is small

# --- Check for GPU ---
device_name = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device_name}")
TABNET_PARAMS['device_name'] = device_name

# 2. Load Data and Preprocess

In [None]:
# --- Load Data ---
print("Loading preprocessed data...")
try:
    train_df = pd.read_csv(DATA_PATH + 'train_final.csv')
    test_df = pd.read_csv(DATA_PATH + 'test_final.csv')
    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure 'train_final.csv' and 'test_final.csv' are in {DATA_PATH}")
    exit()

# --- Prepare Data ---
y_train = train_df[TARGET].values # Use .values for numpy arrays
y_test = test_df[TARGET].values

# Drop Target and potentially ID
if ID_COL in train_df.columns:
    X_train = train_df.drop(columns=[TARGET, ID_COL])
    X_test = test_df.drop(columns=[TARGET, ID_COL])
else:
     X_train = train_df.drop(columns=[TARGET])
     X_test = test_df.drop(columns=[TARGET])

# Align columns just in case
common_cols = list(X_train.columns.intersection(X_test.columns))
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_names = X_train.columns.tolist()

# --- LIMITATION: Treat all features as numerical ---
# Ideally, identify original categorical features and pass their indices to TabNet.
# Since we are using pre-encoded data, we treat all as numerical.
print("WARNING: Treating all features as numerical for TabNet due to pre-encoded input data.")
categorical_indices = [] # No categorical indices provided
categorical_dims = [] # No specific dimensions needed if indices are empty

# Convert to numpy arrays of type float32 for PyTorch
X_train_np = X_train.replace([np.inf, -np.inf], np.nan).fillna(X_train.median()).astype(np.float32).values
X_test_np = X_test.replace([np.inf, -np.inf], np.nan).fillna(X_train.median()).astype(np.float32).values # Use train median

print(f"Prepared X_train shape: {X_train_np.shape}")
print(f"Prepared X_test shape: {X_test_np.shape}")

# 3. Train and Evaluate

In [None]:
# --- Cross-Validation Loop ---
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
oof_predictions = np.zeros(X_train_np.shape[0])
test_predictions_list = []
fold_models = []
fold_results = []

print(f"\nStarting TabNet {N_SPLITS}-Fold Cross-Validation...")
start_cv_time = time.time()

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_np, y_train)):
    print(f"\n--- Fold {fold+1}/{N_SPLITS} ---")
    fold_start_time = time.time()

    # 1. Split data for the fold
    X_train_fold, X_val_fold = X_train_np[train_idx], X_train_np[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # 2. Apply SMOTE to the training part of the fold
    print("Applying SMOTE...")
    smote = SMOTE(sampling_strategy=SMOTE_STRATEGY, random_state=SEED + fold, n_jobs=-1) # Vary seed per fold
    try:
        X_train_fold_res, y_train_fold_res = smote.fit_resample(X_train_fold, y_train_fold)
        print(f"SMOTE applied. Original size: {X_train_fold.shape[0]}, Resampled size: {X_train_fold_res.shape[0]}")
    except Exception as e:
         print(f"SMOTE failed for fold {fold+1}: {e}. Using original data.")
         X_train_fold_res, y_train_fold_res = X_train_fold, y_train_fold # Fallback

    # 3. Apply Scaling (Fit on Resampled Train, Transform Train & Val)
    print("Applying StandardScaler...")
    scaler = StandardScaler()
    X_train_fold_scaled = scaler.fit_transform(X_train_fold_res)
    X_val_fold_scaled = scaler.transform(X_val_fold) # Use same scaler fitted on train

    # 4. Define and Train TabNet Model for the fold
    model = TabNetClassifier(**TABNET_PARAMS)

    print("Training TabNet model...")
    model.fit(
        X_train=X_train_fold_scaled, y_train=y_train_fold_res,
        eval_set=[(X_val_fold_scaled, y_val_fold)],
        eval_name=['validation'],
        eval_metric=['auc'], # Use AUC for early stopping metric
        max_epochs=MAX_EPOCHS,
        patience=PATIENCE,
        batch_size=BATCH_SIZE,
        virtual_batch_size=VIRTUAL_BATCH_SIZE,
        num_workers=0, # Adjust based on system
        drop_last=False, # Important for final batch
        loss_fn=torch.nn.CrossEntropyLoss() # Standard loss for binary classification
    )

    # 5. Predict on Validation and Test Sets
    print("Predicting on validation and test sets...")
    val_preds = model.predict_proba(X_val_fold_scaled)[:, 1]
    # Scale the full test set using the scaler fitted for this fold
    X_test_scaled = scaler.transform(X_test_np)
    test_preds = model.predict_proba(X_test_scaled)[:, 1]

    # 6. Store Predictions
    oof_predictions[val_idx] = val_preds
    test_predictions_list.append(test_preds)
    fold_models.append(model) # Store the model if needed

    # 7. Evaluate Fold (optional)
    fold_auc = roc_auc_score(y_val_fold, val_preds)
    print(f"Fold {fold+1} Validation AUC: {fold_auc:.4f}")
    fold_results.append({'Fold': fold+1, 'Validation AUC': fold_auc})

    fold_end_time = time.time()
    print(f"Fold {fold+1} completed in {(fold_end_time - fold_start_time):.2f} seconds.")

end_cv_time = time.time()
print(f"\nCross-Validation finished in {(end_cv_time - start_cv_time)/60:.2f} minutes.")

In [None]:
# --- Aggregate and Evaluate ---
print("\n" + "="*60)
print("Final Evaluation")
print("="*60)

# Average test predictions across folds
final_test_predictions = np.mean(test_predictions_list, axis=0)

# Evaluate OOF predictions
oof_auc = roc_auc_score(y_train, oof_predictions)
print(f"Overall OOF AUC: {oof_auc:.4f}")

# Find optimal threshold using OOF predictions
optimal_threshold = find_optimal_threshold_j_statistic(y_train, oof_predictions)

# Evaluate final test predictions using the optimal threshold
final_test_predictions_binary = (final_test_predictions >= optimal_threshold).astype(int)
final_results = evaluate_model(y_test, final_test_predictions, final_test_predictions_binary, "TabNet (Tuned CV)")

# Plot ROC curve for the averaged test predictions
plot_roc_curve(y_test, final_test_predictions, "TabNet (Tuned CV)")


# 4. Feature Importance

In [None]:
# --- Feature Importance ---
# TabNet provides feature importance based on the masks used in its attention mechanism
print("\n--- TabNet Feature Importances (from last fold model) ---")
try:
    # Importance shape is (n_features,)
    importances = fold_models[-1].feature_importances_
    feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False).reset_index(drop=True)
    display(feature_importance_df.head(50)) # Show top 50
    # Save importances
    importance_filename = os.path.join(MODEL_OUTPUT_PATH, "tabnet_feature_importances.csv")
    feature_importance_df.to_csv(importance_filename, index=False)
    print(f"Feature importances saved to {importance_filename}")
except Exception as e:
    print(f"Could not get or save feature importances: {e}")

In [None]:
# --- Save Results ---
print("\nSaving results...")
results_summary = pd.DataFrame([final_results]).set_index('Model')
summary_filename = os.path.join(MODEL_OUTPUT_PATH, "tabnet_evaluation_summary.csv")
results_summary.to_csv(summary_filename)
print(f"Evaluation summary saved to {summary_filename}")

# Save OOF and Test predictions
oof_df = pd.DataFrame({'SK_ID_CURR': train_df.index, 'oof_pred_proba': oof_predictions}) # Assuming train_df index maps correctly
oof_filename = os.path.join(MODEL_OUTPUT_PATH, "tabnet_oof_predictions.csv")
oof_df.to_csv(oof_filename, index=False)
print(f"OOF predictions saved to {oof_filename}")

test_pred_df = pd.DataFrame({'SK_ID_CURR': test_df.index, 'test_pred_proba': final_test_predictions}) # Assuming test_df index maps correctly
test_pred_filename = os.path.join(MODEL_OUTPUT_PATH, "tabnet_test_predictions.csv")
test_pred_df.to_csv(test_pred_filename, index=False)
print(f"Test predictions saved to {test_pred_filename}")

# Optionally save one of the trained models (e.g., the last fold's)
# Note: Saving/loading TabNet models might require saving the associated zip file.
# model_save_path = os.path.join(MODEL_OUTPUT_PATH, "tabnet_model_last_fold")
# saved_path = fold_models[-1].save_model(model_save_path)
# print(f"Last fold TabNet model saved to path: {saved_path}")

print("\nTabNet script finished.")