# 0. Library

In [27]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import time
import warnings
import pickle
import os

# --- Sklearn ---
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (roc_auc_score, roc_curve, precision_recall_curve, auc,
                             accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, brier_score_loss, log_loss, classification_report)
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

# --- Imbalanced Learn ---
from imblearn.over_sampling import SMOTE # Or other variants if preferred

# --- TabNet & PyTorch ---
import torch
from pytorch_tabnet.tab_model import TabNetClassifier


# 1. Utility functions + Configurations

## 1.1 Utility functions

In [28]:
# --- Helper Functions (can be imported from a utils file or redefined) ---
def calculate_ks(y_true, y_prob):
    """Calculates the Kolmogorov-Smirnov (KS) statistic."""
    df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})
    df = df.sort_values(by='y_prob', ascending=False)
    # Ensure y_true sums are not zero before division
    sum_true = df['y_true'].sum()
    sum_false = len(df) - sum_true
    if sum_true == 0 or sum_false == 0:
        return 0.0 # KS is 0 if one class is missing
    df['cumulative_true'] = df['y_true'].cumsum() / sum_true
    df['cumulative_false'] = (1 - df['y_true']).cumsum() / sum_false
    ks = max(abs(df['cumulative_true'] - df['cumulative_false']))
    return ks

def find_optimal_threshold_j_statistic(y_true, y_prob_oof):
    """Finds the optimal threshold maximizing Youden's J statistic (Sensitivity + Specificity - 1)."""
    fpr, tpr, thresholds = roc_curve(y_true, y_prob_oof)
     # Handle cases where thresholds might not be strictly decreasing
    valid_indices = np.where(np.isfinite(thresholds))[0]
    if len(valid_indices) == 0:
        print("Warning: No valid thresholds found for J-statistic calculation.")
        return 0.5 # Default fallback
    fpr, tpr, thresholds = fpr[valid_indices], tpr[valid_indices], thresholds[valid_indices]

    if len(thresholds) == 0:
         print("Warning: Threshold array is empty after filtering.")
         return 0.5

    j_statistic = tpr - fpr
    optimal_idx = np.argmax(j_statistic)
    optimal_threshold = thresholds[optimal_idx]
    # Ensure threshold is within [0, 1] bounds if necessary due to floating point issues
    optimal_threshold = max(0.0, min(1.0, optimal_threshold))
    print(f"Optimal threshold based on Youden's J-Statistic (OOF): {optimal_threshold:.4f}")
    return optimal_threshold

def evaluate_model(y_true, y_pred_proba, y_pred_binary, model_name="Model"):
    """Calculates and prints standard classification metrics."""
    # Add epsilon to probabilities for log_loss if necessary
    eps = 1e-15
    y_pred_proba = np.clip(y_pred_proba, eps, 1 - eps)

    auc_roc = roc_auc_score(y_true, y_pred_proba)
    gini = 2 * auc_roc - 1
    ks = calculate_ks(y_true, y_pred_proba)
    accuracy = accuracy_score(y_true, y_pred_binary)
    precision = precision_score(y_true, y_pred_binary, zero_division=0)
    recall = recall_score(y_true, y_pred_binary, zero_division=0)
    f1 = f1_score(y_true, y_pred_binary, zero_division=0)
    brier = brier_score_loss(y_true, y_pred_proba)
    logloss = log_loss(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, y_pred_binary)

    print(f"\n--- Evaluation Metrics for {model_name} ---")
    print(f"AUC ROC:        {auc_roc:.4f}")
    print(f"Gini Coefficient: {gini:.4f}")
    print(f"KS Statistic:   {ks:.4f}")
    print(f"Accuracy:       {accuracy:.4f}")
    print(f"Precision:      {precision:.4f}")
    print(f"Recall (TPR):   {recall:.4f}")
    print(f"F1-Score:       {f1:.4f}")
    print(f"Brier Score:    {brier:.4f}")
    print(f"Log Loss:       {logloss:.4f}")
    print("\nConfusion Matrix:")
    print(cm)

    results = {
        'Model': model_name,
        'AUC': auc_roc, 'Gini': gini, 'KS': ks, 'Accuracy': accuracy,
        'Precision': precision, 'Recall': recall, 'F1': f1,
        'Brier': brier, 'LogLoss': logloss
    }
    return results

def plot_roc_curve(y_true, y_prob, model_name):
    """Plots the ROC curve."""
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    auc_roc = roc_auc_score(y_true, y_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_roc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--') # Diagonal line
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} - ROC Curve')
    plt.legend()
    plt.grid(True)
    # Save the plot
    plot_filename = f"roc_curve_{model_name.replace(' ', '_')}.png"
    plt.savefig(plot_filename)
    print(f"ROC curve saved to {plot_filename}")
    plt.show()

## 1.2 Configurations

In [29]:
# --- Configuration ---
DATA_PATH = '../data/processed/'
MODEL_OUTPUT_PATH = './tabnet_outputs/' # Directory to save model/results
os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)
SEED = 42
N_SPLITS = 5 # Number of folds for Cross-Validation
SMOTE_STRATEGY = 0.5 # Ratio after resampling
TARGET = 'TARGET'
ID_COL = 'SK_ID_CURR'

# TabNet Specific Config
# These are EXAMPLE parameters, tuning is recommended
TABNET_PARAMS = dict(
    # Network architecture
    n_d=64,              # Increase from 50 to capture more complex patterns
    n_a=64,              # Match n_d for balanced attention mechanism
    n_steps=7,           # Increase from 3 for deeper feature processing
    n_independent=2,     # Add independent layer count (wasn't in original)
    n_shared=2,          # Add shared layer count (wasn't in original)
    
    # Regularization
    gamma=1.3,           # Slightly increased feature reuse penalty
    lambda_sparse=5e-4,  # Increased sparsity for better feature selection
    
    # Optimizer settings
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=5e-3),  # Lower learning rate
    
    # Learning rate scheduler
    scheduler_params=dict(
        mode="min", 
        patience=10,      # Increased patience 
        min_lr=1e-5, 
        factor=0.5
    ),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    
    mask_type='sparsemax',
    verbose=1,
    seed=42
)

# Training Config
MAX_EPOCHS = 200         # Increased to allow more training time
PATIENCE = 20            # Increased early stopping patience
BATCH_SIZE = 4096*2        # Larger batch size for efficiency (adjust based on GPU memory)
VIRTUAL_BATCH_SIZE = 512*2 # Increased for better batch normalization

# --- Check for GPU ---
device_name = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device_name}")
TABNET_PARAMS['device_name'] = device_name

Using device: cuda


# 2. Load Data and Preprocess

In [30]:
# --- Load Data ---
print("Loading preprocessed data...")
try:
    train_df = pd.read_csv(DATA_PATH + 'train_final.csv')
    test_df = pd.read_csv(DATA_PATH + 'test_final.csv')
    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure 'train_final.csv' and 'test_final.csv' are in {DATA_PATH}")
    exit()

# --- Prepare Data ---
y_train = train_df[TARGET].values # Use .values for numpy arrays
y_test = test_df[TARGET].values

# Drop Target and potentially ID
if ID_COL in train_df.columns:
    X_train = train_df.drop(columns=[TARGET, ID_COL])
    X_test = test_df.drop(columns=[TARGET, ID_COL])
else:
     X_train = train_df.drop(columns=[TARGET])
     X_test = test_df.drop(columns=[TARGET])

# Align columns just in case
common_cols = list(X_train.columns.intersection(X_test.columns))
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_names = X_train.columns.tolist()


# Feature Selection using variance threshold (optional)
# --- Feature Selection: Low Variance Filter ---
print("\nApplying Low Variance Feature Selection...")
var_selector = VarianceThreshold(threshold=0.01) # Threshold=0 removes zero-variance, 0.01 removes low variance

# Fit on training data only
var_selector.fit(X_train)

# Get the boolean mask of selected features
feature_mask = var_selector.get_support()
original_feature_names = X_train.columns.tolist() # Get original names before transformation
selected_feature_names = [name for name, selected in zip(original_feature_names, feature_mask) if selected]

print(f"Original number of features: {X_train.shape[1]}")
print(f"Number of features after variance thresholding: {len(selected_feature_names)}")

# Transform both X_train and X_test
X_train_np_selected = var_selector.transform(X_train)
X_test_np_selected = var_selector.transform(X_test)

# Convert back to DataFrame with selected column names
X_train = pd.DataFrame(X_train_np_selected, columns=selected_feature_names, index=X_train.index)
X_test = pd.DataFrame(X_test_np_selected, columns=selected_feature_names, index=X_test.index)

# Update the global feature_names list
feature_names = selected_feature_names

print(f"Updated X_train shape: {X_train.shape}")
print(f"Updated X_test shape: {X_test.shape}")

# --- LIMITATION: Treat all features as numerical ---
# Ideally, identify original categorical features and pass their indices to TabNet.
# Since we are using pre-encoded data, we treat all as numerical.
print("WARNING: Treating all features as numerical for TabNet due to pre-encoded input data.")
categorical_indices = [] # No categorical indices provided
categorical_dims = [] # No specific dimensions needed if indices are empty

# Convert to numpy arrays of type float32 for PyTorch
X_train_np = X_train.replace([np.inf, -np.inf], np.nan).fillna(X_train.median()).astype(np.float32).values
X_test_np = X_test.replace([np.inf, -np.inf], np.nan).fillna(X_train.median()).astype(np.float32).values # Use train median

print(f"Prepared X_train shape: {X_train_np.shape}")
print(f"Prepared X_test shape: {X_test_np.shape}")

Loading preprocessed data...
Data loaded successfully.

Applying Low Variance Feature Selection...
Original number of features: 773
Number of features after variance thresholding: 542
Updated X_train shape: (246005, 542)
Updated X_test shape: (61502, 542)
Prepared X_train shape: (246005, 542)
Prepared X_test shape: (61502, 542)


# 3. Train and Evaluate

In [31]:
# Calculate class weights inversely proportional to class frequencies
class_counts = np.bincount(y_train)
total_samples = len(y_train)
# More aggressive weighting for minority class
class_weights = torch.tensor([1.0, (class_counts[0]/class_counts[1]) * 1.0], dtype=torch.float32)
if device_name == 'cuda':
    class_weights = class_weights.cuda()

print(f"Using class weights: {class_weights.cpu().numpy()} to emphasize minority class")

# Create weighted loss function
weighted_loss = torch.nn.CrossEntropyLoss(weight=class_weights)

Using class weights: [ 1.       11.386959] to emphasize minority class


In [36]:
# Add this before the training loop (just before or after the cell with id "fe7d1abd")

# Verify GPU is properly detected and can allocate memory
if device_name == 'cuda':
    print("===== GPU Diagnostics =====")
    # Print CUDA version
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")
    
    # Create a sample tensor on GPU to verify memory allocation works
    test_tensor = torch.zeros((1000, 1000), device='cuda')
    allocated_memory = torch.cuda.memory_allocated() / (1024**2)
    print(f"Successfully allocated tensor on GPU, Memory used: {allocated_memory:.2f} MB")
    
    # Get device properties
    device_properties = torch.cuda.get_device_properties(0)
    print(f"GPU Name: {device_properties.name}")
    print(f"GPU Memory: {device_properties.total_memory / (1024**3):.2f} GB")
    print(f"Compute Capability: {device_properties.major}.{device_properties.minor}")
    
    # Free memory
    del test_tensor
    torch.cuda.empty_cache()
    print("Memory released")
    
    print("===========================")
else:
    print("GPU not available, running on CPU")

===== GPU Diagnostics =====
CUDA Version: 12.6
PyTorch Version: 2.6.0+cu126
Successfully allocated tensor on GPU, Memory used: 573.81 MB
GPU Name: NVIDIA GeForce RTX 3060
GPU Memory: 12.00 GB
Compute Capability: 8.6
Memory released


In [None]:
# --- Cross-Validation Loop ---
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
oof_predictions = np.zeros(X_train_np.shape[0])
test_predictions_list = []
fold_models = []
fold_results = []

print(f"\nStarting TabNet {N_SPLITS}-Fold Cross-Validation...")
start_cv_time = time.time()

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_np, y_train)):
    print(f"\n--- Fold {fold+1}/{N_SPLITS} ---")
    fold_start_time = time.time()

    # 1. Split data for the fold
    X_train_fold, X_val_fold = X_train_np[train_idx], X_train_np[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # 3. Apply Scaling (Fit on Resampled Train, Transform Train & Val)
    print("Applying StandardScaler...")
    scaler = StandardScaler()
    X_train_fold_scaled = scaler.fit_transform(X_train_fold)
    X_val_fold_scaled = scaler.transform(X_val_fold) 

    # 4. Define and Train TabNet Model for the fold
    model = TabNetClassifier(**TABNET_PARAMS)

    print("Training TabNet model...")

    model.fit(
        X_train=X_train_fold_scaled, y_train=y_train_fold,
        eval_set=[(X_val_fold_scaled, y_val_fold)],
        eval_name=['validation'],
        eval_metric=['auc'], # Use AUC for early stopping metric
        max_epochs=MAX_EPOCHS,
        patience=PATIENCE,
        batch_size=BATCH_SIZE,
        virtual_batch_size=VIRTUAL_BATCH_SIZE,
        num_workers=0, # Adjust based on system
        drop_last=False, # Important for final batch
        loss_fn=weighted_loss # Use weighted loss instead of standard CrossEntropyLoss
    )

    # 5. Predict on Validation and Test Sets
    print("Predicting on validation and test sets...")
    val_preds = model.predict_proba(X_val_fold_scaled)[:, 1]
    # Scale the full test set using the scaler fitted for this fold
    X_test_scaled = scaler.transform(X_test_np)
    test_preds = model.predict_proba(X_test_scaled)[:, 1]

    # 6. Store Predictions
    oof_predictions[val_idx] = val_preds
    test_predictions_list.append(test_preds)
    fold_models.append(model) # Store the model if needed

    # 7. Evaluate Fold (optional)
    fold_auc = roc_auc_score(y_val_fold, val_preds)
    print(f"Fold {fold+1} Validation AUC: {fold_auc:.4f}")
    fold_results.append({'Fold': fold+1, 'Validation AUC': fold_auc})

    fold_end_time = time.time()
    print(f"Fold {fold+1} completed in {(fold_end_time - fold_start_time):.2f} seconds.")

end_cv_time = time.time()
print(f"\nCross-Validation finished in {(end_cv_time - start_cv_time)/60:.2f} minutes.")


Starting TabNet 5-Fold Cross-Validation...

--- Fold 1/5 ---
Applying StandardScaler...
Moving data to GPU...
GPU Memory in use after data transfer: 1.0559 GB




Model device check - will use GPU: True
Training TabNet model...


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x0000024E83BC37E0>
Traceback (most recent call last):
  File "c:\Users\ORLab\main_source\CreditRiskProject\venv\Lib\site-packages\torch\utils\data\dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "c:\Users\ORLab\main_source\CreditRiskProject\venv\Lib\site-packages\torch\utils\data\dataloader.py", line 1576, in _shutdown_workers
    if self._persistent_workers or self._workers_status[worker_id]:
                                   ^^^^^^^^^^^^^^^^^^^^
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'


epoch 0  | loss: 1.08314 | validation_auc: 0.52639 |  0:00:04s
epoch 1  | loss: 0.85214 | validation_auc: 0.58793 |  0:00:08s
epoch 2  | loss: 0.77767 | validation_auc: 0.63648 |  0:00:13s
epoch 3  | loss: 0.70323 | validation_auc: 0.67678 |  0:00:17s
epoch 4  | loss: 0.67241 | validation_auc: 0.69399 |  0:00:22s
epoch 5  | loss: 0.65604 | validation_auc: 0.69298 |  0:00:26s
epoch 6  | loss: 0.64267 | validation_auc: 0.69778 |  0:00:31s
epoch 7  | loss: 0.64687 | validation_auc: 0.70464 |  0:00:35s
epoch 8  | loss: 0.63256 | validation_auc: 0.70818 |  0:00:40s
epoch 9  | loss: 0.62917 | validation_auc: 0.7136  |  0:00:44s
epoch 10 | loss: 0.62813 | validation_auc: 0.70638 |  0:00:48s
epoch 11 | loss: 0.63655 | validation_auc: 0.71648 |  0:00:53s
epoch 12 | loss: 0.62619 | validation_auc: 0.71478 |  0:00:57s
epoch 13 | loss: 0.6251  | validation_auc: 0.7165  |  0:01:02s
epoch 14 | loss: 0.62674 | validation_auc: 0.71918 |  0:01:06s
epoch 15 | loss: 0.61719 | validation_auc: 0.71866 |  0

KeyboardInterrupt: 

In [None]:
# --- Aggregate and Evaluate ---
print("\n" + "="*60)
print("Final Evaluation")
print("="*60)

# Average test predictions across folds
final_test_predictions = np.mean(test_predictions_list, axis=0)

# Evaluate OOF predictions
oof_auc = roc_auc_score(y_train, oof_predictions)
print(f"Overall OOF AUC: {oof_auc:.4f}")

# Find optimal threshold using OOF predictions
optimal_threshold = find_optimal_threshold_j_statistic(y_train, oof_predictions)

# Evaluate final test predictions using the optimal threshold
final_test_predictions_binary = (final_test_predictions >= optimal_threshold).astype(int)
final_results = evaluate_model(y_test, final_test_predictions, final_test_predictions_binary, "TabNet (Tuned CV)")

# Plot ROC curve for the averaged test predictions
plot_roc_curve(y_test, final_test_predictions, "TabNet (Tuned CV)")

# 4. Parameter Tuning

In [None]:
# # --- Optuna Hyperparameter Optimization for TabNet ---
# print("\n--- Optuna Optimization for TabNet ---")
# import optuna
# from optuna.pruners import MedianPruner

# # Define the objective function for Optuna
# def objective_tabnet(trial):
#     # Define hyperparameters to tune
#     n_d = trial.suggest_int('n_d', 8, 128)
#     n_a = trial.suggest_int('n_a', 8, 128)
#     n_steps = trial.suggest_int('n_steps', 3, 10)
#     n_independent = trial.suggest_int('n_independent', 1, 5)
#     n_shared = trial.suggest_int('n_shared', 1, 5)
#     gamma = trial.suggest_float('gamma', 1.0, 2.0)
#     lambda_sparse = trial.suggest_float('lambda_sparse', 1e-6, 1e-1, log=True)
#     learning_rate = trial.suggest_float('learning_rate', 5e-4, 2e-2, log=True)
#     batch_size = trial.suggest_categorical('batch_size', [1024, 2048, 4096, 8192])
#     virtual_batch_size = trial.suggest_int('virtual_batch_size', 128, 1024, log=True)
    
#     # Create the TabNet model with suggested parameters
#     params = dict(
#         n_d=n_d, 
#         n_a=n_a,
#         n_steps=n_steps,
#         n_independent=n_independent,
#         n_shared=n_shared,
#         gamma=gamma,
#         lambda_sparse=lambda_sparse,
#         optimizer_fn=torch.optim.Adam,
#         optimizer_params=dict(lr=learning_rate),
#         scheduler_params=dict(mode="min", patience=7, min_lr=1e-5, factor=0.5),
#         scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
#         mask_type='sparsemax',
#         verbose=0,  # Reduced verbosity for optimization
#         seed=SEED,
#         device_name=device_name
#     )
    
#     # Cross-validation setup
#     cv_scores = []
#     fold_models = []
    
#     for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_np, y_train)):
#         X_train_fold, X_val_fold = X_train_np[train_idx], X_train_np[val_idx]
#         y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
        
#         # Apply scaling
#         scaler = StandardScaler()
#         X_train_fold_scaled = scaler.fit_transform(X_train_fold)
#         X_val_fold_scaled = scaler.transform(X_val_fold)
        
#         # Train TabNet model
#         model = TabNetClassifier(**params)
        
#         model.fit(
#             X_train=X_train_fold_scaled, 
#             y_train=y_train_fold,
#             eval_set=[(X_val_fold_scaled, y_val_fold)],
#             eval_name=['validation'],
#             eval_metric=['auc'],
#             max_epochs=MAX_EPOCHS,
#             patience=PATIENCE,
#             batch_size=batch_size,
#             virtual_batch_size=virtual_batch_size,
#             num_workers=0
#         )
        
#         # Get validation AUC
#         val_preds = model.predict_proba(X_val_fold_scaled)[:, 1]
#         fold_auc = roc_auc_score(y_val_fold, val_preds)
#         cv_scores.append(fold_auc)
        
#         # Report intermediate value for pruning
#         trial.report(fold_auc, fold)
        
#         # Pruning check
#         if trial.should_prune():
#             raise optuna.TrialPruned()
    
#     # Return mean CV score
#     return np.mean(cv_scores)

# # Create Optuna study with pruning
# OPTUNA_N_TRIALS = 20  # Adjust as needed based on your computational resources
# storage_path = os.path.join(MODEL_OUTPUT_PATH, "tabnet_optuna_studies.db")
# study = optuna.create_study(
#     direction='maximize',
#     study_name='tabnet_optimization',
#     storage=f'sqlite:///{storage_path}',
#     load_if_exists=True,
#     pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=2)
# )

# # Run optimization
# print(f"Starting Optuna optimization for TabNet with {OPTUNA_N_TRIALS} trials...")
# start_time = time.time()
# study.optimize(objective_tabnet, n_trials=OPTUNA_N_TRIALS, n_jobs=1)
# end_time = time.time()
# print(f"Optuna optimization completed in {(end_time - start_time)/60:.2f} minutes")

# # Get best parameters
# best_params = study.best_params
# print(f"\nBest Params (TabNet): {best_params}")
# print(f"Best CV AUC score: {study.best_value:.4f}")

# # Create final model with best parameters
# final_tabnet_params = dict(
#     n_d=best_params['n_d'],
#     n_a=best_params['n_a'],
#     n_steps=best_params['n_steps'],
#     n_independent=best_params['n_independent'],
#     n_shared=best_params['n_shared'],
#     gamma=best_params['gamma'],
#     lambda_sparse=best_params['lambda_sparse'],
#     optimizer_fn=torch.optim.Adam,
#     optimizer_params=dict(lr=best_params['learning_rate']),
#     scheduler_params=dict(mode="min", patience=7, min_lr=1e-5, factor=0.5),
#     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
#     mask_type='sparsemax',
#     verbose=1,
#     seed=SEED,
#     device_name=device_name
# )

In [None]:
# # Re-train with the best parameters on the full training set
# print("\nRetraining TabNet with best parameters on the full training set...")

# # Apply scaling to the full training data
# scaler = StandardScaler()
# X_train_np_scaled = scaler.fit_transform(X_train_np)
# X_test_np_scaled = scaler.transform(X_test_np)

# # Split a portion for validation to monitor during training
# X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
#     X_train_np_scaled, y_train, test_size=0.2, random_state=SEED)

# final_model = TabNetClassifier(**final_tabnet_params)
# final_model.fit(
#     X_train=X_train_final, 
#     y_train=y_train_final,
#     eval_set=[(X_val_final, y_val_final)],
#     eval_name=['validation'],
#     eval_metric=['auc'],
#     max_epochs=MAX_EPOCHS,
#     patience=PATIENCE,
#     batch_size=best_params.get('batch_size', BATCH_SIZE),
#     virtual_batch_size=best_params.get('virtual_batch_size', VIRTUAL_BATCH_SIZE),
#     num_workers=0
# )

# 5. Feature Importance

In [None]:
# --- Feature Importance ---
# TabNet provides feature importance based on the masks used in its attention mechanism
print("\n--- TabNet Feature Importances (from last fold model) ---")
try:
    # Importance shape is (n_features,)
    importances = fold_models[-1].feature_importances_
    feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False).reset_index(drop=True)
    display(feature_importance_df.head(50)) # Show top 50
    # Save importances
    importance_filename = os.path.join(MODEL_OUTPUT_PATH, "tabnet_feature_importances.csv")
    feature_importance_df.to_csv(importance_filename, index=False)
    print(f"Feature importances saved to {importance_filename}")
except Exception as e:
    print(f"Could not get or save feature importances: {e}")

In [None]:
# --- Save Results ---
print("\nSaving results...")
results_summary = pd.DataFrame([final_results]).set_index('Model')
summary_filename = os.path.join(MODEL_OUTPUT_PATH, "tabnet_evaluation_summary.csv")
results_summary.to_csv(summary_filename)
print(f"Evaluation summary saved to {summary_filename}")

# Save OOF and Test predictions
oof_df = pd.DataFrame({'SK_ID_CURR': train_df.index, 'oof_pred_proba': oof_predictions}) # Assuming train_df index maps correctly
oof_filename = os.path.join(MODEL_OUTPUT_PATH, "tabnet_oof_predictions.csv")
oof_df.to_csv(oof_filename, index=False)
print(f"OOF predictions saved to {oof_filename}")

test_pred_df = pd.DataFrame({'SK_ID_CURR': test_df.index, 'test_pred_proba': final_test_predictions}) # Assuming test_df index maps correctly
test_pred_filename = os.path.join(MODEL_OUTPUT_PATH, "tabnet_test_predictions.csv")
test_pred_df.to_csv(test_pred_filename, index=False)
print(f"Test predictions saved to {test_pred_filename}")

# Optionally save one of the trained models (e.g., the last fold's)
# Note: Saving/loading TabNet models might require saving the associated zip file.
model_save_path = os.path.join(MODEL_OUTPUT_PATH, "tabnet_model_last_fold")
saved_path = fold_models[-1].save_model(model_save_path)
print(f"Last fold TabNet model saved to path: {saved_path}")

print("\nTabNet script finished.")