In [None]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline as SkPipeline

import torch
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight

from pytorch_tabular.models import TabNetModelConfig
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular import TabularModel

In [171]:
# Use the GPU
if torch.backends.mps.is_available():
    print("MPS device is available.")
    device = torch.device("mps")
elif torch.cuda.is_available():
    print("CUDA device is available.")
    device = torch.device("cuda")
else:
    print("No GPU acceleration available.")
    device = torch.device("cpu")

# Fix the seed to have deterministic behaviour
def fix_random(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

SEED = 1337
fix_random(SEED)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

DATASET_PATH = "dataset_train/dataset.csv"
dataset = pd.read_csv(DATASET_PATH, delimiter=",")

print(f"Shape of the dataset: {dataset.shape}")
duplicates = dataset[dataset.duplicated()]
print(f"Number of duplicates in the dataset: {duplicates.shape[0]}")

MPS device is available.
Shape of the dataset: (148301, 145)
Number of duplicates in the dataset: 0


In [None]:
X = dataset.drop(columns=["grade"])
y = dataset["grade"].map({"A": 6, "B": 5, "C": 4, "D": 3, "E": 2, "F": 1, "G": 0})

# 1. First Split: Separate out the final Hold-out Test set (e.g., 20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y
)

# 2. Second Split: Separate Train from Validation (e.g., 10% of total, or 12.5% of the temp data)
# This ensures Validation data is never seen by the "fit" method
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, stratify=y_temp
)
# Resulting Ratios roughly: Train (70%), Val (10%), Test (20%)

In [173]:
class NumericExtractor(BaseEstimator, TransformerMixin):
    """Extracts integers from strings using regex"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].astype(str).str.extract(r"(\d+)").astype(float)
        return X
    
    def set_output(self, *, transform=None):
        # We ignore the 'transform' argument because this class 
        # always returns a DataFrame (pandas) by design.
        return self

class CyclicalDateEncoder(BaseEstimator, TransformerMixin):
    """Converts mm-yyyy to year + sine/cosine month encoding."""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            # errors="coerce" turns unparseable data/NaNs into NaT
            date_series = pd.to_datetime(X[col], format="%b-%Y", errors="coerce")
            # If date is NaT, these become NaN, which we handle in the pipeline later
            angle = 2 * np.pi * date_series.dt.month / 12

            X[f"{col}_year"] = date_series.dt.year
            X[f"{col}_month_sin"] = np.sin(angle)
            X[f"{col}_month_cos"] = np.cos(angle)
            
            X.drop(columns=[col], inplace=True)
        return X
    
    def set_output(self, *, transform=None):
        # We ignore the 'transform' argument because this class 
        # always returns a DataFrame (pandas) by design.
        return self
    
class BinaryModeEncoder(BaseEstimator, TransformerMixin):
    """"Encodes 0 if value is mode, 1 if not"""
    def __init__(self):
        self.modes_ = {}

    def fit(self, X, y=None):
        # Calculate mode for each column and store it
        for col in X.columns:
            self.modes_[col] = X[col].mode()[0]
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, mode in self.modes_.items():
            # Apply: 1 if NOT the mode (least frequent), 0 if mode
            X_copy[col] = (X_copy[col] != mode).astype(int)
        return X_copy
    
    def set_output(self, *, transform=None):
        # We ignore the 'transform' argument because this class 
        # always returns a DataFrame (pandas) by design.
        return self
    
class HighMissingDropper(BaseEstimator, TransformerMixin):
    """Drops columns with high missing percentage. Fits only on training data."""
    
    def __init__(self, threshold=20):
        self.threshold = threshold
        self.cols_to_drop_ = []

    def fit(self, X, y=None):
        missing_percentages = X.isna().mean() * 100
        self.cols_to_drop_ = missing_percentages[missing_percentages > self.threshold].index.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        return X.drop(columns=self.cols_to_drop_)
    
    def set_output(self, *, transform=None):
        # We ignore the 'transform' argument because this class 
        # always returns a DataFrame (pandas) by design.
        return self

In [174]:
redundant_cols = ["loan_title", "borrower_address_state"]

binary_cols = ["loan_payment_plan_flag", "listing_initial_status", "application_type_label",
               "hardship_flag_indicator", "disbursement_method_type", "debt_settlement_flag_indicator"]
extract_fields = ["loan_contract_term_months", "borrower_profile_employment_length"]
date_fields = ["loan_issue_date", "credit_history_earliest_line", "last_payment_date", "last_credit_pull_date"]

# instead of one hot + embedding we just embed
one_hot_encoding_cols = ["borrower_housing_ownership_status", "borrower_income_verification_status",
                       "loan_status_current_code", "loan_purpose_category"]
embed_columns = ["borrower_address_zip"] + one_hot_encoding_cols

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer

numeric_pipe = SkPipeline([
    ('extract', NumericExtractor()),
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

date_pipe = SkPipeline([
    ('cyclical', CyclicalDateEncoder()),
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) # scaling is needed for the year, not for sin/cos
])

binary_pipe = SkPipeline([
    ('binary_enc', BinaryModeEncoder()), 
    ('impute', SimpleImputer(strategy='most_frequent'))
])

# remainder columns are numerical
remainder_pipe = SkPipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num_pipe', numeric_pipe, extract_fields),
    ('date_pipe', date_pipe, date_fields),
    ('bin_pipe', binary_pipe, binary_cols),
    ('drop_redundant', 'drop', redundant_cols),
    ],
    remainder=remainder_pipe,
    verbose_feature_names_out=False
)

numerical_pipeline = SkPipeline([
    ('dropper', HighMissingDropper(threshold=20)),
    ('prep', preprocessor),
])

numerical_pipeline.set_output(transform="pandas")

numerical_columns = [c for c in X_train.columns if c not in embed_columns]

X_numerical_train = numerical_pipeline.fit_transform(X_train[numerical_columns], y_train)
X_numerical_val = numerical_pipeline.transform(X_val[numerical_columns]) 
X_numerical_test = numerical_pipeline.transform(X_test[numerical_columns])

print(type(X_numerical_train))
print(X_numerical_train.shape)

# join all in train_df
train_df = pd.concat([X_numerical_train, X_train[embed_columns]], axis=1)
train_df['grade'] = y_train

val_df = pd.concat([X_numerical_val, X_val[embed_columns]], axis=1)
val_df['grade'] = y_val

test_df = pd.concat([X_numerical_test, X_test[embed_columns]], axis=1)
test_df['grade'] = y_test

# class weights
classes = np.unique(train_df["grade"])
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_df["grade"])
weight_tensor = torch.tensor(class_weights, dtype=torch.float32)
weighted_loss = nn.CrossEntropyLoss(weight=weight_tensor)

num_classes = len(classes)

<class 'pandas.core.frame.DataFrame'>
(103810, 89)


In [176]:
categorical_cols = embed_columns

# Continuous columns are everything else in the processed df (excluding target and categoricals)
target_col = "grade"
continuous_cols = [
    c for c in train_df.columns 
    if c not in categorical_cols and c != target_col
]

In [177]:
# Scenario 1: The "Small & Speedy" Model
space_small = {
    "model_config__n_d": 8,
    "model_config__n_a": 8,
    "model_config__n_steps": 3,
    "model_config__gamma": 1.2,
}

# Scenario 2: The "Medium & Balanced" Model
space_medium = {
    "model_config__n_d": 16,
    "model_config__n_a": 16,
    "model_config__n_steps": 5,
    "model_config__gamma": 1.3,
}

# Scenario 3: The "Large & Deep" Model
space_large = {
    "model_config__n_d": 32,
    "model_config__n_a": 32,
    "model_config__n_steps": 7,
    "model_config__gamma": 1.5,
}

search_space = [space_small, space_medium, space_large]

In [None]:
EPOCHS = 200

results_log = []
backup_file = "tabnet_grid_search_running.csv"

for i, space in enumerate(search_space):
    print(f"Training model config {i+1}/{len(search_space)}:")
    print(space)
    
    n_d = space["model_config__n_d"]
    n_a = space["model_config__n_a"]
    n_steps = space["model_config__n_steps"]
    gamma = space["model_config__gamma"]

    data_config = DataConfig(
        target=["grade"], 
        continuous_cols=continuous_cols,
        categorical_cols=categorical_cols,
        normalize_continuous_features=False,
    )

    trainer_config = TrainerConfig(
        batch_size=512,
        max_epochs=EPOCHS,
        early_stopping="valid_loss",
        early_stopping_patience=15,
        accelerator="auto",
    )

    model_config = TabNetModelConfig(
        task="classification",
        metrics=['accuracy'], 
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        gamma=gamma
    )

    optimizer_config = OptimizerConfig(
        optimizer="AdamW",
        lr_scheduler="ReduceLROnPlateau",
        lr_scheduler_params={"mode": "min", "factor": 0.1, "patience": 10, "min_lr": 1e-5}
    )

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )
    
    tabular_model.fit(train=train_df, validation=val_df, loss=weighted_loss)

    pred_df = tabular_model.predict(test_df)
    
    y_true = test_df["grade"]
    
    y_pred = pred_df["grade_prediction"] 
    
    manual_acc = accuracy_score(y_true, y_pred)
    manual_bacc = balanced_accuracy_score(y_true, y_pred)
    manual_f1 = f1_score(y_true, y_pred, average="weighted") 

    run_metrics = {
        "config_id": i,
        "n_d": n_d,
        "n_a": n_a,
        "n_steps": n_steps,
        "gamma": gamma,
        "test_acc": manual_acc,
        "test_bacc": manual_bacc,
        "test_f1": manual_f1,
    }
    results_log.append(run_metrics)
    print(f"Config {i} Result: {run_metrics}")
    
    pd.DataFrame(results_log).to_csv(backup_file, index=False)
    print(f"-> Saved progress to {backup_file}")
    print("-" * 40)

summary_df = pd.DataFrame(results_log)
final_filename = "tabnet_grid_search_final.csv"
summary_df.to_csv(final_filename, index=False)

print("Final Comparison:")
print(summary_df.sort_values(by="test_bacc", ascending=False))
print(f"Results saved to {final_filename}")

2026-02-02 00:29:27,192 - {pytorch_tabular.tabular_model:145} - INFO - Experiment Tracking is turned off
Seed set to 42
2026-02-02 00:29:27,204 - {pytorch_tabular.tabular_model:547} - INFO - Preparing the DataLoaders


Training model config 1/3:
{'model_config__n_d': 8, 'model_config__n_a': 8, 'model_config__n_steps': 3, 'model_config__gamma': 1.2}


2026-02-02 00:29:27,308 - {pytorch_tabular.tabular_datamodule:527} - INFO - Setting up the datamodule for classification task
2026-02-02 00:29:27,478 - {pytorch_tabular.tabular_model:598} - INFO - Preparing the Model: TabNetModel
2026-02-02 00:29:27,675 - {pytorch_tabular.tabular_model:341} - INFO - Preparing the Trainer
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
2026-02-02 00:29:27,691 - {pytorch_tabular.tabular_model:677} - INFO - Training Started
/Users/geko/unibo/data_analytics/project/.venv/lib/python3.13/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:881: Checkpoint directory /Users/geko/unibo/data_analytics/project/saved_models exists and is not empty.


Output()

`Trainer.fit` stopped: `max_epochs=1` reached.


2026-02-02 00:29:45,959 - {pytorch_tabular.tabular_model:690} - INFO - Training the model completed
2026-02-02 00:29:45,959 - {pytorch_tabular.tabular_model:1531} - INFO - Loading the best model
2026-02-02 00:29:46,942 - {pytorch_tabular.tabular_model:145} - INFO - Experiment Tracking is turned off
Seed set to 42
2026-02-02 00:29:46,953 - {pytorch_tabular.tabular_model:547} - INFO - Preparing the DataLoaders
2026-02-02 00:29:47,020 - {pytorch_tabular.tabular_datamodule:527} - INFO - Setting up the datamodule for classification task


Config 0 Result: {'config_id': 0, 'n_d': 8, 'n_a': 8, 'n_steps': 3, 'gamma': 1.2, 'test_acc': 0.17015609723205555, 'test_bacc': 0.17575241386113113, 'test_f1': 0.17149887123141289}
-> Saved progress to tabnet_grid_search_running.csv
----------------------------------------
Training model config 2/3:
{'model_config__n_d': 16, 'model_config__n_a': 16, 'model_config__n_steps': 5, 'model_config__gamma': 1.3}


2026-02-02 00:29:47,195 - {pytorch_tabular.tabular_model:598} - INFO - Preparing the Model: TabNetModel
2026-02-02 00:29:47,400 - {pytorch_tabular.tabular_model:341} - INFO - Preparing the Trainer
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
2026-02-02 00:29:47,415 - {pytorch_tabular.tabular_model:677} - INFO - Training Started
/Users/geko/unibo/data_analytics/project/.venv/lib/python3.13/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:881: Checkpoint directory /Users/geko/unibo/data_analytics/project/saved_models exists and is not empty.


Output()

`Trainer.fit` stopped: `max_epochs=1` reached.


2026-02-02 00:30:12,173 - {pytorch_tabular.tabular_model:690} - INFO - Training the model completed
2026-02-02 00:30:12,173 - {pytorch_tabular.tabular_model:1531} - INFO - Loading the best model
2026-02-02 00:30:13,954 - {pytorch_tabular.tabular_model:145} - INFO - Experiment Tracking is turned off
Seed set to 42
2026-02-02 00:30:13,964 - {pytorch_tabular.tabular_model:547} - INFO - Preparing the DataLoaders
2026-02-02 00:30:14,045 - {pytorch_tabular.tabular_datamodule:527} - INFO - Setting up the datamodule for classification task


Config 1 Result: {'config_id': 1, 'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 1.3, 'test_acc': 0.15828866187923535, 'test_bacc': 0.156258368963933, 'test_f1': 0.16936865283433108}
-> Saved progress to tabnet_grid_search_running.csv
----------------------------------------
Training model config 3/3:
{'model_config__n_d': 32, 'model_config__n_a': 32, 'model_config__n_steps': 7, 'model_config__gamma': 1.5}


2026-02-02 00:30:14,267 - {pytorch_tabular.tabular_model:598} - INFO - Preparing the Model: TabNetModel
2026-02-02 00:30:14,522 - {pytorch_tabular.tabular_model:341} - INFO - Preparing the Trainer
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
2026-02-02 00:30:14,538 - {pytorch_tabular.tabular_model:677} - INFO - Training Started
/Users/geko/unibo/data_analytics/project/.venv/lib/python3.13/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:881: Checkpoint directory /Users/geko/unibo/data_analytics/project/saved_models exists and is not empty.


Output()

`Trainer.fit` stopped: `max_epochs=1` reached.


2026-02-02 00:30:44,940 - {pytorch_tabular.tabular_model:690} - INFO - Training the model completed
2026-02-02 00:30:44,940 - {pytorch_tabular.tabular_model:1531} - INFO - Loading the best model


Config 2 Result: {'config_id': 2, 'n_d': 32, 'n_a': 32, 'n_steps': 7, 'gamma': 1.5, 'test_acc': 0.139813222750413, 'test_bacc': 0.15536913096043328, 'test_f1': 0.15313400298804045}
-> Saved progress to tabnet_grid_search_running.csv
----------------------------------------
Final Comparison:
   config_id  n_d  n_a  n_steps  gamma  test_acc  test_bacc   test_f1
0          0    8    8        3    1.2  0.170156   0.175752  0.171499
1          1   16   16        5    1.3  0.158289   0.156258  0.169369
2          2   32   32        7    1.5  0.139813   0.155369  0.153134
Results saved to tabnet_grid_search_final.csv
