In [34]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader, TensorDataset

from pytorch_tabular.models import TabNetModelConfig
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig, ModelConfig
from pytorch_tabular import TabularModel

# load the dataset and fix values

In [35]:
# Use the GPU
if torch.backends.mps.is_available():
    print("MPS device is available.")
    device = torch.device("mps")
elif torch.cuda.is_available():
    print("CUDA device is available.")
    device = torch.device("cuda")
else:
    print("No GPU acceleration available.")
    device = torch.device("cpu")

# Fix the seed to have deterministic behaviour
def fix_random(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

SEED = 1337
fix_random(SEED)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

DATASET_PATH = "dataset_train/dataset.csv"
dataset = pd.read_csv(DATASET_PATH, delimiter=",")

print(f"Shape of the dataset: {dataset.shape}")
duplicates = dataset[dataset.duplicated()]
print(f"Number of duplicates in the dataset: {duplicates.shape[0]}")

MPS device is available.
Shape of the dataset: (148301, 145)
Number of duplicates in the dataset: 0


## split the dataset

In [36]:
COLUMNS_TO_DROP_PERCENT = 20

# Show the percentage of missing values
missing_percentages = dataset.isna().mean() * 100
cols_to_drop = missing_percentages[missing_percentages > COLUMNS_TO_DROP_PERCENT]
# drop columns with more than 20% of NaNs. We go from 145 to 89 features
print(f"Shape before dropping columns: {dataset.shape}")
dataset.drop(columns=cols_to_drop.index, inplace=True)
# Drop also loan_title since it's redundant with loan_purpose_category
dataset.drop(columns="loan_title", inplace=True)
print(f"Shape after dropping columns: {dataset.shape}")

for col in dataset.select_dtypes(include="number").columns:
    dataset[col] = dataset[col].fillna(dataset[col].median())

for col in dataset.select_dtypes(include=["object", "category"]).columns:
    dataset[col] = dataset[col].fillna("MISSING")

Shape before dropping columns: (148301, 145)
Shape after dropping columns: (148301, 88)


In [37]:
X = dataset.drop(columns=["grade"])
y = dataset["grade"].map({"A": 6, "B": 5, "C": 4, "D": 3, "E": 2, "F": 1, "G": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [38]:
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns:\n{categorical_cols}")
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()
print(f"Numerical columns:\n{numerical_cols}")

train_df = X_train.copy()
train_df['grade'] = y_train

test_df = X_test.copy()
test_df['grade'] = y_test

Categorical columns:
['loan_contract_term_months', 'borrower_profile_employment_length', 'borrower_housing_ownership_status', 'borrower_income_verification_status', 'loan_issue_date', 'loan_status_current_code', 'loan_payment_plan_flag', 'loan_purpose_category', 'borrower_address_zip', 'borrower_address_state', 'credit_history_earliest_line', 'listing_initial_status', 'last_payment_date', 'last_credit_pull_date', 'application_type_label', 'hardship_flag_indicator', 'disbursement_method_type', 'debt_settlement_flag_indicator']
Numerical columns:
['loan_contract_approved_amount', 'loan_portfolio_total_funded', 'investor_side_funded_amount', 'loan_contract_interest_rate', 'loan_payment_installments_count', 'borrower_income_annual', 'borrower_dti_ratio', 'credit_delinquencies_2yrs', 'fico_score_low_bound', 'fico_score_high_bound', 'credit_inquiries_6m', 'credit_open_accounts', 'credit_public_records', 'revolving_balance', 'revolving_utilization', 'credit_total_accounts', 'outstanding_princ

```python
param_dist_tabnet = {
    'n_d': [16, 32, 64], 
    'n_a': [16, 32, 64],
    'gamma': [1.0, 1.2, 1.5],
    'n_steps': [3, 5], 
    'lambda_sparse': [1e-3, 1e-4]
}
```

In [None]:

# 1. Data Config
data_config = DataConfig(
    target=["grade"], 
    continuous_cols=numerical_cols,
    categorical_cols=categorical_cols,
    normalize_continuous_features=True
)

# 2. Trainer Config
trainer_config = TrainerConfig(
    batch_size=512,
    max_epochs=50,
    early_stopping_patience=5,
    accelerator="auto", # auto Uses GPU if available
)

# 3. Model Config (Using a standard Category Embedding Model)
model_config = TabNetModelConfig(
    task="classification",
    metrics=["accuracy", "f1_score"]
)

# 4. Initialize the Tabular Model
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=OptimizerConfig(),
    trainer_config=trainer_config,
)

2026-01-30 15:59:58,645 - {pytorch_tabular.tabular_model:145} - INFO - Experiment Tracking is turned off


In [40]:
# Train
tabular_model.fit(train=train_df, validation=test_df)

# Evaluate
result = tabular_model.evaluate(test_df)
print(result)

Seed set to 42
2026-01-30 15:59:58,666 - {pytorch_tabular.tabular_model:547} - INFO - Preparing the DataLoaders
2026-01-30 15:59:58,822 - {pytorch_tabular.tabular_datamodule:527} - INFO - Setting up the datamodule for classification task
2026-01-30 15:59:59,346 - {pytorch_tabular.tabular_model:598} - INFO - Preparing the Model: TabNetModel
2026-01-30 15:59:59,517 - {pytorch_tabular.tabular_model:341} - INFO - Preparing the Trainer
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
/Users/geko/unibo/data_analytics/project/.venv/lib/python3.13/site-packages/pytorch_lightning/trainer/setup.py:175: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
2026-01-30 15:59:59,532 - {pytorch_tabular.tabular_model:677} - INFO - Training Started
/Users/geko/unibo/data_analytics/project/.venv/lib/python3.13/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:881: Checkpoint directory /Users/geko/unibo/data_analytics/project/saved_mo

Output()

`Trainer.fit` stopped: `max_epochs=50` reached.


2026-01-30 16:11:33,524 - {pytorch_tabular.tabular_model:690} - INFO - Training the model completed
2026-01-30 16:11:33,524 - {pytorch_tabular.tabular_model:1531} - INFO - Loading the best model


Output()

/Users/geko/unibo/data_analytics/project/.venv/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:434: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


[{'test_loss_0': 0.4214223325252533, 'test_loss': 0.4214223325252533, 'test_accuracy': 0.8274838924407959, 'test_f1_score': 0.8274838924407959}]
