In [None]:
import pandas as pd
import numpy as np
import os
import pickle
import time
import random
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline # Replaces sklearn Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

from pytorch_tabular import TabularModel
from pytorch_tabular.models import TabNetModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
)
from pytorch_tabular.models.common.heads import LinearHeadConfig

# load the dataset and fix values

In [2]:
# Use the GPU
if torch.backends.mps.is_available():
    print("MPS device is available.")
    device = torch.device("mps")
elif torch.cuda.is_available():
    print("CUDA device is available.")
    device = torch.device("cuda")
else:
    print("No GPU acceleration available.")
    device = torch.device("cpu")

# Fix the seed to have deterministic behaviour
def fix_random(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

SEED = 1337
fix_random(SEED)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

DATASET_PATH = "dataset_train/dataset.csv"
dataset = pd.read_csv(DATASET_PATH, delimiter=",")

print(f"Shape of the dataset: {dataset.shape}")
duplicates = dataset[dataset.duplicated()]
print(f"Number of duplicates in the dataset: {duplicates.shape[0]}")

MPS device is available.
Shape of the dataset: (148301, 145)
Number of duplicates in the dataset: 0


## split the dataset

In [3]:
X = dataset.drop(columns=["grade"])
y = dataset["grade"].map({"A": 6, "B": 5, "C": 4, "D": 3, "E": 2, "F": 1, "G": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
categorical_cols = dataset.select_dtypes(include=['object', 'category']).columns
print(f"Categorical columns:\n{categorical_cols.sort_values()}")
numerical_cols = dataset.select_dtypes(include=['number']).columns
print(f"Numerical columns:\n{numerical_cols.sort_values()}")

In [None]:
redundant_cols = ['loan_title', "borrower_address_state"]
binary_cols = ["loan_payment_plan_flag", "listing_initial_status", "application_type_label",
               "hardship_flag_indicator", "disbursement_method_type", "debt_settlement_flag_indicator"]
one_hot_encoding_cols = ["borrower_housing_ownership_status", "borrower_income_verification_status",
                       "loan_status_current_code", "loan_purpose_category"]
extract_fields = ["loan_contract_term_months", "borrower_profile_employment_length"]
date_fields = ["loan_issue_date", "credit_history_earliest_line", "last_payment_date", "last_credit_pull_date"]
embed_column = ['borrower_address_zip']

In [None]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
)

data_config = DataConfig(
    target=[
        "target"
    ],  # target should always be a list.
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True,  # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU",  # Activation between each layers
    learning_rate=1e-3,
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
pred_df = tabular_model.predict(test)
tabular_model.save_model("examples/basic")
loaded_model = TabularModel.load_model("examples/basic")

In [None]:
# 1. Data Configuration
data_config = DataConfig(
    target=["grade"],
    continuous_cols=["age", "income", "credit_score"],
    categorical_cols=["gender", "zip_code", "occupation"],
    normalize_continuous_features=True, # Critical for TabNet
    continuous_feature_transform="quantile_normal", # Gaussian Rank for stability
    handle_unknown_categories=True, # Robustness for inference
    num_workers=4, # Parallel loading
    pin_memory=True # GPU optimization
)

# 2. Trainer Configuration
trainer_config = TrainerConfig(
    batch_size=1024, # Large batch size for Ghost BN
    max_epochs=100, # Allow long training with early stopping
    accelerator="auto", # Auto-detect GPU
    early_stopping_patience=10, # Stop after 10 epochs of no improvement
    checkpoints_save_top_k=1, # Save best model
    load_best=True, # Auto-load best weights
    auto_lr_find=True # Enable LR Finder (optional)
)

# 3. Optimizer Configuration
optimizer_config = OptimizerConfig(
    optimizer="AdamW", # Decoupled weight decay
    optimizer_params={"weight_decay": 0.01},
    lr_scheduler="ReduceLROnPlateau",
    lr_scheduler_params={"patience": 3, "factor": 0.1},
    lr_scheduler_monitor_metric="valid_loss"
)

# 4. TabNet Model Configuration
model_config = TabNetModelConfig(
    task="classification",
    learning_rate=0.02, # Initial estimate
    n_d=16, # Decision dimension
    n_a=16, # Attention dimension
    n_steps=5, # 5 sequential decision steps
    gamma=1.2, # Relaxation parameter for sparsity
    n_independent=2, # Independent GLU blocks
    n_shared=2, # Shared GLU blocks
    virtual_batch_size=128, # Ghost BN size
    metrics=["accuracy", "f1_score"], # Custom metrics
    metrics_prob_input=[False, False] # Both metrics expect class labels, not probs
)

# 5. Experiment Configuration (WandB)
experiment_config = ExperimentConfig(
    project_name="tabnet-churn-prediction",
    run_name="tabnet_experiment_01",
    exp_watch="gradients", # Track gradients for debugging
    log_logits=True # Track output distribution
)

In [None]:
# Initialize Orchestrator
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    experiment_config=experiment_config,
)

# Split Data (or use pre-split dataframes)
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# Train
# Note: WandB run starts automatically here
tabular_model.fit(train=train_df)

# Evaluate
result = tabular_model.evaluate(test_df)
print(f"Test Metrics: {result}")

# Predict
pred_df = tabular_model.predict(test_df)
print(pred_df.head())

# Save for Production
tabular_model.save_model("production_tabnet", inference_only=True)

In [4]:
class NumericExtractor(BaseEstimator, TransformerMixin):
    """Extracts integers from strings using regex"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].astype(str).str.extract(r"(\d+)").astype(float)
        return X

class CyclicalDateEncoder(BaseEstimator, TransformerMixin):
    """Converts mm-yyyy to year + sine/cosine month encoding."""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            # errors="coerce" turns unparseable data/NaNs into NaT
            date_series = pd.to_datetime(X[col], format="%b-%Y", errors="coerce")
            # If date is NaT, these become NaN, which we handle in the pipeline later
            angle = 2 * np.pi * date_series.dt.month / 12

            X[f"{col}_year"] = date_series.dt.year
            X[f"{col}_month_sin"] = np.sin(angle)
            X[f"{col}_month_cos"] = np.cos(angle)
            
            X.drop(columns=[col], inplace=True)
        return X
    
class BinaryModeEncoder(BaseEstimator, TransformerMixin):
    """"Encodes 0 if value is mode, 1 if not"""
    def __init__(self):
        self.modes_ = {}

    def fit(self, X, y=None):
        # Calculate mode for each column and store it
        for col in X.columns:
            self.modes_[col] = X[col].mode()[0]
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, mode in self.modes_.items():
            # Apply: 1 if NOT the mode (least frequent), 0 if mode
            X_copy[col] = (X_copy[col] != mode).astype(int)
        return X_copy
    
class HighMissingDropper(BaseEstimator, TransformerMixin):
    """Drops columns with high missing percentage. Fits only on training data."""
    
    def __init__(self, threshold=20):
        self.threshold = threshold
        self.cols_to_drop_ = []

    def fit(self, X, y=None):
        # Calculate missing percentages only on training data
        missing_percentages = X.isna().mean() * 100
        self.cols_to_drop_ = missing_percentages[missing_percentages > self.threshold].index.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        return X.drop(columns=self.cols_to_drop_)

In [None]:
redundant_cols = ['loan_title', "borrower_address_state"]
binary_cols = ["loan_payment_plan_flag", "listing_initial_status", "application_type_label",
               "hardship_flag_indicator", "disbursement_method_type", "debt_settlement_flag_indicator"]
one_hot_encoding_cols = ["borrower_housing_ownership_status", "borrower_income_verification_status",
                       "loan_status_current_code", "loan_purpose_category"]
extract_fields = ["loan_contract_term_months", "borrower_profile_employment_length"]
date_fields = ["loan_issue_date", "credit_history_earliest_line", "last_payment_date", "last_credit_pull_date"]
embed_column = ['borrower_address_zip'] + one_hot_encoding_cols

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils.class_weight import compute_class_weight

numeric_pipe = SkPipeline([
    ('extract', NumericExtractor()),
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipe = SkPipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

date_pipe = SkPipeline([
    ('cyclical', CyclicalDateEncoder()),
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) # scaling is needed for the year, not for sin/cos
])

binary_pipe = SkPipeline([
    ('binary_enc', BinaryModeEncoder()), 
    ('impute', SimpleImputer(strategy='most_frequent'))
])

preprocessor = ColumnTransformer([
    ('num_pipe', numeric_pipe, extract_fields),
    ('date_pipe', date_pipe, date_fields),
    ('bin_pipe', binary_pipe, binary_cols),
    ('drop_redundant', 'drop', redundant_cols),
], remainder=SimpleImputer(strategy='median'))

numerical_pipeline = SkPipeline([
    ('dropper', HighMissingDropper(threshold=20)),
    ('prep', preprocessor),
])

embed_pipeline = SkPipeline([
    ('dropper', HighMissingDropper(threshold=20)),
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1))
])

numerical_columns = [c for c in X_train.columns if c not in embed_column]

X_numerical_train = numerical_pipeline.fit_transform(X_train[numerical_columns], y_train)
X_zip_train = embed_pipeline.fit_transform(X_train[embed_column]).squeeze()

X_numerical_test = numerical_pipeline.transform(X_test[numerical_columns])
X_zip_test = embed_pipeline.transform(X_test[embed_column]).squeeze()

print(X_numerical_train.shape)
print(X_zip_train.shape)
print(y_train.shape)

(118640, 121)
(118640,)
(118640,)


In [7]:
# 1. Create tensors separately
# Numerical features -> Float32
X_num_train_tensor = torch.tensor(X_numerical_train, dtype=torch.float32)
X_num_test_tensor = torch.tensor(X_numerical_test, dtype=torch.float32)

# Zip codes (indices) -> Long (Integers) required for nn.Embedding
X_zip_train_tensor = torch.tensor(X_zip_train, dtype=torch.long)
X_zip_test_tensor = torch.tensor(X_zip_test, dtype=torch.long)

# Targets -> Long (standard for Classification)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# 2. Pass ALL tensors to TensorDataset
# The dataset will now yield a tuple of 3 items: (numerical_data, zip_data, label)
train_dataset = TensorDataset(X_num_train_tensor, X_zip_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_num_test_tensor, X_zip_test_tensor, y_test_tensor)

# 3. Create DataLoaders
batch_size = 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 4. Handle Class Weights (unchanged)
y_train_np = y_train_tensor.numpy()
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_np),
    y=y_train_np
)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

print(f"Computed Class Weights: {class_weights_tensor}")

Computed Class Weights: tensor([3.4823, 2.8158, 1.7534, 0.9966, 0.5713, 0.5614, 0.7984],
       device='mps:0')
