In [10]:
import pandas as pd
import numpy as np
import os
import pickle
import time
import random
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import copy

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline # Replaces sklearn Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

# load the dataset and fix values

In [11]:
# Use the GPU
if torch.backends.mps.is_available():
    print("MPS device is available.")
    device = torch.device("mps")
elif torch.cuda.is_available():
    print("CUDA device is available.")
    device = torch.device("cuda")
else:
    print("No GPU acceleration available.")
    device = torch.device("cpu")

# Fix the seed to have deterministic behaviour
def fix_random(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

SEED = 1337
fix_random(SEED)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

DATASET_PATH = "dataset_train/dataset.csv"
dataset = pd.read_csv(DATASET_PATH, delimiter=",")

print(f"Shape of the dataset: {dataset.shape}")
duplicates = dataset[dataset.duplicated()]
print(f"Number of duplicates in the dataset: {duplicates.shape[0]}")

MPS device is available.
Shape of the dataset: (148301, 145)
Number of duplicates in the dataset: 0


## split the dataset

In [12]:
X = dataset.drop(columns=["grade"])
y = dataset["grade"].map({"A": 6, "B": 5, "C": 4, "D": 3, "E": 2, "F": 1, "G": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
class NumericExtractor(BaseEstimator, TransformerMixin):
    """Extracts integers from strings using regex"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].astype(str).str.extract(r"(\d+)").astype(float)
        return X

class CyclicalDateEncoder(BaseEstimator, TransformerMixin):
    """Converts mm-yyyy to year + sine/cosine month encoding."""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            # errors="coerce" turns unparseable data/NaNs into NaT
            date_series = pd.to_datetime(X[col], format="%b-%Y", errors="coerce")
            # If date is NaT, these become NaN, which we handle in the pipeline later
            angle = 2 * np.pi * date_series.dt.month / 12

            X[f"{col}_year"] = date_series.dt.year
            X[f"{col}_month_sin"] = np.sin(angle)
            X[f"{col}_month_cos"] = np.cos(angle)
            
            X.drop(columns=[col], inplace=True)
        return X
    
class BinaryModeEncoder(BaseEstimator, TransformerMixin):
    """"Encodes 0 if value is mode, 1 if not"""
    def __init__(self):
        self.modes_ = {}

    def fit(self, X, y=None):
        # Calculate mode for each column and store it
        for col in X.columns:
            self.modes_[col] = X[col].mode()[0]
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, mode in self.modes_.items():
            # Apply: 1 if NOT the mode (least frequent), 0 if mode
            X_copy[col] = (X_copy[col] != mode).astype(int)
        return X_copy
    
class HighMissingDropper(BaseEstimator, TransformerMixin):
    """Drops columns with high missing percentage. Fits only on training data."""
    
    def __init__(self, threshold=20):
        self.threshold = threshold
        self.cols_to_drop_ = []

    def fit(self, X, y=None):
        missing_percentages = X.isna().mean() * 100
        self.cols_to_drop_ = missing_percentages[missing_percentages > self.threshold].index.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        return X.drop(columns=self.cols_to_drop_)

In [14]:
redundant_cols = ["loan_title", "borrower_address_state"]
binary_cols = ["loan_payment_plan_flag", "listing_initial_status", "application_type_label",
               "hardship_flag_indicator", "disbursement_method_type", "debt_settlement_flag_indicator"]
one_hot_encoding_cols = ["borrower_housing_ownership_status", "borrower_income_verification_status",
                       "loan_status_current_code", "loan_purpose_category"]
extract_fields = ["loan_contract_term_months", "borrower_profile_employment_length"]
date_fields = ["loan_issue_date", "credit_history_earliest_line", "last_payment_date", "last_credit_pull_date"]
embed_column = ["borrower_address_zip"]

In [15]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils.class_weight import compute_class_weight

numeric_pipe = SkPipeline([
    ('extract', NumericExtractor()),
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipe = SkPipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

date_pipe = SkPipeline([
    ('cyclical', CyclicalDateEncoder()),
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) # scaling is needed for the year, not for sin/cos
])

binary_pipe = SkPipeline([
    ('binary_enc', BinaryModeEncoder()), 
    ('impute', SimpleImputer(strategy='most_frequent'))
])

# remainder columns are numerical
remainder_pipe = SkPipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num_pipe', numeric_pipe, extract_fields),
    ('cat_pipe', categorical_pipe, one_hot_encoding_cols),
    ('date_pipe', date_pipe, date_fields),
    ('bin_pipe', binary_pipe, binary_cols),
    ('drop_redundant', 'drop', redundant_cols),
    ],
    remainder=remainder_pipe
)

numerical_pipeline = SkPipeline([
    ('dropper', HighMissingDropper(threshold=20)),
    ('prep', preprocessor),
])

zip_pipeline = SkPipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1))
])

numerical_columns = [c for c in X_train.columns if c not in embed_column]

X_numerical_train = numerical_pipeline.fit_transform(X_train[numerical_columns], y_train)
X_zip_train = zip_pipeline.fit_transform(X_train[embed_column]).squeeze()

X_numerical_test = numerical_pipeline.transform(X_test[numerical_columns])
X_zip_test = zip_pipeline.transform(X_test[embed_column]).squeeze()

print(X_numerical_train.shape)
print(X_zip_train.shape)
print(y_train.shape)

(118640, 121)
(118640,)
(118640,)


In [None]:
def create_dataset(X_num, X_zip, y):
    # Numerical features: Float32
    x_num_t = torch.tensor(X_num, dtype=torch.float32)
    
    # Zip indices: Long (Int64). 
    # Shift by +1 so -1 (unknown) becomes 0.
    x_zip_t = torch.tensor(X_zip, dtype=torch.long) + 1
    
    # Targets: Long for loss calculation
    y_t = torch.tensor(y.values, dtype=torch.long)
    
    return TensorDataset(x_num_t, x_zip_t, y_t)


BATCH_SIZE = 1024
EPOCHS = 200
LEARNING_RATE = 1e-3

def run_cv_grid_search(model_class, param_grid, X_num, X_zip, y, n_splits=3):
    stratified_k_fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    parameter_grid = list(ParameterGrid(param_grid))
    results = [] 
    print(f"Starting Grid Search | {len(parameter_grid)} Combos | {n_splits}-Fold CV")

    for i, params in enumerate(parameter_grid):
        print(f"Fitting Combo {i+1}/{len(parameter_grid)}: {params}")
        
        # We will store the BEST score achieved in each fold here
        fold_best_scores = [] 

        for fold, (train_idx, val_idx) in enumerate(stratified_k_fold.split(X_num, y)):
            # --- Data Setup (Same as before) ---
            X_num_train, X_zip_train = X_num[train_idx], X_zip[train_idx]
            y_train_fold = y.iloc[train_idx]
            X_num_val, X_zip_val = X_num[val_idx], X_zip[val_idx]
            y_val_fold = y.iloc[val_idx]

            fold_weights = compute_class_weight('balanced', classes=np.unique(y_train_fold), y=y_train_fold)
            fold_weights_tensor = torch.tensor(fold_weights, dtype=torch.float32).to(device)

            train_ds = create_dataset(X_num_train, X_zip_train, y_train_fold)
            val_ds = create_dataset(X_num_val, X_zip_val, y_val_fold)
            train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
            val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

            # --- Model Setup ---
            model = model_class(**params).to(device)
            optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
            criterion = nn.CrossEntropyLoss(weight=fold_weights_tensor)
            
            # Scheduler monitors 'min' because we want to minimize loss
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, mode='min', factor=0.1, patience=10, min_lr=1e-6
            )

            # --- TRACKING BEST SCORE FOR THIS FOLD ---
            best_fold_score = -float('inf') 
            best_epoch_loss = float('inf')

            for epoch in range(EPOCHS):
                model.train()
                for b_cont, b_zip, b_y in train_loader:
                    b_cont, b_zip, b_y = b_cont.to(device), b_zip.to(device), b_y.to(device)
                    optimizer.zero_grad()
                    loss = criterion(model(b_cont, b_zip), b_y)
                    loss.backward()
                    optimizer.step()

                model.eval()
                val_loss = 0.0
                all_preds = []
                all_targets = []
                with torch.no_grad():
                    for b_cont, b_zip, b_y in val_loader:
                        b_cont, b_zip, b_y = b_cont.to(device), b_zip.to(device), b_y.to(device)
                        logits = model(b_cont, b_zip)
                        val_loss += criterion(logits, b_y).item()
                        
                        preds = torch.argmax(logits, dim=1)
                        all_preds.extend(preds.cpu().numpy())
                        all_targets.extend(b_y.cpu().numpy())

                avg_val_loss = val_loss / len(val_loader)
                
                # Metrics
                b_acc = balanced_accuracy_score(all_targets, all_preds)
                f1 = f1_score(all_targets, all_preds, average='weighted')
                
                # We feed the scheduler the metric we want to improve (Balanced Accuracy)
                scheduler.step(avg_val_loss)

                # We check if current b_acc is better than the best we've seen this fold
                if b_acc > best_fold_score:
                    best_fold_score = b_acc
                    best_epoch_loss = avg_val_loss # Keep track of loss at best acc

            # End of Fold: Record the BEST score achieved, not the average
            fold_best_scores.append(best_fold_score)
            print(f" Fold {fold+1} Completed. Best Bal-Acc: {best_fold_score:.4f}")

        # Average the PEAK performance across the 3 folds
        mean_best_score = np.mean(fold_best_scores)
        
        full_params = {**params}
        
        # Store results. We use negative score for sorting because we want highest Accuracy.
        results.append({
            'params': full_params, 
            'score': mean_best_score 
        })
        
        print(f"Combo Result: Mean Best Balanced Acc: {mean_best_score:.4f}\n")

    # --- Select Best Hyperparameters ---
    # Sort descending (reverse=True) because higher Accuracy is better
    best_result = sorted(results, key=lambda x: x['score'], reverse=True)[0]
    
    print(f"\nBest Hyperparams found: {best_result['params']}")
    print(f"With Mean Balanced Accuracy: {best_result['score']:.4f}")
    
    return best_result['params']

In [17]:
class FeedForwardModel(nn.Module):
    def __init__(self, cont_dim, hidden_dims=[128, 64, 32], output_dim=7):
        super().__init__()
        zip_embed_dim = 64
        num_zip_codes = 883 # borrower_address_zip 882 different values + 1 missing
        self.emb = nn.Embedding(num_zip_codes, zip_embed_dim)
        
        self.input_dim = cont_dim + zip_embed_dim

        layers = []
        in_dim = self.input_dim
        
        for h_dim in hidden_dims:
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.LeakyReLU())
            layers.append(nn.Dropout(0.2))
            in_dim = h_dim
        
        self.mlp = nn.Sequential(*layers)
        
        # Projects the last hidden layer to the number of classes (7)
        self.head = nn.Linear(hidden_dims[-1], output_dim)

    def forward(self, X_cont, X_zip):
        # Embed zip codes
        # Result shape: (Batch_Size, zip_embed_dim)
        zip_embedded = self.emb(X_zip)
        
        # Concatenate continuous features + embeddings
        # Result shape: (Batch_Size, cont_dim + zip_embed_dim)
        x = torch.cat([X_cont, zip_embedded], dim=1)
        
        # Pass through MLP features
        x = self.mlp(x)
        
        # Final classification
        return self.head(x)

In [18]:
# the number of numerical features of the model is defined after training
# we need to pass it as a network parameter
input_cont_dim = X_numerical_train.shape[1]
input_cat_dim = X_zip_train.shape

param_grid = {
    'cont_dim': [input_cont_dim],
    'hidden_dims': [
        [128, 64, 32],      # Original
        [256, 128, 64, 32], # Deeper/Wider
        [64, 32]            # Shallower (prevent overfitting)
    ],
}

best_params = run_cv_grid_search(model_class=FeedForwardModel, param_grid=param_grid, X_num=X_numerical_train, X_zip=X_zip_train, y=y_train, n_splits=3)

# Retrain final model on ALL data using best params
print("Retraining final model on full dataset...")
final_model = FeedForwardModel(
    cont_dim=best_params['cont_dim'], 
    hidden_dims=best_params['hidden_dims']
).to(device)

Starting Grid Search | 3 Combos | 3-Fold CV
Fitting Combo 1/3: {'cont_dim': 121, 'hidden_dims': [128, 64, 32]}
 Fold 1 Completed. Best Bal-Acc: 0.8557
 Fold 2 Completed. Best Bal-Acc: 0.8586
 Fold 3 Completed. Best Bal-Acc: 0.8621
Combo Result: Mean Best Balanced Acc: 0.8588

Fitting Combo 2/3: {'cont_dim': 121, 'hidden_dims': [256, 128, 64, 32]}
 Fold 1 Completed. Best Bal-Acc: 0.8594
 Fold 2 Completed. Best Bal-Acc: 0.8593
 Fold 3 Completed. Best Bal-Acc: 0.8638
Combo Result: Mean Best Balanced Acc: 0.8608

Fitting Combo 3/3: {'cont_dim': 121, 'hidden_dims': [64, 32]}
 Fold 1 Completed. Best Bal-Acc: 0.8475
 Fold 2 Completed. Best Bal-Acc: 0.8449
 Fold 3 Completed. Best Bal-Acc: 0.8492
Combo Result: Mean Best Balanced Acc: 0.8472


Best Hyperparams found: {'cont_dim': 121, 'hidden_dims': [256, 128, 64, 32]}
With Mean Balanced Accuracy: 0.8608
Retraining final model on full dataset...
