In [136]:
import pandas as pd
import numpy as np
import os
import pickle
import time
import random
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# load the dataset and fix values

In [137]:
# Use the GPU
if torch.backends.mps.is_available():
    print("MPS device is available.")
    device = torch.device("mps")
elif torch.cuda.is_available():
    print("CUDA device is available.")
    device = torch.device("cuda")
else:
    print("No GPU acceleration available.")
    device = torch.device("cpu")

# Fix the seed to have deterministic behaviour
def fix_random(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

SEED = 1337
fix_random(SEED)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

DATASET_PATH = "dataset_train/dataset.csv"
dataset = pd.read_csv(DATASET_PATH, delimiter=",")

print(f"Shape of the dataset: {dataset.shape}")
duplicates = dataset[dataset.duplicated()]
print(f"Number of duplicates in the dataset: {duplicates.shape[0]}")

MPS device is available.
Shape of the dataset: (148301, 145)
Number of duplicates in the dataset: 0


## split the dataset

In [138]:
X = dataset.drop(columns=["grade"])
y = dataset["grade"].map({"A": 6, "B": 5, "C": 4, "D": 3, "E": 2, "F": 1, "G": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [139]:
class NumericExtractor(BaseEstimator, TransformerMixin):
    """Extracts integers from strings using regex"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].astype(str).str.extract(r"(\d+)").astype(float)
        return X

class CyclicalDateEncoder(BaseEstimator, TransformerMixin):
    """Converts mm-yyyy to year + sine/cosine month encoding."""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            date_series = pd.to_datetime(X[col], format="%b-%Y", errors="coerce")
            angle = 2 * np.pi * date_series.dt.month / 12

            X[f"{col}_year"] = date_series.dt.year
            X[f"{col}_month_sin"] = np.sin(angle)
            X[f"{col}_month_cos"] = np.cos(angle)
            
            X.drop(columns=[col], inplace=True)
        return X
    
class BinaryModeEncoder(BaseEstimator, TransformerMixin):
    """"Encodes 0 if value is mode, 1 if not"""
    def __init__(self):
        self.modes_ = {}

    def fit(self, X, y=None):
        # Calculate mode for each column and store it
        for col in X.columns:
            self.modes_[col] = X[col].mode()[0]
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, mode in self.modes_.items():
            # Apply: 1 if NOT the mode (least frequent), 0 if mode
            X_copy[col] = (X_copy[col] != mode).astype(int)
        return X_copy
    
class HighMissingDropper(BaseEstimator, TransformerMixin):
    """Drops columns with high missing percentage. Fits only on training data."""
    
    def __init__(self, threshold=20):
        self.threshold = threshold
        self.cols_to_drop_ = []

    def fit(self, X, y=None):
        # Calculate missing percentages only on training data
        missing_percentages = X.isna().mean() * 100
        self.cols_to_drop_ = missing_percentages[missing_percentages > self.threshold].index.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        return X.drop(columns=[col for col in self.cols_to_drop_])

In [140]:
# Configuration FOR RANDOM FOREST
redundant_cols = ['loan_title']
binary_cols = ["loan_payment_plan_flag", "listing_initial_status", "application_type_label",
               "hardship_flag_indicator", "disbursement_method_type", "debt_settlement_flag_indicator"]
one_hot_encoding_cols = ["borrower_housing_ownership_status", "borrower_income_verification_status",
                       "loan_status_current_code", "loan_purpose_category", "borrower_address_state"]
extract_fields = ["loan_contract_term_months", "borrower_profile_employment_length", "borrower_address_zip"]
date_fields = ["loan_issue_date", "credit_history_earliest_line", "last_payment_date", "last_credit_pull_date"]

In [None]:
# Build the column transformer with proper imputation per data type
preprocessor = ColumnTransformer(
    transformers=[
        # Extract numeric from text fields 
        ('extract', NumericExtractor(), extract_fields),
        
        # Date encoding (creates numeric features)
        ('date', CyclicalDateEncoder(), date_fields),
        
        # Binary encoding (no imputation needed - just mode encoding)
        ('binary', BinaryModeEncoder(), binary_cols),
        
        # Categorical with proper imputation BEFORE encoding
        ('categorical', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), one_hot_encoding_cols),
        
        # Drop redundant columns
        ('drop_redundant', 'drop', redundant_cols)
    ], remainder="passthrough"
)

full_pipeline = Pipeline([
    ('dropper', HighMissingDropper(threshold=20)),
    ('prep', preprocessor),
    ('rf', RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42))
])

full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
bacc = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy: {bacc:.4f}")
f1 = f1_score(y_test, y_pred, average="weighted")
print(f"F1 score: {f1:.4f}")

Accuracy: 0.8652
Balanced Accuracy: 0.8016
F1 score: 0.8627


# ---------------------------------------------------------------


preprocessor = ColumnTransformer(transformers=[
    ('extract', NumericExtractor(extract_cols=extract_fields), extract_fields),
    ('date', CyclicalDateEncoder(date_cols=date_fields), date_fields),
    ('binary', BinaryModeEncoder(columns=binary_cols), binary_cols),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), one_hot_encoding_cols),
    ('drop_cols', 'drop', cols_to_drop)
], remainder='passthrough') # This keeps existing numeric columns

full_pipeline = Pipeline([
    ('prep', preprocessor),
    ('imputer', SimpleImputer()),       # Placeholder, set by grid
    ('scaler', StandardScaler()),       # Placeholder, set by grid
    # ('pca', PCA()),                     # Placeholder, set by grid
    ('rf', RandomForestClassifier(max_depth=50, class_weight='balanced', n_jobs=-1))
])

param_grid = [
    # Path A: Test PCA (Must have Imputer and Scaler)
    {
        'imputer': [SimpleImputer(strategy='median')],
        'scaler': [StandardScaler()],   # CRITICAL: PCA requires scaling
    },
    # Path B: Test "Raw" data (no PCA, Scaling optional but usually good)
    {
        'imputer': [SimpleImputer(strategy='median')],
        'scaler': ['passthrough'],      # RF doesn't strictly need scaling
    },
    # Path C: No PCA, no scaling, no variable imputing
    {
        'imputer': ['passthrough'],
        'scaler': ['passthrough'],
    }
]

grid = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='balanced_accuracy')
grid.fit(X_train, y_train)

print(f"Best Setup: {grid.best_params_}")
print(f"Best Score: {grid.best_score_}")

results = pd.DataFrame(grid.cv_results_)
print(results)

# ---------------------------------------------------------------