In [3]:
import pandas as pd
import numpy as np
import os
import pickle
import time
import random
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [4]:
# Use the GPU
if torch.backends.mps.is_available():
    print("MPS device is available.")
    device = torch.device("mps")
elif torch.cuda.is_available():
    print("CUDA device is available.")
    device = torch.device("cuda")
else:
    print("No GPU acceleration available.")
    device = torch.device("cpu")

# Fix the seed to have deterministic behaviour
def fix_random(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower

SEED = 1337
fix_random(SEED)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

DATASET_PATH = "dataset_train/dataset.csv"
dataset = pd.read_csv(DATASET_PATH, delimiter=",")

print(f"Shape of the dataset: {dataset.shape}")
duplicates = dataset[dataset.duplicated()]
print(f"Number of duplicates in the dataset: {duplicates.shape[0]}")

MPS device is available.
Shape of the dataset: (148301, 145)
Number of duplicates in the dataset: 0


## Handling NaNs
We drop columns with more than 20% of NaNs. We go from 145 to 89 features

In [5]:
COLUMNS_TO_DROP_PERCENT = 20 # WHAT IF I PUT 50 INSTEAD?

# Show the percentage of missing values
missing_percentages = dataset.isna().mean() * 100
cols_to_drop = missing_percentages[missing_percentages > COLUMNS_TO_DROP_PERCENT]
print(cols_to_drop.sort_values(ascending=False))

# drop columns with more than 20% of NaNs. We go from 145 to 89 features
print(f"Shape before dropping columns: {dataset.shape}")
dataset.drop(columns=cols_to_drop.index, inplace=True)
# Drop also loan_title since it's redundant with loan_purpose_category
dataset.drop(columns="loan_title", inplace=True)
print(f"Shape after dropping columns: {dataset.shape}")

original_projected_additional_accrued_interest       99.564399
hardship_loan_status_label                           99.432910
hardship_type_label                                  99.431561
hardship_end_date                                    99.431561
hardship_duration_days                               99.430887
hardship_last_payment_amount_total                   99.430887
hardship_days_past_due                               99.430887
hardship_reason_label                                99.430213
hardship_start_date                                  99.429539
hardship_payment_plan_start_date                     99.429539
hardship_deferral_term_months                        99.428864
hardship_status_label                                99.428190
hardship_amount_total                                99.428190
hardship_payoff_balance                              99.427516
secondary_applicant_months_since_last_major_derog    98.374927
settlement_term_months                               98

In [None]:
print(cols_to_drop.index)

months_since_last_delinquency                        51.260612
months_since_last_public_record                      84.002805
next_payment_date                                    60.983405
months_since_last_major_derog                        74.106041
joint_income_annual                                  94.647373
joint_dti_ratio                                      94.652767
joint_income_verification_status                     94.862476
open_accounts_6m                                     39.858126
open_active_installment_loans                        39.842617
open_installment_loans_12m                           39.877681
open_installment_loans_24m                           39.862847
months_since_recent_installment_loan                 41.669308
total_balance_installment_loans                      39.892516
installment_utilization                              48.279513
open_revolving_accounts_12m                          39.860824
open_revolving_accounts_24m                          39

## Categorical data conversion and dataset splitting

In [7]:
numerical_cols = dataset.select_dtypes(include=['number']).columns
print("Before converting categorical data")
categorical_cols = dataset.select_dtypes(include=['object', 'category']).columns
print(f"Categorical columns:\n{categorical_cols}")

Before converting categorical data
Categorical columns:
Index(['loan_contract_term_months', 'borrower_profile_employment_length',
       'borrower_housing_ownership_status',
       'borrower_income_verification_status', 'loan_issue_date',
       'loan_status_current_code', 'loan_payment_plan_flag',
       'loan_purpose_category', 'borrower_address_zip',
       'borrower_address_state', 'credit_history_earliest_line',
       'listing_initial_status', 'last_payment_date', 'last_credit_pull_date',
       'application_type_label', 'hardship_flag_indicator',
       'disbursement_method_type', 'debt_settlement_flag_indicator', 'grade'],
      dtype='object')


### Stateless categorical data conversion

In [8]:
# Stateless transformations (can be done on train and test set independently)
dataset["grade"] = dataset["grade"].map({'A': 6, 'B': 5, 'C': 4, 'D': 3, 'E': 2, 'F': 1, 'G': 0})

def extract_int_value_from_column(column_name):
    dataset[column_name] = dataset[column_name].str.extract(r"(\d+)").astype("Int64")

extract_int_value_from_column("loan_contract_term_months")
extract_int_value_from_column("borrower_profile_employment_length")

# DROP FOR TREES, USE FOR DEEP ENTWORKS
# extract_int_value_from_column("borrower_address_zip")
dataset.drop(columns="borrower_address_zip", inplace=True)

def convert_mm_yyyy_to_year_sine_cosine(column_name):
    # Convert mm-yyyy to the year and month encoded in sine/cosine representation so that dec 1999 and jan 2000 are close
    dataset[column_name] = pd.to_datetime(dataset[column_name], format='%b-%Y')
    date_col = dataset[column_name].dt # avoid creating temporary object
    dataset[f"{column_name}_year"] = date_col.year
    angle = 2 * np.pi * date_col.month / 12
    dataset[f"{column_name}_month_sin"] = np.sin(angle)
    dataset[f"{column_name}_month_cos"] = np.cos(angle)
    dataset.drop(columns=[column_name], inplace=True)

convert_mm_yyyy_to_year_sine_cosine("loan_issue_date")
convert_mm_yyyy_to_year_sine_cosine("credit_history_earliest_line")
convert_mm_yyyy_to_year_sine_cosine("last_payment_date")
convert_mm_yyyy_to_year_sine_cosine("last_credit_pull_date")

### Splitting the dataset

In [9]:
X = dataset.drop(columns=["grade"])
y = dataset["grade"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

print("Training set")
print(X_train.shape)
print(y_train.shape)

print("Validation set")
print(X_val.shape)
print(y_val.shape)

Training set
(118640, 94)
(118640,)
Validation set
(29661, 94)
(29661,)


### Stateful categorical data conversion

In [10]:
# Stateful transformations (with statistics calculated on training set and applied on both sets)

# We transform the binary categorical cols with 1 and 0. We use 1 for the least frequent data
# because it carries more signal
binary_categorical_cols = ["loan_payment_plan_flag", "listing_initial_status", "application_type_label",
               "hardship_flag_indicator", "disbursement_method_type", "debt_settlement_flag_indicator"]
binary_categorical_cols_feature_map = {}

# FIT: Calculate mode only on X_train
# TRANSFORM: Apply to X_train and x_val using the mode of X_train
for col in binary_categorical_cols:
    col_mode = X_train[col].mode()[0]
    binary_categorical_cols_feature_map[col] = col_mode
    X_train[col] = (X_train[col] != col_mode).astype("Int64")
    X_val[col] = (X_val[col] != col_mode).astype("Int64")


one_hot_encoding_cols = ["borrower_housing_ownership_status", "borrower_income_verification_status",
                       "loan_status_current_code", "loan_purpose_category", "borrower_address_state"]

encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoder.fit(X_train[one_hot_encoding_cols])
one_hot_encoding_feature_names = encoder.get_feature_names_out(one_hot_encoding_cols)

# Apply one-hot encoding to the training set
X_train_encoded = pd.DataFrame(encoder.transform(X_train[one_hot_encoding_cols]), columns=one_hot_encoding_feature_names, index=X_train.index)
X_train = pd.concat([X_train.drop(columns=one_hot_encoding_cols), X_train_encoded], axis=1)

# Apply one-hot encoding to the test set
X_val_encoded = pd.DataFrame(encoder.transform(X_val[one_hot_encoding_cols]), columns=one_hot_encoding_feature_names, index=X_val.index)
X_val = pd.concat([X_val.drop(columns=one_hot_encoding_cols), X_val_encoded], axis=1)

In [11]:
print("After converting categorical data")
print(f"Train Shape: {X_train.shape}")
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
print(f"Categorical columns:\n{categorical_cols}")
print(f"Test Shape:  {X_val.shape}")
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
print(f"Categorical columns:\n{categorical_cols}")

After converting categorical data
Train Shape: (118640, 172)
Categorical columns:
Index([], dtype='object')
Test Shape:  (29661, 172)
Categorical columns:
Index([], dtype='object')


In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('pca', PCA(n_components=0.95)),
    ('rf', RandomForestClassifier(max_depth=50, class_weight='balanced', n_jobs=-1))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)

acc = accuracy_score(y_val, y_pred)
print(f"Accuracy: {acc}")
bacc = balanced_accuracy_score(y_val, y_pred)
print(f"Balanced accuracy: {bacc}")
f1 = f1_score(y_val, y_pred, average="weighted")
print(f"F1 score: {f1}")

Accuracy: 0.8591753481001989
Balanced accuracy: 0.7954629216584609
F1 score: 0.8566987148664283
