Private Score: 0.96775.

Public Score: 0.97229.

This notebook builds a simple catboostclassifier model. It achieves a cv score about 0.968 on the full train dataset. Below are the main references.

Data description:
1. Descriptions of Loan Data by vishwa, https://www.kaggle.com/competitions/playground-series-s4e10/discussion/536984 .

Feature Engineering:
1. Loan Approval | EDA + Catboost + Optuna by Igor Volianiuk, https://www.kaggle.com/code/igorvolianiuk/loan-approval-eda-catboost-optuna .
2. Features That Increase CV score by Vladimir Mijatovic, https://www.kaggle.com/competitions/playground-series-s4e10/discussion/539889 .

Model:
1. [AutoML Grand Prix] 1st Place Solution by Vopani, https://www.kaggle.com/code/rohanrao/automl-grand-prix-1st-place-solution .

Hyperparameters:
1. EDA and LGB+CAT+XGB by Martynov Andrey, https://www.kaggle.com/code/martynovandrey/eda-and-lgb-cat-xgb#%F0%9F%93%8C-CATBoost .

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from numpy import percentile
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.base import clone
from sklearn.metrics import roc_auc_score
import gc
import warnings
warnings.filterwarnings("ignore")

from xgboost import XGBClassifier, XGBRegressor
import xgboost as xgb
from lightgbm import LGBMClassifier, LGBMRegressor
import lightgbm as lgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

import optuna
from optuna.visualization import plot_param_importances
from optuna.samplers import RandomSampler, TPESampler, CmaEsSampler
from optuna.pruners import HyperbandPruner
from functools import partial
import time

In [None]:
random_seed = 42
n_fold = 5

# 1. Load Data.

In [None]:
original = pd.read_csv('/kaggle/input/loan-approval-prediction/credit_risk_dataset.csv')
train = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

There are 32581 samples in the original dataset. 11 features and 1 target variable 'loan_status'. The 8 numerical columns include 'person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length' and target variable 'loan_status'. The 4 categorical columns include 'person_home_ownership', 'loan_intent', 'loan_grade' and 'cb_person_default_on_file'. Only 2 columns 'person_emp_length' and 'loan_int_rate' have null values.

There are 58645 entries in the train dataset and 39098 entries in the test dataset. No missing values found in both train and test datasets.

In [None]:
train.drop('id', inplace=True, axis=1)
test.drop('id', inplace=True, axis=1)

# 2. EDA

Loan approval rates across all age ranges are close. 

In [None]:
original['age_range'] = np.where(original['person_age'].between(20, 30, inclusive='left'), '20s', 
                                 np.where(original['person_age'].between(30, 40, inclusive='left'), '30s',
                                          np.where(original['person_age'].between(40, 50, inclusive='left'), '40s','50+')))
original.groupby('age_range')['loan_status'].mean().to_frame('original approval_rate')

In [None]:
train['age_range'] = np.where(train['person_age'].between(20, 30, inclusive='left'), '20s', 
                                 np.where(train['person_age'].between(30, 40, inclusive='left'), '30s',
                                          np.where(train['person_age'].between(40, 50, inclusive='left'), '40s','50+')))
train.groupby('age_range')['loan_status'].mean().to_frame('train approval_rate')

Loan amounts larger than 20k have a higher average approval rate.

In [None]:
original['loan_amnt_range'] = np.where(original['loan_amnt'].between(0, 20000, inclusive='left'), '20k', '20k+')
original.groupby('loan_amnt_range')['loan_status'].mean().to_frame('original approval_rate')

In [None]:
train['loan_amnt_range'] = np.where(train['loan_amnt'].between(0, 20000, inclusive='left'), '20k', '20k+')
train.groupby('loan_amnt_range')['loan_status'].mean().to_frame('train approval_rate')

The target variables 'loan_status' in both the original and train datasets displays imbalanced distributions.

In [None]:
def plot_pie_chart(data, title, ax):
    data_counts = data['loan_status'].value_counts()
    labels = data_counts.index
    sizes = data_counts.values

    ax.pie(sizes, labels=labels, autopct='%1.1f%%')
    ax.axis('equal') 
    ax.set_title(title)

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
plot_pie_chart(original, f"original dataset {'loan_status'} distribution", ax[0])
plot_pie_chart(train, f"Train dataset {'loan_status'} distribution", ax[1])
plt.tight_layout()
plt.show()

# 3. Feature Engineering.

In [None]:
original.drop(['age_range','loan_amnt_range'], axis=1, inplace=True)
train.drop(['age_range','loan_amnt_range'], axis=1, inplace=True)

The first part of feature engineering is to add features that would increase the roc score.

In [None]:
def feature_engineering(df):
    #df['loan_int_rate'] = (df['loan_int_rate'] * 100)
    #df['age_income_interaction'] = (df['person_age'] * df['person_income'])
    #df['monthly_debt'] = df['loan_amnt'] * (1 + df['loan_int_rate'].fillna(original['loan_int_rate'].mean())) / 12
    #df['debt_to_income_ratio'] = df['monthly_debt'] / df['monthly_income']
    #df['loan_amnt_20k+'] = np.where(df['loan_amnt'].between(0, 20000, inclusive='left'), 'NO', 'YES')

    df['loan_to_income_error'] = (df['loan_amnt'] / df['person_income']) - df['loan_percent_income']
    df['loan_to_income_error'] = df['loan_to_income_error'] * 100
    df['loan_percent_income'] = df['loan_percent_income'] * 100
    df['monthly_income'] = df['person_income'] / 12
    df['loan_to_emp_length_ratio'] = df['loan_amnt'] / (df['person_emp_length'].fillna(original['person_emp_length'].mean()))
    df['person_income'] = np.log(df['person_income'])
    df['monthly_income'] = np.log(df['monthly_income'])

    return df

In [None]:
original = feature_engineering(original)
train = feature_engineering(train)
test = feature_engineering(test)

The second part of feature engineering is to change numerical features into strings and turn categorical features into ordinal numbers using OrdinalEncoder. OrdinalEncoder is first applied on original dataset, then train and test, to insure consistent labellings across three datasets.

In [None]:
cat_cols = test.columns[test.dtypes == object].to_list() 
num_cols = test.columns[test.dtypes != object].to_list()

In [None]:
for col in num_cols:
    original[[col]] = original[[col]].astype('str')
    train[[col]] = train[[col]].astype('str')
    test[[col]] = test[[col]].astype('str')

In [None]:
for col in cat_cols:
    enc = preprocessing.OrdinalEncoder(dtype=int)
    original[[col]] = enc.fit_transform(original[[col]])
    original[[col]] = original[[col]].astype('category')
    
    train[[col]] = enc.transform(train[[col]])
    train[[col]] = train[[col]].astype('category')
    
    test[[col]] = enc.transform(test[[col]])
    test[[col]] = test[[col]].astype('category')

# 4. Build Model and Hyperparameters.

In [None]:
X_original = original.drop(['loan_status'], axis=1)
y_original = original['loan_status']

X_train = train.drop(['loan_status'], axis=1)
y_train = train['loan_status']

X_test = test

I tried to fine tune the hyperparameters by Optuna. But it did not work well. So I just borrow parameters from public notebooks.

In [None]:
# parameters from https://www.kaggle.com/code/martynovandrey/eda-and-lgb-cat-xgb#%F0%9F%93%8C-CATBoost
catb_params = {
            'task_type'           : "CPU",
            'loss_function'       : 'Logloss',
            'eval_metric'         : "AUC",
            'bagging_temperature' : 0.25,
            'colsample_bylevel'   : 0.40,
            'iterations'          : 5_000,
            'learning_rate'       : 0.045,
            'max_depth'           : 7,
            'l2_leaf_reg'         : 0.80,
            'min_data_in_leaf'    : 30,
            'random_strength'     : 0.25,
            'random_state'        : 42,
            'early_stopping_rounds': 200,
            'use_best_model'       : True,
}

# 5. Cross Validation and Prediction.

In [None]:
def validation(model):
    cv = StratifiedKFold(n_fold, shuffle=True, random_state=random_seed)
    cv_splits = cv.split(X_train, y_train)
    
    test_preds = []
    val_preds = np.zeros(len(X_train))
    
    for fold, (train_idx, val_idx) in enumerate(cv_splits):
        
        X_train_fold = pd.concat([X_train.iloc[train_idx], X_original], axis=0)
        y_train_fold = pd.concat([y_train.iloc[train_idx], y_original], axis=0)
        X_val_fold, y_val_fold = X_train.iloc[val_idx], y_train.iloc[val_idx]
        
        model_cloned = clone(model)
        
        if isinstance(model_cloned, XGBClassifier):
            model_cloned.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=100, verbose=False)
        elif isinstance(model_cloned, LGBMClassifier):
            model_cloned.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], eval_metric='logloss', callbacks=[lgb.early_stopping(stopping_rounds=100)])
        elif isinstance(model_cloned, CatBoostClassifier):
            train_fold_pool = Pool(X_train_fold, y_train_fold, cat_features=X_train_fold.columns.values)
            val_fold_pool = Pool(X_val_fold, y_val_fold, cat_features=X_val_fold.columns.values)
            X_test_pool = Pool(X_test, cat_features=X_test.columns.values)
            
            model_cloned.fit(X=train_fold_pool, eval_set=val_fold_pool, verbose=False)
        
        val_preds[val_idx] = model_cloned.predict_proba(X_val_fold)[:, 1]
        
        y_pred = model_cloned.predict_proba(X_test_pool)[:, 1]
        test_preds.append(y_pred)
        gc.collect()

    auc_cv = roc_auc_score(y_train, val_preds)
        
    return auc_cv, val_preds, test_preds

In [None]:
%%time

auc_cv_cat, val_preds_cat, test_preds_cat = validation(CatBoostClassifier(**catb_params))
print("The AUC cv score is:", auc_cv_cat)

# 6. Submission.

In [None]:
y_test = np.mean(test_preds_cat, axis=0) # column-wise average
submission = pd.read_csv('/kaggle/input/playground-series-s4e10/sample_submission.csv')
submission['loan_status'] = y_test
submission.to_csv('submission.csv', index=False)
submission.head()