In [1]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report
import time

In [2]:
data = pd.read_csv('./Datasets/Data_Removed_NaN_and_Duplicates.csv')

## Data splits

In [3]:
categorical_features = ['zip_code', 'addr_state', 'emp_length', 'title_category']
target = 'loan_status'

fraction = 0.1
_, sampled_data = train_test_split(data, test_size=fraction, stratify=data[target], random_state=42)

train_data, test_data = train_test_split(sampled_data, test_size=0.3, stratify=sampled_data[target], random_state=42)
train_data, validation_data = train_test_split(train_data, test_size=0.5, stratify=train_data[target], random_state=42)

### Model 1

In [4]:
model = cb.CatBoostClassifier(
    cat_features=categorical_features,
    eval_metric='Logloss',
    early_stopping_rounds=50,
    random_state=42,
    verbose=100
)

In [5]:
X_train, y_train = train_data.drop(target, axis=1), train_data[target]
X_validation, y_validation = validation_data.drop(target, axis=1), validation_data[target]
X_test, y_test = test_data.drop(target, axis=1), test_data[target]

In [6]:
model.fit(
    X_train, 
    y_train, 
    eval_set=(X_validation, y_validation), 
    plot=True,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.170562
0:	learn: 0.4254439	test: 0.4253030	best: 0.4253030 (0)	total: 536ms	remaining: 8m 55s
100:	learn: 0.1053445	test: 0.1053909	best: 0.1053909 (100)	total: 35s	remaining: 5m 11s
200:	learn: 0.1024292	test: 0.1029210	best: 0.1029210 (200)	total: 1m 9s	remaining: 4m 37s
300:	learn: 0.1008702	test: 0.1018731	best: 0.1018730 (298)	total: 1m 44s	remaining: 4m 3s
400:	learn: 0.0998091	test: 0.1012326	best: 0.1012326 (400)	total: 2m 19s	remaining: 3m 27s
500:	learn: 0.0989405	test: 0.1008423	best: 0.1008419 (499)	total: 2m 53s	remaining: 2m 52s
600:	learn: 0.0982547	test: 0.1006068	best: 0.1006068 (600)	total: 3m 27s	remaining: 2m 17s
700:	learn: 0.0976959	test: 0.1004621	best: 0.1004621 (700)	total: 4m 3s	remaining: 1m 43s
800:	learn: 0.0971528	test: 0.1003486	best: 0.1003470 (798)	total: 4m 39s	remaining: 1m 9s
900:	learn: 0.0966837	test: 0.1002983	best: 0.1002980 (899)	total: 5m 12s	remaining: 34.3s
999:	learn: 0.0962099	test: 0.1001951	best: 0.1001951 (999)	tot

<catboost.core.CatBoostClassifier at 0x1dce01a8850>

In [7]:
# Predicting on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

report = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1'])
print(report)

              precision    recall  f1-score   support

     Class 0       0.98      0.98      0.98    719609
     Class 1       0.74      0.75      0.74     61055

    accuracy                           0.96    780664
   macro avg       0.86      0.86      0.86    780664
weighted avg       0.96      0.96      0.96    780664



- Quite good model output with default parameters.

- High Precision and Recall for Class 0:

    - Precision of 0.98 and Recall of 0.98 for Class 0 indicate excellent model performance in correctly identifying and classifying instances of this class.

- Moderate Performance for Class 1:

    - Precision of 0.74 and Recall of 0.75 for Class 1 suggest moderate accuracy in classifying this class. While good, there is room for improvement.
    - The lower performance for Class 1 compared to Class 0 might indicate a challenge in distinguishing Class 1 instances or a class imbalance issue.

- Macro average and Weighted average both at 0.86 and 0.96, respectively, indicate a well-balanced model performance, though slightly biased towards Class 0 due to its higher representation in the dataset.

### Model 2

In [8]:
# Apply Random Over Sampling (ROS) to the training data
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(train_data.drop(target, axis=1), train_data[target])

X_validation, y_validation = validation_data.drop(target, axis=1), validation_data[target]
X_test, y_test = test_data.drop(target, axis=1), test_data[target]

In [9]:
model_2 = cb.CatBoostClassifier(
    cat_features=categorical_features,
    eval_metric='Logloss',
    early_stopping_rounds=500,
    random_state=42
)

model_2.fit(
    X_train, 
    y_train, 
    eval_set=(X_validation, y_validation), 
    verbose=100,
    plot=True,
    early_stopping_rounds=100
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.198382
0:	learn: 0.4963226	test: 0.4974139	best: 0.4974139 (0)	total: 641ms	remaining: 10m 40s
100:	learn: 0.1653257	test: 0.1903342	best: 0.1903342 (100)	total: 56.2s	remaining: 8m 19s
200:	learn: 0.1415669	test: 0.1776840	best: 0.1776840 (200)	total: 1m 52s	remaining: 7m 27s
300:	learn: 0.1320316	test: 0.1745605	best: 0.1737477 (298)	total: 2m 49s	remaining: 6m 34s
400:	learn: 0.1248110	test: 0.1711224	best: 0.1708833 (371)	total: 3m 44s	remaining: 5m 35s
500:	learn: 0.1199556	test: 0.1704607	best: 0.1700767 (498)	total: 4m 44s	remaining: 4m 42s
600:	learn: 0.1162017	test: 0.1690934	best: 0.1690934 (600)	total: 5m 43s	remaining: 3m 48s
700:	learn: 0.1121467	test: 0.1689886	best: 0.1689886 (700)	total: 6m 41s	remaining: 2m 51s
800:	learn: 0.1092709	test: 0.1687223	best: 0.1687223 (800)	total: 7m 39s	remaining: 1m 54s
900:	learn: 0.1063003	test: 0.1688613	best: 0.1685187 (866)	total: 8m 38s	remaining: 57s
999:	learn: 0.1038025	test: 0.1685922	best: 0.1684128 (924

<catboost.core.CatBoostClassifier at 0x1dcd8e7f850>

In [10]:
y_pred = model_2.predict(X_test)
report = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1'])
print(report)

              precision    recall  f1-score   support

     Class 0       0.97      0.97      0.97    719609
     Class 1       0.64      0.66      0.65     61055

    accuracy                           0.94    780664
   macro avg       0.81      0.81      0.81    780664
weighted avg       0.95      0.94      0.94    780664



- Class 0 Performance: Similarly high precision and recall (0.97), showing effective classification for Class 0.
- Class 1 Performance: Lower precision (0.64) and recall (0.66) compared to Model 1, indicating more challenges in correctly classifying Class 1.


- Overall Accuracy and Balance:
- Slightly lower overall accuracy (94%) compared to Model 1.
- Macro average (0.81) and Weighted average (0.95) suggest good performance but with a more noticeable bias towards Class 0 than in Model 1.


- A relatively aggressive learning rate.
- Gradual decrease in Logloss over many iterations, indicating a good but slightly less effective learning compared to Model 1.
- Slightly higher final Logloss value than Model 1, suggesting a marginally less optimal fit to the validation data.

### Saving models

In [11]:
# model trained with the original dataset
model.save_model("model_original_status.bin")
# model trained with the ROS dataset
model_2.save_model("model_ros_status.bin")