In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report,confusion_matrix,roc_auc_score

In [30]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    return summary

In [31]:
file_loc = "./data/loan_data.csv"
df = pd.read_csv(file_loc)
print ("Total number of rows in dataset = {}".format(df.shape[0]))
print ("Total number of columns in dataset = {}".format(df.shape[1]))

Total number of rows in dataset = 614
Total number of columns in dataset = 6


In [32]:
df.head(3)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849.0,0.0,0.0,360.0,Yes,1
1,4583.0,1508.0,128.0,360.0,Yes,0
2,3000.0,0.0,66.0,360.0,Yes,1


In [33]:
result = resumetable(df)
result

Dataset Shape: (614, 6)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value
0,ApplicantIncome,float64,2,503,5849.0,4583.0
1,CoapplicantIncome,float64,2,287,0.0,1508.0
2,LoanAmount,float64,3,203,0.0,128.0
3,Loan_Amount_Term,float64,2,11,360.0,360.0
4,Credit_History,object,0,2,Yes,Yes
5,Loan_Status,int64,0,2,1,0


### Data Preprocessing for Model

In [44]:
target_col = "Loan_Status"
print("Event Rate: ",df["Loan_Status"].mean())
X = df.drop(columns=target_col)
y = df[target_col]

Event Rate:  0.6872964169381107


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=42)

In [36]:
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(491, 5) (491,)
(123, 5) (123,)


In [37]:
features = list(X_train.columns)
cat_features = ["Credit_History"]

In [38]:
# Create pool objects
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

In [39]:
# Initialize and train the model
model_cb = CatBoostClassifier(
    iterations=20,
    learning_rate=0.1,
    depth=2,
    l2_leaf_reg=2,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    od_type='Iter',
    od_wait=10,
    verbose=5
)

model_cb.fit(train_pool, eval_set=test_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.7120640	best: 0.7120640 (0)	total: 267ms	remaining: 5.08s
5:	test: 0.7047965	best: 0.7129360 (3)	total: 272ms	remaining: 634ms
10:	test: 0.6854651	best: 0.7129360 (3)	total: 275ms	remaining: 225ms
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.7129360465
bestIteration = 3

Shrink model to first 4 iterations.


<catboost.core.CatBoostClassifier at 0x2212acf0820>

### Model Predictions and evaluation

In [45]:
# Make predictions
y_pred = model_cb.predict(X_test)
y_pred_proba = model_cb.predict_proba(X_test)[:, 1]

# Print model performance metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"\nROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.3f}")

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.51      0.61        43
           1       0.78      0.91      0.84        80

    accuracy                           0.77       123
   macro avg       0.77      0.71      0.73       123
weighted avg       0.77      0.77      0.76       123


Confusion Matrix:
[[22 21]
 [ 7 73]]

ROC AUC Score: 0.713


### Feature Importances

In [50]:
# Feature importance#-
feature_importance = model_cb.feature_importances_
feature_names = model_cb.feature_names_

feature_importance_df = pd.DataFrame({'Feature': feature_names,'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
4,Credit_History,98.658095
2,LoanAmount,0.746518
1,CoapplicantIncome,0.595387
0,ApplicantIncome,0.0
3,Loan_Amount_Term,0.0
