##  Credit Risk Modelling 

This notebook covers training and evaluating different models to predict loan defaults

In [17]:
import pandas as pd
import numpy as np
from credit_risk_model.data_cleaning import clean_all
from credit_risk_model.feat_eng import engineer_features
from credit_risk_model.model import(
     train_logistic_regression,
     train_xgboost, train_lightgbm,
    compare_models,hyperparameter_tuning_xgboost, 
    hyperparameter_tuning_lightgbm,
    compare_tuned_and_baseline_models
)
from credit_risk_model.config import DATA_PROCESSED, MODELS_DIR

# Import necessary libraries for modelling
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, 
    classification_report,
    confusion_matrix
)
import optuna
import joblib

In [3]:
df = pd.read_csv(DATA_PROCESSED/"agg_main.csv")

In [4]:
df.head()

Unnamed: 0,target,credit_income_ratio,total_income,total_credit_requested,monthly_loan_payment,value_of_goods_financed,age_years,employment_years,NAME_CONTRACT_TYPE,CODE_GENDER,...,avg_amt_credit,approval_rate,n_active_contracts,avg_months_active,n_completed_contracts,avg_payment_ratio,installments_n_late_payments,avg_balance,max_balance,avg_utilization
0,0,2.857143,157500.0,450000.0,35685.0,450000.0,59.309589,12.909589,Cash loans,F,...,443773.607143,0.428571,34.0,-34.794118,5.0,1.056709,2.0,29452.9005,158449.5,
1,0,1.2,112500.0,135000.0,7879.5,135000.0,48.030137,0.441096,Cash loans,F,...,170367.75,0.75,59.0,-26.881356,4.0,1.0,0.0,0.0,0.0,0.0
2,0,6.04,112500.0,679500.0,19867.5,679500.0,37.117808,5.468493,Cash loans,F,...,145647.75,0.833333,35.0,-28.428571,3.0,0.897436,7.0,0.0,0.0,0.0
3,0,2.9,225000.0,652500.0,47610.0,652500.0,38.049315,6.347945,Cash loans,M,...,167734.5,1.0,24.0,-29.0,2.0,0.94186,3.0,279583.392414,463463.865,0.759396
4,0,0.733333,135000.0,99000.0,10395.0,99000.0,27.446575,1.038356,Cash loans,F,...,146957.785714,1.0,49.0,-47.367347,4.0,0.979031,16.0,193191.53371,463813.695,0.674371


## split data

splitting is done so early here to avoid leakage of information from the validation set into the training process, since cleaning uses data level statistics 


In [5]:
# split
X = df.drop('target', axis=1)
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)


In [6]:
#clean data
X_train = clean_all(X_train)
X_val = clean_all(X_val)

In [7]:
# feature engineering
X_train = engineer_features(X_train)
X_val = engineer_features(X_val)

# Replace infinity with 0
X_train = X_train.replace([np.inf, -np.inf], 0)
X_val = X_val.replace([np.inf, -np.inf], 0)

## Evaluate models 

### Baseline model (Logistic Regression)
scaling is done here on only the continuous cols ommiting encoded cols and binary cols

NOTE: tree based models don't require scaling, it is only done here for logistic regression

In [8]:
log_reg_pipe, y_proba, y_pred = train_logistic_regression(
    X_train, y_train, X_val, y_val
)

print("="*50)
print("BASELINE MODEL PERFORMANCE (Logistic Regression)")
print("="*50)

print(f"\nROC-AUC Score: {roc_auc_score(y_val, y_proba):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print("\nClassification Report:")
print(classification_report(y_val, y_pred))

BASELINE MODEL PERFORMANCE (Logistic Regression)

ROC-AUC Score: 0.6979

Confusion Matrix:
[[36284 20254]
 [ 1721  3244]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.64      0.77     56538
           1       0.14      0.65      0.23      4965

    accuracy                           0.64     61503
   macro avg       0.55      0.65      0.50     61503
weighted avg       0.89      0.64      0.72     61503



### XGBoost

In [9]:
xgb_model, y_proba_xgb, y_pred_xgb = train_xgboost(
    X_train, y_train, X_val, y_val
)

print("="*50)
print("XGBoost Performance")
print("="*50)
print(f"ROC-AUC: {roc_auc_score(y_val, y_proba_xgb):.4f}")
print("\n", classification_report(y_val, y_pred_xgb))

XGBoost Performance
ROC-AUC: 0.7218

               precision    recall  f1-score   support

           0       0.94      0.90      0.92     56538
           1       0.23      0.32      0.26      4965

    accuracy                           0.86     61503
   macro avg       0.58      0.61      0.59     61503
weighted avg       0.88      0.86      0.87     61503



### Lightboost 

In [10]:
lgbm_model, y_proba_lgbm, y_pred_lgbm = train_lightgbm(
    X_train, y_train, X_val, y_val
)

print("="*50)
print("LightGBM Performance")
print("="*50)
print(f"ROC-AUC: {roc_auc_score(y_val, y_proba_lgbm):.4f}")
print("\n", classification_report(y_val, y_pred_lgbm))

LightGBM Performance
ROC-AUC: 0.7212

               precision    recall  f1-score   support

           0       0.94      0.90      0.92     56538
           1       0.22      0.32      0.26      4965

    accuracy                           0.86     61503
   macro avg       0.58      0.61      0.59     61503
weighted avg       0.88      0.86      0.87     61503



### Compare results

In [11]:
results_df = compare_models(X_val, y_val, log_reg_pipe, xgb_model, lgbm_model)
results_df

Unnamed: 0,AUC-ROC,AUC-PR,Precision,Recall,F1-score,KS,Gini
XGBoost,0.72178,0.197075,0.225014,0.317422,0.263347,0.327392,0.443561
LightGBM,0.721236,0.197589,0.221108,0.315206,0.259902,0.331329,0.442472
LogisticRegression,0.697947,0.168812,0.081353,0.999194,0.150456,0.297125,0.395894


### Hyperparameter Tuning

#### Hyperparameter tuning for xgboost

In [12]:
xgb_tuned = hyperparameter_tuning_xgboost(X_train, y_train, X_val, y_val) 

[I 2025-11-29 13:33:05,620] A new study created in memory with name: no-name-b271558e-fb49-4412-af31-cbdb7d1dbc48
[I 2025-11-29 13:33:59,735] Trial 0 finished with value: 0.7164098083449975 and parameters: {'n_estimators': 453, 'max_depth': 8, 'learning_rate': 0.07047178428065659, 'subsample': 0.9751849949064038, 'colsample_bytree': 0.736364998625361, 'gamma': 0.04054852035985379, 'min_child_weight': 4}. Best is trial 0 with value: 0.7164098083449975.
[I 2025-11-29 13:35:00,817] Trial 1 finished with value: 0.7242522590034448 and parameters: {'n_estimators': 650, 'max_depth': 5, 'learning_rate': 0.08472568226228272, 'subsample': 0.7212609790184505, 'colsample_bytree': 0.6575513166343617, 'gamma': 3.284748694541343, 'min_child_weight': 10}. Best is trial 1 with value: 0.7242522590034448.
[I 2025-11-29 13:35:57,480] Trial 2 finished with value: 0.7212118705500746 and parameters: {'n_estimators': 717, 'max_depth': 6, 'learning_rate': 0.15220842922148717, 'subsample': 0.9838960270989887, '

### Hyperparameter tuning for LightGBM

In [13]:
lgbm_tuned = hyperparameter_tuning_lightgbm(X_train, y_train, X_val, y_val)

[I 2025-11-29 14:19:24,706] A new study created in memory with name: no-name-5bb0be1d-b349-4da8-87c6-79ac95dc2031
[I 2025-11-29 14:20:07,099] Trial 0 finished with value: 0.7084419957353318 and parameters: {'num_leaves': 111, 'max_depth': 4, 'learning_rate': 0.005176217871973492, 'n_estimators': 731, 'subsample': 0.9979948843823482, 'colsample_bytree': 0.9125230188500855, 'min_child_samples': 40, 'reg_lambda': 1.4470151665475877, 'reg_alpha': 3.047535031917981}. Best is trial 0 with value: 0.7084419957353318.
[I 2025-11-29 14:20:53,470] Trial 1 finished with value: 0.7236875504455343 and parameters: {'num_leaves': 41, 'max_depth': 8, 'learning_rate': 0.050814774614353936, 'n_estimators': 727, 'subsample': 0.9767340194771592, 'colsample_bytree': 0.8919206929643089, 'min_child_samples': 97, 'reg_lambda': 0.5837414072600206, 'reg_alpha': 3.074402867009105}. Best is trial 1 with value: 0.7236875504455343.
[I 2025-11-29 14:21:13,947] Trial 2 finished with value: 0.7224104263467677 and param

### Compare all models(baseline and tuned)

In [14]:
results_df = compare_tuned_and_baseline_models(X_val, y_val, log_reg_pipe, xgb_model, lgbm_model, xgb_tuned, lgbm_tuned)
print(results_df.to_string(index=False))

             Model  Baseline AUC  Tuned AUC
LogisticRegression      0.697947        NaN
           XGBoost      0.721780   0.727390
          LightGBM      0.721236   0.726909


### Feature Importance

In [16]:
# Get feature importances from tuned XGBoost model
importance = xgb_tuned.feature_importances_

feat_imp_df_xgb = pd.DataFrame({
    "feature": X_train.columns,
    "importance": importance
}).sort_values("importance", ascending=False)

feat_imp_df_xgb.head(30)


Unnamed: 0,feature,importance
8,CODE_GENDER,0.053533
30,avg_utilization,0.043519
12,avg_debt_ratio,0.040324
7,NAME_CONTRACT_TYPE,0.037807
18,n_refused,0.03353
57,bureau_activity_ratio,0.029835
22,approval_rate,0.029601
33,Core staff,0.02911
4,value_of_goods_financed,0.026343
53,has_many_active_loans,0.025684


### Save best model (XGBoost)

In [18]:
all_models = {
    "log_reg": log_reg_pipe,
    "XGBoost": xgb_tuned,
    "LightGBM": lgbm_tuned,
}

for name, model in all_models.items():
    joblib.dump(model, f"{MODELS_DIR}/{name}.joblib")
