In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Credit Risk: Calculate Loan Borrowers Probability of Default (PD)

Objective: The objective is to build a model that predicts the probability of default (PD) for a loan borrower based on their personal details and financial information, such as income, total loans outstanding, and previous default history. This model will then be used to estimate the expected loss for the lender, assuming a recovery rate of 10%. 

Task: The task is to build a model that predicts the probability of default (PD) for a borrower based on their financial details and calculates the expected loss assuming a 10% recovery rate.



In [3]:
df = pd.read_csv('Task 3 and 4_Loan_Data.csv')
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


Since we are predicting that a customer will default or not, we take our target variable  **default**, which is binary (0/1). Therefore, we will use a supervised learning algorithm.

In [6]:
# Quick EDA
# Check for missing values in each column
missing_values = df.isnull().sum()
print(missing_values)
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(duplicates)
# Basic statistics of numerical columns
df.describe()


customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64
0


Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4974577.0,1.4612,4159.677034,8718.916797,70039.901401,4.5528,637.5577,0.1851
std,2293890.0,1.743846,1421.399078,6627.164762,20072.214143,1.566862,60.657906,0.388398
min,1000324.0,0.0,46.783973,31.652732,1000.0,0.0,408.0,0.0
25%,2977661.0,0.0,3154.235371,4199.83602,56539.867903,3.0,597.0,0.0
50%,4989502.0,1.0,4052.377228,6732.407217,70085.82633,5.0,638.0,0.0
75%,6967210.0,2.0,5052.898103,11272.26374,83429.166133,6.0,679.0,0.0
max,8999789.0,5.0,10750.67781,43688.7841,148412.1805,10.0,850.0,1.0


In [22]:
class LoanDefaultPrediction:
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)
        self.features = ['credit_lines_outstanding', 'debt_to_income', 'payment_to_income', 'years_employed', 'fico_score']
        self.target = 'default'
        self.models = {
            'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear', max_iter=10000),
            'Decision Tree': DecisionTreeClassifier(random_state=42),
            'Random Forest': RandomForestClassifier(random_state=42),
            'Gradient Boosting': GradientBoostingClassifier(random_state=42),
            'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
        }
        
    def preprocess_data(self):
        # Create new features: Payment to Income Ratio and Debt to Income Ratio
        self.data['payment_to_income'] = self.data['loan_amt_outstanding'] / self.data['income']
        self.data['debt_to_income'] = self.data['total_debt_outstanding'] / self.data['income']

        # Split the dataset into features and target
        X = self.data[self.features]
        y = self.data[self.target]
        
        # Train-test split
        self.xTrain, self.xTest, self.yTrain, self.yTest = train_test_split(X, y, test_size=0.2, random_state=42)

        # Standardize the features
        self.scaler = StandardScaler()
        self.xTrain_scaled = self.scaler.fit_transform(self.xTrain)
        self.xTest_scaled = self.scaler.transform(self.xTest)

    def train_and_evaluate(self):
        results = {}

        for name, model in self.models.items():
            # Train the model
            model.fit(self.xTrain_scaled, self.yTrain)
            
            # Predict the probability of default being 1
            y_pred_proba = model.predict_proba(self.xTest_scaled)[:, 1]
            
            # Calculate AUC-ROC score
            auc = roc_auc_score(self.yTest, y_pred_proba)
            
            # Store performance
            results[name] = auc
        
        return results

    def display_results(self, results):
        for name, auc in results.items():
            print(f"{name} AUC-ROC: {auc:.4f}")


In [25]:
# Data
data_path = 'Task 3 and 4_Loan_Data.csv'

# Calls class
loan_predictor = LoanDefaultPrediction(data_path)

# Preprocess the data
loan_predictor.preprocess_data()

# Train models and evaluate
results = loan_predictor.train_and_evaluate()

# Display the results
loan_predictor.display_results(results)

Logistic Regression AUC-ROC: 1.0000
Decision Tree AUC-ROC: 0.9852
Random Forest AUC-ROC: 0.9999
Gradient Boosting AUC-ROC: 0.9997
XGBoost AUC-ROC: 0.9999


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
