## Credit Risk Modelling

### Saurabh Chatterjee
### Part - 2

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os   

In [3]:
# Load the dataset
df = pd.read_excel("final_filtered_dataset.xlsx")       # loading our filtered dataset (prepared in Part-1)

df.head()

Unnamed: 0.1,Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,...,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,MARITALSTATUS,EDUCATION,GENDER,last_prod_enq2,first_prod_enq2,Approved_Flag
0,0,0.0,0.0,0,0.0,0,0,0,4,1,...,0.0,0.0,1,0,Married,12TH,M,PL,PL,P2
1,1,0.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0,0,Single,GRADUATE,F,ConsumerLoan,ConsumerLoan,P2
2,2,0.125,0.0,0,0.0,1,0,0,0,2,...,0.0,0.0,1,0,Married,SSC,M,ConsumerLoan,others,P2
3,3,0.0,0.0,0,0.0,0,0,0,0,3,...,0.0,0.0,0,0,Married,POST-GRADUATE,M,AL,AL,P1
4,4,0.0,0.0,1,0.167,0,0,0,0,6,...,0.429,0.0,1,0,Married,12TH,M,ConsumerLoan,PL,P3


### Categorical Features

In [6]:
# Categorical Columns:
cat_clumns = []

for i in df.columns:
    if df[i].dtype == 'object':
        cat_clumns.append(i)

cat_clumns

['MARITALSTATUS',
 'EDUCATION',
 'GENDER',
 'last_prod_enq2',
 'first_prod_enq2',
 'Approved_Flag']

In [6]:
# Encoding Categorical features into Numerical:

print(df['MARITALSTATUS'].unique())             # One-Hot Encoding
print(df['EDUCATION'].unique())                 # Label / Ordinal Encoding  (since its categories have a rank/order)
print(df['GENDER'].unique())                    # One-Hot Encoding
print(df['last_prod_enq2'].unique())            # One-Hot Encoding
print(df['first_prod_enq2'].unique())           # One-Hot Encoding

['Married' 'Single']
['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']
['M' 'F']
['PL' 'ConsumerLoan' 'AL' 'CC' 'others' 'HL']
['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']


In [7]:
# Label / Ordinal Encoding: [EDUCATION] Column

df.loc[df['EDUCATION'] == 'SSC', ['EDUCATION']]              = 1
df.loc[df['EDUCATION'] == '12TH', ['EDUCATION']]             = 2
df.loc[df['EDUCATION'] == 'GRADUATE', ['EDUCATION']]         = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE', ['EDUCATION']]   = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE', ['EDUCATION']]    = 4
df.loc[df['EDUCATION'] == 'OTHERS', ['EDUCATION']]           = 1         # 'OTHERS': (assigned = SSC = 1 ) : has to be verified by the business end user
df.loc[df['EDUCATION'] == 'PROFESSIONAL', ['EDUCATION']]     = 3

print(df['EDUCATION'].value_counts())               # count of each Education category
df['EDUCATION'] = df['EDUCATION'].astype(int)
df.info()

EDUCATION
3    18931
2    11703
1     9532
4     1898
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 42064 non-null  int64  
 1   pct_tl_open_L6M            42064 non-null  float64
 2   pct_tl_closed_L6M          42064 non-null  float64
 3   Tot_TL_closed_L12M         42064 non-null  int64  
 4   pct_tl_closed_L12M         42064 non-null  float64
 5   Tot_Missed_Pmnt            42064 non-null  int64  
 6   CC_TL                      42064 non-null  int64  
 7   Home_TL                    42064 non-null  int64  
 8   PL_TL                      42064 non-null  int64  
 9   Secured_TL                 42064 non-null  int64  
 10  Unsecured_TL               42064 non-null  int64  
 11  Other_TL                   42064 non-null  int64  
 12  Age_Oldest_TL         

In [8]:
# One-Hot Encoding:

df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'])        ## FINAL Prepared Data

df_encoded.info()
k = df_encoded.describe()
df_encoded.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 56 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    42064 non-null  int64  
 1   pct_tl_open_L6M               42064 non-null  float64
 2   pct_tl_closed_L6M             42064 non-null  float64
 3   Tot_TL_closed_L12M            42064 non-null  int64  
 4   pct_tl_closed_L12M            42064 non-null  float64
 5   Tot_Missed_Pmnt               42064 non-null  int64  
 6   CC_TL                         42064 non-null  int64  
 7   Home_TL                       42064 non-null  int64  
 8   PL_TL                         42064 non-null  int64  
 9   Secured_TL                    42064 non-null  int64  
 10  Unsecured_TL                  42064 non-null  int64  
 11  Other_TL                      42064 non-null  int64  
 12  Age_Oldest_TL                 42064 non-null  int64  
 13  A

Unnamed: 0.1,Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,0,0.0,0.0,0,0.0,0,0,0,4,1,...,False,False,True,False,False,False,False,False,True,False
1,1,0.0,0.0,0,0.0,0,0,0,0,0,...,True,False,False,False,False,False,True,False,False,False
2,2,0.125,0.0,0,0.0,1,0,0,0,2,...,True,False,False,False,False,False,False,False,False,True
3,3,0.0,0.0,0,0.0,0,0,0,0,3,...,False,False,False,False,True,False,False,False,False,False
4,4,0.0,0.0,1,0.167,0,0,0,0,6,...,True,False,False,False,False,False,False,False,True,False


### Machine Learning Model

In [None]:
y = df_encoded['Approved_Flag']
X = df_encoded.drop(['Approved_Flag'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### (1) Random Forest

In [10]:
rf_classifier = RandomForestClassifier(n_estimators = 200, random_state=42)

rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)


for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):        # Class-wise Precision, Recall and F1
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Accuracy: 0.7648876738381077

Class p1:
Precision: 0.8439716312056738
Recall: 0.7041420118343196
F1 Score: 0.767741935483871

Class p2:
Precision: 0.7940283400809717
Recall: 0.9330029732408325
F1 Score: 0.8579239952610953

Class p3:
Precision: 0.4444444444444444
Recall: 0.20528301886792452
F1 Score: 0.2808466701084151

Class p4:
Precision: 0.7224926971762414
Recall: 0.7210884353741497
F1 Score: 0.7217898832684825



### (2) XGBoost

In [11]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)      # Label Encoding Target variable: has four level classes: P1, P2, P3, P4


X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

xgb_classifier.fit(X_train, y_train)
y_pred = xgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):            # Class-wise Precision, Recall and F1
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

# Accuracy: 0.78

# P3 Accuracy and F1 is Worse: 
# Reason: the data for P3 is ambigUous: its Credit Score range is very large:
# Credit Score rangeS:      P1: 701-811,    P2: 669-700,    P3: 489-776     , P4: 469-658
# P3 range is very low to very high entering into P2 and P4 ranges. -> so the model is also getting confused : and giving bad accuracy for P3.


Accuracy: 0.78

Class p1:
Precision: 0.8243386243386244
Recall: 0.7682445759368837
F1 Score: 0.7953037263910159

Class p2:
Precision: 0.8228051391862955
Recall: 0.9139742319127849
F1 Score: 0.8659968072119447

Class p3:
Precision: 0.4692874692874693
Recall: 0.28830188679245283
F1 Score: 0.35717625058438524

Class p4:
Precision: 0.7219047619047619
Recall: 0.7366375121477162
F1 Score: 0.7291967291967293



Accuracy: 
- Random Forest = 0.76
- XGBoost = 0.78 : Better (F1 Score is also Better)

P3 Accuracy and F1 is Worse: 
- Reason: the data for P3 is ambiguous: its Credit Score range is very large:
- **Credit Score** ranges:      P1: 701-811,    P2: 669-700,    **P3: 489-776**     , P4: 469-658
- P3 range is very low to very high entering into P2 and P4 ranges. -> so the model is also getting confused : and giving bad accuracy for P3.

Let's do further Feature Engineering and Fine-Tuning to get better results.

### Model Evaluation Metrics:
1. Classification:
- Accuracy
- Precision and Recall : for Imbalanced dataset : but there is trade-off between them, so difficulty to follow both separately, solution: F1 Score
- F1 Score : single metric to measure both Precision and Recall together.

2. Regression:
- MAE: 
- MSE:  Values are squared so less interpretability.
- RMSE: Better measure than MSE as its values are comparable to the target value and so can be better interpretated than MSE. \
**Drawback**: *RMSE is scale dependent*: alone RMSE cannot give any idea of how much error it is (low or high), without knowing the scale of the target numerical values.
- MAPE: Mean Absolute Percentage Error = Avg (Absolute Error / Actual Value) % : *Scale Dependent*. \
**Drawback**: Even if one of the Actual value is 0, the MAPE cannot be calculated.
- R2: *Scale Independent* : gives absolute score between 0-1 : BEST

### Hyperparameter Tuning
- For which combination of hyperparameters we are getting best Accuracy / F1 Score.
- First check **Train Accuracy / Score** : if it is High then only measure the **Test Accuracy / Score** : Check *Underfitting / Bias*.
- Then determine **Test Accuracy / Score** : if it is also High then Good. If it is less : *Overfitting*.

In [9]:
# Standardizing Some Numerical Columns:
# Apply Standard Scaler:

y = df_encoded['Approved_Flag']
X = df_encoded.drop(['Approved_Flag'], axis=1)

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler

columns_to_be_scaled = ['Age_Oldest_TL','Age_Newest_TL','time_since_recent_payment',
'max_recent_level_of_deliq','recent_level_of_deliq',
'time_since_recent_enq','NETMONTHLYINCOME','Time_With_Curr_Empr']

# Performing SCALING on Train & Test Data SEPARATELY : *to prevent Data Leakage* :
for i in columns_to_be_scaled: 
    column_data_train = X_train[i].values.reshape(-1, 1)
    column_data_test = X_test[i].values.reshape(-1, 1)

    scaler = StandardScaler()

    scaled_column_train = scaler.fit_transform(column_data_train)
    scaled_column_test = scaler.fit_transform(column_data_test)

    X_train[i] = scaled_column_train
    X_test[i] = scaled_column_test


import xgboost as xgb

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)

xgb_classifier.fit(X_train, y_train)
y_pred = xgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


# No improvement in metrices

Accuracy: 0.77
Class p1:
Precision: 0.8255813953488372
Recall: 0.7702169625246549
F1 Score: 0.7969387755102041

Class p2:
Precision: 0.8240143369175628
Recall: 0.9113974231912785
F1 Score: 0.8655058823529411

Class p3:
Precision: 0.4500601684717208
Recall: 0.28226415094339624
F1 Score: 0.34693877551020413

Class p4:
Precision: 0.7149621212121212
Recall: 0.7337220602526725
F1 Score: 0.7242206235011991



In [13]:
# Hyperparameter Tuning in XGBoost:

from sklearn.model_selection import GridSearchCV

# Define the XGBClassifier with the initial set of hyperparameters
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

# Define the parameter grid for hyperparameter tuning

param_grid = {
    'n_estimators': [50, 100, 200],         # number of trees in the model. Increasing this value generally improves model performance, but can also lead to overfitting.
    'max_depth': [3, 5, 7],                 # maximum depth of the trees in the model
    'learning_rate': [0.01, 0.1, 0.2],      # Step size shrinkage used in update to prevent overfitting. After each boosting step, we can directly get the weights of new features, and eta shrinks the feature weights to make the boosting process more conservative.
    'alpha' : [1, 10, 100],                 # L1 regularization term on weights
    'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],      # subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the model with the best hyperparameters on the test set
best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", accuracy)

# Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'alpha': 1, 'colsample_bytree': 0.9} | Accuracy: 0.81
# Accuracy Improved

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'alpha': 1, 'colsample_bytree': 0.9}
Test Accuracy: 0.81879472245


### Result Analysis: Explain to Business End User

P1: Best \
P2: Second Best \
P3: Third Best \
P4: Last \

**Risk Appetite:** How much the Bank is willing to sell loan.
- Low : not willing to sell too much : <U>Target already achieved</U> : focus only on P1
- High : willing to sell more loans : <U>Target far from being achieved</U> : accept P1, P2, P3 also. 

**Business Interpretation**: P1, P2 covers the risks. P3, P4 can be used to achieve target.

**Feedback Loop / Model Retraining**:
- Update data features, labels and model based on the feedback received from the end business user.




In [None]:
# Manual Hyperparameter Tuning Code:

# # Define the hyperparameter grid
# param_grid = {
#   'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],      # subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.
#   'learning_rate'   : [0.001, 0.01, 0.1, 1],          # Step size shrinkage used in update to prevent overfitting. After each boosting step, we can directly get the weights of new features, and eta shrinks the feature weights to make the boosting process more conservative.
#   'max_depth'       : [3, 5, 8, 10],
#   'alpha'           : [1, 10, 100],                   # L1 regularization term on weights
#   'n_estimators'    : [10,50,100]                     # number of trees in the model. Increasing this value generally improves model performance, but can also lead to overfitting.
# }

# index = 0

# answers_grid = {
#     'combination'       :[],
#     'train_Accuracy'    :[],
#     'test_Accuracy'     :[],
#     'colsample_bytree'  :[],
#     'learning_rate'     :[],
#     'max_depth'         :[],
#     'alpha'             :[],
#     'n_estimators'      :[]

#     }


# # Loop through each combination of hyperparameters
# for colsample_bytree in param_grid['colsample_bytree']:
#   for learning_rate in param_grid['learning_rate']:
#     for max_depth in param_grid['max_depth']:
#       for alpha in param_grid['alpha']:
#           for n_estimators in param_grid['n_estimators']:
             
#               index = index + 1
             
#               # Define and train the XGBoost model
#               model = xgb.XGBClassifier(objective='multi:softmax',  
#                                        num_class=4,
#                                        colsample_bytree = colsample_bytree,
#                                        learning_rate = learning_rate,
#                                        max_depth = max_depth,
#                                        alpha = alpha,
#                                        n_estimators = n_estimators)
               
       
                     
#               y = df_encoded['Approved_Flag']
#               x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )

#               label_encoder = LabelEncoder()
#               y_encoded = label_encoder.fit_transform(y)


#               x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)


#               model.fit(x_train, y_train)
  

       
#               # Predict on training and testing sets
#               y_pred_train = model.predict(x_train)
#               y_pred_test = model.predict(x_test)
       
       
#               # Calculate train and test results
              
#               train_accuracy =  accuracy_score (y_train, y_pred_train)
#               test_accuracy  =  accuracy_score (y_test , y_pred_test)
              
              
       
#               # Include into the lists
#               answers_grid ['combination']   .append(index)
#               answers_grid ['train_Accuracy']    .append(train_accuracy)
#               answers_grid ['test_Accuracy']     .append(test_accuracy)
#               answers_grid ['colsample_bytree']   .append(colsample_bytree)
#               answers_grid ['learning_rate']      .append(learning_rate)
#               answers_grid ['max_depth']          .append(max_depth)
#               answers_grid ['alpha']              .append(alpha)
#               answers_grid ['n_estimators']       .append(n_estimators)
       
       
#               # Print results for this combination
#               print(f"Combination {index}")
#               print(f"colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth: {max_depth}, alpha: {alpha}, n_estimators: {n_estimators}")
#               print(f"Train Accuracy: {train_accuracy:.2f}")
#               print(f"Test Accuracy : {test_accuracy :.2f}")
#               print("-" * 30)