In [1]:
#Credit Risk Resampling Techniques

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [10]:
# Load the data
file_path = Path('../Classification/lending_data.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.430740,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk
...,...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,own,86600,0.653580,12,2,56600,high_risk
77532,17700.0,10.662,mortgage,80900,0.629172,11,2,50900,high_risk
77533,17600.0,10.595,rent,80300,0.626401,11,2,50300,high_risk
77534,16300.0,10.068,mortgage,75300,0.601594,10,2,45300,high_risk


In [12]:
#Split the Data into Training and Testing

In [13]:
# Create our features
X = df.drop(columns = 'loan_status')

# Create our target
y = df['loan_status']

In [14]:
X.describe() 

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0


In [17]:
# Check the balance of our target values
y.value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [43]:
# Create X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify= y)

In [44]:
#Data Pre-Processing
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


In [60]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_scaler = scaler.fit(X_train)

In [61]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [62]:
#Simple Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [63]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9575744975744975

In [64]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  575,    50],
       [   91, 18668]])

In [65]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      0.92      1.00      0.89      0.96      0.91       625
   low_risk       1.00      1.00      0.92      1.00      0.96      0.92     18759

avg / total       0.99      0.99      0.92      0.99      0.96      0.92     19384



In [67]:
#Oversampling
#Naive Random Oversampling

In [68]:
from imblearn.over_sampling import RandomOverSampler
# Resample the training data with the RandomOversampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
# View the count of target classes with Counter
Counter(y_resampled) 

Counter({'low_risk': 56277, 'high_risk': 56277})

In [69]:
Counter(X_resampled) 

Counter({'loan_size': 1,
         'interest_rate': 1,
         'borrower_income': 1,
         'debt_to_income': 1,
         'num_of_accounts': 1,
         'derogatory_marks': 1,
         'total_debt': 1})

In [95]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model2 = LogisticRegression(solver='lbfgs', random_state=1)
model2.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [96]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model2.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  623,     2],
       [  105, 18654]])

In [72]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9956279972279972

In [84]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      1.00      0.99      0.92      1.00      0.99       625
   low_risk       1.00      0.99      1.00      1.00      1.00      0.99     18759

avg / total       1.00      0.99      1.00      0.99      1.00      0.99     19384



In [75]:
#SMOTE Oversampling
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled2, y_resampled2 = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)

# View the count of target classes with Counter
from collections import Counter

Counter(y_resampled2)

Counter({'low_risk': 56277, 'high_risk': 56277})

In [97]:
# Train the Logistic Regression model using the resampled data
model3 = LogisticRegression(solver='lbfgs', random_state=1)
model3.fit(X_resampled2, y_resampled2)

LogisticRegression(random_state=1)

In [98]:
# Calculated the balanced accuracy score
y_pred2 = model3.predict(X_test)
balanced_accuracy_score(y_test, y_pred2)

0.9956013433551896

In [82]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred2)

array([[  623,     2],
       [  105, 18654]])

In [83]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred2))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      1.00      0.99      0.92      1.00      0.99       625
   low_risk       1.00      0.99      1.00      1.00      1.00      0.99     18759

avg / total       1.00      0.99      1.00      0.99      1.00      0.99     19384



In [85]:
#Undersampling
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled3, y_resampled3 = cc.fit_resample(X_train, y_train)

# View the count of target classes with Counter
Counter(y_resampled3)

Counter({'high_risk': 1875, 'low_risk': 1875})

In [99]:
# Train the Logistic Regression model using the resampled data
model4 = LogisticRegression(solver='lbfgs', random_state=1)
model4.fit(X_resampled3, y_resampled3)

LogisticRegression(random_state=1)

In [100]:
# Display the confusion matrix
y_pred3 = model4.predict(X_test)
confusion_matrix(y_test, y_pred3)

array([[  615,    10],
       [  100, 18659]])

In [88]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred3)

0.9893346127192282

In [89]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred3))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      0.98      0.99      0.92      0.99      0.98       625
   low_risk       1.00      0.99      0.98      1.00      0.99      0.98     18759

avg / total       0.99      0.99      0.98      0.99      0.99      0.98     19384



In [94]:
#Combination (Over and Under) Sampling
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

X= X.drop(columns='homeowner')
smote_enn = SMOTEENN(random_state=0)
X_resampled4, y_resampled4 = smote_enn.fit_resample(X, y)

# View the count of target classes with Counter
Counter(y_resampled4)

Counter({'high_risk': 72600, 'low_risk': 74591})

In [105]:
# Train the Logistic Regression model using the resampled data
model5 = LogisticRegression(solver='lbfgs', random_state=1)
model5.fit(X_resampled4, y_resampled4)

LogisticRegression(random_state=1)

In [106]:
# Display the confusion matrix
y_pred4 = model5.predict(X_test)
confusion_matrix(y_test, y_pred4)

array([[  623,     2],
       [  110, 18649]])

In [107]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred4)

0.995468073991151

In [104]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred4))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      1.00      0.99      0.92      1.00      0.99       625
   low_risk       1.00      0.99      1.00      1.00      1.00      0.99     18759

avg / total       1.00      0.99      1.00      0.99      1.00      0.99     19384



In [108]:
#Final Questions

#1. Which model had the best balanced accuracy score?
print('1. The Naive Over Sampling model had the best balanced accuracy score at 0.9956279972279972')


#2. Which model had the best recall score?
print('2. All models practically had the same recall score of 0.99')

#3. Which model had the best geometric mean score?
print('3. The Naive Over Sampling model, the SMOTE Over Sampling model, and SMOTEENN model tied for the best geometric score at 1.00')

1. The Naive Over Sampling model had the best balanced accuracy score at 0.9956279972279972
2. All models practically had the same recall score of 0.99
3. The Naive Over Sampling model, the SMOTE Over Sampling model, and SMOTEENN model tied for the best geometric score at 1.00
