# Credit Risk Resampling Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

**Read the CSV and Perform Basic Data Cleaning**

In [4]:
columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [6]:

# Load the data
file_path = Path('/kaggle/input/loanstats/LoanStats_2019Q1.csv')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# Convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100

# Convert the target column values to low_risk and high_risk based on their values
df['loan_status'] = df['loan_status'].replace({
    'Current': 'low_risk',
    'Late (31-120 days)': 'high_risk',
    'Late (16-30 days)': 'high_risk',
    'Default': 'high_risk',
    'In Grace Period': 'high_risk'
})

df.reset_index(inplace=True, drop=True)

# Display the first 5 rows
print(df.head())

   loan_amnt  int_rate  installment home_ownership  annual_inc  \
0    10500.0    0.1719       375.35           RENT     66000.0   
1    25000.0    0.2000       929.09       MORTGAGE    105000.0   
2    20000.0    0.2000       529.88       MORTGAGE     56000.0   
3    10000.0    0.1640       353.55           RENT     92000.0   
4    22000.0    0.1474       520.39       MORTGAGE     52000.0   

  verification_status issue_d loan_status pymnt_plan    dti  ...  \
0     Source Verified  Mar-19    low_risk          n  27.24  ...   
1            Verified  Mar-19    low_risk          n  20.23  ...   
2            Verified  Mar-19    low_risk          n  24.26  ...   
3            Verified  Mar-19    low_risk          n  31.44  ...   
4        Not Verified  Mar-19    low_risk          n  18.76  ...   

   pct_tl_nvr_dlq  percent_bc_gt_75  pub_rec_bankruptcies  tax_liens  \
0            85.7             100.0                   0.0        0.0   
1            91.2              50.0               

In [7]:
df.columns

Index(['loan_amnt', 'int_rate', 'installment', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'next_pymnt_d', 'collections_12_mths_ex_med',
       'policy_code', 'application_type', 'acc_now_delinq', 'tot_coll_amt',
       'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m',
       'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util',
       'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
       'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_si

**Split the Data into Training and Testing**

In [8]:
# Create our features
#df contains columns with all string values
training_df = pd.get_dummies(df, columns=["home_ownership","verification_status","issue_d",
                                "pymnt_plan" , "hardship_flag", "debt_settlement_flag",
                                          "initial_list_status", "next_pymnt_d", 
                                          "application_type"])

X = training_df.drop(columns="loan_status")

# Create our target
y =  df["loan_status"]

In [9]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.052138,2.219423,95.057627,30.626217,0.125972,0.0,210033.2,61338.43,29734.128558,55722.4
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.390633,1.897432,8.326426,33.631463,0.336732,0.0,192808.8,57387.98,26795.394232,50958.45
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,20.0,0.0,0.0,0.0,3600.0,235.0,100.0,127.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,93.0,0.0,0.0,0.0,66977.0,26503.0,11600.0,22880.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,2.0,100.0,20.0,0.0,0.0,146710.0,45357.0,22100.0,42000.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,3.0,100.0,50.0,0.0,0.0,303640.0,76570.0,39300.0,72499.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,18.0,19.0,100.0,100.0,4.0,0.0,3292782.0,1295455.0,509400.0,1426964.0


In [10]:
# Check the balance of our target values
y.value_counts()

loan_status
low_risk     68470
high_risk      347
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape
Counter(y_train)

Counter({'low_risk': 51352, 'high_risk': 260})

# Oversampling

In this section, I'll compare two oversampling algorithms to determine which one yields the best performance. I'll use the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, I will follow these steps:

1. **View the Count of Target Classes**: I'll use the `Counter` from the `collections` library to check the distribution of the target classes.

2. **Train a Logistic Regression Model**: I'll apply the resampled data to train a logistic regression model.

3. **Calculate the Balanced Accuracy Score**: Using `sklearn.metrics`, I'll compute the balanced accuracy score.

4. **Print the Confusion Matrix**: I'll generate and print the confusion matrix using `sklearn.metrics`.

5. **Generate a Classification Report**: I'll create a classification report using the `imbalanced_classification_report` from the `imbalanced-learn` library.

Note: I'll use a random state of 1 for each sampling algorithm to ensure consistency between tests.

**Naive Random Oversampling**

In [12]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
rand_sample = RandomOverSampler(random_state=1)
X_resampled, y_resampled = rand_sample.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'low_risk': 51352, 'high_risk': 51352})

In [13]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [14]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

con_mat = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    con_mat, index=["Actual High-Risk", "Actual Low-Risk"], columns=["Predicted High-Risk", "Predicted Low-Risk"]
)
display(cm_df)

Unnamed: 0,Predicted High-Risk,Predicted Low-Risk
Actual High-Risk,54,33
Actual Low-Risk,5591,11527


In [15]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

con_mat = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    con_mat, index=["Actual High-Risk", "Actual Low-Risk"], columns=["Predicted High-Risk", "Predicted Low-Risk"]
)
display(cm_df)

Unnamed: 0,Predicted High-Risk,Predicted Low-Risk
Actual High-Risk,54,33
Actual Low-Risk,5591,11527


In [16]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

acc_score =balanced_accuracy_score(y_test, y_pred)
acc_score

0.6470371981902494

In [17]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.62      0.67      0.02      0.65      0.42        87
   low_risk       1.00      0.67      0.62      0.80      0.65      0.42     17118

avg / total       0.99      0.67      0.62      0.80      0.65      0.42     17205



**SMOTE Oversampling**

In [18]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)#refactor code from naive random sampling
Counter(y_resampled)

Counter({'low_risk': 51352, 'high_risk': 51352})

In [19]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [20]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
smote_acc_score = balanced_accuracy_score(y_test, y_pred)
smote_acc_score

0.6377487299112449

In [21]:
# Display the confusion matrix
smote_con_mat = confusion_matrix(y_test, y_pred) #smote confusion matrix
smote_con_mat_df = pd.DataFrame(
    smote_con_mat, index=["Actual High-Risk", "Actual Low-Risk"], columns=["Predicted High_Risk", "Predicted Low_Risk"]
)
smote_con_mat_df

Unnamed: 0,Predicted High_Risk,Predicted Low_Risk
Actual High-Risk,54,33
Actual Low-Risk,5909,11209


In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.62      0.65      0.02      0.64      0.41        87
   low_risk       1.00      0.65      0.62      0.79      0.64      0.41     17118

avg / total       0.99      0.65      0.62      0.79      0.64      0.41     17205



# Undersampling

In this section, I will test undersampling algorithms to determine which one performs best compared to the oversampling algorithms I used earlier. I’ll be using the Cluster Centroids algorithm for undersampling and will follow these steps:

1. **View the Count of Target Classes**: I’ll use the `Counter` from the `collections` library to examine the distribution of the target classes.

2. **Train a Logistic Regression Model**: I’ll apply the resampled data to train a logistic regression model.

3. **Calculate the Balanced Accuracy Score**: I’ll compute the balanced accuracy score using `sklearn.metrics`.

4. **Print the Confusion Matrix**: I’ll generate and print the confusion matrix with `sklearn.metrics`.

5. **Generate a Classification Report**: I’ll create a classification report using the `imbalanced_classification_report` from the `imbalanced-learn` library.

Note: I’ll use a random state of 1 for each sampling algorithm to ensure consistency between tests.

In [23]:
# Resample the data using the ClusterCentroids resampler

from imblearn.under_sampling import ClusterCentroids
clus_cent = ClusterCentroids(random_state=1)
X_resampled, y_resampled = clus_cent.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'high_risk': 260, 'low_risk': 260})

In [24]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [25]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)


under_con_mat = confusion_matrix(y_test, y_pred)#underscore confusion matrix
under_con_mat_df = pd.DataFrame(
    under_con_mat, index=["Actual High-Risk", "Actual Low-Risk"], columns=["Predicted High_Risk", "Predicted Low_Risk"]
)

under_con_mat_df

Unnamed: 0,Predicted High_Risk,Predicted Low_Risk
Actual High-Risk,52,35
Actual Low-Risk,9589,7529


In [26]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
under_acc_score = balanced_accuracy_score(y_test, y_pred)#undersampling accuracy score
under_acc_score

0.5187652843749875

In [27]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.60      0.44      0.01      0.51      0.27        87
   low_risk       1.00      0.44      0.60      0.61      0.51      0.26     17118

avg / total       0.99      0.44      0.60      0.61      0.51      0.26     17205



# Combination (Over and Under) Sampling# 
In this section, I’ll test a combination of over- and under-sampling algorithms to determine if this approach results in better performance compared to the other sampling methods I’ve used. I’ll be resampling the data using the SMOTEENN algorithm and will follow these steps:

1. **View the Count of Target Classes**: I’ll use the `Counter` from the `collections` library to check the distribution of the target classes.

2. **Train a Logistic Regression Model**: I’ll use the resampled data to train a logistic regression model.

3. **Calculate the Balanced Accuracy Score**: I’ll compute the balanced accuracy score using `sklearn.metrics`.

4. **Print the Confusion Matrix**: I’ll generate and print the confusion matrix with `sklearn.metrics`.

5. **Generate a Classification Report**: I’ll create a classification report using the `imbalanced_classification_report` from the `imbalanced-learn` library.

Note: I’ll use a random state of 1 for each sampling algorithm to ensure consistency between tests.

In [28]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)
Counter(y_resampled)

Counter({'high_risk': 68460, 'low_risk': 62011})

In [29]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [30]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)


combined_con_mat = confusion_matrix(y_test, y_pred)
combined_con_mat_df = pd.DataFrame(
    combined_con_mat, index=["Actual High-Risk", "Actual Low-Risk"], columns=["Predicted High_Risk", "Predicted Low_Risk"]
)

combined_con_mat_df

Unnamed: 0,Predicted High_Risk,Predicted Low_Risk
Actual High-Risk,62,25
Actual Low-Risk,6864,10254


In [31]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

combined_acc_score = balanced_accuracy_score(y_test, y_pred)
combined_acc_score

0.6558311275487387

In [32]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.71      0.60      0.02      0.65      0.43        87
   low_risk       1.00      0.60      0.71      0.75      0.65      0.42     17118

avg / total       0.99      0.60      0.71      0.74      0.65      0.42     17205

