# Credit Risk Resampling Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Read the CSV and Perform Basic Data Cleaning

In [3]:
columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [4]:
# Load the data
file_path = Path('LoanStats_2019Q1.csv')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [5]:
df.dtypes

loan_amnt                     float64
int_rate                      float64
installment                   float64
home_ownership                 object
annual_inc                    float64
                               ...   
total_bal_ex_mort             float64
total_bc_limit                float64
total_il_high_credit_limit    float64
hardship_flag                  object
debt_settlement_flag           object
Length: 86, dtype: object

In [6]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,home_ownership,verification_status,issue_d,loan_status,pymnt_plan,initial_list_status,next_pymnt_d,application_type,hardship_flag,debt_settlement_flag
0,RENT,Source Verified,Mar-2019,low_risk,n,w,May-2019,Individual,N,N
1,MORTGAGE,Verified,Mar-2019,low_risk,n,w,May-2019,Individual,N,N
2,MORTGAGE,Verified,Mar-2019,low_risk,n,w,May-2019,Individual,N,N
3,RENT,Verified,Mar-2019,low_risk,n,w,May-2019,Individual,N,N
4,MORTGAGE,Not Verified,Mar-2019,low_risk,n,w,May-2019,Individual,N,N


In [7]:
obj_df["home_ownership"].value_counts()

MORTGAGE    36219
RENT        24613
OWN          7346
ANY           639
Name: home_ownership, dtype: int64

In [8]:
obj_df["verification_status"].value_counts()

Not Verified       32895
Source Verified    25737
Verified           10185
Name: verification_status, dtype: int64

In [9]:
obj_df["issue_d"].value_counts()

Jan-2019    31041
Feb-2019    25579
Mar-2019    12197
Name: issue_d, dtype: int64

In [10]:
obj_df["loan_status"].value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [11]:
obj_df["pymnt_plan"].value_counts()

n    68817
Name: pymnt_plan, dtype: int64

In [12]:
obj_df["initial_list_status"].value_counts()

w    60292
f     8525
Name: initial_list_status, dtype: int64

In [13]:
obj_df["next_pymnt_d"].value_counts()

May-2019    42449
Apr-2019    26368
Name: next_pymnt_d, dtype: int64

In [14]:
obj_df["application_type"].value_counts()

Individual    59206
Joint App      9611
Name: application_type, dtype: int64

In [15]:
obj_df["hardship_flag"].value_counts()

N    68817
Name: hardship_flag, dtype: int64

In [16]:
obj_df["debt_settlement_flag"].value_counts()

N    68817
Name: debt_settlement_flag, dtype: int64

In [17]:
cleanup_nums = {"home_ownership":     {"MORTGAGE": 1, "RENT": 2, "OWN":3, "ANY":4},
                "verification_status": {"Not Verified": 1, "Source Verified": 2, "Verified": 3},
                "issue_d": {"Jan-2019": 1, "Feb-2019": 2, "Mar-2019": 3},
                "loan_status": {"low_risk": 1, "high_risk": 2},
                "pymnt_plan": {"n": 1},
                "initial_list_status": {"w":1, "f":2},
                "next_pymnt_d": {"May-2019": 5, "Apr-2019":4},
                "application_type": {"Individual": 1, "Joint App": 2},
                "hardship_flag": {"N": 1},
                "debt_settlement_flag": {"N": 1 }}
                
                
                
                
                

In [18]:
obj_df = obj_df.replace(cleanup_nums)
obj_df.head()

Unnamed: 0,home_ownership,verification_status,issue_d,loan_status,pymnt_plan,initial_list_status,next_pymnt_d,application_type,hardship_flag,debt_settlement_flag
0,2,2,3,1,1,1,5,1,1,1
1,1,3,3,1,1,1,5,1,1,1
2,1,3,3,1,1,1,5,1,1,1
3,2,3,3,1,1,1,5,1,1,1
4,1,1,3,1,1,1,5,1,1,1


In [19]:
obj_df.dtypes

home_ownership          int64
verification_status     int64
issue_d                 int64
loan_status             int64
pymnt_plan              int64
initial_list_status     int64
next_pymnt_d            int64
application_type        int64
hardship_flag           int64
debt_settlement_flag    int64
dtype: object

# Split the Data into Training and Testing

In [20]:
# Create our features
X = obj_df.drop(columns="loan_status",axis=1)


# Create our target
y = obj_df["loan_status"]

In [21]:
X.describe()

Unnamed: 0,home_ownership,verification_status,issue_d,pymnt_plan,initial_list_status,next_pymnt_d,application_type,hardship_flag,debt_settlement_flag
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,1.599009,1.669994,1.726172,1.0,1.123879,4.616839,1.13966,1.0,1.0
std,0.713731,0.719105,0.743862,0.0,0.329446,0.486161,0.346637,0.0,0.0
min,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0
50%,1.0,2.0,2.0,1.0,1.0,5.0,1.0,1.0,1.0
75%,2.0,2.0,2.0,1.0,1.0,5.0,1.0,1.0,1.0
max,4.0,3.0,3.0,1.0,2.0,5.0,2.0,1.0,1.0


In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68817 entries, 0 to 68816
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   home_ownership        68817 non-null  int64
 1   verification_status   68817 non-null  int64
 2   issue_d               68817 non-null  int64
 3   pymnt_plan            68817 non-null  int64
 4   initial_list_status   68817 non-null  int64
 5   next_pymnt_d          68817 non-null  int64
 6   application_type      68817 non-null  int64
 7   hardship_flag         68817 non-null  int64
 8   debt_settlement_flag  68817 non-null  int64
dtypes: int64(9)
memory usage: 4.7 MB


In [23]:
# Check the balance of our target values
y.value_counts()

1    68470
2      347
Name: loan_status, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(51612, 9)

# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

### Naive Random Oversampling

In [25]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
RandomOverSample = RandomOverSampler(random_state=1)
X_resampled, y_resampled = RandomOverSample.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 51352, 2: 51352})

In [26]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
LogisticModel = LogisticRegression(random_state=1)
LogisticModel.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [27]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = LogisticModel.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[9789, 7329],
       [  43,   44]], dtype=int64)

In [28]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.5388006575051065

In [29]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      0.57      0.51      0.73      0.54      0.29     17118
          2       0.01      0.51      0.57      0.01      0.54      0.29        87

avg / total       0.99      0.57      0.51      0.72      0.54      0.29     17205



### SMOTE Oversampling

In [30]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resample2, y_resample2 = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)

In [31]:
# Train the Logistic Regression model using the resampled data
LogisticModel2 = LogisticRegression(random_state=1)

LogisticModel2.fit(X_resample2, y_resample2)
y_pred_sm = LogisticModel2.predict(X_test)

In [32]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

AccuracyScore2 = balanced_accuracy_score(y_test, y_pred_sm)
AccuracyScore2

0.5163610798876762

In [33]:
# Display the confusion matrix
ConfusionMatrix2 = confusion_matrix(y_test, y_pred_sm)

ConfusionMatrix2_df = pd.DataFrame(
    ConfusionMatrix2, index=["Actual High-Risk", "Actual Low-Risk"], columns=["Predicted High_Risk", "Predicted Low_Risk"])
ConfusionMatrix2_df

Unnamed: 0,Predicted High_Risk,Predicted Low_Risk
Actual High-Risk,8824,8294
Actual Low-Risk,42,45


In [34]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_sm))

                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      0.52      0.52      0.68      0.52      0.27     17118
          2       0.01      0.52      0.52      0.01      0.52      0.27        87

avg / total       0.99      0.52      0.52      0.68      0.52      0.27     17205



# Undersampling

In this section, you will test an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [35]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids

ClusterCentroids = ClusterCentroids(random_state=1)
X_resample3, y_resample3 = ClusterCentroids.fit_resample(X_train, y_train)
Counter(y_resample3)

Counter({1: 260, 2: 260})

In [36]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

LogisticModel3 = LogisticRegression(random_state=1)
LogisticModel3.fit(X_resample3, y_resample3)
y_pred_cc = LogisticModel3.predict(X_test)

In [37]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

AccuracyScore2 = balanced_accuracy_score(y_test, y_pred_cc)
AccuracyScore2

0.5004099334840115

In [38]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix 
ConfusionMatrix3 = confusion_matrix(y_test, y_pred_cc)

ConfusionMatrix3_df = pd.DataFrame(
    ConfusionMatrix3, index=["Actual High-Risk", "Actual Low-Risk"], columns=["Predicted High_Risk", "Predicted Low_Risk"])
ConfusionMatrix3_df

Unnamed: 0,Predicted High_Risk,Predicted Low_Risk
Actual High-Risk,4933,12185
Actual Low-Risk,25,62


In [39]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_cc))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.99      0.29      0.71      0.45      0.45      0.20     17118
          2       0.01      0.71      0.29      0.01      0.45      0.21        87

avg / total       0.99      0.29      0.71      0.44      0.45      0.20     17205



# Combination (Over and Under) Sampling

In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [40]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resample4, y_resample4 = smote_enn.fit_resample(X, y)

In [41]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
LogisticModel4 = LogisticRegression(random_state=1)

LogisticModel4.fit(X_resample4, y_resample4)
from sklearn.metrics import confusion_matrix
y_pred_st = LogisticModel4.predict(X_test)

In [42]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
AccuracyScore4 = balanced_accuracy_score(y_test, y_pred_st)
AccuracyScore4

0.5368758838246492

In [43]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
ConfusionMatrix4 = confusion_matrix(y_test, y_pred_st)

ConfusionMatrix4_df = pd.DataFrame(ConfusionMatrix4, index=["Actual High-Risk", "Actual Low-Risk"], columns=["Predicted High_Risk", "Predicted Low_Risk"])
ConfusionMatrix4_df

Unnamed: 0,Predicted High_Risk,Predicted Low_Risk
Actual High-Risk,13068,4050
Actual Low-Risk,60,27


In [44]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_st))

                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      0.76      0.31      0.86      0.49      0.25     17118
          2       0.01      0.31      0.76      0.01      0.49      0.23        87

avg / total       0.99      0.76      0.31      0.86      0.49      0.25     17205

