In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-

columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [5]:
# Load the data
file_path = Path('LoanStats_2019Q1.csv')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


# Split the Data into Training and Testing

In [6]:
# Create our features
X = df.drop(columns='loan_status')
X = pd.get_dummies(X)

# Create our target
y = df.loc[:, target].copy()

In [7]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,issue_d_Mar-2019,pymnt_plan_n,initial_list_status_f,initial_list_status_w,next_pymnt_d_Apr-2019,next_pymnt_d_May-2019,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.177238,1.0,0.123879,0.876121,0.383161,0.616839,0.86034,0.13966,1.0,1.0
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.381873,0.0,0.329446,0.329446,0.486161,0.486161,0.346637,0.346637,0.0,0.0
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Check the balance of our target values
y['loan_status'].value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Ensemble Learners

In this section, I compare two ensemble algorithms to determine which algorithm results in the best performance. I train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm I follow these steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score


### Balanced Random Forest Classifier

In [10]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=42)

In [11]:
# Calculated the balanced accuracy score
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7651430640583321

In [12]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[   68,    36],
       [ 2113, 14988]], dtype=int64)

In [13]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.03      0.65      0.88      0.06      0.76      0.56       104
   low_risk       1.00      0.88      0.65      0.93      0.76      0.59     17101

avg / total       0.99      0.88      0.66      0.93      0.76      0.59     17205



In [14]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
cols = X.columns

# Store in a DataFrame
feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df.head()

Unnamed: 0,feature,importance
0,loan_amnt,0.010947
1,int_rate,0.029751
2,installment,0.016751
3,annual_inc,0.017047
4,dti,0.015764


In [15]:
#sorted feature importance values
feature_importances_df.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
15,total_rec_prncp,0.084063
14,total_pymnt_inv,0.061651
16,total_rec_int,0.059475
13,total_pymnt,0.052988
20,last_pymnt_amnt,0.050048
...,...,...
19,collection_recovery_fee,0.000000
22,policy_code,0.000000
23,acc_now_delinq,0.000000
46,delinq_amnt,0.000000


### Easy Ensemble AdaBoost Classifier

In [16]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=42)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(random_state=42)

In [17]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9184747405684778

In [18]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)


array([[   94,    10],
       [ 1144, 15957]], dtype=int64)

In [19]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.08      0.90      0.93      0.14      0.92      0.84       104
   low_risk       1.00      0.93      0.90      0.97      0.92      0.85     17101

avg / total       0.99      0.93      0.90      0.96      0.92      0.85     17205



# Analysis of Results
The random forest classifier, with and without AdaBoost, failed to achieve useable performance. The balanced random forest classifier's precision is 0.03, meaning that in 100 loan applications that were flagged to be bad, only 3 were actually bad loan applications. The model's recall/sensitivity is 0.67, meaning that it detected 67% of bad loan applications. The F1 score is low at 0.07, since either a low precision or recall will result in a lower F1 score.

The random forest classifier with AdaBoost, while achieving better results, still suffered from inadequate predictive power. Its precision score is 0.09 and its recall 0.92. The F1 score, again, is skewed low at 0.16 by the low precision score.

The performances of both models are insufficient for commercial application.

# Lets combine over and under sampling with XGBClassifier

In [None]:
from imblearn.combine import SMOTEENN

In [29]:
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'loan_status': 1})

In [39]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state = 42)
xgb.fit(X_resampled, y_resampled)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [40]:
y_pred = xgb.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6969399562778605

In [41]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[   41,    63],
       [    6, 17095]], dtype=int64)

In [42]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.87      0.39      1.00      0.54      0.63      0.37       104
   low_risk       1.00      1.00      0.39      1.00      0.63      0.42     17101

avg / total       1.00      1.00      0.40      1.00      0.63      0.42     17205



# Analysis of Results
The XGBClassifier displays wildly different behavior from the previous models. The precision is extremely high compared to more sensitive models like Random Forest or AdaBoost ensemble. 
The balanced random forest classifier's precision is 0.87, meaning that in 100 loan applications that were flagged to be bad, 87 were actually bad loan applications! The model's recall/sensitivity is 0.39, meaning that it detected 39% of bad loan applications. This means that we let a lot of bad loans slip through the cracks, unnoticed. 