In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced


# Read the CSV and Perform Basic Data Cleaning

In [4]:
# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-

columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [5]:
# Load the data
file_path = Path('../LoanStats_2019Q1.csv')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [52]:
# String conversion into numerical data
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df2=df.copy()
df2["home_ownership"] = le.fit_transform(df2["home_ownership"])
df2["verification_status"] = le.fit_transform(df2["verification_status"])
df2["issue_d"] = le.fit_transform(df2["issue_d"])
df2["loan_status"] = le.fit_transform(df2["loan_status"])
df2["pymnt_plan"] = le.fit_transform(df2["pymnt_plan"])
df2["initial_list_status"] = le.fit_transform(df2["initial_list_status"])
df2["next_pymnt_d"] = le.fit_transform(df2["next_pymnt_d"])
df2["application_type"] = le.fit_transform(df2["application_type"])
df2["hardship_flag"] = le.fit_transform(df2["hardship_flag"])
df2["debt_settlement_flag"] = le.fit_transform(df2["debt_settlement_flag"])

# Split the Data into Training and Testing

In [53]:
# Create our features
X = df2.drop(columns=target,axis=1)# YOUR CODE HERE

# Create our target
y = df2[target] # YOUR CODE HERE


In [55]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,1.812779,88213.71,0.669994,0.805542,0.0,21.778153,0.217766,...,95.057627,30.626217,0.125972,0.0,210033.2,61338.43,29734.128558,55722.4,0.0,0.0
std,10277.34859,0.04813,288.062432,0.941313,115580.0,0.719105,0.714932,0.0,20.199244,0.718367,...,8.326426,33.631463,0.336732,0.0,192808.8,57387.98,26795.394232,50958.45,0.0,0.0
min,1000.0,0.06,30.89,0.0,40.0,0.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,3600.0,235.0,100.0,127.0,0.0,0.0
25%,9000.0,0.0881,265.73,1.0,50000.0,0.0,0.0,0.0,13.89,0.0,...,93.0,0.0,0.0,0.0,66977.0,26503.0,11600.0,22880.0,0.0,0.0
50%,15000.0,0.118,404.56,1.0,73000.0,1.0,1.0,0.0,19.76,0.0,...,100.0,20.0,0.0,0.0,146710.0,45357.0,22100.0,42000.0,0.0,0.0
75%,24000.0,0.1557,648.1,3.0,104000.0,1.0,1.0,0.0,26.66,0.0,...,100.0,50.0,0.0,0.0,303640.0,76570.0,39300.0,72499.0,0.0,0.0
max,40000.0,0.3084,1676.23,3.0,8797500.0,2.0,2.0,0.0,999.0,18.0,...,100.0,100.0,4.0,0.0,3292782.0,1295455.0,509400.0,1426964.0,0.0,0.0


In [56]:
# Check the balance of our target values
y['loan_status'].value_counts()

1    68470
0      347
Name: loan_status, dtype: int64

In [60]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
# YOUR CODE HERE
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1,stratify=y)
X_train.shape
print(X_test.shape)
print(y_test.shape)
print(X_train.shape)
print(y_train.shape)

(17205, 85)
(17205, 1)
(51612, 85)
(51612, 1)


In [63]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_scaler=scaler.fit(X_train)

X_train_scaled=X_scaler.transform(X_train)
X_test_scaled=X_scaler.transform(X_test)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [75]:
# Resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier
# YOUR CODE HERE
model=BalancedRandomForestClassifier(random_state=2)
model.fit(X_train_scaled,y_train)


BalancedRandomForestClassifier(bootstrap=True, class_weight=None,
                               criterion='gini', max_depth=None,
                               max_features='auto', max_leaf_nodes=None,
                               min_impurity_decrease=0.0, min_samples_leaf=2,
                               min_samples_split=2,
                               min_weight_fraction_leaf=0.0, n_estimators=100,
                               n_jobs=1, oob_score=False, random_state=2,
                               replacement=False, sampling_strategy='auto',
                               verbose=0, warm_start=False)

In [76]:
#calculate the accuracy score
y_pred=model.predict(X_test_scaled)
balanced_accuracy_score(y_test,y_pred)

0.7858908348139284

In [77]:
#confusion matrix
cm=confusion_matrix(y_test,y_pred)
cm

array([[   59,    28],
       [ 1821, 15297]])

In [78]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test,y_pred))
# YOUR CODE HERE

                   pre       rec       spe        f1       geo       iba       sup

          0       0.03      0.68      0.89      0.06      0.78      0.59        87
          1       1.00      0.89      0.68      0.94      0.78      0.62     17118

avg / total       0.99      0.89      0.68      0.94      0.78      0.62     17205



In [80]:
# List the features sorted in descending order by feature importance
importances=model.feature_importances_
sorted(zip(importances,X.columns),reverse=True)
# YOUR CODE HERE

[(0.07657507746664234, 'last_pymnt_amnt'),
 (0.07305105421264914, 'total_rec_prncp'),
 (0.06302249774882424, 'total_rec_int'),
 (0.06269808667608041, 'total_pymnt_inv'),
 (0.060918540743694675, 'total_pymnt'),
 (0.025449376832959524, 'int_rate'),
 (0.02445993597815091, 'issue_d'),
 (0.018420540054714437, 'revol_bal'),
 (0.017341789909131516, 'dti'),
 (0.017295506600658088, 'il_util'),
 (0.01723685390584413, 'bc_open_to_buy'),
 (0.01687647626579716, 'avg_cur_bal'),
 (0.016317122846071376, 'total_bc_limit'),
 (0.016085477876933575, 'mo_sin_old_il_acct'),
 (0.015571853790248074, 'out_prncp_inv'),
 (0.015467809740615618, 'annual_inc'),
 (0.015031360974244472, 'max_bal_bc'),
 (0.014857291847477224, 'bc_util'),
 (0.014837189671867318, 'out_prncp'),
 (0.014833300391747182, 'mo_sin_old_rev_tl_op'),
 (0.014806256951717429, 'installment'),
 (0.014432540203182577, 'total_bal_ex_mort'),
 (0.014411417646668694, 'mths_since_recent_inq'),
 (0.013878617367298124, 'total_bal_il'),
 (0.01361233865895789

### Easy Ensemble AdaBoost Classifier

In [85]:
# Train the Classifier
from imblearn.ensemble import EasyEnsembleClassifier
# YOUR CODE HERE
classfier=EasyEnsembleClassifier(n_estimators=100,random_state=2)
classfier.fit(X_train_scaled,y_train)

EasyEnsembleClassifier(base_estimator=None, n_estimators=100, n_jobs=1,
                       random_state=2, replacement=False,
                       sampling_strategy='auto', verbose=0, warm_start=False)

In [86]:
# Calculated the balanced accuracy score
y_pred_2=classfier.predict(X_test_scaled)
balanced_accuracy_score(y_test,y_pred_2)
# YOUR CODE HERE

0.9329270257966005

In [87]:
confusion_matrix(y_test,y_pred_2)

array([[   80,     7],
       [  919, 16199]])

In [91]:
print(classification_report_imbalanced(y_test,y_pred_2))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.08      0.92      0.95      0.15      0.93      0.87        87
          1       1.00      0.95      0.92      0.97      0.93      0.87     17118

avg / total       0.99      0.95      0.92      0.97      0.93      0.87     17205



# Models’ performance


# Precision

The precision refering to how many reliable a positive classification is. In the random forest classifier. the precision for high risk is 0.08 and 1 for low risk. From this result we can said this model predict a very poor high risk target. The low precision means large number of false positive. And the 100% precision for low risk means it succefully predict all the true positive for low risk target. In the AdaBoost classifier the precision result is pretty much close wihch is also not ideal 

# Recall

Recall is the ability of the classifier to find all the positive samples. It can be determined by the ratio. Overall, both model perform a good recall value. A good recall value means there are large amount of true positive and less false negative.

# F1 Score

F1 Score is a weight average of true postive rate and precision, both model has low F1 score in high risk and high F1 Score for low risk.

The credit risk has two level low_risk and high _risk.In this notebook, we had implemented both  Balanced Random Forest Classifier and Adaboost classifier. Based on the summary report we had, we can have some conclusion. Before talking about the result, we needs to clarify the number 0 means high risk and number 1 means low risk. in the random forest classifier, the precision of high risk is almost 0 and the low risk precision is 100%. Apparently, this classifer has an issue in indentifying the low-risk user. In turn of the sensitivity, the low-risk has achieve almost 70% and high risk ~90%, which means the model is really good at finding all the positive samples.The accuracy score for Random Forest classifier is 79%  and 93% foe AdaBoost Classiifer. In turn of the accuracy score, the adaboost classifier perform a lot better. However, both model perform very bad in turn of precision. This mean it is very bad in positive classification. Therefore we cannot recommend both model. The major issue from this model is that the prediction of True Negative is too large. this can lead to calculation of precision and the Recall to be biased since the sample number was already been deviated so much. 