In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('lending_club_ml.csv')
df.drop(['id'], inplace=True, axis=1)

In [4]:
# creating columns for features that had more than 2 outcomes
# I have decided to use subgrade and not grade as these are suppose to be similar features but, subgrade is more granular

df = pd.concat([df, pd.get_dummies(
    df.sub_grade, prefix='sub_grade', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.home_ownership,
                                   prefix='home_ownership', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status,
                                   prefix='verification_status', drop_first=True)], axis=1)
df = pd.concat(
    [df, pd.get_dummies(df.purpose, prefix='purpose', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status_joint,
                                   prefix='verification_status_joint', drop_first=True)], axis=1)

df.drop(columns=['sub_grade', 'home_ownership', 'verification_status',
                 'purpose', 'verification_status_joint'], inplace=True)


# conversions for features that only had 2 outcomes

df.disbursement_method = df.disbursement_method.apply(
    lambda disburstment: 1 if disburstment == 'Cash' else 0)

df.application_type = df.application_type.apply(
    lambda app_type: 1 if app_type == 'Individual' else 0)

In [5]:
# if borrower is an indivudal then their secondary features are filled with their single counterpart

sec_list = ['sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_inq_last_6mths', 'sec_app_mort_acc',
            'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il','sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths',
            'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog']
joint_list = ['dti_joint', 'revol_bal_joint', 'annual_inc_joint']

for secondary in sec_list:
    df.loc[df.application_type == 1,secondary] = df.loc[df.application_type == 1][secondary[8:]]


for joint in joint_list:
    df.loc[df.application_type == 1,joint] = df.loc[df.application_type == 1][joint[:len(joint)-6]]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2506738 entries, 0 to 2506737
Columns: 148 entries, loan_amnt to verification_status_joint_Verified
dtypes: float64(89), int64(5), object(1), uint8(53)
memory usage: 1.9+ GB


In [6]:
#fill all nans with 0
df.fillna(0, inplace=True)

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


def scoring(clf, x, y):
    
    #Baseline
    print('Loan passing rate:', np.mean(y))
    print('Balanced loan passing rate:', np.average(y, weights=x['loan_amnt']))
    print('\n')
    
    
    print('score: ', clf.score(x, y))
    # score adjusted for loan amount
    print('balanced_accuracy_score: ', balanced_accuracy_score(
        y, clf.predict(x), sample_weight=x['loan_amnt']))

    print(confusion_matrix(y, clf.predict(x)))
    
    print('F1 score: ', f1_score(y, clf.predict(x)))
    
    print('precision_score: ', precision_score(y, clf.predict(x)))

    # score adjusted for loan amount
    print('average_precision_score: ', average_precision_score(
        y, clf.predict(x), average='weighted', sample_weight=x['loan_amnt']))
    
    print('recall_score: ', recall_score(y, clf.predict(x)))

    print('roc: ', roc_auc_score(y, clf.predict_proba(x)[:, 1]))

    # score adjusted for loan amount
    print('roc_weighted: ', roc_auc_score(y, clf.predict_proba(x)[
          :, 1], average='weighted', sample_weight=x['loan_amnt']))

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df.drop(['grade','loan_status'], axis=1)
y = df.loan_status

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



rf_best = RandomForestClassifier(criterion= 'entropy', n_estimators= 101)
rf_best.fit(X_train, y_train)
    
print('RANDOM FOREST')
scoring(rf_best, X_test, y_test)

RANDOM FOREST
Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.9414352239695115
balanced_accuracy_score:  0.8598431799117082
[[107995  41682]
 [  2360 599985]]
F1 score:  0.9645968045324322
precision_score:  0.9350410727059363
average_precision_score:  0.9290625875070259
recall_score:  0.9960819795964106
roc:  0.9645267462749104
roc_weighted:  0.9655248162599434


# Testing based off of certain features
Here will be testing our classifier, that was trained and test on the general populatyion with global averages to fill the nans, on certain sub-groups of the population and to see if we need to fill nans with local averages.

In [9]:
def ml_test(dataframe):
    dataframe.fillna(0,inplace=True)
    
    X = dataframe.drop(['grade','loan_status'], axis=1)
    y = dataframe.loan_status

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    print('MAIN CLASSIFER')
    scoring(rf_best, X_test, y_test)
    
    print('\n')
    print('RETRAINED CLASSIFER')
    rf = RandomForestClassifier(criterion= 'entropy', n_estimators= 101)
    rf.fit(X_train, y_train)
    scoring(rf, X_test, y_test)

Individual verse joint: in our EDA joint borrowers failed in higher rates and had wide differences between their single and joint/secondary features.

In [10]:
ml_test(df.loc[df.application_type == 1])

MAIN CLASSIFER
Loan passing rate: 0.8018762389397847
Balanced loan passing rate: 0.7866714294905874


score:  0.9764619995492925
balanced_accuracy_score:  0.9434088562562084
[[130255  16566]
 [   877 593359]]
F1 score:  0.9855143955002694
precision_score:  0.9728392835184654
average_precision_score:  0.970302536186
recall_score:  0.9985241553860755
roc:  0.9923215332215768
roc_weighted:  0.9926144177629965


RETRAINED CLASSIFER
Loan passing rate: 0.8018762389397847
Balanced loan passing rate: 0.7866714294905874


score:  0.9411435287703915
balanced_accuracy_score:  0.859340311828035
[[105697  41124]
 [  2492 591744]]
F1 score:  0.9644561504159387
precision_score:  0.9350196249454863
average_precision_score:  0.9292599632894468
recall_score:  0.9958063799567849
roc:  0.9638754274871868
roc_weighted:  0.9646982666695247


In [11]:
ml_test(df.loc[df.application_type == 0])

MAIN CLASSIFER
Loan passing rate: 0.7680802553579571
Balanced loan passing rate: 0.7633904760778214


score:  0.9794801641586868
balanced_accuracy_score:  0.959861310398982
[[2340  203]
 [  22 8400]]
F1 score:  0.9867841409691631
precision_score:  0.9764035801464606
average_precision_score:  0.9758982268463176
recall_score:  0.9973877938731893
roc:  0.9976604259036195
roc_weighted:  0.9979079265676026


RETRAINED CLASSIFER
Loan passing rate: 0.7680802553579571
Balanced loan passing rate: 0.7633904760778214


score:  0.9252165982672138
balanced_accuracy_score:  0.8408581350792388
[[1763  780]
 [  40 8382]]
F1 score:  0.9533666969972703
precision_score:  0.9148657498362803
average_precision_score:  0.9103189532297549
recall_score:  0.9952505343148895
roc:  0.9641493315682679
roc_weighted:  0.9639070734726592


The Global classifier has outperformed the locally trained version. 

ML for different grades

In [12]:
grades = set(df.grade)

for loan_grade in sorted(grades):
    print(loan_grade)
    ml_test(df.loc[df.grade==loan_grade])
    print('\n')

A
MAIN CLASSIFER
Loan passing rate: 0.9405104807551982
Balanced loan passing rate: 0.9413940416432522


score:  0.9946232842515996
balanced_accuracy_score:  0.9570392949224038
[[  7045    700]
 [     0 122446]]
F1 score:  0.9971497442913451
precision_score:  0.9943156903188086
average_precision_score:  0.99467947023111
recall_score:  1.0
roc:  0.9925819902934617
roc_weighted:  0.9927854720555352


RETRAINED CLASSIFER
Loan passing rate: 0.9405104807551982
Balanced loan passing rate: 0.9413940416432522


score:  0.9826792942676529
balanced_accuracy_score:  0.8520743593414324
[[  5490   2255]
 [     0 122446]]
F1 score:  0.9908758754911045
precision_score:  0.9819167448536901
average_precision_score:  0.9819150342298049
recall_score:  1.0
roc:  0.9493045189169541
roc_weighted:  0.9468541445837867


B
MAIN CLASSIFER
Loan passing rate: 0.8673473571985074
Balanced loan passing rate: 0.8650654540003315


score:  0.9879626212508485
balanced_accuracy_score:  0.9544581410331217
[[ 26473   2642]


The Global classifier has outperformed the locally trained version. 

ML for different loan sizes

In [13]:
for decile in np.arange(1,11):
    print((decile-1)*10,decile*10)
    
    ml_test(df.loc[(df.loan_amnt > np.percentile(df.loan_amnt,(decile-1)*10)) & 
                   (df.loan_amnt<np.percentile(df.loan_amnt,decile*10))])
    print('\n')

0 10
MAIN CLASSIFER
Loan passing rate: 0.8456880009798451
Balanced loan passing rate: 0.8423225500026952


score:  0.9860780337774391
balanced_accuracy_score:  0.9555555261758979
[[10327  1012]
 [   11 62131]]
F1 score:  0.9918346170730735
precision_score:  0.9839728869391698
average_precision_score:  0.9836383730170525
recall_score:  0.9998229860641756
roc:  0.9951266216677824
roc_weighted:  0.9950286540594365


RETRAINED CLASSIFER
Loan passing rate: 0.8456880009798451
Balanced loan passing rate: 0.8423225500026952


score:  0.9557164437065364
balanced_accuracy_score:  0.8571636174459296
[[ 8097  3242]
 [   12 62130]]
F1 score:  0.9744812334331917
precision_score:  0.9504069020375696
average_precision_score:  0.9492412293440604
recall_score:  0.9998068938881916
roc:  0.9631289248911601
roc_weighted:  0.9631752551480401


10 20
MAIN CLASSIFER
Loan passing rate: 0.8402740327960533
Balanced loan passing rate: 0.8404687232746312


score:  0.9847005013282202
balanced_accuracy_score:  0.952

recall_score:  0.9930618720253834
roc:  0.9637984939805051
roc_weighted:  0.9638178604727806




The Global classifier has outperformed the locally trained version. 

Annual Income by decile

In [14]:
for decile in np.arange(1,11):
    print((decile-1)*10,decile*10)
    
    ml_test(df.loc[(df.annual_inc > np.percentile(df.annual_inc,(decile-1)*10)) & 
                   (df.annual_inc<np.percentile(df.annual_inc,decile*10))])
    print('\n')

0 10
MAIN CLASSIFER
Loan passing rate: 0.7586561440560218
Balanced loan passing rate: 0.7413021730605859


score:  0.9792558217084422
balanced_accuracy_score:  0.958855475058197
[[15939  1431]
 [   62 54540]]
F1 score:  0.9864976079151331
precision_score:  0.9744331886155331
average_precision_score:  0.9722167473149594
recall_score:  0.9988645104574924
roc:  0.9960201573380635
roc_weighted:  0.9961609692378186


RETRAINED CLASSIFER
Loan passing rate: 0.7586561440560218
Balanced loan passing rate: 0.7413021730605859


score:  0.9295281498360474
balanced_accuracy_score:  0.8636877101183926
[[12620  4750]
 [  322 54280]]
F1 score:  0.9553646860039426
precision_score:  0.919532441131628
average_precision_score:  0.913466733758933
recall_score:  0.9941027801179444
roc:  0.9634190605058172
roc_weighted:  0.964062997825472


10 20
MAIN CLASSIFER
Loan passing rate: 0.7724567494238119
Balanced loan passing rate: 0.7475995478751546


score:  0.9813869358461942
balanced_accuracy_score:  0.9622350

recall_score:  0.9992266421770023
roc:  0.9633799611785794
roc_weighted:  0.9637493171227729




The Global classifier has outperformed the locally trained version. 

FICO test

In [15]:
for decile in [1,5,7,8,9,10]:
    print((decile-1)*10,decile*10)
    
    ml_test(df.loc[(df.fico_range_high > np.percentile(df.fico_range_high,(decile-1)*10)) & 
                   (df.fico_range_high<np.percentile(df.fico_range_high,decile*10))])
    print('\n')

0 10
MAIN CLASSIFER
Loan passing rate: 0.7313474510264963
Balanced loan passing rate: 0.6995969401756819


score:  0.9770452775287669
balanced_accuracy_score:  0.9607875858980901
[[16876  1405]
 [  157 49609]]
F1 score:  0.9845008930343323
precision_score:  0.9724585407927235
average_precision_score:  0.9678904406579448
recall_score:  0.9968452357030905
roc:  0.9959539406525267
roc_weighted:  0.996121580018337


RETRAINED CLASSIFER
Loan passing rate: 0.7313474510264963
Balanced loan passing rate: 0.6995969401756819


score:  0.9214954369773832
balanced_accuracy_score:  0.8635337652527894
[[13473  4808]
 [  534 49232]]
F1 score:  0.9485386201182976
precision_score:  0.9110288675055515
average_precision_score:  0.8959622587148913
recall_score:  0.989269782582486
roc:  0.964851795445956
roc_weighted:  0.9654961335905506


40 50
MAIN CLASSIFER
Loan passing rate: 0.7851703564138798
Balanced loan passing rate: 0.7650369482232141


score:  0.9797848955880044
balanced_accuracy_score:  0.954696

In [16]:
feature_importance_dictionary = {}

for column,feature_importance in zip(df.columns,rf_best.feature_importances_):
    feature_importance_dictionary[column] = feature_importance

In [17]:
for thing in sorted(feature_importance_dictionary, key=feature_importance_dictionary.get, reverse=True):
    print(thing,feature_importance_dictionary[thing])

int_rate 0.043631874229616924
application_type 0.023902299694057744
annual_inc 0.023795039759499877
installment 0.02363722648086302
disbursement_method 0.021282143861930565
inq_last_12m 0.020076467029538053
acc_open_past_24mths 0.019998085016663423
loan_amnt 0.019890442928423627
pub_rec_bankruptcies 0.01974179949019202
open_acc 0.019676894717240156
total_bc_limit 0.019644628870681278
pub_rec 0.019591831461196894
sec_app_mort_acc 0.019540944260989675
mths_since_last_major_derog 0.019372685171678946
delinq_amnt 0.019282761214607265
emp_length 0.019162267904128588
acc_now_delinq 0.01895624075292583
tot_hi_cred_lim 0.018818092207064624
avg_cur_bal 0.018814037833448503
chargeoff_within_12_mths 0.018761256357897948
tax_liens 0.018563439722250503
max_bal_bc 0.018223982754868404
total_bal_ex_mort 0.016669440757676974
mo_sin_rcnt_tl 0.01655493129375922
revol_bal 0.016354467609255515
term 0.015534353779906399
mo_sin_old_il_acct 0.013954336117730181
num_bc_sats 0.013183840965204679
mths_since_rec