In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('lending_club_ml.csv')
df.drop(['id'], inplace=True, axis=1)

In [4]:
# creating columns for features that had more than 2 outcomes
# I have decided to use subgrade and not grade as these are suppose to be similar features but, subgrade is more granular

df = pd.concat([df, pd.get_dummies(
    df.sub_grade, prefix='sub_grade', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.home_ownership,
                                   prefix='home_ownership', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status,
                                   prefix='verification_status', drop_first=True)], axis=1)
df = pd.concat(
    [df, pd.get_dummies(df.purpose, prefix='purpose', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status_joint,
                                   prefix='verification_status_joint', drop_first=True)], axis=1)

df.drop(columns=['sub_grade', 'home_ownership', 'verification_status',
                 'purpose', 'verification_status_joint'], inplace=True)


# conversions for features that only had 2 outcomes

df.disbursement_method = df.disbursement_method.apply(
    lambda disburstment: 1 if disburstment == 'Cash' else 0)

df.application_type = df.application_type.apply(
    lambda app_type: 1 if app_type == 'Individual' else 0)

In [5]:
"""# if borrower is an indivudal then their secondary features are filled with their single counterpart

sec_list = ['sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_inq_last_6mths', 'sec_app_mort_acc',
            'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il','sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths',
            'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog']
joint_list = ['dti_joint', 'revol_bal_joint', 'annual_inc_joint']

for secondary in sec_list:
    df.loc[df.application_type == 1,secondary] = df.loc[df.application_type == 1][secondary[8:]]


for joint in joint_list:
    df.loc[df.application_type == 1,joint] = df.loc[df.application_type == 1][joint[:len(joint)-6]]

df.info()"""

"# if borrower is an indivudal then their secondary features are filled with their single counterpart\n\nsec_list = ['sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_inq_last_6mths', 'sec_app_mort_acc',\n            'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il','sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths',\n            'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog']\njoint_list = ['dti_joint', 'revol_bal_joint', 'annual_inc_joint']\n\nfor secondary in sec_list:\n    df.loc[df.application_type == 1,secondary] = df.loc[df.application_type == 1][secondary[8:]]\n\n\nfor joint in joint_list:\n    df.loc[df.application_type == 1,joint] = df.loc[df.application_type == 1][joint[:len(joint)-6]]\n\ndf.info()"

In [6]:
#fill all nans with 0
df.fillna(0, inplace=True)

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


def scoring(clf, x, y):
    
    #Baseline
    print('Loan passing rate:', np.mean(y))
    print('Balanced loan passing rate:', np.average(y, weights=x['loan_amnt']))
    print('\n')
    
    
    print('score: ', clf.score(x, y))
    # score adjusted for loan amount
    print('balanced_accuracy_score: ', balanced_accuracy_score(
        y, clf.predict(x), sample_weight=x['loan_amnt']))

    print(confusion_matrix(y, clf.predict(x)))
    
    print('F1 score: ', f1_score(y, clf.predict(x)))
    
    print('precision_score: ', precision_score(y, clf.predict(x)))

    # score adjusted for loan amount
    print('average_precision_score: ', average_precision_score(
        y, clf.predict(x), average='weighted', sample_weight=x['loan_amnt']))
    
    print('recall_score: ', recall_score(y, clf.predict(x)))

    print('roc: ', roc_auc_score(y, clf.predict_proba(x)[:, 1]))

    # score adjusted for loan amount
    print('roc_weighted: ', roc_auc_score(y, clf.predict_proba(x)[
          :, 1], average='weighted', sample_weight=x['loan_amnt']))

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df.drop(['grade','loan_status'], axis=1)
y = df.loan_status

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



rf_best = RandomForestClassifier(criterion= 'entropy', n_estimators= 101)
rf_best.fit(X_train, y_train)
    
print('RANDOM FOREST')
scoring(rf_best, X_test, y_test)

RANDOM FOREST
Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.9414338942211797
balanced_accuracy_score:  0.8602320692946972
[[108066  41611]
 [  2432 599913]]
F1 score:  0.96459193050072
precision_score:  0.9351372668832343
average_precision_score:  0.9292499991517935
recall_score:  0.9959624467705385
roc:  0.9647261565103948
roc_weighted:  0.9655150527937684


# Testing based off of certain features
Here will be testing our classifier, that was trained and test on the general populatyion with global averages to fill the nans, on certain sub-groups of the population and to see if we need to fill nans with local averages.

In [9]:
def ml_test(dataframe):
    dataframe.fillna(0,inplace=True)
    
    X = dataframe.drop(['grade','loan_status'], axis=1)
    y = dataframe.loan_status

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    print('MAIN CLASSIFER')
    scoring(rf_best, X_test, y_test)
    
    print('\n')
    print('RETRAINED CLASSIFER')
    rf = RandomForestClassifier(criterion= 'entropy', n_estimators= 101)
    rf.fit(X_train, y_train)
    scoring(rf, X_test, y_test)

Individual verse joint: in our EDA joint borrowers failed in higher rates and had wide differences between their single and joint/secondary features.

In [10]:
ml_test(df.loc[df.application_type == 1])

MAIN CLASSIFER
Loan passing rate: 0.8018762389397847
Balanced loan passing rate: 0.7866714294905874


score:  0.9763675398788487
balanced_accuracy_score:  0.9433116183374979
[[130230  16591]
 [   922 593314]]
F1 score:  0.985456022176805
precision_score:  0.9727974028742181
average_precision_score:  0.970257096591743
recall_score:  0.9984484278973337
roc:  0.9923590577367548
roc_weighted:  0.9925843852868556


RETRAINED CLASSIFER
Loan passing rate: 0.8018762389397847
Balanced loan passing rate: 0.7866714294905874


score:  0.9412244942022003
balanced_accuracy_score:  0.8590018069866086
[[105551  41270]
 [  2286 591950]]
F1 score:  0.9645152249856614
precision_score:  0.9348251792426013
average_precision_score:  0.9290911291607827
recall_score:  0.9961530435719141
roc:  0.964107709067975
roc_weighted:  0.9649290820221007


In [11]:
ml_test(df.loc[df.application_type == 0])

MAIN CLASSIFER
Loan passing rate: 0.7680802553579571
Balanced loan passing rate: 0.7633904760778214


score:  0.9803921568627451
balanced_accuracy_score:  0.9620772593993847
[[2349  194]
 [  21 8401]]
F1 score:  0.9873655756008697
precision_score:  0.9774287376381617
average_precision_score:  0.9771611732540505
recall_score:  0.997506530515317
roc:  0.9969369634964435
roc_weighted:  0.9972945724052852


RETRAINED CLASSIFER
Loan passing rate: 0.7680802553579571
Balanced loan passing rate: 0.7633904760778214


score:  0.9246694026447788
balanced_accuracy_score:  0.8421763473166493
[[1759  784]
 [  42 8380]]
F1 score:  0.953030819970431
precision_score:  0.9144478393714535
average_precision_score:  0.9109824214358477
recall_score:  0.995013061030634
roc:  0.9617813690022005
roc_weighted:  0.9609105541013541


The Global classifier has outperformed the locally trained version. 

ML for different grades

In [12]:
grades = set(df.grade)

for loan_grade in sorted(grades):
    print(loan_grade)
    ml_test(df.loc[df.grade==loan_grade])
    print('\n')

A
MAIN CLASSIFER
Loan passing rate: 0.9405104807551982
Balanced loan passing rate: 0.9413940416432522


score:  0.9946156032291018
balanced_accuracy_score:  0.9569929927858436
[[  7044    701]
 [     0 122446]]
F1 score:  0.9971456841196614
precision_score:  0.9943076161010824
average_precision_score:  0.9946737664195048
recall_score:  1.0
roc:  0.9919760937660329
roc_weighted:  0.9917791211787231


RETRAINED CLASSIFER
Loan passing rate: 0.9405104807551982
Balanced loan passing rate: 0.9413940416432522


score:  0.9826792942676529
balanced_accuracy_score:  0.8521310558351796
[[  5490   2255]
 [     0 122446]]
F1 score:  0.9908758754911045
precision_score:  0.9819167448536901
average_precision_score:  0.9819218404712945
recall_score:  1.0
roc:  0.9482498038396964
roc_weighted:  0.9467508719636832


B
MAIN CLASSIFER
Loan passing rate: 0.8673473571985074
Balanced loan passing rate: 0.8650654540003315


score:  0.9879626212508485
balanced_accuracy_score:  0.9544207973578072
[[ 26473   2642

The Global classifier has outperformed the locally trained version. 

ML for different loan sizes

In [13]:
for decile in np.arange(1,11):
    print((decile-1)*10,decile*10)
    
    ml_test(df.loc[(df.loan_amnt > np.percentile(df.loan_amnt,(decile-1)*10)) & 
                   (df.loan_amnt<np.percentile(df.loan_amnt,decile*10))])
    print('\n')

0 10
MAIN CLASSIFER
Loan passing rate: 0.8456880009798451
Balanced loan passing rate: 0.8423225500026952


score:  0.9861052516977178
balanced_accuracy_score:  0.9555562898555657
[[10326  1013]
 [    8 62134]]
F1 score:  0.9918508408559411
precision_score:  0.9839580660997356
average_precision_score:  0.9836367758197843
recall_score:  0.9998712625921277
roc:  0.9952201028906399
roc_weighted:  0.9951601291320926


RETRAINED CLASSIFER
Loan passing rate: 0.8456880009798451
Balanced loan passing rate: 0.8423225500026952


score:  0.9555803541051429
balanced_accuracy_score:  0.8568450615562462
[[ 8093  3246]
 [   18 62124]]
F1 score:  0.9744024091850179
precision_score:  0.9503441945846719
average_precision_score:  0.9491349835172084
recall_score:  0.9997103408322874
roc:  0.9628632145257872
roc_weighted:  0.9623637362351344


10 20
MAIN CLASSIFER
Loan passing rate: 0.8402740327960533
Balanced loan passing rate: 0.8404687232746312


score:  0.9848003675075399
balanced_accuracy_score:  0.952

recall_score:  0.9931464833421471
roc:  0.9638948577991222
roc_weighted:  0.9638576405467292




The Global classifier has outperformed the locally trained version. 

Annual Income by decile

In [14]:
for decile in np.arange(1,11):
    print((decile-1)*10,decile*10)
    
    ml_test(df.loc[(df.annual_inc > np.percentile(df.annual_inc,(decile-1)*10)) & 
                   (df.annual_inc<np.percentile(df.annual_inc,decile*10))])
    print('\n')

0 10
MAIN CLASSIFER
Loan passing rate: 0.7586561440560218
Balanced loan passing rate: 0.7413021730605859


score:  0.9791863502473184
balanced_accuracy_score:  0.9589275077722865
[[15939  1431]
 [   67 54535]]
F1 score:  0.9864517762824687
precision_score:  0.9744309044777186
average_precision_score:  0.972259252969048
recall_score:  0.9987729387201933
roc:  0.9958718448633695
roc_weighted:  0.9959721504534519


RETRAINED CLASSIFER
Loan passing rate: 0.7586561440560218
Balanced loan passing rate: 0.7413021730605859


score:  0.9300700272328127
balanced_accuracy_score:  0.8644998304067847
[[12631  4739]
 [  294 54308]]
F1 score:  0.9557145245448706
precision_score:  0.9197419005199248
average_precision_score:  0.9139124711659499
recall_score:  0.9946155818468188
roc:  0.963115460921516
roc_weighted:  0.9638310292572698


10 20
MAIN CLASSIFER
Loan passing rate: 0.7724567494238119
Balanced loan passing rate: 0.7475995478751546


score:  0.9814890451323045
balanced_accuracy_score:  0.96250

average_precision_score:  0.945990808469716
recall_score:  0.9994199816327517
roc:  0.9639177655364713
roc_weighted:  0.9636417975315626




The Global classifier has outperformed the locally trained version. 

FICO test

In [15]:
for decile in [1,5,7,8,9,10]:
    print((decile-1)*10,decile*10)
    
    ml_test(df.loc[(df.fico_range_high > np.percentile(df.fico_range_high,(decile-1)*10)) & 
                   (df.fico_range_high<np.percentile(df.fico_range_high,decile*10))])
    print('\n')

0 10
MAIN CLASSIFER
Loan passing rate: 0.7313474510264963
Balanced loan passing rate: 0.6995969401756819


score:  0.9770011903537261
balanced_accuracy_score:  0.9605620588202005
[[16872  1409]
 [  156 49610]]
F1 score:  0.9844718956193879
precision_score:  0.9723828377663224
average_precision_score:  0.9677030849263832
recall_score:  0.9968653297431982
roc:  0.9960251057164037
roc_weighted:  0.9961777224487528


RETRAINED CLASSIFER
Loan passing rate: 0.7313474510264963
Balanced loan passing rate: 0.6995969401756819


score:  0.9215836113274648
balanced_accuracy_score:  0.8642289477948322
[[13467  4814]
 [  522 49244]]
F1 score:  0.9486053321004778
precision_score:  0.9109475008324392
average_precision_score:  0.896421246402596
recall_score:  0.9895109110637785
roc:  0.9648287907872715
roc_weighted:  0.9648584215433926


40 50
MAIN CLASSIFER
Loan passing rate: 0.7851703564138798
Balanced loan passing rate: 0.7650369482232141


score:  0.9797260166431151
balanced_accuracy_score:  0.9548

In [16]:
feature_importance = pd.DataFrame(data = rf_best.feature_importances_,index=X.columns,columns=['feature_importance'])
feature_importance.sort_values(by='feature_importance',ascending =False)

Unnamed: 0,feature_importance
int_rate,0.044692
dti,0.030237
installment,0.026616
annual_inc,0.024941
length_of_cr,0.024522
revol_util,0.024340
revol_bal,0.024164
bc_open_to_buy,0.023242
avg_cur_bal,0.023225
tot_hi_cred_lim,0.023047
