In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('lending_club_ml.csv')
df.drop(['id'], inplace=True, axis=1)

In [4]:
# creating columns for features that had more than 2 outcomes
# I have decided to use subgrade and not grade as these are suppose to be similar features but, subgrade is more granular

df = pd.concat([df, pd.get_dummies(
    df.sub_grade, prefix='sub_grade', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.home_ownership,
                                   prefix='home_ownership', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status,
                                   prefix='verification_status', drop_first=True)], axis=1)
df = pd.concat(
    [df, pd.get_dummies(df.purpose, prefix='purpose', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status_joint,
                                   prefix='verification_status_joint', drop_first=True)], axis=1)

df.drop(columns=['sub_grade', 'home_ownership', 'verification_status',
                 'purpose', 'verification_status_joint'], inplace=True)


# conversions for features that only had 2 outcomes

df.disbursement_method = df.disbursement_method.apply(
    lambda disburstment: 1 if disburstment == 'Cash' else 0)

df.application_type = df.application_type.apply(
    lambda app_type: 1 if app_type == 'Individual' else 0)

In [5]:
"""# if borrower is an indivudal then their secondary features are filled with their single counterpart

sec_list = ['sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_inq_last_6mths', 'sec_app_mort_acc',
            'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il','sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths',
            'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog']
joint_list = ['dti_joint', 'revol_bal_joint', 'annual_inc_joint']

for secondary in sec_list:
    df.loc[df.application_type == 1,secondary] = df.loc[df.application_type == 1][secondary[8:]]


for joint in joint_list:
    df.loc[df.application_type == 1,joint] = df.loc[df.application_type == 1][joint[:len(joint)-6]]

df.info()"""

"# if borrower is an indivudal then their secondary features are filled with their single counterpart\n\nsec_list = ['sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_inq_last_6mths', 'sec_app_mort_acc',\n            'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il','sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths',\n            'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog']\njoint_list = ['dti_joint', 'revol_bal_joint', 'annual_inc_joint']\n\nfor secondary in sec_list:\n    df.loc[df.application_type == 1,secondary] = df.loc[df.application_type == 1][secondary[8:]]\n\n\nfor joint in joint_list:\n    df.loc[df.application_type == 1,joint] = df.loc[df.application_type == 1][joint[:len(joint)-6]]\n\ndf.info()"

In [6]:
#fill all nans with 0
df.fillna(0, inplace=True)

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


def scoring(clf, x, y):
    
    #Baseline
    print('Loan passing rate:', np.mean(y))
    print('Balanced loan passing rate:', np.average(y, weights=x['loan_amnt']))
    print('\n')
    
    
    print('score: ', clf.score(x, y))
    # score adjusted for loan amount
    print('balanced_accuracy_score: ', balanced_accuracy_score(
        y, clf.predict(x), sample_weight=x['loan_amnt']))

    print(confusion_matrix(y, clf.predict(x)))
    
    print('F1 score: ', f1_score(y, clf.predict(x)))
    
    print('precision_score: ', precision_score(y, clf.predict(x)))

    # score adjusted for loan amount
    print('average_precision_score: ', average_precision_score(
        y, clf.predict(x), average='weighted', sample_weight=x['loan_amnt']))
    
    print('recall_score: ', recall_score(y, clf.predict(x)))

    print('roc: ', roc_auc_score(y, clf.predict_proba(x)[:, 1]))

    # score adjusted for loan amount
    print('roc_weighted: ', roc_auc_score(y, clf.predict_proba(x)[
          :, 1], average='weighted', sample_weight=x['loan_amnt']))

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df.drop(['grade','loan_status'], axis=1)
y = df.loan_status

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



rf_best = RandomForestClassifier(criterion= 'entropy', n_estimators= 101)
rf_best.fit(X_train, y_train)
    
print('RANDOM FOREST')
scoring(rf_best, X_test, y_test)

RANDOM FOREST
Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.9414312347245161
balanced_accuracy_score:  0.8596973398785499
[[107936  41741]
 [  2304 600041]]
F1 score:  0.9645976656723951
precision_score:  0.9349607810751938
average_precision_score:  0.9289922760237548
recall_score:  0.9961749495720891
roc:  0.9644580392774568
roc_weighted:  0.965244315949617


In [9]:
scoring(rf_best, X_train, y_train)

Loan passing rate: 0.8015205879469954
Balanced loan passing rate: 0.7865164321952773


score:  0.9999971505360412
balanced_accuracy_score:  0.9999897655383001
[[ 348270       5]
 [      0 1406441]]
F1 score:  0.9999982224668108
precision_score:  0.9999964449399408
average_precision_score:  0.9999944441662757
recall_score:  1.0
roc:  0.9999999999999999
roc_weighted:  0.9999999999999999


# Testing based off of certain features
Here will be testing our classifier, that was trained and test on the general population with global averages to fill the nans, on certain sub-groups of the population and to see if we need to fill nans with local averages.

In [10]:
#train and test the subsection of the data

def ml_test(dataframe):
    dataframe.fillna(0,inplace=True)
    
    X = dataframe.drop(['grade','loan_status'], axis=1)
    y = dataframe.loan_status

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    #first print out the test scores on our main classifer on subsection
    print('MAIN CLASSIFER')
    scoring(rf_best, X_test, y_test)
    
    #trains random forest model on subsection then prints out scores
    print('\n')
    print('RETRAINED CLASSIFER')
    rf = RandomForestClassifier(criterion= 'entropy', n_estimators= 101)
    rf.fit(X_train, y_train)
    scoring(rf, X_test, y_test)

Individual verse joint: in our EDA joint borrowers failed in higher rates and had wide differences between their single and joint/secondary features.

In [11]:
ml_test(df.loc[df.application_type == 1])

MAIN CLASSIFER
Loan passing rate: 0.8018762389397847
Balanced loan passing rate: 0.7866714294905874


score:  0.976385082389074
balanced_accuracy_score:  0.9432222459588948
[[130203  16618]
 [   882 593354]]
F1 score:  0.9854676268551613
precision_score:  0.9727561265107251
average_precision_score:  0.970208747116037
recall_score:  0.9985157412206598
roc:  0.9922711138650856
roc_weighted:  0.9925060586911842


RETRAINED CLASSIFER
Loan passing rate: 0.8018762389397847
Balanced loan passing rate: 0.7866714294905874


score:  0.9412649769181048
balanced_accuracy_score:  0.8591762956857232
[[105637  41184]
 [  2342 591894]]
F1 score:  0.9645355630262509
precision_score:  0.9349464047084246
average_precision_score:  0.9291758985403014
recall_score:  0.9960588049192577
roc:  0.9637510018663393
roc_weighted:  0.964695795792294


In [12]:
ml_test(df.loc[df.application_type == 0])

MAIN CLASSIFER
Loan passing rate: 0.7680802553579571
Balanced loan passing rate: 0.7633904760778214


score:  0.9800273597811218
balanced_accuracy_score:  0.9602156679297928
[[2346  197]
 [  22 8400]]
F1 score:  0.9871320289088665
precision_score:  0.9770850296615098
average_precision_score:  0.9760969378967801
recall_score:  0.9973877938731893
roc:  0.9968989098734258
roc_weighted:  0.9971694819253112


RETRAINED CLASSIFER
Loan passing rate: 0.7680802553579571
Balanced loan passing rate: 0.7633904760778214


score:  0.9261285909712722
balanced_accuracy_score:  0.8413188780617371
[[1763  780]
 [  30 8392]]
F1 score:  0.9539615778106173
precision_score:  0.914958569559529
average_precision_score:  0.9105429772872695
recall_score:  0.9964379007361672
roc:  0.9629353509566587
roc_weighted:  0.9622769263178165


The Global classifier has outperformed the locally trained version. 

ML for different grades

In [13]:
grades = set(df.grade)

for loan_grade in sorted(grades):
    print(loan_grade)
    ml_test(df.loc[df.grade==loan_grade])
    print('\n')

A
MAIN CLASSIFER
Loan passing rate: 0.9405104807551982
Balanced loan passing rate: 0.9413940416432522


score:  0.9946156032291018
balanced_accuracy_score:  0.9570227584450608
[[  7044    701]
 [     0 122446]]
F1 score:  0.9971456841196614
precision_score:  0.9943076161010824
average_precision_score:  0.9946774331480273
recall_score:  1.0
roc:  0.9926290918592253
roc_weighted:  0.992649224622702


RETRAINED CLASSIFER
Loan passing rate: 0.9405104807551982
Balanced loan passing rate: 0.9413940416432522


score:  0.9826639322226575
balanced_accuracy_score:  0.8520034887242486
[[  5488   2257]
 [     0 122446]]
F1 score:  0.9908678570417035
precision_score:  0.9819009967683215
average_precision_score:  0.9819065265606292
recall_score:  1.0
roc:  0.9477328987288551
roc_weighted:  0.9464595150660863


B
MAIN CLASSIFER
Loan passing rate: 0.8673473571985074
Balanced loan passing rate: 0.8650654540003315


score:  0.9879580650893236
balanced_accuracy_score:  0.9543750129887569
[[ 26472   2643]

The Global classifier has outperformed the locally trained version. 

ML for different loan sizes

In [14]:
for decile in np.arange(1,11):
    print((decile-1)*10,decile*10)
    
    ml_test(df.loc[(df.loan_amnt > np.percentile(df.loan_amnt,(decile-1)*10)) & 
                   (df.loan_amnt<np.percentile(df.loan_amnt,decile*10))])
    print('\n')

0 10
MAIN CLASSIFER
Loan passing rate: 0.8456880009798451
Balanced loan passing rate: 0.8423225500026952


score:  0.985996380016603
balanced_accuracy_score:  0.9552806926480657
[[10321  1018]
 [   11 62131]]
F1 score:  0.9917871195856046
precision_score:  0.983879396348319
average_precision_score:  0.9835388338289567
recall_score:  0.9998229860641756
roc:  0.9956453740142861
roc_weighted:  0.9955258299792639


RETRAINED CLASSIFER
Loan passing rate: 0.8456880009798451
Balanced loan passing rate: 0.8423225500026952


score:  0.9557164437065364
balanced_accuracy_score:  0.8572687890872621
[[ 8099  3240]
 [   14 62128]]
F1 score:  0.974480432907223
precision_score:  0.9504344633459797
average_precision_score:  0.9492773441418482
recall_score:  0.9997747095362235
roc:  0.9622828843091134
roc_weighted:  0.9622485592376738


10 20
MAIN CLASSIFER
Loan passing rate: 0.8402740327960533
Balanced loan passing rate: 0.8404687232746312


score:  0.984780394271676
balanced_accuracy_score:  0.9527586

recall_score:  0.9936964569011105
roc:  0.9644082498824087
roc_weighted:  0.9643877019699697




The Global classifier has outperformed the locally trained version. 

Annual Income by decile

In [15]:
for decile in np.arange(1,11):
    print((decile-1)*10,decile*10)
    
    ml_test(df.loc[(df.annual_inc > np.percentile(df.annual_inc,(decile-1)*10)) & 
                   (df.annual_inc<np.percentile(df.annual_inc,decile*10))])
    print('\n')

0 10
MAIN CLASSIFER
Loan passing rate: 0.7586561440560218
Balanced loan passing rate: 0.7413021730605859


score:  0.9791585616628689
balanced_accuracy_score:  0.9586092388437082
[[15934  1436]
 [   64 54538]]
F1 score:  0.9864346693676747
precision_score:  0.9743452317147246
average_precision_score:  0.972052556315359
recall_score:  0.9988278817625728
roc:  0.995882030044513
roc_weighted:  0.995999881863035


RETRAINED CLASSIFER
Loan passing rate: 0.7586561440560218
Balanced loan passing rate: 0.7413021730605859


score:  0.9292363696993275
balanced_accuracy_score:  0.8619998767558503
[[12563  4807]
 [  286 54316]]
F1 score:  0.9552165311057375
precision_score:  0.9186949241411971
average_precision_score:  0.9124505682297281
recall_score:  0.9947620966264972
roc:  0.9641354614752693
roc_weighted:  0.9647840795566627


10 20
MAIN CLASSIFER
Loan passing rate: 0.7724567494238119
Balanced loan passing rate: 0.7475995478751546


score:  0.98141610992794
balanced_accuracy_score:  0.96225488

recall_score:  0.999323311904877
roc:  0.9628698487158532
roc_weighted:  0.9626602186058735




The Global classifier has outperformed the locally trained version. 

FICO test

In [16]:
for decile in [1,5,7,8,9,10]:
    print((decile-1)*10,decile*10)
    
    ml_test(df.loc[(df.fico_range_high > np.percentile(df.fico_range_high,(decile-1)*10)) & 
                   (df.fico_range_high<np.percentile(df.fico_range_high,decile*10))])
    print('\n')

0 10
MAIN CLASSIFER
Loan passing rate: 0.7313474510264963
Balanced loan passing rate: 0.6995969401756819


score:  0.9769424074536718
balanced_accuracy_score:  0.9605399706904182
[[16862  1419]
 [  150 49616]]
F1 score:  0.9844346782273985
precision_score:  0.9721955520721074
average_precision_score:  0.9676961055272003
recall_score:  0.9969858939838444
roc:  0.9959202756334687
roc_weighted:  0.9960333917814489


RETRAINED CLASSIFER
Loan passing rate: 0.7313474510264963
Balanced loan passing rate: 0.6995969401756819


score:  0.9214513498023424
balanced_accuracy_score:  0.8637880408357643
[[13464  4817]
 [  528 49238]]
F1 score:  0.9485171593415591
precision_score:  0.9108870594764592
average_precision_score:  0.8961129308188192
recall_score:  0.9893903468231322
roc:  0.964433758952018
roc_weighted:  0.9650956658475641


40 50
MAIN CLASSIFER
Loan passing rate: 0.7851703564138798
Balanced loan passing rate: 0.7650369482232141


score:  0.9796867640131889
balanced_accuracy_score:  0.9545

In [17]:
feature_importance = pd.DataFrame(data = rf_best.feature_importances_,index=X.columns,columns=['feature_importance'])
feature_importance.sort_values(by='feature_importance',ascending =False)

Unnamed: 0,feature_importance
int_rate,0.047555
dti,0.030253
installment,0.026801
annual_inc,0.024744
length_of_cr,0.024518
revol_util,0.024386
revol_bal,0.023915
bc_open_to_buy,0.023277
tot_hi_cred_lim,0.023141
avg_cur_bal,0.022902
