In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('lending_club_ml.csv')
df.drop(['id'], inplace=True, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2506738 entries, 0 to 2506737
Data columns (total 100 columns):
loan_amnt                              float64
term                                   int64
int_rate                               float64
installment                            float64
grade                                  object
sub_grade                              object
emp_length                             int64
home_ownership                         object
annual_inc                             float64
verification_status                    object
loan_status                            int64
purpose                                object
dti                                    float64
delinq_2yrs                            float64
fico_range_low                         float64
fico_range_high                        float64
inq_last_6mths                         float64
mths_since_last_delinq                 float64
mths_since_last_record                 float64
open

In [4]:
for thing in df:
    if df[thing].dtype == 'object':
        print(thing)

grade
sub_grade
home_ownership
verification_status
purpose
application_type
verification_status_joint
disbursement_method


In [5]:
# creating columns for features that had more than 2 outcomes
# I have decided to use subgrade and not grade as these are suppose to be similar features but, subgrade is more granular

df = pd.concat([df, pd.get_dummies(
    df.sub_grade, prefix='sub_grade', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.home_ownership,
                                   prefix='home_ownership', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status,
                                   prefix='verification_status', drop_first=True)], axis=1)
df = pd.concat(
    [df, pd.get_dummies(df.purpose, prefix='purpose', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status_joint,
                                   prefix='verification_status_joint', drop_first=True)], axis=1)

df.drop(columns=['sub_grade', 'home_ownership', 'verification_status',
                 'purpose', 'verification_status_joint'], inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2506738 entries, 0 to 2506737
Columns: 148 entries, loan_amnt to verification_status_joint_Verified
dtypes: float64(89), int64(3), object(3), uint8(53)
memory usage: 1.9+ GB


For the time being I have decided to drop grade in favor sub grade

In [6]:
# conversions for features that only had 2 outcomes

df.disbursement_method = df.disbursement_method.apply(
    lambda disburstment: 1 if disburstment == 'Cash' else 0)

df.application_type = df.application_type.apply(
    lambda app_type: 1 if app_type == 'Individual' else 0)

In [7]:
# if borrower is an indivudal then their secondary features are filled with their single counterpart

sec_list = ['sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_inq_last_6mths', 'sec_app_mort_acc',
            'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il','sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths',
            'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog']
joint_list = ['dti_joint', 'revol_bal_joint', 'annual_inc_joint']

for secondary in sec_list:
    df.loc[df.application_type == 1,secondary] = df.loc[df.application_type == 1][secondary[8:]]


for joint in joint_list:
    df.loc[df.application_type == 1,joint] = df.loc[df.application_type == 1][joint[:len(joint)-6]]


In [8]:
#fill all nans with averages of respective feature

df_global = df.copy()

for thing in df_global:
    if df_global[thing].isnull().values.any():
        df_global[thing].fillna(np.mean(df_global[thing]), inplace=True)

In [9]:
#fill nans with 0

df_0 = df.copy()

df_0.fillna(0,inplace=True)

In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


def scoring(clf, x, y):
    
    #Baseline
    print('Loan passing rate:', np.mean(y))
    print('Balanced loan passing rate:', np.average(y, weights=x['loan_amnt']))
    print('\n')
    
    
    print('score: ', clf.score(x, y))
    # score adjusted for loan amount
    print('balanced_accuracy_score: ', balanced_accuracy_score(
        y, clf.predict(x), sample_weight=x['loan_amnt']))

    print(confusion_matrix(y, clf.predict(x)))
    
    print('F1 score: ', f1_score(y, clf.predict(x)))
    
    print('precision_score: ', precision_score(y, clf.predict(x)))

    # score adjusted for loan amount
    print('average_precision_score: ', average_precision_score(
        y, clf.predict(x), average='weighted', sample_weight=x['loan_amnt']))
    
    print('recall_score: ', recall_score(y, clf.predict(x)))

    print('roc: ', roc_auc_score(y, clf.predict_proba(x)[:, 1]))
    
    # score adjusted for loan amount
    print('roc_weighted: ', roc_auc_score(y, clf.predict_proba(x)[
          :, 1], average='weighted', sample_weight=x['loan_amnt']))

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

parameters = {'class_weight': ['balanced', None],
              'criterion':['gini','entropy'],
              'n_estimators': [31,61,101]}

rf = RandomForestClassifier()

rf_cv = GridSearchCV(rf, parameters, cv=2)

X_train, X_test, y_train, y_test = train_test_split(
    df_global.drop(['grade', 'loan_status'], axis=1), df_global.loan_status, test_size=0.3, random_state=42)

rf_cv.fit(X_train, y_train)

scoring(rf_cv, X_test, y_test)

rf_cv.best_params_

Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.9414485214528299
balanced_accuracy_score:  0.8602194324067285
[[108035  41642]
 [  2390 599955]]
F1 score:  0.9646028512583384
precision_score:  0.9350963299391986
average_precision_score:  0.929240714442075
recall_score:  0.9960321742522973
roc:  0.9644819356626336
roc_weighted:  0.9653726069638308


{'class_weight': None, 'criterion': 'entropy', 'n_estimators': 101}

In [12]:
rf_cv.cv_results_

{'mean_fit_time': array([189.24578261, 374.40472686, 629.3536638 , 207.815575  ,
        439.85815489, 683.81564224, 213.7911365 , 405.44491613,
        666.43385112, 222.0709846 , 430.86518526, 716.66022801]),
 'std_fit_time': array([ 7.48860598, 11.66744506,  5.87156498,  4.99168122,  1.91236627,
         4.73340809,  1.49103332, 11.91378272, 23.92983139,  0.47105265,
         2.51298904, 10.27856016]),
 'mean_score_time': array([16.27320445, 30.38423371, 50.24074173, 15.48011637, 31.42988014,
        51.29145074, 18.26019275, 34.97027028, 53.47753847, 16.1920408 ,
        31.55969954, 52.01865339]),
 'std_score_time': array([0.60262287, 1.60464311, 2.36364961, 1.02505994, 0.10805035,
        0.39251924, 0.10462439, 0.04020679, 1.66756833, 0.07317483,
        0.05757475, 0.65471888]),
 'param_class_weight': masked_array(data=['balanced', 'balanced', 'balanced', 'balanced',
                    'balanced', 'balanced', None, None, None, None, None,
                    None],
           

In [13]:
rf_cv0 = GridSearchCV(rf, parameters, cv=2)

X0_train, X0_test, y0_train, y0_test = train_test_split(
    df_0.drop(['grade', 'loan_status'], axis=1), df_0.loan_status, test_size=0.3, random_state=42)

rf_cv0.fit(X_train, y_train)

scoring(rf_cv, X_test, y_test)

rf_cv0.best_params_

Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.9414485214528299
balanced_accuracy_score:  0.8602194324067285
[[108035  41642]
 [  2390 599955]]
F1 score:  0.9646028512583384
precision_score:  0.9350963299391986
average_precision_score:  0.929240714442075
recall_score:  0.9960321742522973
roc:  0.9644819356626336
roc_weighted:  0.9653726069638308


{'class_weight': None, 'criterion': 'entropy', 'n_estimators': 101}

In [14]:
rf_cv0.cv_results_

{'mean_fit_time': array([175.75168943, 332.5977757 , 547.21123791, 194.0141716 ,
        382.2543968 , 630.42463434, 186.12559986, 357.85094774,
        590.35645819, 195.38701093, 382.94614875, 635.74373555]),
 'std_fit_time': array([1.35364604, 0.05975699, 0.11553311, 0.62529159, 1.92539299,
        2.68243897, 2.65633178, 1.99806511, 0.96246409, 0.56529367,
        1.53090489, 0.25895047]),
 'mean_score_time': array([14.28439736, 27.055457  , 43.97178316, 13.7667625 , 26.23386705,
        42.81012058, 14.95983636, 28.60220993, 46.86334479, 13.87848985,
        26.46876097, 43.24542487]),
 'std_score_time': array([0.19377613, 0.02586639, 0.12389255, 0.03119087, 0.0938369 ,
        0.01837039, 0.00763404, 0.02293432, 0.04613268, 0.03397   ,
        0.03079176, 0.00545704]),
 'param_class_weight': masked_array(data=['balanced', 'balanced', 'balanced', 'balanced',
                    'balanced', 'balanced', None, None, None, None, None,
                    None],
              mask=[Fal

In [15]:
scoring(rf_cv, X0_test, y0_test)
scoring(rf_cv0, X_test, y_test)

Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.8751446101310866
balanced_accuracy_score:  0.7363449960435129
[[ 69029  80648]
 [ 13246 589099]]
F1 score:  0.9261893007738434
precision_score:  0.8795843803704981
average_precision_score:  0.8744520975947128
recall_score:  0.9780092803957865
roc:  0.8820008884224068
roc_weighted:  0.891890237698218
Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.9414378834661752
balanced_accuracy_score:  0.8603868714360607
[[108131  41546]
 [  2494 599851]]
F1 score:  0.9645907270157316
precision_score:  0.9352257650098145
average_precision_score:  0.9293245702907738
recall_score:  0.9958595157260374
roc:  0.9644467537663284
roc_weighted:  0.9653301573861001


From the results we can see that the random forest trained on nans=0 out performed the the random forest we trained on 
filling our nans with the global average even againest each other's data set.

We are going to fill our nans with 0s and train a random forest with these criteria: 'class_weight': None, 'criterion': 'entropy', 'n_estimators': a bunch more than 101.