In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('lending_club_ml.csv')
df.drop(['id'], inplace=True, axis=1)

In [4]:
# creating columns for features that had more than 2 outcomes
# I have decided to use subgrade and not grade as these are suppose to be similar features but, subgrade is more granular

df = pd.concat([df, pd.get_dummies(
    df.sub_grade, prefix='sub_grade', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.home_ownership,
                                   prefix='home_ownership', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status,
                                   prefix='verification_status', drop_first=True)], axis=1)
df = pd.concat(
    [df, pd.get_dummies(df.purpose, prefix='purpose', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status_joint,
                                   prefix='verification_status_joint', drop_first=True)], axis=1)

df.drop(columns=['sub_grade', 'home_ownership', 'verification_status',
                 'purpose', 'verification_status_joint'], inplace=True)


# conversions for features that only had 2 outcomes

df.disbursement_method = df.disbursement_method.apply(
    lambda disburstment: 1 if disburstment == 'Cash' else 0)

df.application_type = df.application_type.apply(
    lambda app_type: 1 if app_type == 'Individual' else 0)

In [5]:
#fill all nans with 0
df.fillna(0, inplace=True)

In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


def scoring(clf, x, y):
    
    #Baseline
    print('Loan passing rate:', np.mean(y))
    print('Balanced loan passing rate:', np.average(y, weights=x['loan_amnt']))
    print('\n')
    
    
    print('score: ', clf.score(x, y))
    # score adjusted for loan amount
    print('balanced_accuracy_score: ', balanced_accuracy_score(
        y, clf.predict(x), sample_weight=x['loan_amnt']))

    print(confusion_matrix(y, clf.predict(x)))
    
    print('F1 score: ', f1_score(y, clf.predict(x)))
    
    print('precision_score: ', precision_score(y, clf.predict(x)))

    # score adjusted for loan amount
    print('average_precision_score: ', average_precision_score(
        y, clf.predict(x), average='weighted', sample_weight=x['loan_amnt']))
    
    print('recall_score: ', recall_score(y, clf.predict(x)))

    print('roc: ', roc_auc_score(y, clf.predict_proba(x)[:, 1]))

    # score adjusted for loan amount
    print('roc_weighted: ', roc_auc_score(y, clf.predict_proba(x)[
          :, 1], average='weighted', sample_weight=x['loan_amnt']))

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df.drop(['grade','loan_status'], axis=1)
y = df.loan_status

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_best = RandomForestClassifier(criterion= 'entropy', n_estimators= 101)
rf_best.fit(X_train, y_train)
    
print('RANDOM FOREST')
scoring(rf_best, X_test, y_test)

RANDOM FOREST
Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.9414963923927757
balanced_accuracy_score:  0.8599558014707467
[[107967  41710]
 [  2286 600059]]
F1 score:  0.9646366812044556
precision_score:  0.9350077675923891
average_precision_score:  0.9291126361304757
recall_score:  0.9962048327785571
roc:  0.9647346881425156
roc_weighted:  0.965500586336011


In [10]:
df_modded = df[['loan_status','loan_amnt','annual_inc','installment','int_rate','fico_range_high']]

df_modded['installment_to_annual_inc'] =  df_modded.annual_inc/df_modded.installment * 12
df_modded.drop(['annual_inc','installment'],axis=1,inplace=True)

In [12]:
Xm = df_modded.drop(['loan_status'], axis=1)
ym = df_modded.loan_status

Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.3, random_state=42)

rf_m = RandomForestClassifier(criterion= 'entropy', n_estimators= 101)
rf_m.fit(Xm_train, ym_train)
    
print('RANDOM FOREST')
scoring(rf_m, Xm_test, ym_test)

RANDOM FOREST
Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.9088151676413722
balanced_accuracy_score:  0.8276923489943548
[[101746  47931]
 [ 20642 581703]]
F1 score:  0.9443391486380854
precision_score:  0.9238748225159378
average_precision_score:  0.914873392617448
recall_score:  0.9657306028936905
roc:  0.913568553286728
roc_weighted:  0.9160013070955311
