Finding the best way to fill the nans. Should it be filled with 0s or with the average of the respective feature?

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('lending_club_ml.csv')
df.drop(['id'], inplace=True, axis=1)

In [4]:
# creating columns for features that had more than 2 outcomes
# I have decided to use subgrade and not grade as these are suppose to be similar features but, subgrade is more granular

df = pd.concat([df, pd.get_dummies(
    df.sub_grade, prefix='sub_grade', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.home_ownership,
                                   prefix='home_ownership', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status,
                                   prefix='verification_status', drop_first=True)], axis=1)
df = pd.concat(
    [df, pd.get_dummies(df.purpose, prefix='purpose', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status_joint,
                                   prefix='verification_status_joint', drop_first=True)], axis=1)

df.drop(columns=['sub_grade', 'home_ownership', 'verification_status',
                 'purpose', 'verification_status_joint'], inplace=True)


# conversions for features that only had 2 outcomes

df.disbursement_method = df.disbursement_method.apply(
    lambda disburstment: 1 if disburstment == 'Cash' else 0)

df.application_type = df.application_type.apply(
    lambda app_type: 1 if app_type == 'Individual' else 0)

In [5]:
df_all0 = df.copy()

In [6]:
# if borrower is an indivudal then their secondary features are filled with their single counterpart

sec_list = ['sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_inq_last_6mths', 'sec_app_mort_acc',
            'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il','sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths',
            'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog']
joint_list = ['dti_joint', 'revol_bal_joint', 'annual_inc_joint']

for secondary in sec_list:
    df.loc[df.application_type == 1,secondary] = df.loc[df.application_type == 1][secondary[8:]]


for joint in joint_list:
    df.loc[df.application_type == 1,joint] = df.loc[df.application_type == 1][joint[:len(joint)-6]]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2506738 entries, 0 to 2506737
Columns: 148 entries, loan_amnt to verification_status_joint_Verified
dtypes: float64(89), int64(5), object(1), uint8(53)
memory usage: 1.9+ GB


In [7]:
#fill all nans with averages of respective feature

df_global = df.copy()

for thing in df_global:
    if df_global[thing].isnull().values.any():
        df_global[thing].fillna(np.mean(df_global[thing]), inplace=True)

In [8]:
#fill all nans with 0
df_global.fillna(0, inplace=True)
df_all0.fillna(0, inplace=True)

In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


def scoring(clf, x, y):
    
    #Baseline
    print('Loan passing rate:', np.mean(y))
    print('Balanced loan passing rate:', np.average(y, weights=x['loan_amnt']))
    print('\n')
    
    
    print('score: ', clf.score(x, y))
    # score adjusted for loan amount
    print('balanced_accuracy_score: ', balanced_accuracy_score(
        y, clf.predict(x), sample_weight=x['loan_amnt']))

    print(confusion_matrix(y, clf.predict(x)))
    
    print('F1 score: ', f1_score(y, clf.predict(x)))
    
    print('precision_score: ', precision_score(y, clf.predict(x)))

    # score adjusted for loan amount
    print('average_precision_score: ', average_precision_score(
        y, clf.predict(x), average='weighted', sample_weight=x['loan_amnt']))
    
    print('recall_score: ', recall_score(y, clf.predict(x)))

    print('roc: ', roc_auc_score(y, clf.predict_proba(x)[:, 1]))

    # score adjusted for loan amount
    print('roc_weighted: ', roc_auc_score(y, clf.predict_proba(x)[
          :, 1], average='weighted', sample_weight=x['loan_amnt']))

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df_global.drop(['grade','loan_status'], axis=1)
y = df_global.loan_status

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_best = RandomForestClassifier(criterion= 'entropy', n_estimators= 101)
rf_best.fit(X_train, y_train)
    
scoring(rf_best, X_test, y_test)

Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.941541603836058
balanced_accuracy_score:  0.8602047804170503
[[108065  41612]
 [  2350 599995]]
F1 score:  0.9646594080800546
precision_score:  0.9351440991136318
average_precision_score:  0.9292314399682938
recall_score:  0.9960985813777818
roc:  0.9644059362464467
roc_weighted:  0.9652058267062085


In [11]:
X0 = df_all0.drop(['grade','loan_status'], axis=1)
y0 = df_all0.loan_status

X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0, test_size=0.3, random_state=42)

rf0_best = RandomForestClassifier(criterion= 'entropy', n_estimators= 101)
rf0_best.fit(X0_train, y0_train)
    
print('RANDOM FOREST')

scoring(rf0_best, X0_test, y0_test)

RANDOM FOREST
Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.9415562310677081
balanced_accuracy_score:  0.8601988309706627
[[108010  41667]
 [  2284 600061]]
F1 score:  0.9646716872723706
precision_score:  0.935070621821083
average_precision_score:  0.9292277605564438
recall_score:  0.9962081531348314
roc:  0.9647164436635872
roc_weighted:  0.9655456417876066


In [12]:
scoring(rf_best, X0_test, y0_test)
scoring(rf0_best, X_test, y_test)

Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.8079324806987029
balanced_accuracy_score:  0.5943855683977545
[[ 31680 117997]
 [ 26442 575903]]
F1 score:  0.8885712191753873
precision_score:  0.8299510015852428
average_precision_score:  0.8188008540209637
recall_score:  0.9561015696984286
roc:  0.739744197418488
roc_weighted:  0.7478246696409757
Loan passing rate: 0.8009672589365736
Balanced loan passing rate: 0.7855872316459166


score:  0.8570480650832023
balanced_accuracy_score:  0.6510727436734819
[[ 45088 104589]
 [  2914 599431]]
F1 score:  0.917708297451325
precision_score:  0.8514402999914775
average_precision_score:  0.8400250971028239
recall_score:  0.9951622409084495
roc:  0.8543565033812119
roc_weighted:  0.8647971196425178


I have decided to go with filling the nans with 0s. The classifier trained on the 0 seems to score better across both sets. Another reason I have decided to got with zero is because LC has been in the works of changing how the screen loan applicants. This may greatly skew the average.