This is best thought of as a "appendix" to loan_ml. I am trying to investigate the usefulness of a logistic regression (logit) since it seems to have abysmal to no predictive value.

Here I will try to find if I improve the logit function through a combination of preprocessing and hyper parameter tuning.

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('lending_club_ml.csv')
df.drop(['id'], inplace=True, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2506738 entries, 0 to 2506737
Data columns (total 100 columns):
loan_amnt                              float64
term                                   int64
int_rate                               float64
installment                            float64
grade                                  object
sub_grade                              object
emp_length                             int64
home_ownership                         object
annual_inc                             float64
verification_status                    object
loan_status                            int64
purpose                                object
dti                                    float64
delinq_2yrs                            float64
fico_range_low                         float64
fico_range_high                        float64
inq_last_6mths                         float64
mths_since_last_delinq                 float64
mths_since_last_record                 float64
open

In [4]:
#For normalizing

numeric=[]

for column in df:
    if ((df[column].dtypes == 'float64') | (df[column].dtypes=='int64')):
        numeric.append(column)
numeric.remove('loan_status')

This is basically the same code from loan_ml

In [5]:
#if a loan was an indivdual loan then we will fill the all joint-realted features with its self

df.application_type = df.application_type.apply(
    lambda app_type: 1 if app_type == 'Individual' else 0)

sec_list = ['sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_inq_last_6mths', 'sec_app_mort_acc',
            'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths',
            'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog', 'sec_app_open_act_il']
joint_list = ['dti_joint', 'revol_bal_joint', 'annual_inc_joint']

for secondary in sec_list:
    df.loc[df.application_type ==
           1][secondary] = df.loc[df.application_type == 1][secondary[8:]]

for joint in joint_list:
    df.loc[df.application_type ==
           1][joint] = df.loc[df.application_type == 1][joint[:len(joint)-6]]

In [6]:
#Dummy variables for catergorical features

df = pd.concat([df, pd.get_dummies(
    df.sub_grade, prefix='sub_grade', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.home_ownership,
                                   prefix='home_ownership', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status,
                                   prefix='verification_status', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.purpose, prefix='purpose', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df.verification_status_joint,
                                   prefix='verification_status_joint', drop_first=True)], axis=1)

df.disbursement_method = df.disbursement_method.apply(
    lambda disburstment: 1 if disburstment == 'Cash' else 0)
df.application_type = df.application_type.apply(
    lambda app_type: 1 if app_type == 'Individual' else 0)

df.drop(columns=['sub_grade', 'home_ownership', 'verification_status',
                 'purpose', 'verification_status_joint'], inplace=True)

In [7]:
#fill nans with the average

df_avg=df

for thing in df_avg:
    if df_avg[thing].isnull().values.any():
        df_avg[thing].fillna(np.mean(df_avg[thing]), inplace=True)

In [8]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

def scoring(clf, x, y):
    print('score: ', (clf.score(x, y)))
    
    #score adjusted for loan amount
    print('balanced_accuracy_score: ', balanced_accuracy_score(y, clf.predict(x), sample_weight=x['loan_amnt']))

    print(confusion_matrix(y, clf.predict(x)))
    print('F1 score: ', f1_score(y, clf.predict(x)))
    print('precision_score: ',precision_score(y, clf.predict(x)))
    
    #score adjusted for loan amount
    print('average_precision_score: ', average_precision_score(y, clf.predict(x), average='weighted', sample_weight=x['loan_amnt']))
    print('recall_score: ', recall_score(y, clf.predict(x)))
    
    #score adjusted for loan amount
    print('roc: ', roc_auc_score(y, clf.predict_proba(x)[:, 1], average='weighted', sample_weight=x['loan_amnt']))

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_avg = df_avg.drop(['loan_status','grade'], axis=1)
y_avg = df_avg.loan_status

X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg, y_avg, test_size=0.3, random_state=42)

logit = LogisticRegression()

logit.fit(X_avg_train, y_avg_train)
print('LOGISTIC REGRESSION')
scoring(logit, X_avg_test, y_avg_test)


LOGISTIC REGRESSION
score:  0.8009446532149326
balanced_accuracy_score:  0.5002377383591609
[[    56 149621]
 [    73 602272]]
F1 score:  0.8894625612336975
precision_score:  0.8010075901757298
average_precision_score:  0.7856673290807463
recall_score:  0.9998788069959906
roc:  0.6443311934446472


In [10]:
from sklearn.ensemble import RandomForestClassifier

X_avg = df_avg.drop(['loan_status','grade'], axis=1)
y_avg = df_avg.loan_status

X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg, y_avg, test_size=0.3, random_state=42)

rf = RandomForestClassifier()

rf.fit(X_avg_train, y_avg_train)
print('RANDOM FOREST')
scoring(rf, X_avg_test, y_avg_test)

RANDOM FOREST
score:  0.9281962495778049
balanced_accuracy_score:  0.8594551763957001
[[110676  39001]
 [ 14997 587348]]
F1 score:  0.9560525240621343
precision_score:  0.9377327975298116
average_precision_score:  0.9295429039132475
recall_score:  0.9751023084776996
roc:  0.9371857589959804


Random Forest vasts outperforms logistic regression. We will try to combine classifiers in an attempt to improve our scores. 

In [11]:
#with logit_predict_proba feature

X_avg_pred = pd.DataFrame(logit.predict_proba(X_avg)[:,1],columns =['logit_predict_proba'])
X_avg_ = pd.concat([X_avg,X_avg_pred],axis=1)

X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg_, y_avg, test_size=0.3, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_avg_train,y_avg_train)
print('RANDOM FOREST')
scoring(rf, X_avg_test, y_avg_test)


RANDOM FOREST
score:  0.9280845507179312
balanced_accuracy_score:  0.8599772617037034
[[110611  39066]
 [ 15016 587329]]
F1 score:  0.9559858065986295
precision_score:  0.9376336018007807
average_precision_score:  0.9297987138523454
recall_score:  0.9750707650930945
roc:  0.9379349450913548


performs about the same, it maybe because there are too many features.

In [12]:
#just the predict_proba feature
df_rf = pd.concat([X_avg_pred,y_avg],axis=1)

X = pd.DataFrame(df_rf['logit_predict_proba'])
y = pd.DataFrame(df_rf['loan_status'])

X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_avg_train,y_avg_train)
print('RANDOM FOREST')
print('score: ', (rf.score(X_avg_test, y_avg_test)))
    
#print('balanced_accuracy_score: ', balanced_accuracy_score(y, clf.predict(x), sample_weight=x['loan_amnt']))

print(confusion_matrix(y_avg_test, rf.predict(X_avg_test)))
print('F1 score: ', f1_score(y_avg_test, rf.predict(X_avg_test)))
print('precision_score: ',precision_score(y_avg_test, rf.predict(X_avg_test)))
    
#print('average_precision_score: ', average_precision_score(y, clf.predict(x), average='weighted', sample_weight=x['loan_amnt']))
print('recall_score: ', recall_score(y_avg_test, rf.predict(X_avg_test)))
    
print('roc: ', roc_auc_score(y_avg_test, rf.predict_proba(X_avg_test)[:, 1]))

RANDOM FOREST
score:  0.8077742406472151
[[ 59983  89694]
 [ 54864 547481]]
F1 score:  0.8833758228991868
precision_score:  0.8592317652136383
recall_score:  0.9089159866853713
roc:  0.7816482831604685


Using just the logit results as a feature resulted in a much lower score. There were much more false positives and false negatives compared to the massive feature set.

In [13]:
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg, y_avg, test_size=0.3, random_state=42)

logit_balanced = LogisticRegression(class_weight = 'balanced')

logit_balanced.fit(X_avg_train, y_avg_train)
print('LOGISTIC REGRESSION')
scoring(logit_balanced, X_avg_test, y_avg_test)

LOGISTIC REGRESSION
score:  0.5913444021584475
balanced_accuracy_score:  0.6140717696508232
[[ 97559  52118]
 [255200 347145]]
F1 score:  0.6931753739986102
precision_score:  0.8694644883197291
average_precision_score:  0.8282011180636957
recall_score:  0.5763225394084786
roc:  0.6630771823758567


In [14]:
#trying with nans filled with 0

df_0 = df.fillna(0)

X_0 = df_0.drop(['loan_status','grade'], axis=1)
y_0 = df_0.loan_status

X_0_train, X_0_test, y_0_train, y_0_test = train_test_split(
X_0, y_0, test_size=0.3, random_state=42)

logit = LogisticRegression()
logit.fit(X_0_train, y_0_train)
print('LOGISTIC REGRESSION')
scoring(logit, X_0_test, y_0_test)

LOGISTIC REGRESSION
score:  0.8009446532149326
balanced_accuracy_score:  0.5002377383591609
[[    56 149621]
 [    73 602272]]
F1 score:  0.8894625612336975
precision_score:  0.8010075901757298
average_precision_score:  0.7856673290807463
recall_score:  0.9998788069959906
roc:  0.6443311934446472


Did not even matter, will try normalizing then hyper tuning 'C'

In [15]:
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg, y_avg, test_size=0.3, random_state=42)

logit_c_0001 = LogisticRegression(C=.0001)

logit_c_0001.fit(X_avg_train, y_avg_train)
print('LOGISTIC REGRESSION')
scoring(logit_c_0001, X_avg_test, y_avg_test)

LOGISTIC REGRESSION
score:  0.8008635385666909
balanced_accuracy_score:  0.5006189990261432
[[   152 149525]
 [   230 602115]]
F1 score:  0.8893968544703228
precision_score:  0.8010683305837901
average_precision_score:  0.7857958153749027
recall_score:  0.9996181590284637
roc:  0.6554825379834309


In [16]:
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg, y_avg, test_size=0.3, random_state=42)

logit_c_01 = LogisticRegression(C=.01)

logit_c_01.fit(X_avg_train, y_avg_train)
print('LOGISTIC REGRESSION')
scoring(logit_c_01, X_avg_test, y_avg_test)

LOGISTIC REGRESSION
score:  0.8009300259832824
balanced_accuracy_score:  0.5004424141715003
[[   103 149574]
 [   131 602214]]
F1 score:  0.8894458668387817
precision_score:  0.8010423151207521
average_precision_score:  0.7857363003694248
recall_score:  0.9997825166640381
roc:  0.6514327242621893


In [17]:
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg, y_avg, test_size=0.3, random_state=42)

logit_c_1 = LogisticRegression(C=.1)

logit_c_1.fit(X_avg_train, y_avg_train)
print('LOGISTIC REGRESSION')
scoring(logit_c_1, X_avg_test, y_avg_test)

LOGISTIC REGRESSION
score:  0.800956620949919
balanced_accuracy_score:  0.5002725321329511
[[    60 149617]
 [    68 602277]]
F1 score:  0.8894692886558428
precision_score:  0.8010131747294167
average_precision_score:  0.7856790529684031
recall_score:  0.9998871078866762
roc:  0.6449016175844058


In [18]:
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg, y_avg, test_size=0.3, random_state=42)

logit_c_10 = LogisticRegression(C=10)

logit_c_10.fit(X_avg_train, y_avg_train)
print('LOGISTIC REGRESSION')
scoring(logit_c_10, X_avg_test, y_avg_test)

LOGISTIC REGRESSION
score:  0.8009419937182689
balanced_accuracy_score:  0.5002239402258125
[[    57 149620]
 [    76 602269]]
F1 score:  0.8894607578896999
precision_score:  0.8010078615327528
average_precision_score:  0.7856626798467449
recall_score:  0.9998738264615793
roc:  0.640807180050564


In [19]:
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg, y_avg, test_size=0.3, random_state=42)

logit_c_100 = LogisticRegression(C=100)

logit_c_100.fit(X_avg_train, y_avg_train)
print('LOGISTIC REGRESSION')
scoring(logit_c_100, X_avg_test, y_avg_test)

LOGISTIC REGRESSION
score:  0.8009326854799461
balanced_accuracy_score:  0.5002524657712254
[[    64 149613]
 [    90 602255]]
F1 score:  0.8894538746858877
precision_score:  0.8010116137407098
average_precision_score:  0.7856722914932446
recall_score:  0.9998505839676597
roc:  0.6424014797915372


In [20]:
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg, y_avg, test_size=0.3, random_state=42)

logit_c_1000 = LogisticRegression(C=1000)

logit_c_1000.fit(X_avg_train, y_avg_train)
print('LOGISTIC REGRESSION')
scoring(logit_c_1000, X_avg_test, y_avg_test)

LOGISTIC REGRESSION
score:  0.8009180582482959
balanced_accuracy_score:  0.500493372619013
[[   112 149565]
 [   149 602196]]
F1 score:  0.8894370160090865
precision_score:  0.801047141312199
average_precision_score:  0.7857534741417198
recall_score:  0.99975263345757
roc:  0.65190125034441


In [21]:
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(
X_avg, y_avg, test_size=0.3, random_state=42)

logit_c_10000 = LogisticRegression(C=10000)

logit_c_10000.fit(X_avg_train, y_avg_train)
print('LOGISTIC REGRESSION')
scoring(logit_c_10000, X_avg_test, y_avg_test)

LOGISTIC REGRESSION
score:  0.8009300259832824
balanced_accuracy_score:  0.5003103162678659
[[    74 149603]
 [   102 602243]]
F1 score:  0.8894506018722618
precision_score:  0.8010190916756889
average_precision_score:  0.7856917848676329
recall_score:  0.9998306618300143
roc:  0.6493744955895291


In [22]:
'''from sklearn.model_selection import GridSearchCV

parameters = {'C':np.logspace(np.log10(.0001),np.log10(10000),5)}

logit_cv = GridSearchCV(logit, parameters, cv=5)

logit_cv.fit(X_avg,y_avg)

logit_cv.best_score_'''
#took too long to run


"from sklearn.model_selection import GridSearchCV\n\nparameters = {'C':np.logspace(np.log10(.0001),np.log10(10000),5)}\n\nlogit_cv = GridSearchCV(logit, parameters, cv=5)\n\nlogit_cv.fit(X_avg,y_avg)\n\nlogit_cv.best_score_"

In [23]:
from sklearn.preprocessing import StandardScaler

trans_norm = StandardScaler(copy = False)

df_avg=df

for thing in df_avg:
    if df_avg[thing].isnull().values.any():
        df_avg[thing].fillna(np.mean(df_avg[thing]), inplace=True)
        
X_avg = df_avg.drop(['loan_status','grade'], axis=1)
y_avg = df_avg.loan_status

for number in numeric:
    X_avg[number] = trans_norm.fit_transform(np.array(X_avg[number]).reshape(-1,1))

X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(X_avg, y_avg, test_size=0.3, random_state=42)

logit = LogisticRegression()
logit.fit(X_avg_train, y_avg_train)
print('LOGISTIC REGRESSION')
scoring(logit, X_avg_test, y_avg_test)


LOGISTIC REGRESSION
score:  0.8044511995659701
balanced_accuracy_score:  0.8542232944913832
[[ 11325 138352]
 [  8705 593640]]
F1 score:  0.889790210419107
precision_score:  0.8109924698630586
average_precision_score:  86.64655048676774
recall_score:  0.9855481493164217


ValueError: x is neither increasing nor decreasing : [0. 0. 0. ... 1. 1. 1.].

In [None]:
'''from sklearn.linear_model import LogisticRegressionCV

clf = LogisticRegressionCV(cv=5,solver='liblinear', Cs=np.logspace(np.log10(.0001),np.log10(10000),5),scoring='roc_auc')

clf.fit(X_avg,y_avg)
print(clf.scores_)

clf.fit(X_0,y_0)
print(clf.scores_)'''
#takes forever to run