# LendingClub Loan Prediction - Model Selection
## Author: Congjun Huang

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform as sp_loguniform
from scipy.stats import randint as sp_randint

import warnings
warnings.filterwarnings('ignore')

In [2]:
# get training and test data
X_train = pd.read_csv('X_train.csv')
X_train.index = X_train['Unnamed: 0']
X_train = X_train.drop(['Unnamed: 0'], axis=1)
X_train.index.name = ''

y_train = pd.read_csv('y_train.csv')
y_train.index = y_train['Unnamed: 0']
y_train = y_train.drop(['Unnamed: 0'], axis=1)
y_train.index.name = ''

X_test = pd.read_csv('X_test.csv')
X_test.index = X_test['Unnamed: 0']
X_test = X_test.drop(['Unnamed: 0'], axis=1)
X_test.index.name = ''

y_test = pd.read_csv('y_test.csv')
y_test.index = y_test['Unnamed: 0']
y_test = y_test.drop(['Unnamed: 0'], axis=1)
y_test.index.name = ''

In [3]:
# logistic regression random search
params_lr = {'penalty': ['l1','l2'],
             'C': sp_loguniform(1e-4,1e2),
             'fit_intercept': [True, False]}

lr_cv = RandomizedSearchCV(
        LogisticRegression(),
        params_lr,
        n_iter=10,
        cv=StratifiedKFold(5, shuffle=True, random_state=123),
        scoring='roc_auc',
        n_jobs=-1,
        random_state=123).fit(
        X_train, y_train)

In [4]:
# record best estimator，parameters and best score
print(lr_cv.best_estimator_)
print(lr_cv.best_params_)
print(lr_cv.best_score_)

LogisticRegression(C=76.66289057556017)
{'C': 76.66289057556017, 'fit_intercept': True, 'penalty': 'l2'}
0.7093442712266047


In [5]:
# neural network random search
hl = []
for i in [1,2,3,4,5,10,15,25,30]:
    hl.append(i)
    hl.append((i,i))
    
params_mlp = {'hidden_layer_sizes': hl,
              'alpha': sp_loguniform(1e-4,1e2),
              'batch_size': [1,3,5,10,20,50,100,250,500],
              'learning_rate_init': sp_loguniform(1e-4,1e2)}

mlp_cv = RandomizedSearchCV(
         MLPClassifier(),
         params_mlp,
         n_iter=10,
         cv=StratifiedKFold(5, shuffle=True, random_state=123),
         scoring='roc_auc',
         n_jobs=-1,
         random_state=123).fit(
         X_train, y_train)

In [6]:
# record best estimator，parameters and best score
print(mlp_cv.best_estimator_)
print(mlp_cv.best_params_)
print(mlp_cv.best_score_)

MLPClassifier(alpha=0.0011051954732269518, batch_size=100,
              hidden_layer_sizes=(15, 15),
              learning_rate_init=0.0002900807334178909)
{'alpha': 0.0011051954732269518, 'batch_size': 100, 'hidden_layer_sizes': (15, 15), 'learning_rate_init': 0.0002900807334178909}
0.7269344667965878


In [7]:
# random forest random search   
params_rf = {'n_estimators': [100,120,200,300,500,800,1200],
             'max_depth': [None,5,8,15,25,30],
             'min_samples_split': [1,2,5,10,15,100],
             'min_samples_leaf': [1,2,5,10],
             'max_features': ['sqrt','log2',None]}

rf_cv = RandomizedSearchCV(
        RandomForestClassifier(),
        params_rf,
        n_iter=10,
        cv=StratifiedKFold(5, shuffle=True, random_state=123),
        scoring='roc_auc',
        n_jobs=-1,
        random_state=123).fit(
        X_train, y_train)

In [8]:
# record best estimator，parameters and best score
print(rf_cv.best_estimator_)
print(rf_cv.best_params_)
print(rf_cv.best_score_)

RandomForestClassifier(max_features='sqrt', min_samples_leaf=5,
                       n_estimators=800)
{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': None}
0.7270418663460392


In [9]:
# adaboost random search
params_ada = {'n_estimators': [100,120,200,300,500,800,1200],
              'learning_rate': sp_loguniform(1e-4,1e2)}

ada_cv = RandomizedSearchCV(
         AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5), random_state=123),
         params_ada,
         n_iter=10,
         cv=StratifiedKFold(5, shuffle=True, random_state=123),
         scoring='roc_auc',
         n_jobs=-1,
         random_state=123).fit(
         X_train, y_train)

In [10]:
# record best estimator，parameters and best score
print(ada_cv.best_estimator_)
print(ada_cv.best_params_)
print(ada_cv.best_score_)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),
                   learning_rate=0.03722421578735048, n_estimators=200,
                   random_state=123)
{'learning_rate': 0.03722421578735048, 'n_estimators': 200}
0.7272116849944383


In [11]:
# gradientboost random search
params_gb = {'n_estimators': [100,120,200,300,500,800,1200],
             'learning_rate': sp_loguniform(1e-4,1e2),
             'max_depth': [None,5,8,15,25,30]}

gb_cv = RandomizedSearchCV(
        GradientBoostingClassifier(),
        params_gb,
        n_iter=10,
        cv=StratifiedKFold(5, shuffle=True, random_state=123),
        scoring='roc_auc',
        n_jobs=-1,
        random_state=123).fit(
        X_train, y_train)

In [12]:
# record best estimator，parameters and best score
print(gb_cv.best_estimator_)
print(gb_cv.best_params_)
print(gb_cv.best_score_)

GradientBoostingClassifier(learning_rate=0.029210987323232726, max_depth=5,
                           n_estimators=1200)
{'learning_rate': 0.029210987323232726, 'max_depth': 5, 'n_estimators': 1200}
0.7310230778059281


In [13]:
# xgboost random search
params_xgb = {'n_estimators': [100,200,300,500,800,1000],
              'learning_rate': sp_loguniform(1e-4,1e-1),
              'max_depth': [3,5,7,9,12,15,25],
              'min_child_weight': [1,3,5,7]}

xgb_cv = RandomizedSearchCV(
         XGBClassifier(),
         params_xgb,
         n_iter=10,
         cv=StratifiedKFold(5, shuffle=True, random_state=123),
         scoring='roc_auc',
         n_jobs=-1,
         random_state=123).fit(
         X_train, y_train)



In [14]:
# record best estimator，parameters and best score
print(xgb_cv.best_estimator_)
print(xgb_cv.best_params_)
print(xgb_cv.best_score_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.014700433699563096,
              max_delta_step=0, max_depth=5, min_child_weight=3, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
{'learning_rate': 0.014700433699563096, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 500}
0.7299263605202484


In [19]:
# comparision of best score (auc)
value = [[lr_cv.best_score_],[mlp_cv.best_score_],[rf_cv.best_score_],
       [ada_cv.best_score_],[gb_cv.best_score_],[xgb_cv.best_score_]]
model = ['Logistic Regression','Neural Network','Random Forest',
         'AdaBoost','Gradient Boosting','XGBoost']
bs = pd.DataFrame(value, columns=['Best Score (AUC)'], index=model)
bs

Unnamed: 0,Best Score (AUC)
Logistic Regression,0.709344
Neural Network,0.726934
Random Forest,0.727042
AdaBoost,0.727212
Gradient Boosting,0.731023
XGBoost,0.729926


In [20]:
# select Gradient Boosting with best score (AUC) 0.731 as best model
best_model = gb_cv.best_estimator_ 

In [21]:
# pickle out best model
with open('best_model.pickle', 'wb') as file:
    pickle.dump(best_model, file)