# Neural Network - Lending Club Loan Default Prediction

Neural Networks are equally powerful and popular as GBM. 

In [2]:
#Neural Network - Deep Borkar
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.model_selection import train_test_split

import pickle
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings('ignore')

In [3]:
model_data = pd.read_csv('data/existing_customers.csv', index_col=0)

In [4]:
model_data.head()

Unnamed: 0,total_pymnt,out_prncp,loan_amnt,installment,total_rec_late_fee,int_rate,term,dti,revol_util,annual_inc,revol_bal,bc_open_to_buy,tot_hi_cred_lim,mo_sin_old_rev_tl_op,bc_util,loan_status
39733,572.57,12688.29,13000,295.66,0.0,12.98,1,21.9,47.2,52000.0,16320,9280.0,140185.0,173.0,63.8,0
39734,750.81,9095.92,9450,382.31,0.0,26.31,0,15.06,23.5,42000.0,14791,23518.0,283111.0,209.0,38.3,0
39735,305.19,2363.11,2500,104.39,0.0,28.72,0,11.38,82.5,142000.0,4540,0.0,48167.0,275.0,100.9,0
39739,592.85,8559.71,9000,285.41,0.0,8.81,0,12.77,7.2,75900.0,2558,20301.0,68088.0,221.0,9.8,0
39741,839.58,7476.28,8000,284.87,0.0,16.91,0,39.43,41.2,42000.0,18572,12191.0,69683.0,99.0,55.5,0


In [5]:
Y = model_data['loan_status']
X = model_data.drop("loan_status", 1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)

In [7]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [8]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [9]:
import tensorflow.keras
from tensorflow.keras import backend
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

### Parameter Tuning 

Performing a Grid Search to find optimal parameters for the Neural Network.<br> Here we try to find the best combination of batch size, epoch and activation function.

In [10]:
# fine tuning with Grid Search
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

def build_classifier(optimizer):
    # first step: create a Sequential object, as a sequence of layers. B/C NN is a sequence of layers.
    classifier = Sequential()
    # add the first hidden layer
    classifier.add(Dense(units=6,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
    # add the second hidden layer
    classifier.add(Dense(units=6,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
    # add the output layer
    classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',
                    activation = 'sigmoid'))
    # compiling the NN
    classifier.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return classifier

Using TensorFlow backend.


In [11]:
if False:
    classifier = KerasClassifier(build_fn=build_classifier)

# create a dictionary of hyper-parameters to optimize
    parameters = {'batch_size':[10,20,30], 'nb_epoch':[1,2,3],'optimizer':['adam','rmsprop']}
    grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv=3)
    grid_search = grid_search.fit(X_train_sc[int(0.95*len(X_train_sc)):],y_train[int(0.95*len(X_train_sc)):], verbose=2)

    best_parameters = grid_search.best_params_ 
    best_accuracy = grid_search.best_score_

In [12]:
classifier = Sequential()

# add the first hidden layer
classifier.add(Dense(units=5,kernel_initializer='glorot_uniform',
                    activation = 'relu'))

# add the second hidden layer
classifier.add(Dense(units=5,kernel_initializer='glorot_uniform',
                    activation = 'relu'))

# add the output layer
classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',
                    activation = 'sigmoid'))

# compiling the NN
classifier.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [13]:
classifier.fit(X_train_sc,y_train,batch_size=10,epochs=3, verbose=2)

Train on 1332320 samples
Epoch 1/3
1332320/1332320 - 224s - loss: 0.0578 - accuracy: 0.9851
Epoch 2/3
1332320/1332320 - 218s - loss: 0.0525 - accuracy: 0.9885
Epoch 3/3
1332320/1332320 - 218s - loss: 0.0509 - accuracy: 0.9881


<tensorflow.python.keras.callbacks.History at 0x2a4dd42d940>

In [14]:
classifier.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  80        
_________________________________________________________________
dense_1 (Dense)              multiple                  30        
_________________________________________________________________
dense_2 (Dense)              multiple                  6         
Total params: 116
Trainable params: 116
Non-trainable params: 0
_________________________________________________________________


In [15]:
y_pred = classifier.predict_proba(X_test_sc)
y_pred = (y_pred > 0.5)

In [16]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[782286,    508],
       [ 10763,  94657]], dtype=int64)

In [17]:
default_probs = classifier.predict_proba(X_test_sc)

In [18]:
business_NN = pd.DataFrame({'Default_Prob': default_probs[:,0], 'Loan_amt' : X_test['loan_amnt'], 'term': X_test['term'] ,'target': y_test})

In [19]:
business_NN.to_csv('business_analysis_data/business_NN.csv')