In [1]:
# importing packages we'll need
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn import ensemble
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading the data set
df = pd.read_csv('loan.csv', dtype = {'issue_d': str}, low_memory=False)
# https://www.kaggle.com/wendykan/lending-club-loan-data

In [4]:
Y = df['sub_grade']

In [5]:
manual_feat_list = ['int_rate', 'all_util', 'percent_bc_gt_75', 'dti_joint', 'num_tl_op_past_12m', 'inq_last_6mths', 'open_il_12m', 'bc_open_to_buy', 'total_bc_limit', 'mo_sin_old_rev_tl_op', 'sec_app_mort_acc', 'annual_inc_joint', 'pub_rec_bankruptcies', 'term']

In [6]:
X = pd.DataFrame()
X = df.filter(items=manual_feat_list)
X.head()

Unnamed: 0,int_rate,all_util,percent_bc_gt_75,dti_joint,num_tl_op_past_12m,inq_last_6mths,open_il_12m,bc_open_to_buy,total_bc_limit,mo_sin_old_rev_tl_op,sec_app_mort_acc,annual_inc_joint,pub_rec_bankruptcies,term
0,13.56,28.0,0.0,,3,1.0,1.0,34360.0,36500,212,,,1,36 months
1,18.94,57.0,0.0,,6,0.0,2.0,13761.0,15000,378,,,1,60 months
2,17.97,35.0,0.0,,0,0.0,0.0,13800.0,13800,92,,,0,36 months
3,18.94,70.0,100.0,,3,0.0,3.0,1239.0,5000,154,,,0,36 months
4,16.14,54.0,0.0,,5,0.0,3.0,8471.0,9300,216,,,0,60 months


In [7]:
X.dtypes

int_rate                float64
all_util                float64
percent_bc_gt_75        float64
dti_joint               float64
num_tl_op_past_12m        int64
inq_last_6mths          float64
open_il_12m             float64
bc_open_to_buy          float64
total_bc_limit            int64
mo_sin_old_rev_tl_op      int64
sec_app_mort_acc        float64
annual_inc_joint        float64
pub_rec_bankruptcies      int64
term                     object
dtype: object

In [8]:
X = pd.concat([X, pd.get_dummies(X['term'], drop_first=True)], axis=1)

In [9]:
X = X.drop('term', 1)

In [10]:
X.head()

Unnamed: 0,int_rate,all_util,percent_bc_gt_75,dti_joint,num_tl_op_past_12m,inq_last_6mths,open_il_12m,bc_open_to_buy,total_bc_limit,mo_sin_old_rev_tl_op,sec_app_mort_acc,annual_inc_joint,pub_rec_bankruptcies,60 months
0,13.56,28.0,0.0,,3,1.0,1.0,34360.0,36500,212,,,1,0
1,18.94,57.0,0.0,,6,0.0,2.0,13761.0,15000,378,,,1,1
2,17.97,35.0,0.0,,0,0.0,0.0,13800.0,13800,92,,,0,0
3,18.94,70.0,100.0,,3,0.0,3.0,1239.0,5000,154,,,0,0
4,16.14,54.0,0.0,,5,0.0,3.0,8471.0,9300,216,,,0,1


In [11]:
X.fillna(0, inplace=True)

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 465)

## Neural Network 1

In [13]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 100 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(100,))
mlp.fit(X_test, Y_test)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [15]:
mlp.score(X_test, Y_test)

0.18215745738681519

In [16]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X_test, Y_test, cv=5)

array([0.1719306 , 0.15734444, 0.18065931, 0.06604554, 0.16632744])

## Neural Network 2

In [19]:
mlp2 = MLPClassifier(hidden_layer_sizes=(10, 10,))
mlp2.fit(X_test, Y_test)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [20]:
mlp2.score(X_test, Y_test)

0.06540273512750577

## Neural Network 3

In [21]:
mlp3 = MLPClassifier(hidden_layer_sizes=(100, 10,))
mlp3.fit(X_test, Y_test)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [22]:
mlp3.score(X_test, Y_test)

0.0654535975636893

## Neural Network 4

In [25]:
mlp4 = MLPClassifier(hidden_layer_sizes=(200,))
mlp4.fit(X_test, Y_test)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [26]:
mlp4.score(X_test, Y_test)

0.13397165690743673

## Random Forest Classifier

In [17]:
rfc = ensemble.RandomForestClassifier()
rfc.fit(X_test,Y_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
cross_val_score(rfc, X_test, Y_test, cv = 5)

array([0.74933274, 0.70113774, 0.75312331, 0.72742305, 0.75034983])

## Summary

The random forest classifier dominates the neural network classifier in my opinion. The random forest classifier took only seconds to run, whereas most of the neural network classifiers here took minutes to run, and I tried to run several more with larger layer sizes that never finished running. Not only is the random forest classifier more time efficient, but the R squared of 0.7 to 0.75 completely dominates the scores of the neural networks, which ranged from 0.06 to 0.18. One last thing that was puzzling was when I went from a hidden layer size of (100,) to (100, 10) from mlp 1 to mlp 3, the R squared got significantly worse. Also, a hidden layer of 200 performed worse than 100.