This notebook is dedicated to Classification models for a preprocessed dataset.

In [1]:
# Useful libraries
import pandas as pd
import numpy as np
import sklearn
#import xgboost as xgb

# Going to use these 4 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

We import the preprocessed data into a pandas DataFrame and make sure we only have the features we want (no index column which was created by pd.to_csv automatically). Then we split the preprocessed data into training and test sets.

In [2]:
df = pd.read_csv('data/loan_preprocessed_all_hangyu.csv')
feature_list = ['term', 'grade', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'loan_status', 'purpose', 'addr_state',
       'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'pub_rec', 'revol_bal', 'initial_list_status',
       'out_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'application_type']

df = df.filter(feature_list)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 752938 entries, 0 to 752937
Data columns (total 25 columns):
term                           752938 non-null int64
grade                          752938 non-null int64
emp_length                     752938 non-null int64
home_ownership                 752938 non-null int64
annual_inc                     752938 non-null float64
verification_status            752938 non-null int64
loan_status                    752938 non-null int64
purpose                        752938 non-null int64
addr_state                     752938 non-null int64
delinq_2yrs                    752938 non-null float64
inq_last_6mths                 752938 non-null float64
mths_since_last_delinq         752938 non-null float64
mths_since_last_record         752938 non-null float64
pub_rec                        752938 non-null float64
revol_bal                      752938 non-null float64
initial_list_status            752938 non-null int64
out_prncp                  

In [3]:
# Create training and test data
train_df, test_df = train_test_split(df, test_size=0.2)

# Split label and features
X_train = train_df.drop('loan_status', axis=1)
Y_train = train_df['loan_status'].ravel()

X_test = test_df.drop('loan_status', axis=1)
Y_test = test_df['loan_status'].ravel()

After splitting the data into test and training sets, we create some helper functions/classes to make the code a bit more managable. This first part will focus only on separate models (no stacking of models) and their confusion matrices/overall accuracy. 

Some things to consider in general are the effects of balanced data, the number of features, and the correlation between features.

In [4]:
# Class to extend the Sklearn classifier (just makes it easier to read different models)
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

This sections serves to manage the parameters for each model used.

In [5]:
# Put in our parameters for models
SEED = 1234

# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

Now we create and train our models. Then we predict on the test set with our trained models and generate confusion matrices along with its overall accuracy as a metric of our models.

In [None]:
# Create objects that represent our models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [None]:
# Train models
rf.train(X_train[0:1000], Y_train[0:1000])
ada.train(X_train[0:1000], Y_train[0:1000])
gb.train(X_train[0:1000], Y_train[0:1000])
svc.train(X_train[0:1000], Y_train[0:1000])

In [None]:
# Predict on test set for accuracy metric
rf_predictions = rf.predict(X_test)
ada_predictions = ada.predict(X_test)
gb_predictions = gb.predict(X_test)
svc_predictions = svc.predict(X_test)

In [None]:
# This function just computes the confusion matrix and accuracy, then displays both.
def accuracy_metric(truth, predictions, model_name):
    confusion = pd.crosstab(truth, predictions, rownames = ['Truth'], colnames = ['Prediction'], margins=True)
    print('\n------------------------------\n')
    print(model_name + ' confusion matrix: \n')
    print(confusion)
    print('\nAccuracy: ' + str((confusion[0][0] + confusion[1][1]) / len(truth)))

In [None]:
# These are the accuracies and confusion matrices over all the training data 
accuracy_metric(Y_test, rf_predictions, 'Random Forest')
accuracy_metric(Y_test, ada_predictions, 'AdaBoost')
accuracy_metric(Y_test, gb_predictions, 'Gradient Boosting')
accuracy_metric(Y_test, svc_predictions, 'Support Vector Classifier')

Here we will implement the stacked modelling approach. This approach simply uses the predictions of a number of models as the training data for another model or layer of models.

We need to use Out-of-Fold predictions in order to feed the predictions of the first layer without worrying too much about overfitting the data. At its simplest form, this is K-folding our training data and training/predicting the models on its own fold.

In [None]:
NFOLDS = 4 # set this for the number of models we want to stack in the first layer
kf = KFold(n_splits= NFOLDS, shuffle = True, random_state=SEED)

ntrain = train_df.shape[0]
ndimensions = train_df.shape[1]
ntest = test_df.shape[0]

# Out-of-fold predictions (for stacked models)
def get_oof(clf, x_train, y_train, x_test):
    i = 0
    oof_train = np.zeros((len(x_train),))
    oof_test = np.zeros((len(x_test),))
    oof_test_skf = np.empty((NFOLDS, len(x_test)))

    for train_index, test_index in kf.split(x_train):
        x_tr = x_train.iloc[train_index,:]
        y_tr = y_train[train_index]
        x_te = x_train.iloc[test_index,:]
    
        clf.train(x_tr, y_tr)
    
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i,:] = clf.predict(x_test)
        i += 1

    oof_test[:] = oof_test_skf.mean(axis=0)    
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
# Create our OOF train and test predictions. These base results will be used as new features

rf_oof_train, rf_oof_test = get_oof(rf,X_train[0:1000], Y_train[0:1000], X_test) # Random Forest
print("Random Forest training complete")

ada_oof_train, ada_oof_test = get_oof(ada, X_train[0:1000], Y_train[0:1000], X_test) # AdaBoost 
print(" AdaBoost training complete")

gb_oof_train, gb_oof_test = get_oof(gb,X_train[0:1000], Y_train[0:1000], X_test) # Gradient Boost
print("Gradient Boosting training complete")

svc_oof_train, svc_oof_test = get_oof(svc,X_train[0:1000], Y_train[0:1000], X_test) # Support Vector Classifier
print("SVC training complete")

In [None]:
# Create new training/test feature sets for layer2 model(s)
layer1_train = np.concatenate((rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
layer2_test = np.concatenate((rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [None]:
# Train layer2 model(s) and predict