## Load data and usual functions

In [41]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

df = pd.read_csv('data.csv')

#################################

# Remove missing zip_code values.    
df = df[df['zip_code'].notna()]

# Extract issue year information and add to dataset
df['issue_year'] = df['issue_d'].apply(lambda x: int(str(x).split('-')[1]))
df['earliest_cr_year'] = df['earliest_cr_line'].apply(lambda x: int(str(x).split('-')[1]))


#################################

# Split dataset in train/test based on loan_status
label_columns = ['loan_status']

training_columns = ['emp_length', 'term', 'home_ownership', 'verification_status', 'purpose', 'zip_code',
                    'initial_list_status', 'annual_inc', 'dti',
                    'revol_util', 'open_acc', 'pub_rec', 'total_acc', 'mort_acc', 
                    'pub_rec_bankruptcies', 'issue_year', 'earliest_cr_year']

# Split the training and validation datasets and their labels.
X_train, X_val, y_train, y_val = train_test_split(df[training_columns], df[label_columns],random_state = 1912)

y_train['loan_status'] = y_train['loan_status'].apply(lambda x: 0 if x == 'Fully Paid' else 1)
y_val['loan_status'] = y_val['loan_status'].apply(lambda x: 0 if x == 'Fully Paid' else 1)

#################################

# Convert emp_length to ordinal values
def convert_emp_length (row):
    if type(row['emp_length']) == float:
        return np.NaN
    if row['emp_length'] == '10+ years':
        return 10
    if row['emp_length'][0] != '<':
        return int(row['emp_length'][0])
    if row['emp_length'] == '< 1 year':
        return 0
    
# Convert issue_year to ordinal values
def convert_issue_year (row):
    return row['issue_year'] - 2006

# Convert earliest_cr_year to ordinal values
def convert_earliest_cr_year (row):
    return row['earliest_cr_year'] - 1934

#################################

def prep_dataset(dataset):
    
    # Get 1st digit from zip_code
    dataset['zip_code_1'] = dataset['zip_code'].apply(lambda x: str(x)[0:1])

    # Remove artificial space from feature 'term'
    dataset['term'] = dataset['term'].apply(lambda x: x if str(x)[0] != ' ' else x[1:])

    # Fill NaNs with the median
    cols = ['annual_inc', 'dti', 'revol_util', 'open_acc', 'pub_rec', 'total_acc', 'mort_acc', 
            'pub_rec_bankruptcies']
    for i in cols:
        dataset[i].fillna(dataset[i].median(), inplace = True)
    
    # Perform ORDINAL ENCODING for emp_length and issue_year
    dataset['issue_year_mod'] = dataset.apply (lambda row: convert_issue_year(row), axis=1)
    dataset['earliest_cr_year_mod'] = dataset.apply (lambda row: convert_earliest_cr_year(row), axis=1)
    dataset['emp_length_mod'] = dataset.apply (lambda row: convert_emp_length(row), axis=1)
    dataset['emp_length_mod'].fillna(dataset['emp_length_mod'].median(), inplace = True)
    
    # Perform ONE-HOT ENCODING    
    cols = ['term', 'home_ownership', 'verification_status', 'purpose', 'zip_code_1', 
            'initial_list_status']
    
    for i in cols:
        dummies = pd.get_dummies(dataset[i], prefix = i, drop_first = False)
        dataset = pd.concat([dataset, dummies], axis = 1)
        
    return dataset

X_train = prep_dataset(X_train.copy())
X_val = prep_dataset(X_val.copy())

#################################

# Drop unused columns from this dataset.
def drop_unused(dataset):
    
    # This has been replaced with ordinal encoding.
    dataset = dataset.drop(['emp_length'], axis = 1)

    # These have been replaced with one-hot encoding.
    dataset = dataset.drop(['term'], axis = 1)
    dataset = dataset.drop(['home_ownership'], axis = 1)
    dataset = dataset.drop(['verification_status'], axis = 1)
    dataset = dataset.drop(['purpose'], axis = 1)
    dataset = dataset.drop(['zip_code'], axis = 1)
    dataset = dataset.drop(['zip_code_1'], axis = 1)
    dataset = dataset.drop(['earliest_cr_year'], axis = 1)
    dataset = dataset.drop(['issue_year'], axis = 1)
    dataset = dataset.drop(['initial_list_status'], axis = 1)
    
    return dataset

X_train = drop_unused(X_train.copy())
X_val = drop_unused(X_val.copy())

#################################

# Drop not important features from this dataset.
def drop_not_important(dataset):
    
    dataset = dataset.drop(['home_ownership_ANY'], axis = 1)
    dataset = dataset.drop(['home_ownership_NONE'], axis = 1)
    dataset = dataset.drop(['home_ownership_OTHER'], axis = 1)
    dataset = dataset.drop(['home_ownership_OWN'], axis = 1)
    dataset = dataset.drop(['verification_status_Not Verified'], axis = 1)
    dataset = dataset.drop(['purpose_car'], axis = 1)
    dataset = dataset.drop(['purpose_educational'], axis = 1) 
    dataset = dataset.drop(['purpose_house'], axis = 1)
    dataset = dataset.drop(['purpose_major_purchase'], axis = 1)
    dataset = dataset.drop(['purpose_medical'], axis = 1)
    dataset = dataset.drop(['purpose_moving'], axis = 1)
    dataset = dataset.drop(['purpose_renewable_energy'], axis = 1)
    dataset = dataset.drop(['purpose_small_business'], axis = 1)
    dataset = dataset.drop(['purpose_vacation'], axis = 1)
    dataset = dataset.drop(['purpose_wedding'], axis = 1)
    
    dataset = dataset.drop(['purpose_home_improvement'], axis = 1)
    dataset = dataset.drop(['purpose_other'], axis = 1)
    
    return dataset

X_train = drop_not_important(X_train.copy())
X_val = drop_not_important(X_val.copy())

X_train.head(5)

Unnamed: 0,annual_inc,dti,revol_util,open_acc,pub_rec,total_acc,mort_acc,pub_rec_bankruptcies,issue_year_mod,earliest_cr_year_mod,emp_length_mod,term_36 months,term_60 months,home_ownership_MORTGAGE,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,purpose_credit_card,purpose_debt_consolidation,zip_code_1_0,zip_code_1_1,zip_code_1_2,zip_code_1_3,zip_code_1_4,zip_code_1_5,zip_code_1_6,zip_code_1_7,zip_code_1_8,zip_code_1_9,initial_list_status_f,initial_list_status_w
1241974,45000.0,12.67,20.1,12.0,1.0,14.0,0.0,0.0,10,77,3.0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
1189799,84198.0,20.09,51.1,14.0,0.0,43.0,1.0,0.0,6,56,10.0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0
1130069,78000.0,15.72,60.8,21.0,0.0,44.0,0.0,0.0,7,66,3.0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
434815,33280.0,21.6,46.4,9.0,0.0,17.0,0.0,0.0,11,76,1.0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
790599,58000.0,26.76,24.8,29.0,0.0,44.0,3.0,0.0,8,63,1.0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0


## Bayesian Optimization with hyperopt for hyperparameters search (3-Fold CV)

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from time import time
from sklearn.metrics import f1_score, make_scorer, roc_auc_score

from hyperopt import STATUS_OK

N_FOLDS = 3

import hyperopt
from hyperopt import hp
from hyperopt.pyll.base import scope

# Define the search space distributions
space = {
    'class_weight': hp.choice('class_weight', [None, 'balanced', 'balanced_subsample']),
    'max_depth': scope.int(hp.quniform('max_depth', 4, 40, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 5, 80, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 40, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 40, 1)),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_features': hp.choice('max_features', ['auto', 'log2'])
}
                              
# Sample from the full space
example = hyperopt.pyll.stochastic.sample(space)
print('sample example of parameters space based on defined distributions: ', example)

import csv

# File to save first results
out_file = 'rfc_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['loss', 'params', 'train_time'])
of_connection.close()

def objective(params, n_folds = N_FOLDS):
    """Objective function for RandomForest Classifier for Hyperparameter Tuning"""
    
    forest = RandomForestClassifier(**params)
    
    start = time()
    
    # Perform n_fold cross validation with hyperparameters
    # Evalute based on AUC-ROC
    cv_results = np.mean(cross_val_score(forest, X_train, np.ravel(y_train), 
                                         cv=N_FOLDS, scoring=make_scorer(roc_auc_score)))
    
    end = time()
    train_time = (end - start) * 1000

    # Loss must be minimized
    loss = 1 - cv_results
    
    
    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, train_time])
    of_connection.close()
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

from hyperopt import tpe # Algorithm
tpe_algorithm = tpe.suggest

sample example of parameters space based on defined distributions:  {'class_weight': None, 'criterion': 'gini', 'max_depth': 17, 'max_features': 'auto', 'min_samples_leaf': 26, 'min_samples_split': 33, 'n_estimators': 45}


## Run bayesian optimization based TPE (commented to avoid waiting times)

In [44]:
from hyperopt import fmin
from hyperopt import Trials# Trials object to track progress
bayes_trials = Trials()

MAX_EVALS = 500

# Uncomment this if you want to run bayesian optimization
# # Optimize
# best = fmin(fn = objective, space = space, algo = tpe.suggest, 
#             max_evals = MAX_EVALS, trials = bayes_trials)