# Predicting the outcome of loan applications
# 3b. Random Forest
For this problem, we care about having a low number of false negatives as possible. False negatives, i.e. people we accept but should have rejected, pose a greater risk, because they could lead to loss of the capital lent as well as potential revenue from the interest. While high recall is associated to few false negatives, we can not tune model parameters by optimising on recall alone, otherwise the model will be pushed to behave like a random model, which has a recall of 1.

In [1]:
import os
import pandas as pd
import numpy as np
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

p = os.path.abspath('../')
if p not in sys.path:
    sys.path.append(p)

from shared.data_processing import CategoricalEncoder
from shared.data_processing import FeatureSelector


## Load data

In [2]:
df = pd.read_csv('./data/loan_data_prepped.csv')

In [3]:
X = df.drop(['label', 'accepted'], axis=1)
y = df['label']

## Set the features
Features of interest we have identified in the previous notebooks.

In [4]:
NUMERICAL_FEATURES = ['duration', 'loan_amount', 'age']

FIXED_CATEGORICAL = ['foreign_worker_binary', 'checking_status_ordinal', 'savings_status_ordinal',
                     'employment_ordinal', 'installment_commitment_ordinal']

OTHER_CATEGORICAL = ['loan_history', 'purpose', 'other_parties', 'property_magnitude',
                     'other_payment_plans', 'housing', 'personal_status', 'job']

FEATURES = NUMERICAL_FEATURES + FIXED_CATEGORICAL + OTHER_CATEGORICAL

In [5]:
FEATURES

['duration',
 'loan_amount',
 'age',
 'foreign_worker_binary',
 'checking_status_ordinal',
 'savings_status_ordinal',
 'employment_ordinal',
 'installment_commitment_ordinal',
 'loan_history',
 'purpose',
 'other_parties',
 'property_magnitude',
 'other_payment_plans',
 'housing',
 'personal_status',
 'job']

## Tune model parameters
We will focus on tuning the most important parameters, i.e. the maximum tree depth and the maximum number of features to use in each tree.

In [6]:
encoder = CategoricalEncoder(features_to_encode=OTHER_CATEGORICAL)

In [7]:
selector = FeatureSelector(features_to_select=NUMERICAL_FEATURES)

In [8]:
rfc = RandomForestClassifier(class_weight='balanced', n_estimators=100)

Feature scaling is not needed for a Random Forest model.

In [9]:
pipe = Pipeline(steps=[('encode', encoder),
                       ('select', selector),
                       ('forest', rfc)])

In [10]:
param_grid_coarse = {
    'forest__criterion': ['gini', 'entropy'],
    'forest__max_depth': [1, 5, 10, None],
    'forest__max_features': [1, 5, 10, 'auto']
}

param_grid_fine = {
    'forest__criterion': ['gini', 'entropy'],
    'forest__max_depth': [3, 4, 5, 6, 7],
    'forest__max_features': [1, 2, 3]
}

In [11]:
grid = GridSearchCV(pipe,
                    verbose=1,
                    cv=StratifiedKFold(n_splits=50),
                    scoring=make_scorer(f1_score),
                    param_grid=param_grid_fine)

In [12]:
grid.fit(X, y)

Fitting 50 folds for each of 30 candidates, totalling 1500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1500 out of 1500 | elapsed:  5.0min finished


GridSearchCV(cv=StratifiedKFold(n_splits=50, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('encode',
                                        CategoricalEncoder(features_to_encode=['loan_history',
                                                                               'purpose',
                                                                               'other_parties',
                                                                               'property_magnitude',
                                                                               'other_payment_plans',
                                                                               'housing',
                                                                               'personal_status',
                                                                               'job'])),
                                       ('select',
                                        FeatureSelector(fea

Find the best values.

In [13]:
best_criterion = grid.best_estimator_.get_params()['forest__criterion']
best_max_depth = grid.best_estimator_.get_params()['forest__max_depth']
best_max_features = grid.best_estimator_.get_params()['forest__max_features']

print(best_criterion)
print(best_max_depth)
print(best_max_features)

entropy
4
3


## Evaluate the best model over 50 random splits
Given that the data set is so small, it's important to evaluate over many random train/test splits, so that we get a better picture of the metrics.

In [14]:
rfc_tuned = RandomForestClassifier(class_weight='balanced',
                                   n_estimators=100,
                                   criterion=best_criterion,
                                   max_depth=best_max_depth,
                                   max_features=best_max_features)

In [15]:
pipe_tuned = Pipeline(steps=[('encoder', encoder),
                             ('select', selector),
                             ('forest', rfc_tuned)])

In [16]:
sss = StratifiedShuffleSplit(n_splits=50, test_size=0.25)

In [17]:
METRIC_FUNCTIONS = {
    'accuracy': accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'f1': f1_score
}

In [18]:
METRICS = {k: [] for k in METRIC_FUNCTIONS.keys()}

for train_IDX, test_IDX in sss.split(X, y):
    pipe_tuned.fit(X.loc[train_IDX], y.loc[train_IDX])
    predictions = pipe_tuned.predict(X.loc[test_IDX])
    truth = y.loc[test_IDX]
    
    for key, metric in METRIC_FUNCTIONS.items():
        METRICS[key].append(metric(truth, predictions))

In [19]:
{k: np.mean(v) for k, v in METRICS.items()}

{'accuracy': 0.64048,
 'precision': 0.42180782208104367,
 'recall': 0.5253333333333333,
 'f1': 0.46635866220578265}

In [20]:
{k: np.std(v) for k, v in METRICS.items()}

{'accuracy': 0.030804051681556453,
 'precision': 0.038858188172290434,
 'recall': 0.06510162995057975,
 'f1': 0.04286913036061381}