# Predicting the outcome of loan applications
# 3a. Random Forest
The metrics we will try to optimise is **recall**, because we care about minimising the false negatives. False negative, i.e. people we accept but should have rejected, pose a greater risk, because they could lead to loss of the capital lost as well as potential revenue from the interest.

In [1]:
import os
import pandas as pd
import numpy as np
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

p = os.path.abspath('../')
if p not in sys.path:
    sys.path.append(p)

from shared.data_processing import CategoricalEncoder


## Load data

In [2]:
df = pd.read_csv('./data/loan_data_prepped.csv')

## Set the features
Features of interest we have identified in the previous notebooks.

In [3]:
OTHER_CATEGORICAL = ['loan_history', 'purpose', 'other_parties', 'property_magnitude',
                     'other_payment_plans', 'housing', 'personal_status', 'job']

In [4]:
NUMERICAL_FEATURES = ['duration', 'loan_amount', 'age']

FIXED_CATEGORICAL = ['foreign_worker_binary', 'checking_status_ordinal', 'savings_status_ordinal',
                     'employment_ordinal', 'installment_commitment_ordinal']

VARIABLE_CATEGORICAL = [f'{feature}_encoded' for feature in OTHER_CATEGORICAL]

FEATURES = NUMERICAL_FEATURES + FIXED_CATEGORICAL + VARIABLE_CATEGORICAL

In [5]:
FEATURES

['duration',
 'loan_amount',
 'age',
 'foreign_worker_binary',
 'checking_status_ordinal',
 'savings_status_ordinal',
 'employment_ordinal',
 'installment_commitment_ordinal',
 'loan_history_encoded',
 'purpose_encoded',
 'other_parties_encoded',
 'property_magnitude_encoded',
 'other_payment_plans_encoded',
 'housing_encoded',
 'personal_status_encoded',
 'job_encoded']

## Tune model parameters
We will focus on tuning the most important parameters, i.e. the maximum tree depth and the maximum number of features to use in each tree.

In [7]:
encoder = CategoricalEncoder(features_to_encode=OTHER_CATEGORICAL,
                             target='label',
                             features_to_return=FEATURES)

In [8]:
rfc = RandomForestClassifier(class_weight='balanced', n_estimators=100)

In [30]:
pipe = Pipeline(steps=[('encoder', encoder),
                       ('forest', rfc)])

In [31]:
grid = GridSearchCV(pipe,
                    verbose=2,
                    cv=StratifiedKFold(n_splits=50),
                    scoring=make_scorer(recall_score),
                    param_grid={
                        'forest__criterion': ['gini', 'entropy'],
                        'forest__max_depth': [1, 5, 10, None],
                        'forest__max_features': [1, 5, 10, 'auto']
                    })

In [32]:
grid.fit(df, df['label'])

Fitting 50 folds for each of 32 candidates, totalling 1600 fits
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterio

[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=5, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=5, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=5, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=5, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=5, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=5, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterio

[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=10 
[CV]  fore

[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=1, forest__max_f

[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterio

[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterio

[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=10 
[CV]  fore

[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=5, forest__max_f

[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=1, total=   0.1s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=1 
[CV]  fore

[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=5 
[CV]  fore

[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=10

[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=10, f

[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=1 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__ma

[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=5 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__ma

[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=10, total=   0.3s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=10 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=10, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None,

[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, f

[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=gini, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=gini, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, f

[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=1, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__ma

[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=5, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=5, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=5, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=10, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, fore

[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=10, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=entropy, forest__max_dept

[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=1, forest__max_features=auto, total=   0.1s
[CV] forest__criterion=entropy

[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=1, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=1, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=1, total=   0.1s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__ma

[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__ma

[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=10, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5,

[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=5, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy

[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10

[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10

[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=10, total=   0.3s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=10, total=   0.3s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=10, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=10, total=   0.3s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=10, total=   0.3s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=10, total=   0.3s
[CV] forest__criterion=entropy, forest__m

[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=10, forest__max_features=auto, total=   0.2s
[CV] forest__criter

[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=1 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=1, total=   0.2s
[CV] forest__criterion=entropy

[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=5 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=5, total=   0.2s
[CV] forest__criterion=entropy

[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=10, total=   0.3s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=10, total=   0.3s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=10, total=   0.3s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=10, total=   0.3s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=10, total=   0.3s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=10 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=10, total=   0.3s
[CV] forest__criter

[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto, total=   0.2s
[CV] forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto 
[CV]  forest__criterion=entropy, forest__max_depth=None, forest__max_features=auto, total=   0.

[Parallel(n_jobs=1)]: Done 1600 out of 1600 | elapsed:  5.9min finished


GridSearchCV(cv=StratifiedKFold(n_splits=50, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('encoder', CategoricalEncoder(features_to_encode=['loan_history', 'purpose', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'personal_status', 'job'],
          features_to_return=['duration', 'loan_amount', 'age', 'foreign_worker_binary', 'checking_status_ordinal',...ors=100, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'forest__criterion': ['gini', 'entropy'], 'forest__max_depth': [1, 5, 10, None], 'forest__max_features': [1, 5, 10, 'auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(recall_score), verbose=2)

Find the best values.

In [33]:
best_criterion = grid.best_estimator_.get_params()['forest__criterion']
best_max_depth = grid.best_estimator_.get_params()['forest__max_depth']
best_max_features = grid.best_estimator_.get_params()['forest__max_features']

print(best_criterion)
print(best_max_depth)
print(best_max_features)

gini
1
1


## Evaluate the best model over 50 random splits
Given that the data set is so small, it's important to evaluate over many random train/test splits, so that we get a better picture of the metrics. With the best parameters, we get:
- **average recall of 0.74**;
- **average precision of 0.48**

In [60]:
rfc_tuned = RandomForestClassifier(class_weight='balanced',
                                   n_estimators=100,
                                   criterion=best_criterion,
                                   max_depth=1,
                                   max_features=1)

In [61]:
pipe_tuned = Pipeline(steps=[('encoder', encoder),
                             ('scaler', StandardScaler()),
                             ('forest', rfc_tuned)])

In [62]:
sss = StratifiedShuffleSplit(n_splits=50, test_size=0.25)

In [63]:
METRIC_FUNCTIONS = {
    'accuracy': accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'f1': f1_score
}

In [64]:
METRICS = {k: [] for k in METRIC_FUNCTIONS.keys()}

X = df
y = df['label']

for train_IDX, test_IDX in sss.split(X, y):
    pipe_tuned.fit(X.loc[train_IDX], y.loc[train_IDX])
    logistic_predictions = pipe_tuned.predict(X.loc[test_IDX])
    truth = y.loc[test_IDX]
    
    for key, metric in METRIC_FUNCTIONS.items():
        METRICS[key].append(metric(truth, logistic_predictions))

In [65]:
{k: np.mean(v) for k, v in METRICS.items()}

{'accuracy': 0.68096,
 'f1': 0.590448380551911,
 'precision': 0.4965676759879424,
 'recall': 0.7629333333333334}

In [66]:
{k: np.std(v) for k, v in METRICS.items()}

{'accuracy': 0.07215343650859604,
 'f1': 0.04187831374494692,
 'precision': 0.0650969252388257,
 'recall': 0.12742342362724715}

Check accuracy on training set to check for overfitting.

In [41]:
pipe_tuned.score(X.loc[train_IDX], y.loc[train_IDX])

0.72