# Imports

In [603]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from sklearn.metrics import roc_auc_score


import data_loader as dl
import utils

# Load Data

In [604]:
ld = dl.DataLoader('data/loan_data_clean.csv',
                   ['person_income', 'loan_int_rate','loan_percent_income', 'loan_intent'],
                   ['loan_status'])
ld.train_test_split(test_size=0.4, random_state=154)

# Train the decision tree

In [605]:
clf_tree0 = DecisionTreeClassifier(criterion='gini',
                                     splitter='best',
                                     max_depth=5,
                                     min_samples_split=1000,
                                     min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0,
                                     random_state=42,
                                     min_impurity_decrease=0.0)


clf_tree0.fit(ld.features_train, ld.loan_status_train)

In [606]:
default_prob = clf_tree0.predict_proba(ld.features_test)[:,1]
roc_auc_score(y_true = ld.loan_status_test, y_score = default_prob)

0.8642154498826626

# Hyperparameter tuning with Grid Search Cross Validation (GridSearchCV)

In [607]:
# Define the DecisionTreeClassifier with specific random state
clf_tree = DecisionTreeClassifier(random_state=42)

In [608]:
# Define the grid of candidate parameters for the best estimator
param_grid = {'max_depth': [2, 3, 4, 5, 6, 7, 8, 10, 12, 15],
              'min_samples_leaf': [5, 25, 50, 100],
              'max_features': [None, 'sqrt', 'log2', 0.5],
              'min_samples_split': [2, 100, 1000, 5000]}

In [609]:
# Define the GridSearchCV instance
grid_search = GridSearchCV(estimator=clf_tree,          # Estimator, for us the DecisionTreeClassifier
                          param_grid=param_grid,   # Grid of parameters
                          scoring='roc_auc',       # Criterion for comparing the classifiers
                          n_jobs=-1,               # Paralelisation. For us no.
                          cv=5,                    # Number of sub-datasets for cross validation
                          refit=True,              # Return the estimator with the best hyperparameters
                          return_train_score=True) # Return the score for the best estimator

In [531]:
# Fit the grid search to the training datasets
grid_search.fit(ld.features_train, ld.loan_status_train)

In [303]:
# Print the hyperparameters and score for the best estimator from the grid search cross validation
print("Hyperparameters for the best classification decision tree from grid search:")
for hyperparam, value in grid_search.best_params_.items():
    print(hyperparam + ": ", value)
    
best_score = grid_search.best_score_
print("score: ", best_score)

Hyperparameters for the best classification decision tree from grid search:
max_depth:  15
max_features:  None
min_samples_leaf:  100
min_samples_split:  2
score:  0.8837158224347983


In [10]:
# Best classification decision tree
best_clf_tree = grid_search.best_estimator_

# Custom grid search cross validation

I also illustrates now how to run cross-validation more manually to obtain custom tree attributes, such as the total number of nodes or leaf nodes associated with certain hyperparameter settings. The following function accesses the internal .tree_ attribute to retrieve information about the total node count, as well as how many of these nodes are leaf nodes.

In [3]:
from treegridsearchcv import get_leaves_count

In [13]:
get_leaves_count(best_clf_tree)

121

In [747]:
from sklearn.model_selection import KFold
from itertools import product


class ModelCalculations():
    
    def __init__(self):
        """Initialize ModelCalculations instance"""
        self.estimator = None
              
            
    def set_estimator(self, estimator):
        """Sets the estimator to do the calculations for"""
        self.estimator = estimator
            
    
    def get_leaves_count(self):
        """
        Returns the number of leaves in the tree
        The child_left[i] method returns the number of the node the is the left child of node i.
        If the node i is a leaf node, child_left[i] = -1. Therefore
        """
        t = self.estimator.tree_
        n = t.node_count
        leaves = len([i for i in range(n) if t.children_left[i]== -1])
        return {"leaves_count": leaves}
    
    
    def other_calc(self):
        return {"random_calc": 666}
    
    
    def calculate(self):
        """ Entry point to do all the calculations"""
        
        return {**self.get_leaves_count(), **self.other_calc()}
     



In [748]:
calcs = ModelCalculations()
grid = {'max_depth': [2, 3], 'min_samples_leaf': [5, 25], 'min_samples_split': [2, 100]}
tgsv = GridSearchCVPlus(estimator=DecisionTreeClassifier(random_state=42),
                        param_grid = grid, model_calculations = calcs, cv=3)

In [749]:
df = tgsv.fit(ld.features, ld.loan_status)

In [751]:
tgsv.best_estimator_

In [675]:
results = tgsv.cv_results_
res = np.vstack([results[f"split{i}_test_score"] for i in range(tgsv.n_splits)])
print(results['params'][np.argmax(np.mean(res, axis = 0))])
results

{'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}


{'params': [{'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2},
  {'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 100},
  {'max_depth': 2, 'min_samples_leaf': 25, 'min_samples_split': 2},
  {'max_depth': 2, 'min_samples_leaf': 25, 'min_samples_split': 100},
  {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2},
  {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 100},
  {'max_depth': 3, 'min_samples_leaf': 25, 'min_samples_split': 2},
  {'max_depth': 3, 'min_samples_leaf': 25, 'min_samples_split': 100}],
 'split0_train_score': [0.7902819076775234,
  0.7902819076775234,
  0.7902819076775234,
  0.7902819076775234,
  0.8215556029061746,
  0.8215556029061746,
  0.8215556029061746,
  0.8215556029061746],
 'split1_train_score': [0.7943015091806307,
  0.7943015091806307,
  0.7943015091806307,
  0.7943015091806307,
  0.8261951483465594,
  0.8261951483465594,
  0.8261951483465594,
  0.8261951483465594],
 'split2_train_score': [0.7915382091363926,

In [613]:
gscv=GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                param_grid = grid,
                scoring='roc_auc',
                n_jobs=-1,
                cv=3)
gscv.fit(ld.features, ld.loan_status)

In [700]:
gscv.cv_results_['split0_test_score']
gscv.best_score_

0.8211381704681032

In [533]:
list(grid_search.cv_results_.keys())

['mean_fit_time',
 'std_fit_time',
 'mean_score_time',
 'std_score_time',
 'param_max_depth',
 'param_max_features',
 'param_min_samples_leaf',
 'param_min_samples_split',
 'params',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'mean_test_score',
 'std_test_score',
 'rank_test_score',
 'split0_train_score',
 'split1_train_score',
 'split2_train_score',
 'split3_train_score',
 'split4_train_score',
 'mean_train_score',
 'std_train_score']

In [733]:
DecisionTreeClassifier(random_state=42).random_state

42

In [625]:
x = {"a": 1, "b": 2}
y = {"c": 3, "d": 4}
{**x, **y}

{'a': 1, 'b': 2, 'c': 3, 'd': 4}