# Imports

In [295]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from sklearn.metrics import roc_auc_score


import data_loader as dl
import utils

# Load Data

In [296]:
ld = dl.DataLoader('data/loan_data_clean.csv',
                   ['person_income', 'loan_int_rate','loan_percent_income', 'loan_intent'],
                   ['loan_status'])
ld.train_test_split(test_size=0.4, random_state=154)

# Train the decision tree

In [297]:
clf_tree0 = DecisionTreeClassifier(criterion='gini',
                                     splitter='best',
                                     max_depth=5,
                                     min_samples_split=1000,
                                     min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0,
                                     random_state=42,
                                     min_impurity_decrease=0.0)


clf_tree0.fit(ld.features_train, ld.loan_status_train)

In [298]:
default_prob = clf_tree0.predict_proba(ld.features_test)[:,1]
roc_auc_score(y_true = ld.loan_status_test, y_score = default_prob)

0.8642154498826626

# Hyperparameter tuning with Grid Search Cross Validation (GridSearchCV)

In [299]:
# Define the DecisionTreeClassifier with specific random state
clf_tree = DecisionTreeClassifier(random_state=42)

In [300]:
# Define the grid of candidate parameters for the best estimator
param_grid = {'max_depth': [2, 3, 4, 5, 6, 7, 8, 10, 12, 15],
              'min_samples_leaf': [5, 25, 50, 100],
              'max_features': [None, 'sqrt', 'log2', 0.5],
              'min_samples_split': [2, 100, 1000, 5000]}

In [301]:
# Define the GridSearchCV instance
grid_search = GridSearchCV(estimator=clf_tree,          # Estimator, for us the DecisionTreeClassifier
                          param_grid=param_grid,   # Grid of parameters
                          scoring='roc_auc',       # Criterion for comparing the classifiers
                          n_jobs=-1,               # Paralelisation. For us no.
                          cv=5,                    # Number of sub-datasets for cross validation
                          refit=True,              # Return the estimator with the best hyperparameters
                          return_train_score=True) # Return the score for the best estimator

In [307]:
# Fit the grid search to the training datasets
grid_search.fit(ld.features_train, ld.loan_status_train)

In [303]:
# Print the hyperparameters and score for the best estimator from the grid search cross validation
print("Hyperparameters for the best classification decision tree from grid search:")
for hyperparam, value in grid_search.best_params_.items():
    print(hyperparam + ": ", value)
    
best_score = grid_search.best_score_
print("score: ", best_score)

Hyperparameters for the best classification decision tree from grid search:
max_depth:  15
max_features:  None
min_samples_leaf:  100
min_samples_split:  2
score:  0.8837158224347983


In [10]:
# Best classification decision tree
best_clf_tree = grid_search.best_estimator_

# Custom grid search cross validation

I also illustrates now how to run cross-validation more manually to obtain custom tree attributes, such as the total number of nodes or leaf nodes associated with certain hyperparameter settings. The following function accesses the internal .tree_ attribute to retrieve information about the total node count, as well as how many of these nodes are leaf nodes.

In [3]:
from treegridsearchcv import get_leaves_count

In [13]:
get_leaves_count(best_clf_tree)

121

In [320]:
from sklearn.model_selection import KFold
from itertools import product


class TreeCalculation():
    
    def __init__(self):
        self.tree = None
        self.calcs_names = []
        
        self._get_calcs_names()
        
    def set_tree(self, tree):
        self.tree = tree
        
    def _get_calcs_names(self):
        if not self.calcs_names:
            self.calculate()
    
    def get_leaves_count(self):
        """
        Returns the number of leaves in the tree
        The child_left[i] method returns the number of the node the is the left child of node i.
        If the node i is a leaf node, child_left[i] = -1. Therefore
        """
        if self.tree is None:
            self.calcs_names.append("leaves_count")
        else:
            t = self.tree.tree_
            n = t.node_count
            leaves = len([i for i in range(n) if t.children_left[i]== -1])
            return leaves
    
    def calculate(self):
        return [self.get_leaves_count()]
     

class TreeGridSearchCV():
    
    def __init__(self, estimator, param_grid, scoring='roc_auc', cv=3, shuffle=True,
                 refit=True, return_train_score=True, tree_calculation=None):
        self.model = estimator
        self.param_grid = param_grid
        
        self.n_splits = cv
        self.shuffle = shuffle
        
        self.refit = refit
        self.return_train_score = return_train_score
        
        self.tree_calculation = tree_calculation
        
        self.best_params_ = None
        self.best_estimator_ = None
        self.best_score_ = None
        
                
    def fit(self, features, target):
        
        params_scores_calcs = []
        
        # Find all combinations of parameters
        param_combinations = list(product(*self.param_grid.values()))
        for params in param_combinations:
            
            # Define a tree with each combination of parameters
            params_dict = dict(zip(self.param_grid.keys(), params))
            tree = DecisionTreeClassifier(random_state=42)
            tree.set_params(**params_dict)

            # Cross-validate
            kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=42)
            train_scores, val_scores, quantities = [], [], []
            for train_idx, test_idx in kf.split(features):
                # Fit the tree to the training data
                features_train, target_train,  = features.iloc[train_idx], target.iloc[train_idx]
                features_test, target_test = features.iloc[test_idx], target.iloc[test_idx]
                tree.fit(X=features_train, y=target_train)
                
                # Score the train dataset and testing dataset (validation)
                train_pred = tree.predict_proba(X=features_train)[:, 1]
                train_score = roc_auc_score(y_score=train_pred, y_true=target_train)
                train_scores.append(train_score)
                
                test_pred = tree.predict_proba(X=features_test)[:, 1]
                val_score = roc_auc_score(y_score=test_pred, y_true=target_test)
                val_scores.append(val_score)
                
                
#                 # Calculate the extra quantities
#                 if self.tree_calculation is not None:
#                     self.tree_calculation.set_tree(tree)
#                     self.tree_calculation.calculate()
#                     quantities.append()
            
            # Keep calculations for median validation score
            cv_scores = np.transpose(np.array([train_scores, val_scores]))
            median_val_row = cv_scores[np.where(cv_scores[:, 1] == np.median(cv_scores[:, 1]))][0]
            median_val_row = np.concatenate((list(params_dict.values()), median_val_row))
            params_scores_calcs.append(median_val_row)
            
        # Get columns
        hyperparams_names = list(self.param_grid.keys())
        cols = hyperparams_names + ['train_score', 'val_score']
        if self.tree_calculation is not None:
            cols += self.tree_calculation.calcs_names
        
        # Stack the tree features for each hyperparameter combination vertically
        params_scores_calcs = np.vstack(params_scores_calcs)
        print(params_scores_calcs[:,-1])
        # Find the best hyperparameters that lead to best validation score
        self.best_score_ = np.max(params_scores_calcs[:,-1])
        best_params_scores_calcs = params_scores_calcs[np.where(params_scores_calcs[:,-1] == self.best_score_)]
        self.best_params_ = dict(zip(hyperparams_names, best_params_scores_calcs[0,: len(hyperparams_names)]))
        # Define and train the estimator with the best hyperparameters
        self.best_estimator = None
        
        # Return a DataFrame
        params_scores_calcs = dict(zip(cols, np.transpose(params_scores_calcs)))
        return pd.DataFrame(params_scores_calcs)

            


In [321]:
calcs = TreeCalculation()
tgsv = TreeGridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                        param_grid = {'max_depth': [2, 3, 4],
                                      'min_samples_leaf': [5, 25,],
                                      'max_features': [None, 'sqrt', 'log2', 0.5],
                                      'min_samples_split': [2, 100]},
                       )

In [322]:
%time
df = tgsv.fit(ld.features, ld.loan_status)
df

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs
[0.7934907970746963 0.7934907970746963 '0.7054200118722371'
 '0.7054200118722371' '0.7054200118722371' '0.7054200118722371'
 0.790508067656619 0.790508067656619 0.7934907970746963 0.7934907970746963
 '0.7054200118722371' '0.7054200118722371' '0.7054200118722371'
 '0.7054200118722371' 0.790508067656619 0.790508067656619
 0.8226324807258398 0.8226324807258398 '0.712454020088799'
 '0.712454020088799' '0.712454020088799' '0.712454020088799'
 0.8206359768428308 0.8206359768428308 0.8226324807258398
 0.8226324807258398 '0.712454020088799' '0.712454020088799'
 '0.712454020088799' '0.712454020088799' 0.8206359768428308
 0.8206359768428308 0.8537343331596956 0.8537343331596956
 '0.7706208411090099' '0.7706208411090099' '0.7706208411090099'
 '0.7706208411090099' 0.8493450567597886 0.8488833837889275
 0.8537343331596956 0.8537343331596956 '0.7706208411090099'
 '0.7706208411090099' '0.7706208411090099' '0.7706208411090099'
 0.84934505

TypeError: '>=' not supported between instances of 'float' and 'str'

In [None]:
tgsv.best_score_