# Imports

In [7]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from sklearn.metrics import roc_auc_score


import data_loader as dl
import utils

# Load Data

In [2]:
ld = dl.DataLoader('cr_loan_nout_nmiss.csv',
                   ['person_income', 'loan_int_rate','loan_percent_income', 'loan_intent'],
                   ['loan_status'])
ld.train_test_split(test_size=0.4, random_state=154)

# Train the decision tree

In [3]:
clf_tree = DecisionTreeClassifier(criterion='gini',
                                     splitter='best',
                                     max_depth=5,
                                     min_samples_split=1000,
                                     min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0,
                                     random_state=42,
                                     min_impurity_decrease=0.0)


clf_tree.fit(ld.features_train, ld.loan_status_train)

In [4]:
default_prob = clf_tree.predict_proba(ld.features_test)[:,1]
roc_auc_score(y_true = ld.loan_status_test, y_score = default_prob)

0.8642154498826626

# Hyperparameter tuning with Grid Search Cross Validation

In [36]:
# Define the DecisionTreeClassifier with specific random state
clf = DecisionTreeClassifier(random_state=42)

In [48]:
# Define the grid of candidate parameters for the best estimator
param_grid = {'max_depth': [2, 3, 4, 5, 6, 7, 8, 10, 12, 15],
              'min_samples_leaf': [5, 25, 50, 100],
              'max_features': [None, 'sqrt', 'log2', 0.5],
              'min_samples_split': [2, 100, 1000, 5000]}

In [41]:
# Define the GridSearchCV instance
grid_search = GridSearchCV(estimator=clf,          # Estimator, for us the DecisionTreeClassifier
                          param_grid=param_grid,   # Grid of parameters
                          scoring='roc_auc',       # Criterion for comparing the classifiers
                          n_jobs=-1,               # Paralelisation. For us no.
                          cv=5,                    # Number of sub-datasets for cross validation
                          refit=True,              # Return the estimator with the best hyperparameters
                          return_train_score=True) # Return the score for the best estimator

In [42]:
# Fit the grid search to the training datasets
grid_search.fit(ld.features_train, ld.loan_status_train)

In [46]:
print(grid_search.best_params_)
best_estimator = grid_search.best_estimator_
best_score = grid_search.best_score_

{'max_depth': 15, 'max_features': None, 'min_samples_leaf': 100, 'min_samples_split': 2}


In [47]:
best_score

0.8837158224347983