# DECISION TREE CLASSIFIER - RANDOM SEARCH

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
from datetime import datetime
from time import time

### GLOBAL VARIABLES

In [2]:
DATAPATH = '../data/processed/'
SEED = 47
NITER = 100
CV = 3
SCORE = 'roc_auc'
usenull = True
NJOBS = -1

### LOAD DATASET

In [5]:
train_features = np.load(DATAPATH+'X_train.npy')

In [6]:
train_labels = np.load(DATAPATH+'y_train.npy')

#### Input Null values

In [7]:
if usenull == False:
    train_features[np.isnan(train_features)] = -9999

### TRAIN MODEL

#### Set hyperparameters

In [36]:
# ======== General Parameters ======= #

# The function to measure the quality of a split.
criterion = ['gini', 'entropy']

# The strategy used to choose the split at each node.
splitter = ['best', 'random']

# The maximum depth of a tree
max_depth = [i for i in range(3,10,1)] 

# The minimum number of samples required to split an internal node.
min_samples_split  = [2, 5, 10] + [i for i in range(50,550,50)]

# The minimum number of samples required to be at a leaf node. 
# A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples 
# in each of the left and right branches. 
# This may have the effect of smoothing the model, especially in regression.
min_samples_leaf = [2, 5, 10] + [i for i in range(50,550,50)]

# Weights associated with classes in the form {class_label: weight}.
class_weight="balanced" 

[Decision Tree params](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [46]:
# Create the random grid
random_grid = {
    'criterion' : criterion,
    'splitter' : splitter,
    'max_depth' : max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

In [58]:
random_grid

{'learning_rate': [0.01],
 'min_child_weight': [1, 3, 5, 7, 9],
 'max_depth': [3, 5, 7, 9],
 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
 'subsample': [0.6, 0.7, 0.8, 0.9],
 'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
 'reg_lambda': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]}

#### Training

In [42]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
model = DecisionTreeClassifier(class_weight=class_weight)

In [49]:
# Random search of parameters, using CV fold cross validation, 
# search across NITER different combinations, and use all available cores
knn_rsearch = RandomizedSearchCV(estimator = model, param_distributions = random_grid, scoring=SCORE, n_iter = NITER, cv = CV, verbose=2, random_state=SEED, n_jobs = NJOBS)# Fit the random search model


In [51]:
start = time()
idt_rsearch.fit(train_features, train_labels)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), NITER))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.


KeyboardInterrupt: 

#### Saving results

In [60]:
cv_results = pd.DataFrame(idt_rsearch.cv_results_)

In [None]:
cv_results.to_csv('../models/rsearch_idt_classifier_d' + str(datetime.now().date()) + '.csv',sep=';',index=False)

#### Best estimator

In [59]:
idt_rsearch.best_estimator_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

#### Best parameter

In [None]:
idt_rsearch.best_params_

#### Best Score

In [82]:
print(SCORE,' : ', idt_rsearch.best_score_)

roc_auc  :  0.9987598580610301


#### Saving best hyperparameters

In [77]:
np.save('../models/idt_classifier_bestparams_d' + str(datetime.now().date()) + '.npy', idt_rsearch.best_params_)