# RANDOM FOREST -  RANDOM SEARCH

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from time import time
import pandas as pd
from datetime import datetime

### GLOBAL VARIABLES

In [2]:
DATAPATH = 'data/train_test/'
SEED = 47
NITER = 100
CV = 7
SCORE = 'balanced_accuracy'

### LOAD DATASET

In [3]:
train_features = np.load(DATAPATH+'X_features_train.npy')

In [4]:
train_labels = np.load(DATAPATH+'y_train.npy')

#### Input Null values

In [5]:
train_features[np.isnan(train_features)] = -9999

### TRAIN MODEL

#### Set hyperparameters

In [6]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in range(3, 50, 2)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10] + [i for i in range(50,550,50)]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 5, 10] + [i for i in range(50,550,50)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

[random forest params](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [7]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [8]:
random_grid


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [3,
  5,
  7,
  9,
  11,
  13,
  15,
  17,
  19,
  21,
  23,
  25,
  27,
  29,
  31,
  33,
  35,
  37,
  39,
  41,
  43,
  45,
  47,
  49,
  None],
 'min_samples_split': [2,
  5,
  10,
  50,
  100,
  150,
  200,
  250,
  300,
  350,
  400,
  450,
  500],
 'min_samples_leaf': [2,
  5,
  10,
  50,
  100,
  150,
  200,
  250,
  300,
  350,
  400,
  450,
  500],
 'bootstrap': [True, False]}

In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

In [10]:
# Random search of parameters, using CV fold cross validation, 
# search across NITER different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, scoring=SCORE, n_iter = NITER, cv = CV, verbose=2, random_state=SEED, n_jobs = -1)# Fit the random search model


In [11]:
start = time()
rf_random.fit(train_features, train_labels)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), NITER))

Fitting 7 folds for each of 100 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:  7.9min finished


RandomizedSearchCV took 503.30 seconds for 100 candidates parameter settings.


In [12]:
cv_results = pd.DataFrame(rf_random.cv_results_)



#### Saving results

In [13]:
cv_results.to_csv('output/results/rf_randomsearchcv_d' + str(datetime.now().date()) + '.csv',sep=';',index=False)

#### Best estimator

In [14]:
rf_random.best_estimator_

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=1800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#### Best parameter

In [15]:
rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 5,
 'min_samples_leaf': 5,
 'max_features': 'auto',
 'max_depth': 11,
 'bootstrap': False}

#### Best Score

In [16]:
print(SCORE,' : ', rf_random.best_score_)

balanced_accuracy  :  0.7412680374983612


In [17]:
np.save('output/results/rf_rscv_best_params_d' + str(datetime.now().date()) + '.npy', rf_random.best_params_)

In [18]:
np.save('output/results/rf_rscv_best_estimator_d' + str(datetime.now().date()) + '.npy', rf_random.best_estimator_)