In [None]:
# Load libraries
import numpy as np;
import pandas as pd

from timeit import default_timer

from sklearn.model_selection import train_test_split

# Random Forest

## Intuition

- This type of model represents an ensemble (combination) of decision tree models.


- Trees are created with a random subset of the data.


- Each individual tree is trained in parallel.


- The prediction of each tree is made and the one with the most votes is selected.


- Its goal is to combine complex (deep) trees with low bias and high variance (overfitting), and reduce its variance by voting.


- This type of ensemble methods is called **bagging**.

![imagen.png](attachment:imagen.png)

## Key Hyperparameters

![imagen-4.png](attachment:imagen-4.png)

In [None]:
# [1] Import model
from sklearn.ensemble import RandomForestRegressor as model_constructor
?model_constructor

## Data

Boston housing is the new iris.

In [None]:
from sklearn.datasets import fetch_california_housing as load_data;

In [None]:
data = load_data();
X = data.data;
y = data.target;

In [None]:
perc_values = [0.7, 0.15, 0.15];
X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=perc_values[1] + perc_values[2], random_state=1);
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest,  test_size= perc_values[2] / (perc_values[1] + perc_values[2]), random_state=1)

## Grid Search

Let's define the grid we will be using.

In [None]:
# Decision Tree
criterion_values = ['absolute_error', 'squared_error']
max_depth_values = [None, 6];
min_samples_split_values = [2, 5, 20];
min_samples_leaf_values = [1, 5, 20];
max_features_values = [None, 1, 2];

# Random Forest
n_estimators_values = [10, 100];

params_grid = {  'criterion': criterion_values,
                 'max_depth': max_depth_values,
                 'min_samples_split': min_samples_split_values,
                 'min_samples_leaf': min_samples_leaf_values,
                 'max_features': max_features_values,
                 'n_estimators': n_estimators_values}

Get total number of combinations.

In [None]:
n = len(params_grid['n_estimators'])*len(params_grid['max_depth'])*len(params_grid['min_samples_split'])*len(params_grid['min_samples_leaf'])*len(params_grid['max_features'])*len(params_grid['criterion'])
print(str(n)+ ' iterations of Random Forest')

216 iterations of Random Forest


We will be using **MAPE** as evaluation metric.

In [1]:
# 2) Import metric
from sklearn.metrics import mean_absolute_percentage_error as metric

### Using a Fixed Validation Set

In [None]:
num_iter = 1;
grid_results = pd.DataFrame(columns = ('criterion',
                                       'max_depth',
                                       'min_samples_split',
                                       'min_samples_leaf',
                                       'max_features',
                                       'n_estimators',
                                       'mape_train',
                                       'mape_val',
                                       'time'))

for criterion in params_grid['criterion']:
    for max_depth in params_grid['max_depth']:
        for min_samples_split in params_grid['min_samples_split']:
            for min_samples_leaf in params_grid['min_samples_leaf']:
                for max_features in params_grid['max_features']:
                    for n_estimators in params_grid['n_estimators']:


                        # Start time
                        start_time = default_timer()

                        # Print trace
                        print('Iteracion = ' + str(num_iter))

                        # [3] Define model
                        model = model_constructor(criterion = criterion,
                                                  max_depth = max_depth,
                                                  min_samples_split = min_samples_split,
                                                  min_samples_leaf = min_samples_leaf,
                                                  max_features = max_features,
                                                  n_estimators = n_estimators,
                                                  random_state = 0)

                        # [4] Train model
                        model.fit(X_train, y_train)


                        # [5] Predict
                        pred_train = model.predict(X_train) # predict!
                        pred_val = model.predict(X_val) # predict!

                        # [6] Compute metric
                        metric_train = metric(y_train, pred_train)
                        metric_val = metric(y_val, pred_val)


                        # Computational time
                        time = default_timer() - start_time

                        # print error
                        print('MAPE train = %.2f - MAPE validation = %.2f. Time spend = %.2f.'
                              % (metric_train, metric_val, time))

                        # Save iteration results
                        grid_results.loc[num_iter]=[criterion,
                                                    max_depth,
                                                    min_samples_split,
                                                    min_samples_leaf,
                                                    max_features,
                                                    n_estimators,
                                                 metric_train,
                                                 metric_val,
                                                time]
                        num_iter += 1

print('Grid Search Total Computational Time: ', np.sum(grid_results.time.values))

Iteracion = 1
MAPE train = 0.05 - MAPE validation = 0.12. Time spend = 0.10.
Iteracion = 2
MAPE train = 0.04 - MAPE validation = 0.11. Time spend = 0.90.
Iteracion = 3
MAPE train = 0.07 - MAPE validation = 0.13. Time spend = 0.03.
Iteracion = 4
MAPE train = 0.06 - MAPE validation = 0.13. Time spend = 0.27.
Iteracion = 5
MAPE train = 0.06 - MAPE validation = 0.14. Time spend = 0.03.
Iteracion = 6
MAPE train = 0.05 - MAPE validation = 0.12. Time spend = 0.30.
Iteracion = 7
MAPE train = 0.09 - MAPE validation = 0.12. Time spend = 0.06.
Iteracion = 8
MAPE train = 0.09 - MAPE validation = 0.12. Time spend = 0.59.
Iteracion = 9
MAPE train = 0.12 - MAPE validation = 0.15. Time spend = 0.02.
Iteracion = 10
MAPE train = 0.13 - MAPE validation = 0.15. Time spend = 0.20.
Iteracion = 11
MAPE train = 0.12 - MAPE validation = 0.14. Time spend = 0.03.
Iteracion = 12
MAPE train = 0.11 - MAPE validation = 0.14. Time spend = 0.21.
Iteracion = 13
MAPE train = 0.14 - MAPE validation = 0.15. Time spend = 0

Iteracion = 107
MAPE train = 0.16 - MAPE validation = 0.18. Time spend = 0.02.
Iteracion = 108
MAPE train = 0.16 - MAPE validation = 0.17. Time spend = 0.24.
Iteracion = 109
MAPE train = 0.05 - MAPE validation = 0.12. Time spend = 0.03.
Iteracion = 110
MAPE train = 0.04 - MAPE validation = 0.11. Time spend = 0.29.
Iteracion = 111
MAPE train = 0.05 - MAPE validation = 0.13. Time spend = 0.02.
Iteracion = 112
MAPE train = 0.05 - MAPE validation = 0.13. Time spend = 0.16.
Iteracion = 113
MAPE train = 0.06 - MAPE validation = 0.13. Time spend = 0.02.
Iteracion = 114
MAPE train = 0.04 - MAPE validation = 0.12. Time spend = 0.20.
Iteracion = 115
MAPE train = 0.09 - MAPE validation = 0.13. Time spend = 0.04.
Iteracion = 116
MAPE train = 0.08 - MAPE validation = 0.13. Time spend = 0.27.
Iteracion = 117
MAPE train = 0.13 - MAPE validation = 0.16. Time spend = 0.02.
Iteracion = 118
MAPE train = 0.13 - MAPE validation = 0.15. Time spend = 0.14.
Iteracion = 119
MAPE train = 0.12 - MAPE validation 

Iteracion = 213
MAPE train = 0.21 - MAPE validation = 0.20. Time spend = 0.01.
Iteracion = 214
MAPE train = 0.20 - MAPE validation = 0.20. Time spend = 0.13.
Iteracion = 215
MAPE train = 0.17 - MAPE validation = 0.18. Time spend = 0.01.
Iteracion = 216
MAPE train = 0.17 - MAPE validation = 0.18. Time spend = 0.12.
Grid Search Total Computational Time:  33.81021060000006


Check results

In [None]:
grid_results

Unnamed: 0,criterion,max_depth,min_samples_split,min_samples_leaf,max_features,n_estimators,mape_train,mape_val,time
1,mae,,2,1,,10,0.049882,0.119276,0.095321
2,mae,,2,1,,100,0.044307,0.113089,0.898907
3,mae,,2,1,1,10,0.067915,0.134735,0.032469
4,mae,,2,1,1,100,0.055438,0.126676,0.265638
5,mae,,2,1,2,10,0.055246,0.138674,0.034067
...,...,...,...,...,...,...,...,...,...
212,mse,6,20,20,,100,0.132810,0.152078,0.162219
213,mse,6,20,20,1,10,0.207096,0.203152,0.014946
214,mse,6,20,20,1,100,0.199995,0.197254,0.125953
215,mse,6,20,20,2,10,0.168594,0.182117,0.013165


**Which one is the winner?**

In [None]:
grid_results = grid_results.sort_values(by = ['mape_val', 'mape_train', 'time'], ascending = [True, True, True])
grid_results

Unnamed: 0,criterion,max_depth,min_samples_split,min_samples_leaf,max_features,n_estimators,mape_train,mape_val,time
2,mae,,2,1,,100,0.044307,0.113089,0.898907
110,mse,,2,1,,100,0.042537,0.114179,0.293225
128,mse,,5,1,,100,0.050316,0.115209,0.273475
20,mae,,5,1,,100,0.051764,0.115387,0.867990
73,mae,6,5,1,,10,0.079233,0.116318,0.094820
...,...,...,...,...,...,...,...,...,...
159,mse,,20,20,1,10,0.207096,0.203152,0.012487
177,mse,6,2,20,1,10,0.207096,0.203152,0.014231
213,mse,6,20,20,1,10,0.207096,0.203152,0.014946
141,mse,,5,20,1,10,0.207096,0.203152,0.015063


In [None]:
best_model = grid_results.iloc[0]
best_model

criterion                  mae
max_depth                 None
min_samples_split            2
min_samples_leaf             1
max_features              None
n_estimators               100
mape_train           0.0443072
mape_val              0.113089
time                  0.898907
Name: 2, dtype: object

### Using Cross-Validation

In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [None]:
# Define grid
grid_cv = GridSearchCV(model_constructor(),
                     param_grid=params_grid,
                     n_jobs=-1, ## Paralellization!
                     cv = 5, # Number of folds
                     scoring = make_scorer(metric, greater_is_better=False)) # Important!!!

In this case, we do not need a fixed validation set, so we will combine train and validation.

In [None]:
# Run grid
start_time = default_timer()

grid_cv.fit(np.concatenate((X_train, X_val), axis = 0), np.concatenate((y_train, y_val), axis = 0))

stop_time = default_timer()
print('CV Grid Search Total Computational Time: : ', stop_time - start_time)

CV Grid Search Total Computational Time: :  58.770217100000025


In [None]:
grid_cv.best_params_

{'criterion': 'mae',
 'max_depth': None,
 'max_features': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [None]:
- grid_cv.best_score_

0.10113023230048757

## Final Model

Validation has served its purpose, let's combine it with train to get more training data.

In [None]:
print('Old train data size = ' + str(X_train.shape))
print('Old train target size = ' + str(y_train.shape))

# Combine train and validación
X_train = np.concatenate((X_train, X_val), axis = 0)
y_train = np.concatenate((y_train, y_val), axis = 0)

print('New train data size = ' + str(X_train.shape))
print('New train target size = ' + str(y_train.shape))

Old train data size = (430, 13)
Old train target size = (430,)
New train data size = (506, 13)
New train target size = (506,)


In [None]:
# [3] Define model
model = model_constructor(criterion = best_model.criterion,
                          max_depth = best_model.max_depth,
                          min_samples_split = best_model.min_samples_split,
                          min_samples_leaf = best_model.min_samples_leaf,
                          max_features = best_model.max_features,
                          n_estimators =  best_model.n_estimators,
                          random_state = 0) # Use same random_state as in training!!!

# [4] Train model
model.fit(X_train, y_train)

# [5] Predict
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

# [6] Compute metric
metric_train = metric(y_train, pred_train)
metric_test = metric(y_test, pred_test)



In [None]:
# print error
print('MAPE train = %.2f - MAPE test = %.2f'
      % (metric_train, metric_test))

MAPE train = 0.03 - MAPE test = 0.12
