In [1]:
import pandas as pd
import numpy as np

In [2]:
# import dataset
import openml

dataset = openml.datasets.get_dataset(487)
X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)



In [3]:
X.shape

(30, 40)

In [4]:
# split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

In [6]:
# create search grid for RandomizedSearch

n_estimators = [int(x) for x in np.linspace(start=100, stop=1500, num=15)]

criterion = ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']

max_depth = [int(x) for x in np.linspace(10, 100, num=10)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

max_features = [int(x) for x in np.linspace(start=1, stop=13, num=4)]

random_grid = {'n_estimators': n_estimators,
               'criterion': criterion,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features}

pprint(random_grid)

{'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': [1, 5, 9, 13],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100,
                  200,
                  300,
                  400,
                  500,
                  600,
                  700,
                  800,
                  900,
                  1000,
                  1100,
                  1200,
                  1300,
                  1400,
                  1500]}


In [7]:
rf = RandomForestRegressor()
rf_ran = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter=200, cv=3, verbose=1, random_state=42)
rf_ran.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


In [8]:
# show best param combination
rf_ran.best_params_

{'n_estimators': 300,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 13,
 'max_depth': None,
 'criterion': 'absolute_error'}

In [9]:
def get_accuracy(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    avg_perf_error = 100 * np.mean(errors / y_test)
    accuracy = 100 - avg_perf_error

    print('Average Error: {:0.4f}'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

In [10]:
# check performance on basic model against tuned model
orig = RandomForestRegressor(random_state = 42)
orig.fit(X_train, y_train)
orig_accuracy = get_accuracy(orig, X_test, y_test)

tuned = rf_ran.best_estimator_
tuned_accuracy = get_accuracy(tuned, X_test, y_test)

print('Improvement: {:0.2f}%.'.format( 100 * (tuned_accuracy - orig_accuracy) / orig_accuracy))

Average Error: 0.7470
Accuracy = 94.92%.
Average Error: 0.7370
Accuracy = 95.01%.
Improvement: 0.09%.


###### Fine tuning with GridSearchCV:

In [11]:
from sklearn.model_selection import GridSearchCV

grid = {'n_estimators': [250, 300, 350],
        'criterion': ['absolute_error'],
        'max_depth': [None],
        'min_samples_split': [1, 2, 3],
        'min_samples_leaf': [1, 2, 3, 4, 5],
        'max_features': [10, 13, 15]}

rf = RandomForestRegressor()
rf_grid = GridSearchCV(estimator=rf, param_grid=grid, cv=3, verbose=1)
rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 135 candidates, totalling 405 fits


In [12]:
rf_grid.best_params_

{'criterion': 'absolute_error',
 'max_depth': None,
 'max_features': 13,
 'min_samples_leaf': 4,
 'min_samples_split': 3,
 'n_estimators': 250}

In [17]:
# check performance on grid-tuned model against random-tuned model
best_grid_rf = rf_grid.best_estimator_
grid_accuracy = get_accuracy(best_grid_rf, X_test, y_test)

print('Improvement: {:0.2f}%.'.format( 100 * (grid_accuracy - tuned_accuracy) / tuned_accuracy))

Average Error: 0.7597
Accuracy = 94.86%.
Improvement: -0.16%.


In [None]:
## -0.16% increase in error - will therefore use best from random search:
# {'n_estimators': 300,
#  'min_samples_split': 2,
#  'min_samples_leaf': 4,
#  'max_features': 13,
#  'max_depth': None,
#  'criterion': 'absolute_error'}