In [26]:
import pandas as pd
import numpy as np

In [27]:
# import dataset
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]

In [28]:
# split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7)

In [29]:
# view params
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

pprint(RandomForestRegressor().get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [36]:
# create search grid for RandomizedSearch

n_estimators = [int(x) for x in np.linspace(start=100, stop=2000, num=20)]

criterion = ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']

max_depth = [int(x) for x in np.linspace(10, 100, num=10)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

max_features = [int(x) for x in np.linspace(start=1, stop=13, num=4)]

random_grid = {'n_estimators': n_estimators,
               'criterion': criterion,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features}

pprint(random_grid)

{'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': [1, 5, 9, 13],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100,
                  200,
                  300,
                  400,
                  500,
                  600,
                  700,
                  800,
                  900,
                  1000,
                  1100,
                  1200,
                  1300,
                  1400,
                  1500,
                  1600,
                  1700,
                  1800,
                  1900,
                  2000]}


In [37]:
rf = RandomForestRegressor()
rf_ran = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter=200, cv=3, verbose=1, random_state=42)
rf_ran.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


In [38]:
# show best param combination
rf_ran.best_params_

{'n_estimators': 1100,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 5,
 'max_depth': 10,
 'criterion': 'absolute_error'}

In [39]:
def get_accuracy(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    avg_perf_error = 100 * np.mean(errors / y_test)
    accuracy = 100 - avg_perf_error

    print('Average Error: {:0.4f}'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

In [40]:
# check performance on basic model against tuned model
orig = RandomForestRegressor(random_state = 42)
orig.fit(X_train, y_train)
orig_accuracy = get_accuracy(orig, X_test, y_test)

tuned = rf_ran.best_estimator_
tuned_accuracy = get_accuracy(tuned, X_test, y_test)

print('Improvement: {:0.2f}%.'.format( 100 * (tuned_accuracy - orig_accuracy) / orig_accuracy))

Average Error: 2.3442
Accuracy = 87.26%.
Average Error: 2.3298
Accuracy = 87.89%.
Improvement: 0.72%.


###### Fine tuning with GridSearchCV:

In [42]:
from sklearn.model_selection import GridSearchCV

grid = {'n_estimators': [1000, 1100, 1200],
        'criterion': ['absolute_error'],
        'max_depth': [5, 10, 15],
        'min_samples_split': [1, 2, 5, 7],
        'min_samples_leaf': [1, 2, 3, 4, 5],
        'max_features': [3, 5, 7]}

rf = RandomForestRegressor()
rf_grid = GridSearchCV(estimator=rf, param_grid=grid, cv=3, verbose=1)
rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 540 candidates, totalling 1620 fits
[CV] END criterion=absolute_error, max_depth=5, max_features=3, min_samples_leaf=1, min_samples_split=1, n_estimators=1000; total time=   2.4s
[CV] END criterion=absolute_error, max_depth=5, max_features=3, min_samples_leaf=1, min_samples_split=1, n_estimators=1000; total time=   2.0s
[CV] END criterion=absolute_error, max_depth=5, max_features=3, min_samples_leaf=1, min_samples_split=1, n_estimators=1000; total time=   1.9s
[CV] END criterion=absolute_error, max_depth=5, max_features=3, min_samples_leaf=1, min_samples_split=1, n_estimators=1100; total time=   2.2s
[CV] END criterion=absolute_error, max_depth=5, max_features=3, min_samples_leaf=1, min_samples_split=1, n_estimators=1100; total time=   2.2s
[CV] END criterion=absolute_error, max_depth=5, max_features=3, min_samples_leaf=1, min_samples_split=1, n_estimators=1100; total time=   2.2s
[CV] END criterion=absolute_error, max_depth=5, max_features=3, min_samples_le

In [43]:
rf_grid.best_params_

{'criterion': 'absolute_error',
 'max_depth': 15,
 'max_features': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 1,
 'n_estimators': 1100}

In [44]:
# check performance on grid-tuned model against random-tuned model
best_grid_rf = rf_ran.best_estimator_
grid_accuracy = get_accuracy(best_grid_rf, X_test, y_test)

print('Improvement: {:0.2f}%.'.format( 100 * (grid_accuracy - tuned_accuracy) / tuned_accuracy))

Average Error: 2.3298
Accuracy = 87.89%.
Improvement: 0.00%.
