In [1]:
import pandas as pd
import numpy as np

In [2]:
# import dataset
import openml

dataset = openml.datasets.get_dataset(44061)
X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)



In [3]:
print(X.shape)
print(X.head())

(4209, 359)
  X3 X4  X6 X10 X12 X13 X14 X15 X16 X17  ... X375 X376 X377 X378 X379 X380  \
0  0  3   9   0   0   1   0   0   0   0  ...    0    0    1    0    0    0   
1  4  3  11   0   0   0   0   0   0   0  ...    1    0    0    0    0    0   
2  2  3   9   0   0   0   0   0   0   1  ...    0    0    0    0    0    0   
3  5  3  11   0   0   0   0   0   0   0  ...    0    0    0    0    0    0   
4  5  3   3   0   0   0   0   0   0   0  ...    0    0    0    0    0    0   

  X382 X383 X384 X385  
0    0    0    0    0  
1    0    0    0    0  
2    1    0    0    0  
3    0    0    0    0  
4    0    0    0    0  

[5 rows x 359 columns]


In [4]:
columns_to_encode = ['X3', 'X4', 'X6']
X = pd.get_dummies(X, columns=columns_to_encode)

# Display the one-hot encoded DataFrame
pd.set_option('display.max_columns', None)
print(X.head())
pd.reset_option('display.max_columns')

  X10 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X26 X27 X28 X29 X30  \
0   0   0   1   0   0   0   0   1   0   0   1   0   0   0   0   0   0   0   0   
1   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   1   0   0   0   
2   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   1   1   1   0   
3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   1   1   0   
4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   1   1   0   

  X31 X32 X33 X34 X35 X36 X37 X38 X39 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49  \
0   1   0   0   0   1   0   1   0   0   0   0   0   0   0   0   1   0   0   0   
1   1   0   0   0   1   0   1   0   0   0   0   0   0   0   0   0   0   0   0   
2   1   0   0   0   1   0   1   0   0   0   0   0   1   0   0   1   0   0   0   
3   1   0   0   0   1   0   1   0   0   0   0   0   1   0   0   1   0   0   0   
4   1   0   0   0   1   0   1   0   0   0   0   0   1   0   0   1   0   0   0   

  X50 X51 X52 X53 X54 X55 

In [5]:
# split dataset, split again to reduce number of samples used in tuning
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_values = X_train.head(800)
y_values = y_train.head(800)
print(X.shape)

(4209, 379)


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

In [7]:
# create search grid for RandomizedSearch

n_estimators = [int(x) for x in np.linspace(start=100, stop=1500, num=15)]

criterion = ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']

max_depth = [int(x) for x in np.linspace(50, 500, num=10)]
max_depth.append(None)

min_samples_split = [10, 20, 30]

min_samples_leaf = [5, 10, 15]

max_features = [int(x) for x in np.linspace(start=1, stop=40, num=4)]

random_grid = {'n_estimators': n_estimators,
               'criterion': criterion,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features}

pprint(random_grid)

{'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
 'max_depth': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, None],
 'max_features': [1, 14, 27, 40],
 'min_samples_leaf': [5, 10, 15],
 'min_samples_split': [10, 20, 30],
 'n_estimators': [100,
                  200,
                  300,
                  400,
                  500,
                  600,
                  700,
                  800,
                  900,
                  1000,
                  1100,
                  1200,
                  1300,
                  1400,
                  1500]}


In [8]:
rf = RandomForestRegressor()
rf_ran = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter=150, cv=3, verbose=1, random_state=42, n_jobs=-1)
rf_ran.fit(X_values, y_values)

Fitting 3 folds for each of 150 candidates, totalling 450 fits


In [10]:
# show best param combination
rf_ran.best_params_

{'n_estimators': 600,
 'min_samples_split': 30,
 'min_samples_leaf': 5,
 'max_features': 40,
 'max_depth': 350,
 'criterion': 'poisson'}

In [11]:
def get_accuracy(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    avg_perf_error = 100 * np.mean(errors / y_test)
    accuracy = 100 - avg_perf_error

    print('Average Error: {:0.4f}'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

In [13]:
# check performance on basic model against tuned model
orig = RandomForestRegressor(random_state = 42)
orig.fit(X_values, y_values)
orig_accuracy = get_accuracy(orig, X_test, y_test)

tuned = rf_ran.best_estimator_
tuned_accuracy = get_accuracy(tuned, X_test, y_test)

print('Improvement: {:0.2f}%.'.format( 100 * (tuned_accuracy - orig_accuracy) / orig_accuracy))

Average Error: 6.5237
Accuracy = 93.85%.
Average Error: 5.7524
Accuracy = 94.62%.
Improvement: 0.82%.


###### Fine tuning with GridSearchCV:

In [14]:
from sklearn.model_selection import GridSearchCV

grid = {'n_estimators': [500, 600, 700],
        'criterion': ['poisson'],
        'max_depth': [300, 350, 400, None],
        'min_samples_split': [30],
        'min_samples_leaf': [3, 5, 7],
        'max_features': [30, 40, 50]}

rf = RandomForestRegressor()
rf_grid = GridSearchCV(estimator=rf, param_grid=grid, cv=3, verbose=1)
rf_grid.fit(X_values, y_values)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [15]:
rf_grid.best_params_

{'criterion': 'poisson',
 'max_depth': None,
 'max_features': 50,
 'min_samples_leaf': 5,
 'min_samples_split': 30,
 'n_estimators': 600}

In [16]:
# check performance on grid-tuned model against random-tuned model
best_grid_rf = rf_ran.best_estimator_
grid_accuracy = get_accuracy(best_grid_rf, X_test, y_test)

print('Improvement: {:0.2f}%.'.format( 100 * (grid_accuracy - tuned_accuracy) / tuned_accuracy))

Average Error: 5.7524
Accuracy = 94.62%.
Improvement: 0.00%.


In [None]:
# no significant improvement: params to be used:
# {'criterion': 'poisson',
#  'max_depth': None,
#  'max_features': 50,
#  'min_samples_leaf': 5,
#  'min_samples_split': 30,
#  'n_estimators': 600}