In [1]:
import pandas as pd
import numpy as np
import hyperopt
import os
import sys
import csv

from time import time

from hyperopt import hp, fmin, tpe
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score

### Loading data

In [2]:
df = pd.read_csv('data/manipulated_train.csv')

In [3]:
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

In [4]:
X.shape

(1460, 79)

### Setting paratemers

In [5]:
# Test parameters
max_evals = 200
run_counter = 0
num_cv = 10
n_jobs = -1
output_log_file = '~/temp/hyperopt-ada.csv'

In [6]:
ada = AdaBoostRegressor()
ada.get_params()

{'base_estimator': None,
 'learning_rate': 1.0,
 'loss': 'linear',
 'n_estimators': 50,
 'random_state': None}

In [7]:
tree = DecisionTreeRegressor()
tree.get_params()

{'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': None,
 'splitter': 'best'}

In [8]:
# Hyperparameters space
space = (
    hp.uniform('max_depth', 5, 30),
    hp.uniform('max_features', 0.1, 1),
    hp.uniform('min_samples_leaf', 1, 10)
)

### Function run_wrapper

Just a function for counting and logging experiments, that is called by the hyperopt.fmin(). It also calls the run_test(), the function that indeed execute the experiment for the input parameters.

In [9]:
def run_wrapper(params):
    global run_counter
    global o_f
    
    run_counter += 1
    print "=============================="
    print "run", run_counter
    
    s = time()
    r2score = run_test(params)
    
    print
    print "1 - R2 score:", r2score
    print "elapsed: {}s \n".format(int(round(time() - s)))
    
    writer.writerow([r2score] + list(params))
    o_f.flush()
    return r2score

### Function run_test

The function that receives the parameters, execute the test and return the results. I am using a cross-validation for obtaining the score, and returning it for being optmized by hyperopt.fmin() (therefore I subtracted 1 - score).

In [10]:
def run_test(params):
    max_depth, max_features, min_samples_leaf = params
    max_depth = int(max_depth)
    min_samples_leaf = int(min_samples_leaf)
    
    print "max_depth:", max_depth
    print "max_features:", max_features
    print "min_samples_leaf:", min_samples_leaf
    
    dt = DecisionTreeRegressor(max_depth=max_depth,
                               max_features=max_features,
                               min_samples_leaf=min_samples_leaf)
    
    ada = AdaBoostRegressor(base_estimator=dt,
                            n_estimators=100,
                            random_state=66)
    
    cvs = cross_val_score(ada, X, y,
                          cv=num_cv,
                          n_jobs=n_jobs)
    
    return 1 - np.median(cvs)

### Main function

Just some logging and calling the hyperopt.fmin(). In the end, it obtains the best parameters -- the ones that minimizes the objective function.

In [11]:
if __name__ == '__main__':
    
    headers = 'r2score max_depth max_features min_samples_leaf'.split()
    o_f = open(output_log_file, 'wb')
    writer = csv.writer(o_f)
    writer.writerow(headers)
    
    start_time = time()
    best = fmin(run_wrapper, space, algo=tpe.suggest, max_evals=max_evals)
    end_time = time()
    
    print "Seconds passed:", int(round(end_time - start_time))
    print best

run 1
max_depth: 18
max_features: 0.416766886219
min_samples_leaf: 3

1 - R2 score: 0.116134523548
elapsed: 5s 

run 2
max_depth: 10
max_features: 0.438866994862
min_samples_leaf: 6

1 - R2 score: 0.116478099568
elapsed: 4s 

run 3
max_depth: 11
max_features: 0.296154138694
min_samples_leaf: 5

1 - R2 score: 0.115709541632
elapsed: 4s 

run 4
max_depth: 25
max_features: 0.615929357222
min_samples_leaf: 6

1 - R2 score: 0.116574866203
elapsed: 6s 

run 5
max_depth: 11
max_features: 0.560843674475
min_samples_leaf: 1

1 - R2 score: 0.121499022193
elapsed: 6s 

run 6
max_depth: 16
max_features: 0.65350437889
min_samples_leaf: 3

1 - R2 score: 0.11123038553
elapsed: 6s 

run 7
max_depth: 14
max_features: 0.525612120075
min_samples_leaf: 9

1 - R2 score: 0.123636617203
elapsed: 5s 

run 8
max_depth: 25
max_features: 0.778970714122
min_samples_leaf: 5

1 - R2 score: 0.116115872679
elapsed: 8s 

run 9
max_depth: 13
max_features: 0.919535183576
min_samples_leaf: 5

1 - R2 score: 0.116574685412