In [1]:
import pandas as pd
import numpy as np
import hyperopt
import os
import sys
import csv

from time import time

from hyperopt import hp, fmin, tpe
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score

### Loading data

In [2]:
df = pd.read_csv('data/manipulated_train.csv')

In [3]:
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

In [4]:
X.shape

(1460, 79)

### Setting paratemers

In [5]:
# Test parameters
max_evals = 200
run_counter = 0
num_cv = 10
n_jobs = -1
output_log_file = '/tmp/hyperopt-et.csv'

In [6]:
# Hyperparameters space
space = (
    hp.uniform('max_depth', 5, 30),
    hp.uniform('max_features', 0.1, 1),
    hp.uniform('min_samples_leaf', 1, 10)
)

### Function run_wrapper

Just a function for counting and logging experiments, that is called by the hyperopt.fmin(). It also calls the run_test(), the function that indeed execute the experiment for the input parameters.

In [7]:
def run_wrapper(params):
    global run_counter
    global o_f
    
    run_counter += 1
    print "=============================="
    print "run", run_counter
    
    s = time()
    r2score = run_test(params)
    
    print
    print "1 - R2 score:", r2score
    print "elapsed: {}s \n".format(int(round(time() - s)))
    
    writer.writerow([r2score] + list(params))
    o_f.flush()
    return r2score

### Function run_test

The function that receives the parameters, execute the test and return the results. I am using a cross-validation for obtaining the score, and returning it for being optmized by hyperopt.fmin() (therefore I subtracted 1 - score).

In [None]:
def run_test(params):
    max_depth, max_features, min_samples_leaf = params
    max_depth = int(max_depth)
    min_samples_leaf = int(min_samples_leaf)
    
    print "max_depth:", max_depth
    print "max_features:", max_features
    print "min_samples_leaf:", min_samples_leaf
    
    et = ExtraTreesRegressor(max_depth=max_depth,
                             max_features=max_features,
                             min_samples_leaf=min_samples_leaf,
                             n_estimators=100,
                             random_state=66)
    
    cvs = cross_val_score(et, X, y,
                          cv=num_cv,
                          n_jobs=n_jobs)
    
    return 1 - np.median(cvs)

### Main function

Just some logging and calling the hyperopt.fmin(). In the end, it obtains the best parameters -- the ones that minimizes the objective function.

In [None]:
if __name__ == '__main__':
    
    headers = 'r2score max_depth max_features min_samples_leaf'.split()
    o_f = open(output_log_file, 'wb')
    writer = csv.writer(o_f)
    writer.writerow(headers)
    
    start_time = time()
    best = fmin(run_wrapper, space, algo=tpe.suggest, max_evals=max_evals)
    end_time = time()
    
    print "Seconds passed:", int(round(end_time - start_time))
    print best

run 1
max_depth: 23
max_features: 0.148196167325
min_samples_leaf: 3

1 - R2 score: 0.140337621005
elapsed: 2s 

run 2
max_depth: 6
max_features: 0.896749788191
min_samples_leaf: 8

1 - R2 score: 0.155901346291
elapsed: 3s 

run 3
max_depth: 20
max_features: 0.519160108947
min_samples_leaf: 5

1 - R2 score: 0.126570695644
elapsed: 3s 

run 4
max_depth: 24
max_features: 0.937054774532
min_samples_leaf: 8

1 - R2 score: 0.132147137434
elapsed: 4s 

run 5
max_depth: 22
max_features: 0.645266070546
min_samples_leaf: 6
