In [1]:
import pandas as pd
import numpy as np
import hyperopt
import os
import sys
import csv

from time import time

from hyperopt import hp, fmin, tpe
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score

### Loading data

In [2]:
df = pd.read_csv('data/manipulated_train.csv')

In [3]:
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

In [4]:
X.shape

(1460, 79)

### Setting paratemers

In [5]:
# Parameters
max_evals = 200
run_counter = 0
num_cv = 10
n_jobs = -1
output_file = '/tmp/hyperopt-et.csv'

In [6]:
# Hyperparameters space
space = (
    hp.uniform('max_depth', 5, 30),
    hp.uniform('max_features', 0.1, 1),
    hp.uniform('min_samples_leaf', 1, 10)
)

In [7]:
def run_wrapper(params):
    global run_counter
    global o_f
    
    run_counter += 1
    print "=============================="
    print "run", run_counter
    
    s = time()
    r2score = run_test(params)
    
    print
    print "1 - R2 score:", r2score
    print "elapsed: {}s \n".format(int(round(time() - s)))
    
    writer.writerow([r2score] + list(params))
    o_f.flush()
    return r2score

In [None]:
def run_test(params):
    max_depth, max_features, min_samples_leaf = params
    max_depth = int(max_depth)
    min_samples_leaf = int(min_samples_leaf)
    
    print "max_depth:", max_depth
    print "max_features:", max_features
    print "min_samples_leaf:", min_samples_leaf
    
    et = ExtraTreesRegressor(max_depth=max_depth,
                             max_features=max_features,
                             min_samples_leaf=min_samples_leaf,
                             n_estimators=100,
                             random_state=66)
    
    cvs = cross_val_score(et, X, y,
                          cv=num_cv,
                          n_jobs=n_jobs)
    
    return 1 - np.median(cvs)

In [None]:
if __name__ == '__main__':
    
    headers = 'r2score max_depth max_features min_samples_leaf'.split()
    o_f = open(output_file, 'wb')
    writer = csv.writer(o_f)
    writer.writerow(headers)
    
    start_time = time()
    best = fmin(run_wrapper, space, algo=tpe.suggest, max_evals=max_evals)
    end_time = time()
    
    print "Seconds passed:", int(round(end_time - start_time))
    print best

run 1
max_depth: 18
max_features: 0.450672366906
min_samples_leaf: 9

1 - R2 score: 0.140563998661
elapsed: 3s 

run 2
max_depth: 26
max_features: 0.167890719192
min_samples_leaf: 5

1 - R2 score: 0.148530592998
elapsed: 2s 

run 3
max_depth: 29
max_features: 0.207695530626
min_samples_leaf: 1

1 - R2 score: 0.124597278261
elapsed: 3s 

run 4
max_depth: 10
max_features: 0.219453289314
min_samples_leaf: 4

1 - R2 score: 0.142600488575
elapsed: 3s 

run 5
max_depth: 19
max_features: 0.903475449478
min_samples_leaf: 7

1 - R2 score: 0.132304528994
elapsed: 4s 

run 6
max_depth: 27
max_features: 0.519161251125
min_samples_leaf: 8

1 - R2 score: 0.134312913956
elapsed: 3s 

run 7
max_depth: 10
max_features: 0.384339372256
min_samples_leaf: 5

1 - R2 score: 0.133854877073
elapsed: 3s 

run 8
max_depth: 14
max_features: 0.254889862933
min_samples_leaf: 5

1 - R2 score: 0.136357662722
elapsed: 2s 

run 9
max_depth: 14
max_features: 0.233587823608
min_samples_leaf: 1

1 - R2 score: 0.1270275629