# Hyperopt Estimator Selection

Utilizing the `any_regressor` argument within the HyperoptEstimator function, we find that the [ExtraTreesRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html) is the best performing regressor out of a cross validated test set of potential regressors.

The hyperparameter testing space provided the following optimized parameters:

    (bootstrap=True, criterion='mse', max_depth=None,
        max_features='sqrt', max_leaf_nodes=None,
        min_impurity_decrease=0.0, min_impurity_split=None,
        min_samples_leaf=1, min_samples_split=2,
        min_weight_fraction_leaf=0.0, n_estimators=121, n_jobs=1,
        oob_score=False, random_state=0, verbose=False, warm_start=False)
        
The preprocessing tested over various algorithms selected StandardScalar with the following parameters:

    (copy=True, with_mean=False, with_std=True)

In [10]:
from hpsklearn import HyperoptEstimator, any_regressor, any_preprocessing
from sklearn.model_selection import train_test_split
from hyperopt import tpe
import pandas as pd
import numpy as np

# Download the data and split into training and test sets
property_assess = pd.read_csv('assessment_per_capita.csv', dtype={'suite': str})
ml = property_assess[['value', 'nb_id', 'garage', 'zoning', 'lot_size', 'year_built', 'crime_per_capita']]
dummy_cols = ['nb_id', 'garage', 'zoning']
df = pd.get_dummies(ml, columns=dummy_cols, drop_first=True)

X = df.drop('value', axis=1).values
y = df['value'].values

y.reshape(-1,1)

print(type(X))

<class 'numpy.ndarray'>


In [11]:
col_mean = np.nanmean(X, axis=0)

#Find indicies that you need to replace
inds = np.where(np.isnan(X))

#Place column means in the indices. Align the arrays using take
X[inds] = np.take(col_mean, inds[1])

print(np.nan in X)
print(X.shape)
print(X)

False
(297554, 321)
[[5.57000000e+02 1.96400000e+03 1.29259694e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [5.66000000e+02 1.96200000e+03 1.29259694e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [8.86000000e+02 1.96800000e+03 1.29259694e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [3.94000000e+02 2.01500000e+03 1.30726355e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.57000000e+02 2.01700000e+03 1.30726355e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [4.26000000e+02 2.01500000e+03 1.30726355e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

# Instantiate a HyperoptEstimator with the search space and number of evaluations

estim = HyperoptEstimator(regressor=any_regressor('my_clf'),
                          preprocessing=any_preprocessing('my_pre'),
                          algo=tpe.suggest,
                          max_evals=100,
                          trial_timeout=120)

# Search the hyperparameter space based on the data

estim.fit(X_train, y_train)

# Show the results

print(estim.score(X_test, y_test))

print( estim.best_model() )



0.3545132305361983
{'learner': ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=121, n_jobs=1,
          oob_score=False, random_state=0, verbose=False, warm_start=False), 'preprocs': (StandardScaler(copy=True, with_mean=False, with_std=True),), 'ex_preprocs': ()}
