# Optimization
a) Based on Model Selection exercise, we choose Decision Tree model to optimize with Bayesian Optimization


In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics
from pathlib import Path

import autosklearn.classification
SEED = 42 # for reproducibility

### Retrieve Data

In [2]:
df = pd.read_csv(Path("../data/data.csv"))
X = df.loc[:, df.columns.drop(['timestamp', 'label'])]
y = df.loc[:, 'label']

In [3]:
X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y, stratify=y, test_size=0.2, random_state=SEED)

### Parameters to optimize

In [4]:
params = {
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : np.arange(1, 50),
    'criterion' :['gini', 'entropy']
    }

In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=SEED)
# define the search
search = BayesSearchCV(estimator=DecisionTreeClassifier(random_state=SEED), search_spaces=params, n_jobs=-1, cv=cv)
# perform the search
search.fit(X_train, y_train)
# report the best result
print(search.best_score_)
print(search.best_params_)
print(search.best_estimator_)



0.9560606060606062
OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('max_features', 'log2')])
DecisionTreeClassifier(criterion='entropy', max_depth=20, max_features='log2',
                       random_state=42)


In [35]:
# evaluate on test
predictions = search.best_estimator_.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))

Accuracy score 1.0


In [36]:
# check by training from scratch using best_params_
clf = DecisionTreeClassifier(criterion='entropy', max_depth=20, max_features='log2',
                       random_state=42)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))

Accuracy score 1.0


## b) Compare against automl

In [43]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    disable_evaluator_output=False,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 10},
)
automl.fit(X_train, y_train)

AutoSklearnClassifier(per_run_time_limit=30, resampling_strategy='cv',
                      resampling_strategy_arguments={'folds': 10},
                      time_left_for_this_task=120)

In [44]:
predictions = automl.predict(X_test)
print("Accuracy score CV", sklearn.metrics.accuracy_score(y_test, predictions))

Accuracy score CV 1.0


## Garbage

In [4]:
params = dict()
params['C'] = (1e-6, 100.0, 'log-uniform')
params['gamma'] = (1e-6, 100.0, 'log-uniform')
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']

In [32]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [9]:
from numpy import mean
from sklearn.datasets import make_blobs
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from skopt.space import Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from sklearn.svm import SVC

# define the model
model = SVC()
# define the space of hyperparameters to search
search_space = dict()
search_space['C'] = (1e-6, 100.0, 'log-uniform')
search_space['gamma'] = (1e-6, 100.0, 'log-uniform')
search_space['degree'] = (1,5)
search_space['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']

# define the function used to evaluate a given configuration
@use_named_args(params)
def evaluate_model(**params):
	# something
	model.set_params(**params)
	# calculate 5-fold cross validation
	result = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='accuracy')
	# calculate the mean of the scores
	estimate = mean(result)
	return 1.0 - estimate

# perform optimization
result = gp_minimize(evaluate_model, search_space)
# summarizing finding:
print('Best Accuracy: %.3f' % (1.0 - result.fun))
print('Best Parameters: n_neighbors=%d, p=%d' % (result.x[0], result.x[1]))

ValueError: All elements in list must be instances of <class 'skopt.space.space.Dimension'>, but found: ['n_estimators', 'max_features', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'bootstrap']