[View in Colaboratory](https://colab.research.google.com/github/thundercat95/AutoML/blob/master/SMAC%20preprocessor.ipynb)

## Installation

In [0]:
!pip install openml
!apt-get install build-essential swig
!pip install smac
!pip install xgboost

## Imports

In [0]:
import numpy as np
import pandas as pd
import time
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import normalize
from sklearn.preprocessing import Normalizer
from sklearn import decomposition
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from xgboost import XGBClassifier
import openml as oml
apikey = '2f6c58f4f6d62ea93dd32764ec88089c'
oml.config.apikey = apikey

# Import ConfigSpace and different types of parameters
from smac.configspace import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformFloatHyperparameter, UniformIntegerHyperparameter
from ConfigSpace.conditions import *

# Import SMAC-utilities
from smac.tae.execute_func import ExecuteTAFuncDict
from smac.scenario.scenario import Scenario
from smac.facade.smac_facade import SMAC

## Test with Splice Dataset (OpenML id : 46)

In [0]:
# We load Splice dataset from openml
splice = oml.datasets.get_dataset(46)

#Preprocessor normalize hyper parameter
norm = CategoricalHyperparameter("norm", ["l1", "l2", "max"], default_value="l2")

# Build Configuration Space for Model
cs = ConfigurationSpace()
max_depth = UniformIntegerHyperparameter("max_depth", 1, 10, default_value=3)
learning_rate = UniformFloatHyperparameter("learning_rate", 0.001, 1.0, default_value=0.1)
n_estimators = UniformIntegerHyperparameter("n_estimators", 10, 500, default_value=100)
gamma = UniformFloatHyperparameter("gamma", 0.0, 1.0, default_value=0.0)
min_child_weight = UniformIntegerHyperparameter("min_child_weight", 1, 50, default_value=1)
max_delta_step  = UniformIntegerHyperparameter("max_delta_step ", 0, 50, default_value=0)
subsample = UniformFloatHyperparameter("subsample", 0.0, 1.0, default_value=1.0)
cs.add_hyperparameters([max_depth, learning_rate, n_estimators, gamma, min_child_weight, max_delta_step, subsample,norm])


def model_with_pre(cfg):
    print(cfg)
    cfg = {k : cfg[k] for k in cfg if cfg[k]}
    pre_cfg ={}
    X, y = splice.get_data(target=splice.default_target_attribute)
    pre_cfg['norm'] = cfg['norm']
    print(pre_cfg)
    pre_cfg = {k : pre_cfg[k] for k in pre_cfg if pre_cfg[k]}
    X_new = normalize(X, **pre_cfg)
    del cfg['norm']
    xgbc = XGBClassifier(**cfg, random_state=42)
    scores = cross_val_score(xgbc, X_new, y, cv=5)
    print(np.mean(scores))
    return 1-np.mean(scores)  # Minimize!
  
# Scenario object
scenario = Scenario({"run_obj": "quality",   # we optimize quality (alternatively runtime)
                     "runcount-limit": 5,  # maximum function evaluations
                     "cs": cs,               # configuration space
                     "deterministic": "true"
                     })

# Optimize, using a SMAC-object
print("Optimizing! Depending on your machine, this might take a few minutes.")
smac = SMAC(scenario=scenario, rng=np.random.RandomState(42),
        tae_runner=model_with_pre)
start = time.time()
incumbent = smac.optimize()
inc_value = model_with_pre(incumbent)
end = time.time()
print("Optimized Value: %.5f" % (1-inc_value))
print("Best parameter : ", incumbent.get_dictionary())
print("Time required in seconds :", (end - start))




Optimizing! Depending on your machine, this might take a few minutes.
Configuration:
  gamma, Value: 0.0
  learning_rate, Value: 0.1
  max_delta_step , Value: 0
  max_depth, Value: 3
  min_child_weight, Value: 1
  n_estimators, Value: 100
  norm, Value: 'l2'
  subsample, Value: 1.0

{'norm': 'l2'}


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9564272318105663
Configuration:
  gamma, Value: 0.7359525285652845
  learning_rate, Value: 0.8117657600950038
  max_delta_step , Value: 48
  max_depth, Value: 5
  min_child_weight, Value: 12
  n_estimators, Value: 103
  norm, Value: 'max'
  subsample, Value: 0.46610295102568133

{'norm': 'max'}


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9448260081924135
Configuration:
  gamma, Value: 0.6108620947042035
  learning_rate, Value: 0.8363471290456959
  max_delta_step , Value: 5
  max_depth, Value: 7
  min_child_weight, Value: 23
  n_estimators, Value: 68
  norm, Value: 'max'
  subsample, Value: 0.18550419634224635

{'norm': 'max'}


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9319782232153008
Configuration:
  gamma, Value: 0.4372298697996926
  learning_rate, Value: 0.4332483856090999
  max_delta_step , Value: 13
  max_depth, Value: 3
  min_child_weight, Value: 36
  n_estimators, Value: 331
  norm, Value: 'l1'
  subsample, Value: 0.7658859038977468

{'norm': 'l1'}


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9482757619509163
Configuration:
  gamma, Value: 0.9187416722392869
  learning_rate, Value: 0.6433665259425442
  max_delta_step , Value: 11
  max_depth, Value: 6
  min_child_weight, Value: 4
  n_estimators, Value: 434
  norm, Value: 'l2'
  subsample, Value: 0.5109722934233317

{'norm': 'l2'}


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9564301799020456
Configuration:
  gamma, Value: 0.9187416722392869
  learning_rate, Value: 0.6433665259425442
  max_delta_step , Value: 11
  max_depth, Value: 6
  min_child_weight, Value: 4
  n_estimators, Value: 434
  norm, Value: 'l2'
  subsample, Value: 0.5109722934233317

{'norm': 'l2'}


  if diff:
  if diff:
  if diff:
  if diff:


0.9564301799020456
Optimized Value: 0.95643
Best parameter :  {'gamma': 0.9187416722392869, 'learning_rate': 0.6433665259425442, 'max_delta_step ': 11, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 434, 'norm': 'l2', 'subsample': 0.5109722934233317}
Time required in seconds : 136.17632913589478


  if diff:


## Pipeline

In [0]:
# We load Splice dataset from openml
splice = oml.datasets.get_dataset(46)

#Preprocessor normalize hyper parameter
norm = CategoricalHyperparameter("norm", ["l1", "l2", "max"], default_value="l2")

# Build Configuration Space for Model
cs = ConfigurationSpace()
max_depth = UniformIntegerHyperparameter("max_depth", 1, 10, default_value=3)
learning_rate = UniformFloatHyperparameter("learning_rate", 0.001, 1.0, default_value=0.1)
n_estimators = UniformIntegerHyperparameter("n_estimators", 10, 500, default_value=100)
gamma = UniformFloatHyperparameter("gamma", 0.0, 1.0, default_value=0.0)
min_child_weight = UniformIntegerHyperparameter("min_child_weight", 1, 50, default_value=1)
max_delta_step  = UniformIntegerHyperparameter("max_delta_step ", 0, 50, default_value=0)
subsample = UniformFloatHyperparameter("subsample", 0.0, 1.0, default_value=1.0)
cs.add_hyperparameters([max_depth, learning_rate, n_estimators, gamma, min_child_weight, max_delta_step, subsample,norm])


def model_with_pre(cfg):
    print(cfg)
    cfg = {k : cfg[k] for k in cfg if cfg[k]}
    pre_cfg ={}
    X, y = splice.get_data(target=splice.default_target_attribute)
    pre_cfg['norm'] = cfg['norm']
    print(pre_cfg)
    pre_cfg = {k : pre_cfg[k] for k in pre_cfg if pre_cfg[k]}
    del cfg['norm']
    print(norm)
    final = Pipeline([('normalize', Normalizer(**pre_cfg)), ('Xgboost', XGBClassifier(**cfg))])
    scores = cross_val_score(final, X, y, cv=5)
    print(np.mean(scores))
    return 1-np.mean(scores)  # Minimize!
  
# Scenario object
scenario = Scenario({"run_obj": "quality",   # we optimize quality (alternatively runtime)
                     "runcount-limit": 5,  # maximum function evaluations
                     "cs": cs,               # configuration space
                     "deterministic": "true"
                     })

# Optimize, using a SMAC-object
print("Optimizing! Depending on your machine, this might take a few minutes.")
smac = SMAC(scenario=scenario, rng=np.random.RandomState(42),
        tae_runner=model_with_pre)
start = time.time()
incumbent = smac.optimize()
inc_value = model_with_pre(incumbent)
end = time.time()
print("Optimized Value: %.5f" % (1-inc_value))  
print("Best parameter : ", incumbent.get_dictionary())
print("Time required in seconds :", (end - start))

Optimizing! Depending on your machine, this might take a few minutes.
Configuration:
  gamma, Value: 0.0
  learning_rate, Value: 0.1
  max_delta_step , Value: 0
  max_depth, Value: 3
  min_child_weight, Value: 1
  n_estimators, Value: 100
  norm, Value: 'l2'
  subsample, Value: 1.0

{'norm': 'l2'}
Normalizer(copy=True, norm='l1')


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9589321222499603
Configuration:
  gamma, Value: 0.8527391249355921
  learning_rate, Value: 0.45037628925500317
  max_delta_step , Value: 41
  max_depth, Value: 1
  min_child_weight, Value: 19
  n_estimators, Value: 388
  norm, Value: 'max'
  subsample, Value: 0.6982551360049958

{'norm': 'max'}
Normalizer(copy=True, norm='l1')


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9498431569894832
Configuration:
  gamma, Value: 0.6108620947042035
  learning_rate, Value: 0.8363471290456959
  max_delta_step , Value: 5
  max_depth, Value: 7
  min_child_weight, Value: 23
  n_estimators, Value: 68
  norm, Value: 'max'
  subsample, Value: 0.18550419634224635

{'norm': 'max'}
Normalizer(copy=True, norm='l1')


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9216265090475873
Configuration:
  gamma, Value: 0.5183345126020776
  learning_rate, Value: 0.4075297598687496
  max_delta_step , Value: 15
  max_depth, Value: 6
  min_child_weight, Value: 5
  n_estimators, Value: 60
  norm, Value: 'max'
  subsample, Value: 0.22370237127274972

{'norm': 'max'}
Normalizer(copy=True, norm='l1')


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9551762552456371
Configuration:
  gamma, Value: 0.9187416722392869
  learning_rate, Value: 0.6433665259425442
  max_delta_step , Value: 11
  max_depth, Value: 6
  min_child_weight, Value: 4
  n_estimators, Value: 434
  norm, Value: 'l2'
  subsample, Value: 0.5109722934233317

{'norm': 'l2'}
Normalizer(copy=True, norm='l1')


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9511029809094058
Configuration:
  gamma, Value: 0.0
  learning_rate, Value: 0.1
  max_delta_step , Value: 0
  max_depth, Value: 3
  min_child_weight, Value: 1
  n_estimators, Value: 100
  norm, Value: 'l2'
  subsample, Value: 1.0

{'norm': 'l2'}
Normalizer(copy=True, norm='l1')


  if diff:
  if diff:
  if diff:
  if diff:


0.9589321222499603
Optimized Value: 0.95893
Best parameter :  {'gamma': 0.0, 'learning_rate': 0.1, 'max_delta_step ': 0, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'norm': 'l2', 'subsample': 1.0}
Time required in seconds : 79.83850502967834


  if diff:


## Test

In [0]:
b = dict()
b['C'] = 0.1
BaseEstimator.get_params(LogisticRegression(),deep=True)

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [42]:
final = Pipeline([('normalize', Normalizer()), ('Xgboost', XGBClassifier())])
z = final.steps[1][1]
cs1 = ConfigurationSpace()
cs2 = ConfigurationSpace()
cs3 = ConfigurationSpace()
max_depth = UniformIntegerHyperparameter("max_depth", 1, 10, default_value=3)
cs1.add_hyperparameters([max_depth])
cs1.get_all_unconditional_hyperparameters()

['max_depth']

In [43]:
# Build Configuration Space which defines all parameters and their ranges
cs = ConfigurationSpace()

# We define a few possible types of SVM-kernels and add them as "kernel" to our cs
kernel = CategoricalHyperparameter("kernel", ["linear", "rbf", "poly", "sigmoid"], default_value="poly")
cs.add_hyperparameter(kernel)

# There are some hyperparameters shared by all kernels
C = UniformFloatHyperparameter("C", 0.001, 1000.0, default_value=1.0)
shrinking = CategoricalHyperparameter("shrinking", ["true", "false"], default_value="true")
cs.add_hyperparameters([C, shrinking])

# Others are kernel-specific, so we can add conditions to limit the searchspace
degree = UniformIntegerHyperparameter("degree", 1, 5, default_value=3)     # Only used by kernel poly
coef0 = UniformFloatHyperparameter("coef0", 0.0, 10.0, default_value=0.0)  # poly, sigmoid
cs.add_hyperparameters([degree, coef0])
use_degree = InCondition(child=degree, parent=kernel, values=["poly"])
use_coef0 = InCondition(child=coef0, parent=kernel, values=["poly", "sigmoid"])
cs.add_conditions([use_degree, use_coef0])

# This also works for parameters that are a mix of categorical and values from a range of numbers
# For example, gamma can be either "auto" or a fixed float
gamma = CategoricalHyperparameter("gamma", ["auto", "value"], default_value="auto")  # only rbf, poly, sigmoid
gamma_value = UniformFloatHyperparameter("gamma_value", 0.0001, 8, default_value=1)
cs.add_hyperparameters([gamma, gamma_value])

# We only activate gamma_value if gamma is set to "value"
cs.add_condition(InCondition(child=gamma_value, parent=gamma, values=["value"]))
# And again we can restrict the use of gamma in general to the choice of the kernel
cs.add_condition(InCondition(child=gamma, parent=kernel, values=["rbf", "poly", "sigmoid"]))

def convert_cs_to_dictionary(cfg, num):
  cfg_dict = dict()
  list_unconditional = cfg.get_all_unconditional_hyperparameters()
  list_confitional = cfg.get_all_conditional_hyperparameters()
  list_all = list_unconditional + list_confitional
  for hyperparameter in list_all:
    cfg_dict[""]

cs_to_dict = convert_cs_to_dictionary(cs)

gamma | kernel in {'rbf', 'poly', 'sigmoid'}