In [1]:
import numpy as np
import pandas as pd
import sklearn.gaussian_process as gp
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from numpy.random import RandomState

from util.defs import ParamType
from util.util import random_x_sample
from acquisition.acquisition_functions import ProbabilityOfImprovement, ExpectedImprovement
from acquisition.acquisition_optimizers import RandomAcquisitionOpt, ConstrainedAcquisitionOpt
from hp_optimizers.hp_optimizer import GPROptimizer, RandomSearchOptimizer

# Set random seed

In [2]:
rand = RandomState(seed=123)

# Grab the dataset

In [3]:
iris = "../data/iris.csv"
df = pd.read_csv(iris)
data = np.array(df.iloc[:, :5])
rand.shuffle(data)
X = data[:, :-1]
y = data[:, -1]

# Define: bounds for hyperparameters, parameter data types, objective function (lower is better)

In [4]:
# x0 = n_estimators [10, 1000] -- int
# x1 = max_depth [1, 5] -- int
# x2 = min_samples_split [2, 10] -- int
# x3 = max_features [1, 4] -- int
bounds = np.array([
    [10, 1000],
    [1, 5],
    [2, 10]
])

p_types = [ParamType.Disc, ParamType.Disc, ParamType.Disc]

def objective(x):
    # x may be passed as a (1, p) array
    if x.ndim > 1:
        x = x[0]
    # define model
    model = RandomForestClassifier(n_estimators=int(x[0]),
                                   max_depth=int(x[1]), 
                                   min_samples_split=int(x[2]),
                                   n_jobs=-1,
                                   random_state=rand)
    # get 5-fold cross validation results
    cv_res = cross_validate(model, X, y, cv=5)
    # return 1 - acc
    return 1 - cv_res["test_score"].mean()

# Define the Bayesian Optimizer

In [5]:
kernel = gp.kernels.RBF()
gpr = gp.GaussianProcessRegressor(kernel, 
                                  alpha=np.abs(rand.normal(scale=.01)),
                                  n_restarts_optimizer=10, 
                                  random_state=rand)
acquisition = ExpectedImprovement()
# acquisition = ProbabilityOfImprovement()
opt_acquisition = ConstrainedAcquisitionOpt(gpr, acquisition, rand)
gpr_opt = GPROptimizer(gpr, opt_acquisition, objective, bounds, p_types, rand, initial_samples=3, fit=True)

# Run the optimization

In [6]:
res1 = gpr_opt.optimize(10, thresh=0.03, verbose=True)

Optimization iteration 1
Selected next parameter sample from acquisition optimizer: [493.   3.   5.]
Objective value at sample: 0.04

Optimization iteration 2
Selected next parameter sample from acquisition optimizer: [677.   3.   8.]
Objective value at sample: 0.04

Optimization iteration 3
Selected next parameter sample from acquisition optimizer: [558.   1.   8.]
Objective value at sample: 0.0467

Optimization iteration 4
Selected next parameter sample from acquisition optimizer: [568.   2.   7.]
Objective value at sample: 0.04

Optimization iteration 5
Selected next parameter sample from acquisition optimizer: [635.   2.   9.]
Objective value at sample: 0.0467

Optimization iteration 6
Selected next parameter sample from acquisition optimizer: [584.   4.   5.]
Objective value at sample: 0.04

Optimization iteration 7
Selected next parameter sample from acquisition optimizer: [632.   3.   3.]
Objective value at sample: 0.04

Optimization iteration 8
Selected next parameter sample fr

In [7]:
res1

{'argmin': 12,
 'minimizer': array([574.,   3.,   3.]),
 'minimum': array([0.03333333])}

# Try Random Search for comparison

In [8]:
rand_opt = RandomSearchOptimizer(objective, bounds, p_types, rand, parallel=True)
res2 = rand_opt.optimize(13, verbose=False)
res2

{'argmin': 1,
 'minimizer': array([631.,   3.,   9.]),
 'minimum': array([0.03333333])}