In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import time

In [None]:
X, y = load_breast_cancer(return_X_y=True)

In [None]:
X.shape

# 1. Specifying parameter space

In [None]:
n_estimators_vals = [2,10,50]
criterion_vals = ['gini', 'entropy']
max_features_vals = [0.1, 0.9]

In [None]:
num_trials = 20
num_concurrent = 1
cv_value = 5

# 2. Experimenting with different frameworks

## Sherpa

```
conda install -c sherpa sherpa
```

How to limit number of cores?

In [None]:
import sherpa
import sherpa.algorithms.bayesian_optimization as bayesian_optimization

In [None]:
parameters = [sherpa.Discrete('n_estimators', n_estimators_vals),
              sherpa.Choice('criterion', criterion_vals),
              sherpa.Continuous('max_features', max_features_vals)]

algorithm = bayesian_optimization.GPyOpt(max_concurrent=num_concurrent,
                                         model_type='GP_MCMC',
                                         acquisition_type='EI_MCMC',
                                         max_num_trials=num_trials)

sherpa.algorithms.Repeat(algorithm, num_times=5, wait_for_completion=False, agg=False)

algorithm = sherpa.algorithms.RandomSearch(max_num_trials=num_trials)

%%time
study = sherpa.Study(parameters=parameters,
                     algorithm=algorithm,
                     lower_is_better=False)
for trial in study:
    print("Trial ", trial.id, " with parameters ", trial.parameters)
    clf = RandomForestClassifier(criterion=trial.parameters['criterion'],
                                 max_features=trial.parameters['max_features'],
                                 n_estimators=trial.parameters['n_estimators'],
                                 random_state=0)
    scores = cross_val_score(clf, X, y, cv=cv_value)
    print("Score: ", scores.mean())
    study.add_observation(trial, iteration=1, objective=scores.mean())
    study.finalize(trial)

CPU times: user 5h 47min 53s, sys: 50min 53s, total: 6h 38min 47s
Wall time: 14min 1s

print(study.get_best_result())

## Optuna

In [None]:
import optuna
from optuna.samplers import TPESampler, RandomSampler, GridSampler

In [None]:
def objective(trial):
    parameters = {
        'n_estimators':trial.suggest_categorical('n_estimators', n_estimators_vals),
        'criterion':trial.suggest_categorical('criterion', criterion_vals),
        'max_features':trial.suggest_float("max_features", max_features_vals[0], max_features_vals[-1])
    }
    print("Trial with parameters ", parameters)
    clf = RandomForestClassifier(criterion=parameters['criterion'],
                                 max_features=parameters['max_features'],
                                 n_estimators=parameters['n_estimators'],
                                 random_state=0)
    score = cross_val_score(clf, X, y, cv=cv_value)
    accuracy = score.mean()
    return accuracy

#### By default: TPESampler!

In [None]:
algo = TPESampler()
#algo = RandomSampler()

In [None]:
%%time
study = optuna.create_study(direction="maximize", sampler=algo)
study.optimize(objective, n_trials=num_trials, n_jobs=12)
print(study.best_trial)

In [None]:
#best parameter combination
study.best_params

In [None]:
#score achieved with best parameter combination
study.best_value

In [None]:
import joblib

In [None]:
joblib.dump(study, "study.pkl")

In [None]:
study_loaded = joblib.load("study.pkl")

In [None]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

In [None]:
plot_optimization_history(study_loaded)

In [None]:
plot_edf(study_loaded)

In [None]:
plot_parallel_coordinate(study_loaded)

In [None]:
plot_param_importances(study_loaded)

## HyperOpt

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials , space_eval

In [None]:
def hyperopt_train_test(params):
    clf = RandomForestClassifier(**params)
    score = cross_val_score(clf, X, y, cv=cv_value)
    return score.mean()

space = {
    'n_estimators': hp.choice('n_estimators', n_estimators_vals),
    'criterion': hp.choice('criterion', criterion_vals),
    'max_features': hp.uniform('max_features', max_features_vals[0], max_features_vals[-1]),
}

best = 0
def f(params):
    global best
    acc = hyperopt_train_test(params)
    if acc > best:
        best = acc
        print( 'new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
%%time
trials = Trials()
best_config = fmin(f, space, algo=tpe.suggest, max_evals=num_trials, trials=trials)

In [None]:
print(best_config)

In [None]:
best