## Single Pipeline hyperparameter optimization
Before adding hyperparameter search option to the Experiment Graph, we perform a simple analysis where we select the one pipeline and for every execution we add the hyperparameters and the final accuracy to the Trial object of the hypernet. Afterwards, we perform a search using a predefined budget to find the best set of parameters.
We compare the result, with the vanilla version, where the Trial object is empty and report the quality and time to achieve the certain level of quality.

In [1]:
from openml import datasets, tasks, runs, flows, setups, config, evaluations
from workloadoptimization.essentials import Component, ExperimentObject, ExperimentGraph, ExperimentParser
from workloadoptimization.hyperopt_helper import TrialConverter
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import uuid
import networkx as nx
import sklearn
from networkx.drawing.nx_agraph import graphviz_layout
from openmlstudy14.preprocessing import ConditionalImputer
%matplotlib inline


config.apikey = '8e2079508005379adb9a60798696134e'
config.server = 'https://www.openml.org/api/v1'
config.set_cache_directory(os.path.expanduser('~/openml-cache'))

In [2]:
flow = flows.get_flow(flow_id=7707)
task = tasks.get_task(task_id=31)
pipeline = flows.flow_to_sklearn(flow)

In [3]:
import pickle
def getTopRuns(numberOfRuns, pipeline, task):
    openMlEvaluations = evaluations.list_evaluations('predictive_accuracy', task= [task], flow = [pipeline])
    evaluationData = pd.DataFrame.from_dict(openMlEvaluations, orient='index')
    evaluationData['accuracy'] = evaluationData.apply(lambda eva: eva.values[0].value, axis = 1)
    evaluationData['run_id'] = evaluationData.apply(lambda eva: eva.values[0].run_id, axis = 1)
    # extracting the top ''numberOfRuns' runs
    topRuns = evaluationData.sort_values('accuracy',ascending=False)
    if (numberOfRuns>0):
        topRuns = topRuns[0:numberOfRuns]
    # retreiving the run objects from the top runs
    openMLRuns = runs.list_runs(task=[task], flow=[pipeline])
    experiments = pd.DataFrame.from_dict(openMLRuns,orient='index')
    Experiment = experiments.merge(topRuns,on='run_id').drop(columns=['uploader',0])
    # TODO: This is the limit on the api size, I should fix it to make calls in batches
    Setup = pd.DataFrame.from_dict(setups.list_setups(setup=Experiment.setup_id[0:500], size = 500 ), orient='index').reset_index()
    Setup.columns=['id', 'setup']

    return pd.merge(Setup, Experiment, how = 'inner', left_on='id', right_on='setup_id').drop(columns = ['id','setup_id'])[['run_id','task_id','flow_id', 'accuracy','setup']]
def extractExperiments(filePath, taskIds, flowIds):
    if os.path.isfile(filePath):
        return pd.read_pickle(filePath)
    frames = []
    for t in taskIds:
        for f in flowIds:
            frames.append(getTopRuns(100000,f,t))
    Experiments = pd.concat(frames).reset_index(drop=True)
    Experiments.to_pickle(filePath)
    return Experiments

# This is time consuming, so it is better to persist the list of the runs to disk
# If you are changing the tasks or flow ids, remember to change the name of the file 
Experiments = extractExperiments('meta/hyper-opt-experiment-31-7707', [31], [7707])


In [4]:
OPENML_FLOWS = {}
FLOW_IDS = Experiments.flow_id.unique()
for f in FLOW_IDS:
    try:
        #print f
        fl = flows.get_flow(f)
        OPENML_FLOWS[f] = fl 
    except:
        print 'error for {}'.format(f)
parser = ExperimentParser()
experimentObjects = parser.extractOpenMLFlows(Experiments, OPENML_FLOWS)

In [53]:
# Find the range of all the hyperparameters for setting up the search space
param_range = {}
for e in experimentObjects:
    for k,v in e.extractParams().iteritems():
        if param_range.has_key(k):
            param_range[k].append(v)
        else:
            param_range[k] = [v]

In [54]:
# Find the hyperparameters with constant values so we can set them once and do not include them in the search space
CONSTANT_PARAMS = {}
DYNAMIC_PARAMS = []
for k in param_range.keys():
    try:
        size = len(set(param_range[k]))
        if size == 1:
            CONSTANT_PARAMS[k] = param_range[k][0]
        else:
            DYNAMIC_PARAMS.append(k)
    except: 
        CONSTANT_PARAMS[k] = param_range[k][0]    
        #print k,'error'
print 'Dynamic hyperparameters: {}'.format(DYNAMIC_PARAMS)

Dynamic hyperparameters: ['classifier__tol', 'classifier__gamma', 'classifier__C', 'imputation__strategy', 'classifier__degree', 'classifier__coef0', 'classifier__shrinking', 'classifier__kernel']


Based on the list of the parameters in the experiments these are the feasable ranges (or categories) of the parameters.
- classifier__tol 1.0509652110524482e-05 0.09706102908291375
- classifier__gamma 3.122280314190532e-05 7.998532268538166
- classifier__C 0.03213680700039348 32547.418063576853
- imputation__strategy {u'"mean"', u'"median"', u'"most_frequent"'}
- classifier__degree {u'1', u'2', u'3', u'4', u'5'}
- classifier__coef0 -0.9942534412466477 0.9975887639931769
- classifier__shrinking True, False

In [55]:
# construct the search space
from hyperopt import hp

# Parameter search space
space = {}
# between 1.0509652110524482e-05 0.09706102908291375
space['classifier__tol'] = hp.lognormal('classifier__tol',-7, 1)
# between 3.122280314190532e-05 7.998532268538166
space['classifier__gamma'] = hp.lognormal('classifier__gamma',0.0001, 1.3)
# One of True or False
space['classifier__C'] = hp.lognormal('classifier__C',2.5, 3)
# choice
space['imputation__strategy'] = hp.choice('imputation__strategy', [u'mean', u'median', u'most_frequent'])
# choice
space['classifier__degree'] = hp.choice('classifier__degree', [1, 2, 3, 4, 5])
# Between -0.9942534412466477 0.9975887639931769
space['classifier__coef0'] = hp.uniform('classifier__coef0', -1, 1)
# True or False
space['classifier__shrinking'] = hp.choice('classifier__shrinking',[True, False])

In [56]:
# set the constant parameters and define the objective function
pipeline.set_params(**CONSTANT_PARAMS)
def objective(params):
    #print params
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    accuracy = pipeline.score(X_test, y_test)
    #print 'accuracy = {}'.format(accuracy)
    return 1 - accuracy

In [57]:
# load the dataset
dataset = datasets.get_dataset(dataset_id=task.dataset_id)
data = dataset.get_data()
train_indices,test_indices = task.get_train_test_split_indices()
X, y, attribute_names = dataset.get_data(
target=dataset.default_target_attribute,
return_attribute_names=True)
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]

In [101]:
# sort based on the loss
sortedObjects = sorted(experimentObjects, key=lambda eo: 1 - eo.quality, reverse=True)

In [102]:
trialConverter = TrialConverter()
warmTrials = trialConverter.trialsFromExperimentObjects(space, sortedObjects)

In [103]:
# start the search process with trials database already set
from hyperopt import fmin, tpe, Trials, rand

# Run the hyperparameter search using the tpe algorithm
best = fmin(objective,
            space,
            algo=tpe.suggest,
            max_evals=1500,
            trials=warmTrials)

In [None]:
# start the search process with trials database already set
from hyperopt import fmin, tpe, Trials, rand

coldTrials = Trials()
# Run the hyperparameter search using the tpe algorithm
best = fmin(objective,
            space,
            algo=tpe.suggest,
            max_evals=1000,
            trials=coldTrials)

In [None]:
def drawPlots(targetTrial):
    plt.figure(figsize=(10,10))
    plt.subplot(3,3,1)
    ax = sns.regplot(x = np.array([x['misc']['vals']['classifier__tol'][0] for x in targetTrial.trials]), y = np.array(targetTrial.losses()), fit_reg=False)
    ax.set_title('classifier__tol')
    plt.subplot(3,3,2)
    ax = sns.regplot(x = np.array([x['misc']['vals']['classifier__gamma'][0] for x in targetTrial.trials]), y = np.array(targetTrial.losses()), fit_reg=False)
    ax.set_title('classifier__gamma')
    plt.subplot(3,3,3)
    ax = sns.regplot(x = np.array([x['misc']['vals']['classifier__C'][0] for x in targetTrial.trials]), y = np.array(targetTrial.losses()), fit_reg=False)
    ax.set_title('classifier__C')
    plt.subplot(3,3,4)
    ax = sns.regplot(x = np.array([x['misc']['vals']['classifier__degree'][0] for x in targetTrial.trials]), y = np.array(targetTrial.losses()), fit_reg=False)
    ax.set_title('classifier__degree')
    plt.subplot(3,3,5)
    ax = sns.regplot(x = np.array([x['misc']['vals']['classifier__coef0'][0] for x in targetTrial.trials]), y = np.array(targetTrial.losses()), fit_reg=False)
    ax.set_title('classifier__coef0')
    plt.subplot(3,1,3)
    ax = sns.regplot(np.array(range(len(targetTrial.losses()))), np.array(targetTrial.losses()), fit_reg=False)
    ax.set_title('Trials vs Quality over time')

## Cold vs Warm

In [None]:
#drawPlots(warmTrials)
trials = Trials()
trials.insert_trial_docs(warmTrials.trials[500:])
trials.refresh()
plt.figure(figsize=(10,10))
plt.subplot(2,1,1)
ax = sns.regplot(np.array(range(len(trials.losses()))), np.array(trials.losses()), fit_reg=False)
ax.set_title('Warm Trials')
plt.subplot(2,1,2)
ax = sns.regplot(np.array(range(len(coldTrials.losses()))), np.array(coldTrials.losses()), fit_reg=False)
ax.set_title('Cold Trials')

## Using Experiment Database to warmstart the Trials

In [None]:
#drawPlots(warmTrials)
trials = Trials()
trials.insert_trial_docs(warmTrials.trials[500:600])
trials.refresh()
drawPlots(trials)

## No Warmstarting

In [None]:
drawPlots(coldTrials)

In [94]:
coldTrials.best_trial

{'book_time': datetime.datetime(2018, 7, 1, 11, 18, 0, 93000),
 'exp_key': None,
 'misc': {'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'idxs': {'classifier__C': [20],
   'classifier__coef0': [20],
   'classifier__degree': [20],
   'classifier__gamma': [20],
   'classifier__shrinking': [20],
   'classifier__tol': [20],
   'imputation__strategy': [20]},
  'tid': 20,
  'vals': {'classifier__C': [0.6923023790257264],
   'classifier__coef0': [0.4490707362102988],
   'classifier__degree': [4],
   'classifier__gamma': [0.008142826362265525],
   'classifier__shrinking': [0],
   'classifier__tol': [0.00024120800848078465],
   'imputation__strategy': [2]},
  'workdir': None},
 'owner': None,
 'refresh_time': datetime.datetime(2018, 7, 1, 11, 18, 0, 409000),
 'result': {'loss': 0.18999999999999995, 'status': 'ok'},
 'spec': None,
 'state': 2,
 'tid': 20,
 'version': 0}

In [95]:
warmTrials.best_trial

{'book_time': datetime.datetime(2018, 7, 1, 11, 28, 41, 638000),
 'exp_key': None,
 'misc': {'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'idxs': {'classifier__C': [621],
   'classifier__coef0': [621],
   'classifier__degree': [621],
   'classifier__gamma': [621],
   'classifier__shrinking': [621],
   'classifier__tol': [621],
   'imputation__strategy': [621]},
  'tid': 621,
  'vals': {'classifier__C': [7.75292203371408],
   'classifier__coef0': [0.7731936758562243],
   'classifier__degree': [4],
   'classifier__gamma': [0.0011317768583867197],
   'classifier__shrinking': [1],
   'classifier__tol': [0.0018070574299404],
   'imputation__strategy': [0]},
  'workdir': None},
 'owner': None,
 'refresh_time': datetime.datetime(2018, 7, 1, 11, 28, 41, 913000),
 'result': {'loss': 0.19999999999999996, 'status': 'ok'},
 'spec': None,
 'state': 2,
 'tid': 621,
 'version': 0}

## Conclusion
This certainly shows some promising results. I have to investiage a bit more because it seems the search is stuck in some local point because the objective value of most of the proposed hyperparamters are constant (0.3) as you see from the figure above.
Moreover, the behaviour is a bit random, sometimes s