# Model fitting and hyperopt

## Import and general Setup

In [1]:
from bofire.benchmarks.api import Himmelblau, hyperoptimize
from bofire.data_models.surrogates.api import SingleTaskGPSurrogate
import bofire.surrogates.api as surrogates 
from bofire.surrogates.feature_importance import permutation_importance_hook, combine_permutation_importances
from bofire.plot.feature_importance import plot_feature_importance_by_feature_plotly
from pydantic import BaseModel
from typing import Literal
from bofire.surrogates.diagnostics import CvResults2CrossValidationValues, CrossValidationValues
from bofire.data_models.enum import RegressionMetricsEnum
import random
import json

benchmark = Himmelblau()
experiments = benchmark.f(benchmark.domain.inputs.sample(25), return_complete=True)

surrogate_data = SingleTaskGPSurrogate(inputs=benchmark.domain.inputs, outputs=benchmark.domain.outputs)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class TestMethod(BaseModel):
    type: str

class CrossValidation(TestMethod):
    type: Literal["CrossValidation"] = "CrossValidation"
    foldCount: int

## With hyperopt

In [3]:
FOLDS = 5
RANDOM_STATE = random.randint(1,1000)

# remove all non valid output data for cross val
cleaned_experiments = surrogate_data.outputs.preprocess_experiments_all_valid_outputs(experiments=experiments)
# generate the optimized surrogate data
opt_surrogate_data, metrics = hyperoptimize(surrogate_data=surrogate_data, training_data=cleaned_experiments, folds=FOLDS, random_state=RANDOM_STATE)
metrics = metrics.reset_index(drop=True)
# calculate the metrics for optimized surrogate data, this is needed if we have more sophisticated splitting schemes in the future
surrogate = surrogates.map(opt_surrogate_data)
cv_train, cv_test, pi = surrogate.cross_validate(cleaned_experiments, folds=FOLDS, hooks={"pemutation_imprtance": permutation_importance_hook}, random_state=RANDOM_STATE)
testMethod = CrossValidation(foldCount=FOLDS)
cvResultsTrain = CvResults2CrossValidationValues(cv_train)
cvResultsTest = CvResults2CrossValidationValues(cv_test)
metricsTrain = {surrogate.outputs[0].key: cv_train.get_metrics(combine_folds=False).describe().loc["mean"].to_dict()}
metricsTest = {surrogate.outputs[0].key: cv_test.get_metrics(combine_folds=True).describe().loc["mean"].to_dict()}
# train to the whole dataset and save the model
surrogate.fit(experiments=experiments)
# get the dump
dump = surrogate.dumps()
# create the misc field
misc = {
    "hyperparameter_optimization": {
        "domain": surrogate_data.hyperconfig.domain.json(),
        "metrics": json.dumps(metrics.to_dict()),
    },
    "feature_importance": {
        "permutation_feature_importance":  json.dumps({m.name: combine_permutation_importances(pi["pemutation_imprtance"], m).describe().loc[["mean", "std"]].to_dict() for m in RegressionMetricsEnum})
    }
}
# save to backend
# - opt_surrogate_data
# - dump
# - testMethod
# - cvResultsTrain
# - cvResultsTest
# - metricsTrain
# - metricsTest






# Without Hyperopt

In [4]:
FOLDS = 5
RANDOM_STATE = random.randint(1,1000)

cleaned_experiments = surrogate_data.outputs.preprocess_experiments_all_valid_outputs(experiments=experiments)
# calculate the metrics for optimized surrogate data, this is needed if we have more sophisticated splitting schemes in the future
surrogate = surrogates.map(surrogate_data)
cv_train, cv_test, pi = surrogate.cross_validate(cleaned_experiments, folds=FOLDS, hooks={"pemutation_imprtance": permutation_importance_hook}, random_state=RANDOM_STATE)
testMethod = CrossValidation(foldCount=FOLDS)
cvResultsTrain = CvResults2CrossValidationValues(cv_train)
cvResultsTest = CvResults2CrossValidationValues(cv_test)
metricsTrain = {surrogate.outputs[0].key: cv_train.get_metrics(combine_folds=False).describe().loc["mean"].to_dict()}
metricsTest = {surrogate.outputs[0].key: cv_test.get_metrics(combine_folds=True).describe().loc["mean"].to_dict()}
# train to the whole dataset and save the model
surrogate.fit(experiments=experiments)
# get the dump
dump = surrogate.dumps()
# create the misc field
misc = {
    "feature_importance": {
        "permutation_feature_importance":  json.dumps({m.name: combine_permutation_importances(pi["pemutation_imprtance"], m).describe().loc[["mean", "std"]].to_dict() for m in RegressionMetricsEnum})
    }
}
# save to backend
# - surrogate_data
# - dump
# - testMethod
# - cvResultsTrain
# - cvResultsTest
# - metricsTrain
# - metricsTest


## Combined

In [5]:
HYPEROPTIMIZE = False
FOLDS = 5
RANDOM_STATE = random.randint(1,1000)

# remove all non valid output data for cross val
cleaned_experiments = surrogate_data.outputs.preprocess_experiments_all_valid_outputs(experiments=experiments)
# generate the optimized surrogate data
if HYPEROPTIMIZE:
    opt_surrogate_data, metrics = hyperoptimize(surrogate_data=surrogate_data, training_data=cleaned_experiments, folds=FOLDS, random_state=RANDOM_STATE)
    metrics = metrics.reset_index(drop=True)
else:
    opt_surrogate_data = surrogate_data
# calculate the metrics for optimized surrogate data, this is needed if we have more sophisticated splitting schemes in the future
surrogate = surrogates.map(opt_surrogate_data)
cv_train, cv_test, pi = surrogate.cross_validate(cleaned_experiments, folds=FOLDS, hooks={"pemutation_imprtance": permutation_importance_hook}, random_state=RANDOM_STATE)
testMethod = CrossValidation(foldCount=FOLDS)
cvResultsTrain = CvResults2CrossValidationValues(cv_train)
cvResultsTest = CvResults2CrossValidationValues(cv_test)
metricsTrain = {surrogate.outputs[0].key: cv_train.get_metrics(combine_folds=False).describe().loc["mean"].to_dict()}
metricsTest = {surrogate.outputs[0].key: cv_test.get_metrics(combine_folds=True).describe().loc["mean"].to_dict()}
# train to the whole dataset and save the model
surrogate.fit(experiments=experiments)
# get the dump
dump = surrogate.dumps()
# create the misc field
misc = {"feature_importance": {
        "permutation_feature_importance":  json.dumps({m.name: combine_permutation_importances(pi["pemutation_imprtance"], m).describe().loc[["mean", "std"]].to_dict() for m in RegressionMetricsEnum})
    }}
if HYPEROPTIMIZE:
    misc["hyperparameter_optimization"] = {
        "domain": surrogate_data.hyperconfig.domain.json(),
        "metrics": json.dumps(metrics.to_dict()),
    }
