# Model Building with BoFire

This notebooks shows how to setup and analyze models trained with BoFire. It is stil WIP.

## Imports

In [1]:
from bofire.data_models.features.api import (
    ContinuousInput,
    ContinuousOutput,
)
from bofire.data_models.domain.api import Inputs, Outputs
from bofire.data_models.surrogates.api import SingleTaskGPSurrogate, RandomForestSurrogate, MixedSingleTaskGPSurrogate, AnySurrogate, RandomForestSurrogate
from bofire.benchmarks.single import Himmelblau
from bofire.benchmarks.multi import CrossCoupling
import bofire.surrogates.api as surrogates
import json
from bofire.data_models.enum import CategoricalEncodingEnum

from pydantic import parse_obj_as

  from .autonotebook import tqdm as notebook_tqdm


## Problem Setup

For didactic purposes, we sample data from a Himmelblau benchmark function and use them to train a SingleTaskGP.

In [4]:
benchmark = Himmelblau()
samples = benchmark.domain.inputs.sample(n=50)
experiments = benchmark.f(samples, return_complete=True)

experiments.head(10)

Unnamed: 0,x_1,x_2,y,valid_y
0,-0.503564,-2.530529,177.487428,1
1,-1.524027,-3.366221,152.92894,1
2,0.642145,2.663061,63.337918,1
3,-2.909026,3.811943,22.985851,1
4,0.096535,-3.248111,216.042024,1
5,2.994917,-0.071841,20.419082,1
6,-1.949467,-1.314282,124.644996,1
7,0.7303,1.864153,81.81312,1
8,-3.196244,-3.284751,16.906965,1
9,-2.487994,3.537119,10.759791,1


## Model Fitting


In [5]:
input_features = benchmark.domain.input_features
output_features = benchmark.domain.output_features

In [6]:
input_features.json()

'{"type": "Inputs", "features": [{"type": "ContinuousInput", "key": "x_1", "lower_bound": -4.0, "upper_bound": 4.0}, {"type": "ContinuousInput", "key": "x_2", "lower_bound": -4.0, "upper_bound": 4.0}]}'

In [7]:
output_features.json()

'{"type": "Outputs", "features": [{"type": "ContinuousOutput", "key": "y", "objective": {"type": "MaximizeObjective", "w": 1.0, "lower_bound": 0, "upper_bound": 1}}]}'

### Single Task GP

Generate the json spec

In [8]:
# we setup the data model, here a Single Task GP
surrogate_data = SingleTaskGPSurrogate(
    input_features=input_features,
    output_features=output_features
)

# we generate the json spec
jspec = surrogate_data.json()

jspec

'{"type": "SingleTaskGPSurrogate", "input_features": {"type": "Inputs", "features": [{"type": "ContinuousInput", "key": "x_1", "lower_bound": -4.0, "upper_bound": 4.0}, {"type": "ContinuousInput", "key": "x_2", "lower_bound": -4.0, "upper_bound": 4.0}]}, "output_features": {"type": "Outputs", "features": [{"type": "ContinuousOutput", "key": "y", "objective": {"type": "MaximizeObjective", "w": 1.0, "lower_bound": 0, "upper_bound": 1}}]}, "input_preprocessing_specs": {}, "kernel": {"type": "ScaleKernel", "base_kernel": {"type": "MaternKernel", "ard": true, "nu": 2.5, "lengthscale_prior": {"type": "GammaPrior", "concentration": 3.0, "rate": 6.0}}, "outputscale_prior": {"type": "GammaPrior", "concentration": 2.0, "rate": 0.15}}, "scaler": "NORMALIZE"}'

Load it from the spec

In [42]:
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))

Map it 

In [32]:
surrogate = surrogates.map(surrogate_data)

Fit it. This is not 100% finished. In the future we will call here hyperfit which will return the CV results etc. This has to be finished. So ignore this for now and just call fit.

In [8]:
surrogate.fit(experiments=experiments)

Dump it.

In [9]:
# dump it
dump = surrogate.dumps()

Make predictions.

In [10]:
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)

Load again from spec and dump and make predictions.

In [34]:
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions==predictions2


### Random Forest

Generate the json spec

In [48]:
# we setup the data model, here a Single Task GP
surrogate_data = RandomForestSurrogate(
    input_features=input_features,
    output_features=output_features,
    random_state=42
)

# we generate the json spec
jspec = surrogate_data.json()

jspec

'{"type": "RandomForestSurrogate", "input_features": {"type": "Inputs", "features": [{"type": "ContinuousInput", "key": "x_1", "lower_bound": -4.0, "upper_bound": 4.0}, {"type": "ContinuousInput", "key": "x_2", "lower_bound": -4.0, "upper_bound": 4.0}]}, "output_features": {"type": "Outputs", "features": [{"type": "ContinuousOutput", "key": "y", "objective": {"type": "MaximizeObjective", "w": 1.0, "lower_bound": 0, "upper_bound": 1}}]}, "input_preprocessing_specs": {}, "n_estimators": 100, "criterion": "squared_error", "max_depth": null, "min_samples_split": 2, "min_samples_leaf": 1, "min_weight_fraction_leaf": 0.0, "max_features": 1.0, "max_leaf_nodes": null, "min_impurity_decrease": 0.0, "bootstrap": true, "oob_score": false, "random_state": 42, "ccp_alpha": 0.0, "max_samples": null}'

In [49]:
# Load it from the spec
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
# Map it 
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)

In [50]:
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions==predictions2

True

### Mixed GP

Generate the json spec

In [9]:
# we setup the data model, here a Single Task GP
surrogate_data = MixedSingleTaskGPSurrogate(
    input_features=input_features,
    output_features=output_features,
)

# we generate the json spec
jspec = surrogate_data.json()

jspec

ValidationError: 1 validation error for MixedSingleTaskGPSurrogate
input_preprocessing_specs
  MixedSingleTaskGPSurrogate can only be used if at least one one-hot encoded categorical feature is present. (type=value_error)

As expected this fails, as we do not have any categorical feature in the dataset. So we have to setup another problem.

In [10]:
benchmark = CrossCoupling()
samples = benchmark.domain.inputs.sample(n=50)
experiments = benchmark.f(samples, return_complete=True)

experiments.head(10)

Unnamed: 0,base_eq,t_res,temperature,base,catalyst,yield,cost,valid_cost,valid_yield
0,1.108878,561.134791,58.990544,DBU,tBuXPhos,0.412899,0.24945,1,1
1,1.119897,1638.6267,38.141564,TEA,tBuBrettPhos,0.114862,0.278678,1,1
2,1.025228,1152.919137,33.574324,DBU,AlPhos,0.931872,0.420131,1,1
3,2.398088,276.503577,82.572749,BTMG,tBuBrettPhos,1.003118,0.383019,1,1
4,2.108768,716.139937,55.254079,TMG,tBuXPhos,0.114621,0.248316,1,1
5,1.131651,383.878986,35.294437,BTMG,AlPhos,0.955243,0.468442,1,1
6,1.422897,1476.426426,44.983435,TMG,AlPhos,0.194928,0.419063,1,1
7,1.784293,972.424187,85.569375,BTMG,tBuBrettPhos,1.030528,0.356208,1,1
8,2.259056,1325.443448,83.484574,TMG,AlPhos,0.724812,0.419093,1,1
9,1.811428,264.567154,47.612658,TMG,tBuBrettPhos,-0.022759,0.278336,1,1


In [13]:
# we setup the data model, here a Single Task GP
surrogate_data = MixedSingleTaskGPSurrogate(
    input_features=benchmark.domain.input_features,
    output_features=Outputs(features=[benchmark.domain.output_features.features[0]]),
    input_preprocessing_specs={"catalyst": CategoricalEncodingEnum.ONE_HOT}
)

# we generate the json spec
jspec = surrogate_data.json()

jspec

'{"type": "MixedSingleTaskGPSurrogate", "input_features": {"type": "Inputs", "features": [{"type": "CategoricalDescriptorInput", "key": "catalyst", "categories": ["tBuXPhos", "tBuBrettPhos", "AlPhos"], "allowed": [true, true, true], "descriptors": ["area_cat", "M2_cat"], "values": [[460.7543, 67.2057], [518.8408, 89.8738], [819.933, 129.0808]]}, {"type": "CategoricalDescriptorInput", "key": "base", "categories": ["TEA", "TMG", "BTMG", "DBU"], "allowed": [true, true, true, true], "descriptors": ["area", "M2"], "values": [[162.2992, 25.8165], [165.5447, 81.4847], [227.3523, 30.554], [192.4693, 59.8367]]}, {"type": "ContinuousInput", "key": "base_eq", "lower_bound": 1.0, "upper_bound": 2.5}, {"type": "ContinuousInput", "key": "temperature", "lower_bound": 30.0, "upper_bound": 100.0}, {"type": "ContinuousInput", "key": "t_res", "lower_bound": 60.0, "upper_bound": 1800.0}]}, "output_features": {"type": "Outputs", "features": [{"type": "ContinuousOutput", "key": "yield", "objective": {"type"

In [14]:
# Load it from the spec
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
# Map it 
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)

In [15]:
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions==predictions2

True