# Model Building with BoFire

This notebooks shows how to setup and analyze models trained with BoFire. It is stil WIP.

## Imports

In [1]:
from bofire.data_models.domain.api import Outputs
from bofire.data_models.surrogates.api import SingleTaskGPSurrogate, RandomForestSurrogate, MixedSingleTaskGPSurrogate, AnySurrogate, RandomForestSurrogate, EmpiricalSurrogate, MLPEnsemble
from bofire.benchmarks.single import Himmelblau
from bofire.benchmarks.multi import CrossCoupling
import bofire.surrogates.api as surrogates
import json
from bofire.data_models.enum import CategoricalEncodingEnum

from pydantic import parse_obj_as

## Problem Setup

For didactic purposes, we sample data from a Himmelblau benchmark function and use them to train a SingleTaskGP.

In [2]:
benchmark = Himmelblau()
samples = benchmark.domain.inputs.sample(n=50)
experiments = benchmark.f(samples, return_complete=True)

experiments.head(10)

Unnamed: 0,x_1,x_2,y,valid_y
0,5.010799,-0.612165,184.746878,1
1,-1.779981,-0.137665,140.265892,1
2,-5.063193,-3.811183,123.236129,1
3,5.114825,5.41927,1178.897832,1
4,-2.921467,-2.808005,31.952544,1
5,0.90609,4.477183,227.148355,1
6,3.319714,-2.211923,6.272053,1
7,3.629923,-0.748149,9.937792,1
8,-1.612215,4.45189,141.192994,1
9,0.242512,4.767127,293.096581,1


## Model Fitting


In [3]:
input_features = benchmark.domain.inputs
output_features = benchmark.domain.outputs

In [4]:
input_features.json()

'{"type": "Inputs", "features": [{"type": "ContinuousInput", "key": "x_1", "unit": null, "bounds": [-6.0, 6.0], "stepsize": null}, {"type": "ContinuousInput", "key": "x_2", "unit": null, "bounds": [-6.0, 6.0], "stepsize": null}]}'

In [5]:
output_features.json()

'{"type": "Outputs", "features": [{"type": "ContinuousOutput", "key": "y", "unit": null, "objective": {"type": "MinimizeObjective", "w": 1.0, "bounds": [0, 1]}}]}'

### Single Task GP

Generate the json spec

In [6]:
# we setup the data model, here a Single Task GP
surrogate_data = SingleTaskGPSurrogate(
    inputs=input_features,
    outputs=output_features
)

# we generate the json spec
jspec = surrogate_data.json()

jspec

'{"type": "SingleTaskGPSurrogate", "inputs": {"type": "Inputs", "features": [{"type": "ContinuousInput", "key": "x_1", "unit": null, "bounds": [-6.0, 6.0], "stepsize": null}, {"type": "ContinuousInput", "key": "x_2", "unit": null, "bounds": [-6.0, 6.0], "stepsize": null}]}, "outputs": {"type": "Outputs", "features": [{"type": "ContinuousOutput", "key": "y", "unit": null, "objective": {"type": "MinimizeObjective", "w": 1.0, "bounds": [0, 1]}}]}, "input_preprocessing_specs": {}, "dump": null, "kernel": {"type": "ScaleKernel", "base_kernel": {"type": "MaternKernel", "ard": true, "nu": 2.5, "lengthscale_prior": {"type": "GammaPrior", "concentration": 3.0, "rate": 6.0}}, "outputscale_prior": {"type": "GammaPrior", "concentration": 2.0, "rate": 0.15}}, "noise_prior": {"type": "GammaPrior", "concentration": 1.1, "rate": 0.05}, "scaler": "NORMALIZE"}'

Load it from the spec

In [7]:
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))

Map it 

In [8]:
surrogate = surrogates.map(surrogate_data)

Fit it. This is not 100% finished. In the future we will call here hyperfit which will return the CV results etc. This has to be finished. So ignore this for now and just call fit.

In [9]:
surrogate.fit(experiments=experiments)

Dump it.

In [10]:
# dump it
dump = surrogate.dumps()

Make predictions.

In [11]:
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)

Load again from spec and dump and make predictions.

In [12]:
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions==predictions2


True

### Random Forest

Generate the json spec

In [13]:
# we setup the data model, here a Single Task GP
surrogate_data = RandomForestSurrogate(
    inputs=input_features,
    outputs=output_features,
    random_state=42
)

# we generate the json spec
jspec = surrogate_data.json()

jspec

'{"type": "RandomForestSurrogate", "inputs": {"type": "Inputs", "features": [{"type": "ContinuousInput", "key": "x_1", "unit": null, "bounds": [-6.0, 6.0], "stepsize": null}, {"type": "ContinuousInput", "key": "x_2", "unit": null, "bounds": [-6.0, 6.0], "stepsize": null}]}, "outputs": {"type": "Outputs", "features": [{"type": "ContinuousOutput", "key": "y", "unit": null, "objective": {"type": "MinimizeObjective", "w": 1.0, "bounds": [0, 1]}}]}, "input_preprocessing_specs": {}, "dump": null, "n_estimators": 100, "criterion": "squared_error", "max_depth": null, "min_samples_split": 2, "min_samples_leaf": 1, "min_weight_fraction_leaf": 0.0, "max_features": 1.0, "max_leaf_nodes": null, "min_impurity_decrease": 0.0, "bootstrap": true, "oob_score": false, "random_state": 42, "ccp_alpha": 0.0, "max_samples": null}'

In [14]:
# Load it from the spec
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
# Map it 
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)

In [15]:
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions==predictions2

True

### MLP Ensemble

Generate the json spec

In [16]:
# we setup the data model, here a Single Task GP
surrogate_data = MLPEnsemble(
    inputs=input_features,
    outputs=output_features,
    n_estimators=2
)

# we generate the json spec
jspec = surrogate_data.json()

jspec

'{"type": "MLPEnsemble", "inputs": {"type": "Inputs", "features": [{"type": "ContinuousInput", "key": "x_1", "unit": null, "bounds": [-6.0, 6.0], "stepsize": null}, {"type": "ContinuousInput", "key": "x_2", "unit": null, "bounds": [-6.0, 6.0], "stepsize": null}]}, "outputs": {"type": "Outputs", "features": [{"type": "ContinuousOutput", "key": "y", "unit": null, "objective": {"type": "MinimizeObjective", "w": 1.0, "bounds": [0, 1]}}]}, "input_preprocessing_specs": {}, "dump": null, "n_estimators": 2, "hidden_layer_sizes": [100], "activation": "relu", "dropout": 0.0, "batch_size": 10, "n_epochs": 200, "lr": 0.0001, "weight_decay": 0.0, "subsample_fraction": 1.0, "shuffle": true, "scaler": "NORMALIZE"}'

In [None]:
# Load it from the spec
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
# Map it 
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)

In [19]:
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions==predictions2

True

## Empirical Surrogate

The empirical model is special as it has per default no fit and you need cloudpickle. There can be empirical models which implement a fit, but for this they also have to inherit from `Trainable`. The current example is the default without any fit functionality.

In [20]:
from botorch.models.deterministic import DeterministicModel
from torch import Tensor

class HimmelblauModel(DeterministicModel):
    def __init__(self):
        super().__init__()
        self._num_outputs = 1

    def forward(self, X: Tensor) -> Tensor:
        return (
            (X[..., 0] ** 2 + X[..., 1] - 11.0) ** 2
            + (X[..., 0] + X[..., 1] ** 2 - 7.0) ** 2
        ).unsqueeze(-1)

In [21]:
# we setup the data model, here a Single Task GP
surrogate_data = EmpiricalSurrogate(
    inputs=input_features,
    outputs=output_features,
)

# we generate the json spec
jspec = surrogate_data.json()

jspec

'{"type": "EmpiricalSurrogate", "inputs": {"type": "Inputs", "features": [{"type": "ContinuousInput", "key": "x_1", "unit": null, "bounds": [-6.0, 6.0], "stepsize": null}, {"type": "ContinuousInput", "key": "x_2", "unit": null, "bounds": [-6.0, 6.0], "stepsize": null}]}, "outputs": {"type": "Outputs", "features": [{"type": "ContinuousOutput", "key": "y", "unit": null, "objective": {"type": "MinimizeObjective", "w": 1.0, "bounds": [0, 1]}}]}, "input_preprocessing_specs": {}, "dump": null}'

In [22]:
# Load it from the spec
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
# Map it 
surrogate = surrogates.map(surrogate_data)
# attach the actual model to it
surrogate.model = HimmelblauModel()
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)

In [23]:
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions==predictions2

True

### Mixed GP

Generate data for a mixed problem.

In [24]:
benchmark = CrossCoupling()
samples = benchmark.domain.inputs.sample(n=50)
experiments = benchmark.f(samples, return_complete=True)

experiments.head(10)

Unnamed: 0,base_eq,t_res,temperature,base,catalyst,yield,cost,valid_cost,valid_yield
0,2.042351,647.316445,52.297576,TMG,AlPhos,0.093265,0.419085,1,1
1,1.044297,690.01172,86.55915,DBU,AlPhos,0.952935,0.420151,1,1
2,1.258711,144.332565,92.814988,TEA,tBuXPhos,0.041249,0.248697,1,1
3,2.495915,1116.115238,85.396238,BTMG,AlPhos,0.930243,0.528033,1,1
4,1.219549,1764.319528,72.869934,TEA,tBuXPhos,0.135403,0.248683,1,1
5,1.010881,1544.723259,37.30969,TEA,tBuBrettPhos,0.118655,0.278638,1,1
6,2.197348,1678.508461,68.74229,BTMG,tBuXPhos,0.954217,0.344219,1,1
7,1.08108,1330.517549,30.354525,DBU,tBuXPhos,0.214738,0.24942,1,1
8,2.326009,994.826769,89.459671,TEA,tBuXPhos,0.099964,0.249086,1,1
9,1.899661,1712.027463,58.522522,TEA,AlPhos,0.136984,0.419703,1,1


In [25]:
# we setup the data model, here a Single Task GP
surrogate_data = MixedSingleTaskGPSurrogate(
    inputs=benchmark.domain.inputs,
    outputs=Outputs(features=[benchmark.domain.outputs.features[0]]),
    input_preprocessing_specs={"catalyst": CategoricalEncodingEnum.ONE_HOT}
)

# we generate the json spec
jspec = surrogate_data.json()

jspec

'{"type": "MixedSingleTaskGPSurrogate", "inputs": {"type": "Inputs", "features": [{"type": "CategoricalDescriptorInput", "key": "catalyst", "categories": ["tBuXPhos", "tBuBrettPhos", "AlPhos"], "allowed": [true, true, true], "descriptors": ["area_cat", "M2_cat"], "values": [[460.7543, 67.2057], [518.8408, 89.8738], [819.933, 129.0808]]}, {"type": "CategoricalDescriptorInput", "key": "base", "categories": ["TEA", "TMG", "BTMG", "DBU"], "allowed": [true, true, true, true], "descriptors": ["area", "M2"], "values": [[162.2992, 25.8165], [165.5447, 81.4847], [227.3523, 30.554], [192.4693, 59.8367]]}, {"type": "ContinuousInput", "key": "base_eq", "unit": null, "bounds": [1.0, 2.5], "stepsize": null}, {"type": "ContinuousInput", "key": "temperature", "unit": null, "bounds": [30.0, 100.0], "stepsize": null}, {"type": "ContinuousInput", "key": "t_res", "unit": null, "bounds": [60.0, 1800.0], "stepsize": null}]}, "outputs": {"type": "Outputs", "features": [{"type": "ContinuousOutput", "key": "yi

In [28]:
# Load it from the spec
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
# Map it 
surrogate = surrogates.map(surrogate_data)
# Fit it
surrogate.fit(experiments=experiments)
# dump it
dump = surrogate.dumps()
# predict with it
df_predictions = surrogate.predict(experiments)
# transform to spec
predictions = surrogate.to_predictions(predictions=df_predictions)

In [29]:
surrogate_data = parse_obj_as(AnySurrogate, json.loads(jspec))
surrogate = surrogates.map(surrogate_data)
surrogate.loads(dump)

# predict with it
df_predictions2 = surrogate.predict(experiments)
# transform to spec
predictions2 = surrogate.to_predictions(predictions=df_predictions2)

# check for equality
predictions==predictions2

True