In [None]:
import pandas as pd
import torch
import numpy as np
from aepsych.benchmark import (
    Benchmark,
    LSEProblemWithEdgeLogging,
)

torch.manual_seed(1)
np.random.seed(1)

  from .autonotebook import tqdm as notebook_tqdm


# Define the problem
We have example_problems defined in benchmark/example_problems.py. Problems are subclass
of the Problem class. Below is an example of a very simple Problem that is subclassed
from an existing example problem.

In [2]:
# Modified DiscrimLowDim in benchmark/example_problems.py
class Discrim2D(LSEProblemWithEdgeLogging):
    name = "discrim_2D"
    bounds = torch.tensor([[0, 0], [1, 1]], dtype=torch.double)

    def __init__(self, thresholds: float | list | torch.Tensor | None = None) -> None:
        thresholds = 0.75 if thresholds is None else thresholds
        super().__init__(thresholds=thresholds)

    def f(self, x: torch.Tensor) -> torch.Tensor:
        # Super simple function that maps 2D to 0 to 3 where 0 is always 0
        return torch.prod(x, dim=1).to(torch.double) * 3.0

    def evaluate(self, strat):
        metrics = super().evaluate(strat)

        # Check proportion of points in the constraints
        n_init_trials = len(strat.strat_list[0].x)
        near_constraint = torch.any(strat.x[n_init_trials:, :] < 0.05, dim=1).double()
        metrics["prop_near_constraint"] = (
            near_constraint.mean().item() if len(near_constraint) != 0 else 0
        )

        return metrics

In [3]:
problem = Discrim2D(thresholds=0.75)

For this benchmarking problem, we want to see if we added mixed likelihood constraints
to a threshold finding GPClassification experiment can help make the model more
effective. 

We care if the overall performance is better, if model fitting/generating takes longer,
and whether we are actually removing points near the constraints. Most of these are 
already evaluated from the superclass so we just add an additional metric to the 
evaluate method to count the number of points near the constraints.

In [4]:
config = {
    "common": {
        "stimuli_per_trial": 1,
        "outcome_types": ["binary"],
        "strategy_names": "[init_strat, opt_strat]",
    },
    "init_strat": {
        "min_total_tells": 10,
        "generator": "SobolGenerator",
    },
    "opt_strat": {
        "min_total_tells": 40,
        "model": "GPClassificationModel",
        "generator": "OptimizeAcqfGenerator",
    },
    "OptimizeAcqfGenerator": {"acqf": "EAVC"},
    "EAVC": {"target": 0.75},
    "GPClassificationModel": {"constraint_factory": "constraint_factory"},
    "constraint_factory": {
        "constraint_lower": "[[0, 0], [0, 0]]",
        "constraint_upper": "[[0, 1], [1, 0]]",
        "constraint_values": "[0.5, 0.5]",
        "points_per_dim": [0, 3, 5, 10],
    },
}

benchmark = Benchmark(problems=[problem], configs=config, seed=1, n_reps=1, log_every=3)

To setup the Benchmark, we need to create a config for the problem we want to test. The
config is a dictionary in the usual Config form. However, whenever a leaf node is a list
it will be combined in different runs with the Cartesian product. In the above example,
we only modify points_per_dim so there will be 4 runs, each with a different 
points_per_dim.

We can run the benchmarks and get the resulting data as a pandas dataframe.

In [5]:
benchmark.run_benchmarks()
df = benchmark.pandas()

[0m[0;37m2025-03-24 16:30:45,310 [INFO   ] Starting fitting (no warm start)...
[0m[0;37m2025-03-24 16:30:46,047 [INFO   ] Fitting done, took 0.7369418144226074
[0m[0;37m2025-03-24 16:30:46,049 [INFO   ] Starting gen...
[0m[0;37m2025-03-24 16:30:46,217 [INFO   ] Gen done, time=0.1677844524383545
[0m[0;37m2025-03-24 16:30:46,219 [INFO   ] Starting fitting (no warm start)...
[0m[0;37m2025-03-24 16:30:46,813 [INFO   ] Fitting done, took 0.5946550369262695
[0m[0;37m2025-03-24 16:30:46,815 [INFO   ] Starting gen...
[0m[0;37m2025-03-24 16:30:46,966 [INFO   ] Gen done, time=0.1503310203552246
[0m[0;37m2025-03-24 16:30:46,967 [INFO   ] Starting fitting (no warm start)...
[0m[0;37m2025-03-24 16:30:47,768 [INFO   ] Fitting done, took 0.80037522315979
[0m[0;37m2025-03-24 16:30:47,769 [INFO   ] Starting gen...
[0m[0;37m2025-03-24 16:30:47,916 [INFO   ] Gen done, time=0.14638733863830566
[0m[0;37m2025-03-24 16:30:47,918 [INFO   ] Starting fitting (no warm start)...
[0m[0;

In [6]:
df.columns

Index(['fit_time', 'cum_fit_time', 'gen_time', 'cum_gen_time', 'trial_id',
       'rep', 'seed', 'final', 'strat_idx', 'problem_name',
       'problem_thresholds', 'init_strat_min_total_tells',
       'init_strat_generator', 'init_strat_stimuli_per_trial',
       'init_strat_outcome_types', 'init_strat_strategy_names',
       'init_strat_lb', 'init_strat_ub', 'init_strat_dim',
       'init_strat_parnames', 'opt_strat_min_total_tells', 'opt_strat_model',
       'opt_strat_generator', 'opt_strat_stimuli_per_trial',
       'opt_strat_outcome_types', 'opt_strat_strategy_names', 'opt_strat_lb',
       'opt_strat_ub', 'opt_strat_dim', 'opt_strat_parnames',
       'OptimizeAcqfGenerator_acqf', 'OptimizeAcqfGenerator_stimuli_per_trial',
       'OptimizeAcqfGenerator_outcome_types',
       'OptimizeAcqfGenerator_strategy_names', 'OptimizeAcqfGenerator_lb',
       'OptimizeAcqfGenerator_ub', 'OptimizeAcqfGenerator_dim',
       'OptimizeAcqfGenerator_parnames', 'EAVC_target',
       'EAVC_stimuli

In [7]:
cols = [
    "seed",
    "trial_id",
    "strat_idx",
    "constraint_factory_points_per_dim",
    "fit_time",
    "cum_fit_time",
    "gen_time",
    "cum_gen_time",
    "brier",
    "misclass_on_thresh_0.75",
    "prop_edge_sampling_mean",
    "prop_edge_sampling_err",
    "prop_near_constraint",
]

Notice that we have a lot of metrics that we can look at as well as the config settings
for a particular run of the experiment. We filter only for the colmns we care about.

It turns out that adding constraints will improve performance but not by removing
points near the constraint (which were already gone), but it does seem to push the model
towards exploiting information about the threshold faster. However, it seems to take 
doubly as long to fit the models with a constraint. Interestingly, the number of 
constraint points do not affect performance or runtime. Perhaps in a more difficult 
problem, this would matter more.

In [8]:
with pd.option_context("display.max_rows", None):
    print(df.loc[df["constraint_factory_points_per_dim"].to_numpy() == "0"][cols])

    seed  trial_id  strat_idx constraint_factory_points_per_dim  fit_time  \
20     3        12          1                                 0  0.189529   
21     3        15          1                                 0  0.198996   
22     3        18          1                                 0  0.268287   
23     3        21          1                                 0  0.272965   
24     3        24          1                                 0  0.201023   
25     3        27          1                                 0  0.299139   
26     3        30          1                                 0  0.186038   
27     3        33          1                                 0  0.295722   
28     3        36          1                                 0  0.216405   
29     3        39          1                                 0  0.211605   

    cum_fit_time  gen_time  cum_gen_time     brier  misclass_on_thresh_0.75  \
20      0.523002  0.274712      1.004995  0.024569                 0.3365

In [9]:
with pd.option_context("display.max_rows", None):
    print(df.loc[df["constraint_factory_points_per_dim"].to_numpy() == "3"][cols])

   seed  trial_id  strat_idx constraint_factory_points_per_dim  fit_time  \
0     1        12          1                                 3  0.710992   
1     1        15          1                                 3  0.778970   
2     1        18          1                                 3  0.699226   
3     1        21          1                                 3  0.541484   
4     1        24          1                                 3  0.558986   
5     1        27          1                                 3  0.557222   
6     1        30          1                                 3  0.586382   
7     1        33          1                                 3  0.526090   
8     1        36          1                                 3  0.566061   
9     1        39          1                                 3  0.525740   

   cum_fit_time  gen_time  cum_gen_time     brier  misclass_on_thresh_0.75  \
0      2.107579  0.148588      1.214064  0.044257                 0.321367   
1      

In [10]:
with pd.option_context("display.max_rows", None):
    print(df.loc[df["constraint_factory_points_per_dim"].to_numpy() == "5"][cols])

    seed  trial_id  strat_idx constraint_factory_points_per_dim  fit_time  \
10     2        12          1                                 5  1.029102   
11     2        15          1                                 5  0.590700   
12     2        18          1                                 5  0.577923   
13     2        21          1                                 5  0.572699   
14     2        24          1                                 5  0.636850   
15     2        27          1                                 5  0.636860   
16     2        30          1                                 5  0.623660   
17     2        33          1                                 5  0.543972   
18     2        36          1                                 5  0.620366   
19     2        39          1                                 5  0.692879   

    cum_fit_time  gen_time  cum_gen_time     brier  misclass_on_thresh_0.75  \
10      1.874613  0.212459      1.324383  0.098363                 0.4298

In [11]:
with pd.option_context("display.max_rows", None):
    print(df.loc[df["constraint_factory_points_per_dim"].to_numpy() == "10"][cols])

    seed  trial_id  strat_idx constraint_factory_points_per_dim  fit_time  \
30     4        12          1                                10  0.751844   
31     4        15          1                                10  0.937644   
32     4        18          1                                10  0.654186   
33     4        21          1                                10  0.702457   
34     4        24          1                                10  0.665254   
35     4        27          1                                10  0.710754   
36     4        30          1                                10  0.696480   
37     4        33          1                                10  0.642701   
38     4        36          1                                10  0.668211   
39     4        39          1                                10  0.716791   

    cum_fit_time  gen_time  cum_gen_time     brier  misclass_on_thresh_0.75  \
30      2.355779  0.243305      1.461005  0.132360                 0.4401