Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 32 additions & 14 deletions ax/analysis/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,21 +84,44 @@ def compute(
# (3) experiment data does not have has_step_column=True (data with a
# progression doesn't support relativization due to time-series step
# alignment complexities.)
# (4) no preference metric objectives -- preference metrics (e.g.,
# pairwise_pref_query) have binary 0/1 labels with SQ mean near zero,
# causing relativization to crash with "mean_control too small."
data = experiment.lookup_data(trial_indices=self.trial_indices)
has_preference_objective = experiment.optimization_config is not None and any(
is_preference_metric(n)
for n in experiment.optimization_config.objective.metric_names
)
should_relativize = (
len(experiment.metrics) > 0
and experiment.status_quo is not None
and not data.has_step_column
and not has_preference_objective
)

# When relativizing, scope to non-preference metrics only. Preference
# metrics (e.g., pairwise_pref_query) have binary 0/1 labels with SQ
# mean near zero, causing relativization to crash with "mean_control
# too small." Non-preference tracking metrics are relativized normally.
non_preference_metric_names: list[str] | None = None
if should_relativize:
non_preference_metric_names = [
name for name in experiment.metrics if not is_preference_metric(name)
]

df = experiment.to_df(
trial_indices=self.trial_indices,
omit_empty_columns=self.omit_empty_columns,
trial_statuses=self.trial_statuses,
relativize=should_relativize,
metric_names_to_relativize=non_preference_metric_names,
)

# Drop preference metric columns (e.g., pairwise_pref_query) -- their
# binary 0/1 values are not informative in a summary table. Then drop
# rows where all remaining metric columns are empty, which removes
# labeling-only trials that have no tracking metric data.
preference_cols = [col for col in df.columns if is_preference_metric(col)]
if preference_cols:
df = df.drop(columns=preference_cols)
remaining_metric_cols = [
col for col in df.columns if col in experiment.metrics
]
if remaining_metric_cols:
df = df.dropna(subset=remaining_metric_cols, how="all")

return self._create_analysis_card(
title=(
"Summary for "
Expand All @@ -112,10 +135,5 @@ def compute(
"Metric results are relativized against status quo."
)
),
df=experiment.to_df(
trial_indices=self.trial_indices,
omit_empty_columns=self.omit_empty_columns,
trial_statuses=self.trial_statuses,
relativize=should_relativize,
),
df=df,
)
94 changes: 78 additions & 16 deletions ax/analysis/tests/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from ax.analysis.summary import Summary
from ax.api.client import Client
from ax.api.configs import RangeParameterConfig
from ax.core.arm import Arm
from ax.core.base_trial import TrialStatus
from ax.core.data import Data
from ax.core.experiment import Experiment
from ax.core.metric import Metric
from ax.core.objective import MultiObjective, Objective
from ax.core.optimization_config import MultiObjectiveOptimizationConfig
Expand Down Expand Up @@ -224,35 +224,97 @@ def test_trial_status_filter(self) -> None:
self.assertIn(0, card.df["trial_index"].values)
self.assertIn(1, card.df["trial_index"].values)

def test_compute_with_preference_objective_skips_relativization(self) -> None:
"""Summary should skip relativization when a preference metric is an
objective, since binary 0/1 labels have SQ mean near zero which causes
'mean_control too small' errors."""
def _attach_binary_pairwise_data(
self, experiment: Experiment, pairwise_name: str
) -> None:
"""Attach binary 0/1 pairwise data to every trial, with the status-quo
(control) arm set to 0. A near-zero control mean is exactly what makes
relativizing this metric crash with 'mean_control too small', so this
deterministically reproduces the crash unless the preference metric is
scoped out of relativization."""
status_quo_name = none_throws(experiment.status_quo).name
for trial in experiment.trials.values():
arm_names = [arm.name for arm in trial.arms]
# Control arm gets 0.0 so |mean_control| is below the relativization
# epsilon; all other arms get 1.0.
means = [0.0 if name == status_quo_name else 1.0 for name in arm_names]
experiment.attach_data(
Data(
df=pd.DataFrame(
{
"arm_name": arm_names,
"metric_name": [pairwise_name] * len(arm_names),
"mean": means,
"sem": [0.0] * len(arm_names),
"trial_index": [trial.index] * len(arm_names),
"metric_signature": [pairwise_name] * len(arm_names),
}
)
)
)

def test_compute_with_preference_objective_per_metric_relativization(
self,
) -> None:
"""Summary with a preference metric objective should relativize only
non-preference metrics. The preference metric (pairwise_pref_query)
has binary 0/1 labels with SQ mean near zero -- relativizing it would
crash with 'mean_control too small'. Non-preference metrics should
be relativized normally."""
pairwise_name = Keys.PAIRWISE_PREFERENCE_QUERY.value

# Use Client to set up experiment with SQ and data
client = self.client
client.configure_optimization(objective="foo")
experiment = client._experiment
experiment.status_quo = Arm(parameters={"x1": 0.5, "x2": 0.5})
# Use an experiment with BatchTrials and SQ data, which triggers
# relativization in the Summary. get_branin_experiment_with_status_quo_trials
# creates BatchTrials with a SQ arm, so data.relativize() has SQ data.
experiment = get_branin_experiment_with_status_quo_trials()

# Add pairwise_pref_query as an additional objective
# Add pairwise_pref_query as an additional objective alongside branin.
experiment.add_tracking_metric(Metric(name=pairwise_name))
experiment.optimization_config = MultiObjectiveOptimizationConfig(
objective=MultiObjective(
objectives=[
Objective(metric=Metric(name="foo"), minimize=True),
Objective(metric=experiment.metrics["branin"], minimize=True),
Objective(metric=Metric(name=pairwise_name), minimize=False),
]
)
)

client.get_next_trials(max_trials=1)
client.complete_trial(trial_index=0, raw_data={"foo": 1.0, pairwise_name: 0.0})
self._attach_binary_pairwise_data(experiment, pairwise_name)

# Should succeed without "mean_control too small" error
# Should succeed without "mean_control too small" crash
card = Summary().compute(experiment=experiment)
self.assertNotIn("relativized", card.subtitle)

# Subtitle should indicate relativization (non-preference metrics)
self.assertIn("relativized", card.subtitle)

# Preference metric column should be dropped from the summary
self.assertNotIn(pairwise_name, card.df.columns)

def test_compute_with_preference_tracking_metric_and_no_optimization_config(
self,
) -> None:
"""A preference metric attached as a tracking metric (with a status quo
but no optimization_config) must still be scoped out of relativization.
Relativization is gated only on metrics/status_quo/step data, not on the
optimization_config, so without scoping the binary 0/1 preference metric
(SQ mean ~0) would crash with 'mean_control too small'."""
pairwise_name = Keys.PAIRWISE_PREFERENCE_QUERY.value

experiment = get_branin_experiment_with_status_quo_trials()
# Preference metric is tracking-only; there is no optimization_config.
experiment.add_tracking_metric(Metric(name=pairwise_name))
experiment._optimization_config = None

self._attach_binary_pairwise_data(experiment, pairwise_name)

# Should succeed without "mean_control too small" crash even though
# there is no optimization_config.
card = Summary().compute(experiment=experiment)

# branin is still relativized (non-preference metric).
self.assertIn("relativized", card.subtitle)
# Preference metric column should be dropped from the summary.
self.assertNotIn(pairwise_name, card.df.columns)

def test_default_excludes_stale_trials(self) -> None:
"""Test that Summary defaults to excluding STALE trials."""
Expand Down
53 changes: 53 additions & 0 deletions ax/analysis/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,59 @@ def test_relativize_df_with_sq_multiple_trials(self) -> None:
decimal=1,
)

def test_relativize_df_with_sq_skips_metrics_missing_from_sq(self) -> None:
"""When model predictions include metrics not in the status quo df
(e.g., pairwise_pref_query from a preference model), relativization
should skip those metrics and leave their columns untouched."""
df = pd.DataFrame(
{
"trial_index": [0, 0],
"arm_name": ["status_quo", "arm1"],
"foo_mean": [10.0, 12.0],
"foo_sem": [1.0, 1.2],
"pairwise_pref_query_mean": [0.5, 0.8],
"pairwise_pref_query_sem": [0.1, 0.2],
}
)
# Status quo df only has foo, not pairwise_pref_query
status_quo_df = pd.DataFrame(
{
"trial_index": [0],
"arm_name": ["status_quo"],
"foo_mean": [10.0],
"foo_sem": [1.0],
}
)

rel_df = _relativize_df_with_sq(
df=df,
status_quo_df=status_quo_df,
status_quo_name="status_quo",
)

with self.subTest("foo is relativized"):
np.testing.assert_almost_equal(
rel_df.loc[rel_df["arm_name"] == "arm1", "foo_mean"].iloc[0],
0.2,
decimal=1,
)

with self.subTest("pairwise_pref_query is untouched"):
np.testing.assert_almost_equal(
rel_df.loc[
rel_df["arm_name"] == "arm1", "pairwise_pref_query_mean"
].iloc[0],
0.8,
decimal=5,
)
np.testing.assert_almost_equal(
rel_df.loc[
rel_df["arm_name"] == "arm1", "pairwise_pref_query_sem"
].iloc[0],
0.2,
decimal=5,
)

def test_is_preference_metric(self) -> None:
self.assertTrue(is_preference_metric("pairwise_pref_query"))
self.assertFalse(is_preference_metric("branin"))
Expand Down
9 changes: 8 additions & 1 deletion ax/analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,7 +845,14 @@ def _relativize_df_with_sq(
and 'METRIC_NAME_sem' columns relativized to the status quo arm for each metric
within each trial.
"""
metric_names = [name[:-5] for name in df.columns if name.endswith("_mean")]
# Only relativize metrics present in both the data df and the status quo
# df. Model predictions may include metrics (e.g., pairwise_pref_query)
# that the status quo df doesn't have columns for.
metric_names = [
name[:-5]
for name in df.columns
if name.endswith("_mean") and name in status_quo_df.columns
]

rel_df = df.copy()

Expand Down
34 changes: 32 additions & 2 deletions ax/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import itertools
import math
from bisect import bisect_right
from collections.abc import Iterable
from collections.abc import Iterable, Sequence
from copy import deepcopy
from functools import cached_property
from io import StringIO
Expand Down Expand Up @@ -439,6 +439,7 @@ def relativize(
include_sq: bool = False,
bias_correction: bool = True,
control_as_constant: bool = False,
metric_names: Sequence[str] | None = None,
) -> Data:
"""Relativize a data object w.r.t. a status_quo arm.

Expand All @@ -453,6 +454,9 @@ def relativize(
ax.utils.stats.math_utils.relativize for more details.
control_as_constant: If true, control is treated as a constant.
bias_correction is ignored when this is true.
metric_names: If provided, only relativize these metrics. Other
metrics are passed through with raw values. If None, all
metrics are relativized.

Returns:
The new data object with the relativized metrics (excluding the
Expand All @@ -471,6 +475,7 @@ def relativize(
include_sq=include_sq,
bias_correction=bias_correction,
control_as_constant=control_as_constant,
metric_names=metric_names,
)
return self.__class__(df=df_rel)

Expand Down Expand Up @@ -698,6 +703,7 @@ def relativize_dataframe(
include_sq: bool = False,
bias_correction: bool = True,
control_as_constant: bool = False,
metric_names: Sequence[str] | None = None,
) -> pd.DataFrame:
"""Relativize a dataframe w.r.t. a status_quo arm.

Expand All @@ -712,6 +718,9 @@ def relativize_dataframe(
ax.utils.stats.math_utils.relativize for more details.
control_as_constant: If true, control is treated as a constant.
bias_correction is ignored when this is true.
metric_names: If provided, only relativize these metrics. Other
metrics are passed through with raw values. If None, all
metrics are relativized.

Returns:
The new dataframe with the relativized metrics (excluding the
Expand All @@ -721,11 +730,26 @@ def relativize_dataframe(
grp_cols = list(
{"trial_index", "metric_name", "random_split"}.intersection(df.columns.values)
)
metric_names_set = set(metric_names) if metric_names is not None else None

grouped_df = df.groupby(grp_cols)
dfs = []
for grp in grouped_df.groups.keys():
subgroup_df = grouped_df.get_group(grp)

# If metric scoping is requested, skip relativization for excluded
# metrics and pass through raw data (with or without SQ row).
if metric_names_set is not None and "metric_name" in grp_cols:
grp_metric = (
grp if isinstance(grp, str) else grp[grp_cols.index("metric_name")]
)
if grp_metric not in metric_names_set:
if include_sq:
dfs.append(subgroup_df)
else:
dfs.append(subgroup_df[subgroup_df["arm_name"] != status_quo_name])
continue

is_sq = subgroup_df["arm_name"] == status_quo_name

# Check if status quo exists in this subgroup (trial)
Expand Down Expand Up @@ -758,7 +782,13 @@ def relativize_dataframe(
dfs.append(subgroup_df.assign(mean=means_rel, sem=sems_rel))
df_rel = pd.concat(dfs, axis=0)
if include_sq:
df_rel.loc[df_rel["arm_name"] == status_quo_name, "sem"] = 0.0
# Zero SEM only for metrics that were actually relativized.
# Non-relativized metrics (excluded via metric_names scoping)
# should retain their original SEM values.
sq_mask = df_rel["arm_name"] == status_quo_name
if metric_names_set is not None and "metric_name" in df_rel.columns:
sq_mask = sq_mask & df_rel["metric_name"].isin(metric_names_set)
df_rel.loc[sq_mask, "sem"] = 0.0
df_rel.reset_index(inplace=True, drop=True)
# Reorder columns to match expected order (reuses Data class logic)
df_rel = Data._get_df_with_cols_in_expected_order(df_rel)
Expand Down
Loading
Loading