diff --git a/ax/analysis/summary.py b/ax/analysis/summary.py index 1de06ce245f..083733710f5 100644 --- a/ax/analysis/summary.py +++ b/ax/analysis/summary.py @@ -84,21 +84,44 @@ def compute( # (3) experiment data does not have has_step_column=True (data with a # progression doesn't support relativization due to time-series step # alignment complexities.) - # (4) no preference metric objectives -- preference metrics (e.g., - # pairwise_pref_query) have binary 0/1 labels with SQ mean near zero, - # causing relativization to crash with "mean_control too small." data = experiment.lookup_data(trial_indices=self.trial_indices) - has_preference_objective = experiment.optimization_config is not None and any( - is_preference_metric(n) - for n in experiment.optimization_config.objective.metric_names - ) should_relativize = ( len(experiment.metrics) > 0 and experiment.status_quo is not None and not data.has_step_column - and not has_preference_objective ) + # When relativizing, scope to non-preference metrics only. Preference + # metrics (e.g., pairwise_pref_query) have binary 0/1 labels with SQ + # mean near zero, causing relativization to crash with "mean_control + # too small." Non-preference tracking metrics are relativized normally. + non_preference_metric_names: list[str] | None = None + if should_relativize: + non_preference_metric_names = [ + name for name in experiment.metrics if not is_preference_metric(name) + ] + + df = experiment.to_df( + trial_indices=self.trial_indices, + omit_empty_columns=self.omit_empty_columns, + trial_statuses=self.trial_statuses, + relativize=should_relativize, + metric_names_to_relativize=non_preference_metric_names, + ) + + # Drop preference metric columns (e.g., pairwise_pref_query) -- their + # binary 0/1 values are not informative in a summary table. Then drop + # rows where all remaining metric columns are empty, which removes + # labeling-only trials that have no tracking metric data. + preference_cols = [col for col in df.columns if is_preference_metric(col)] + if preference_cols: + df = df.drop(columns=preference_cols) + remaining_metric_cols = [ + col for col in df.columns if col in experiment.metrics + ] + if remaining_metric_cols: + df = df.dropna(subset=remaining_metric_cols, how="all") + return self._create_analysis_card( title=( "Summary for " @@ -112,10 +135,5 @@ def compute( "Metric results are relativized against status quo." ) ), - df=experiment.to_df( - trial_indices=self.trial_indices, - omit_empty_columns=self.omit_empty_columns, - trial_statuses=self.trial_statuses, - relativize=should_relativize, - ), + df=df, ) diff --git a/ax/analysis/tests/test_summary.py b/ax/analysis/tests/test_summary.py index 338ab9183d1..301ef5284cb 100644 --- a/ax/analysis/tests/test_summary.py +++ b/ax/analysis/tests/test_summary.py @@ -10,9 +10,9 @@ from ax.analysis.summary import Summary from ax.api.client import Client from ax.api.configs import RangeParameterConfig -from ax.core.arm import Arm from ax.core.base_trial import TrialStatus from ax.core.data import Data +from ax.core.experiment import Experiment from ax.core.metric import Metric from ax.core.objective import MultiObjective, Objective from ax.core.optimization_config import MultiObjectiveOptimizationConfig @@ -224,35 +224,97 @@ def test_trial_status_filter(self) -> None: self.assertIn(0, card.df["trial_index"].values) self.assertIn(1, card.df["trial_index"].values) - def test_compute_with_preference_objective_skips_relativization(self) -> None: - """Summary should skip relativization when a preference metric is an - objective, since binary 0/1 labels have SQ mean near zero which causes - 'mean_control too small' errors.""" + def _attach_binary_pairwise_data( + self, experiment: Experiment, pairwise_name: str + ) -> None: + """Attach binary 0/1 pairwise data to every trial, with the status-quo + (control) arm set to 0. A near-zero control mean is exactly what makes + relativizing this metric crash with 'mean_control too small', so this + deterministically reproduces the crash unless the preference metric is + scoped out of relativization.""" + status_quo_name = none_throws(experiment.status_quo).name + for trial in experiment.trials.values(): + arm_names = [arm.name for arm in trial.arms] + # Control arm gets 0.0 so |mean_control| is below the relativization + # epsilon; all other arms get 1.0. + means = [0.0 if name == status_quo_name else 1.0 for name in arm_names] + experiment.attach_data( + Data( + df=pd.DataFrame( + { + "arm_name": arm_names, + "metric_name": [pairwise_name] * len(arm_names), + "mean": means, + "sem": [0.0] * len(arm_names), + "trial_index": [trial.index] * len(arm_names), + "metric_signature": [pairwise_name] * len(arm_names), + } + ) + ) + ) + + def test_compute_with_preference_objective_per_metric_relativization( + self, + ) -> None: + """Summary with a preference metric objective should relativize only + non-preference metrics. The preference metric (pairwise_pref_query) + has binary 0/1 labels with SQ mean near zero -- relativizing it would + crash with 'mean_control too small'. Non-preference metrics should + be relativized normally.""" pairwise_name = Keys.PAIRWISE_PREFERENCE_QUERY.value - # Use Client to set up experiment with SQ and data - client = self.client - client.configure_optimization(objective="foo") - experiment = client._experiment - experiment.status_quo = Arm(parameters={"x1": 0.5, "x2": 0.5}) + # Use an experiment with BatchTrials and SQ data, which triggers + # relativization in the Summary. get_branin_experiment_with_status_quo_trials + # creates BatchTrials with a SQ arm, so data.relativize() has SQ data. + experiment = get_branin_experiment_with_status_quo_trials() - # Add pairwise_pref_query as an additional objective + # Add pairwise_pref_query as an additional objective alongside branin. experiment.add_tracking_metric(Metric(name=pairwise_name)) experiment.optimization_config = MultiObjectiveOptimizationConfig( objective=MultiObjective( objectives=[ - Objective(metric=Metric(name="foo"), minimize=True), + Objective(metric=experiment.metrics["branin"], minimize=True), Objective(metric=Metric(name=pairwise_name), minimize=False), ] ) ) - client.get_next_trials(max_trials=1) - client.complete_trial(trial_index=0, raw_data={"foo": 1.0, pairwise_name: 0.0}) + self._attach_binary_pairwise_data(experiment, pairwise_name) - # Should succeed without "mean_control too small" error + # Should succeed without "mean_control too small" crash card = Summary().compute(experiment=experiment) - self.assertNotIn("relativized", card.subtitle) + + # Subtitle should indicate relativization (non-preference metrics) + self.assertIn("relativized", card.subtitle) + + # Preference metric column should be dropped from the summary + self.assertNotIn(pairwise_name, card.df.columns) + + def test_compute_with_preference_tracking_metric_and_no_optimization_config( + self, + ) -> None: + """A preference metric attached as a tracking metric (with a status quo + but no optimization_config) must still be scoped out of relativization. + Relativization is gated only on metrics/status_quo/step data, not on the + optimization_config, so without scoping the binary 0/1 preference metric + (SQ mean ~0) would crash with 'mean_control too small'.""" + pairwise_name = Keys.PAIRWISE_PREFERENCE_QUERY.value + + experiment = get_branin_experiment_with_status_quo_trials() + # Preference metric is tracking-only; there is no optimization_config. + experiment.add_tracking_metric(Metric(name=pairwise_name)) + experiment._optimization_config = None + + self._attach_binary_pairwise_data(experiment, pairwise_name) + + # Should succeed without "mean_control too small" crash even though + # there is no optimization_config. + card = Summary().compute(experiment=experiment) + + # branin is still relativized (non-preference metric). + self.assertIn("relativized", card.subtitle) + # Preference metric column should be dropped from the summary. + self.assertNotIn(pairwise_name, card.df.columns) def test_default_excludes_stale_trials(self) -> None: """Test that Summary defaults to excluding STALE trials.""" diff --git a/ax/analysis/tests/test_utils.py b/ax/analysis/tests/test_utils.py index 398c6ec9eeb..eeef3fde5a3 100644 --- a/ax/analysis/tests/test_utils.py +++ b/ax/analysis/tests/test_utils.py @@ -721,6 +721,59 @@ def test_relativize_df_with_sq_multiple_trials(self) -> None: decimal=1, ) + def test_relativize_df_with_sq_skips_metrics_missing_from_sq(self) -> None: + """When model predictions include metrics not in the status quo df + (e.g., pairwise_pref_query from a preference model), relativization + should skip those metrics and leave their columns untouched.""" + df = pd.DataFrame( + { + "trial_index": [0, 0], + "arm_name": ["status_quo", "arm1"], + "foo_mean": [10.0, 12.0], + "foo_sem": [1.0, 1.2], + "pairwise_pref_query_mean": [0.5, 0.8], + "pairwise_pref_query_sem": [0.1, 0.2], + } + ) + # Status quo df only has foo, not pairwise_pref_query + status_quo_df = pd.DataFrame( + { + "trial_index": [0], + "arm_name": ["status_quo"], + "foo_mean": [10.0], + "foo_sem": [1.0], + } + ) + + rel_df = _relativize_df_with_sq( + df=df, + status_quo_df=status_quo_df, + status_quo_name="status_quo", + ) + + with self.subTest("foo is relativized"): + np.testing.assert_almost_equal( + rel_df.loc[rel_df["arm_name"] == "arm1", "foo_mean"].iloc[0], + 0.2, + decimal=1, + ) + + with self.subTest("pairwise_pref_query is untouched"): + np.testing.assert_almost_equal( + rel_df.loc[ + rel_df["arm_name"] == "arm1", "pairwise_pref_query_mean" + ].iloc[0], + 0.8, + decimal=5, + ) + np.testing.assert_almost_equal( + rel_df.loc[ + rel_df["arm_name"] == "arm1", "pairwise_pref_query_sem" + ].iloc[0], + 0.2, + decimal=5, + ) + def test_is_preference_metric(self) -> None: self.assertTrue(is_preference_metric("pairwise_pref_query")) self.assertFalse(is_preference_metric("branin")) diff --git a/ax/analysis/utils.py b/ax/analysis/utils.py index f9e871da06b..79353958684 100644 --- a/ax/analysis/utils.py +++ b/ax/analysis/utils.py @@ -845,7 +845,14 @@ def _relativize_df_with_sq( and 'METRIC_NAME_sem' columns relativized to the status quo arm for each metric within each trial. """ - metric_names = [name[:-5] for name in df.columns if name.endswith("_mean")] + # Only relativize metrics present in both the data df and the status quo + # df. Model predictions may include metrics (e.g., pairwise_pref_query) + # that the status quo df doesn't have columns for. + metric_names = [ + name[:-5] + for name in df.columns + if name.endswith("_mean") and name in status_quo_df.columns + ] rel_df = df.copy() diff --git a/ax/core/data.py b/ax/core/data.py index 2939143ff79..dc9b84650de 100644 --- a/ax/core/data.py +++ b/ax/core/data.py @@ -11,7 +11,7 @@ import itertools import math from bisect import bisect_right -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from copy import deepcopy from functools import cached_property from io import StringIO @@ -439,6 +439,7 @@ def relativize( include_sq: bool = False, bias_correction: bool = True, control_as_constant: bool = False, + metric_names: Sequence[str] | None = None, ) -> Data: """Relativize a data object w.r.t. a status_quo arm. @@ -453,6 +454,9 @@ def relativize( ax.utils.stats.math_utils.relativize for more details. control_as_constant: If true, control is treated as a constant. bias_correction is ignored when this is true. + metric_names: If provided, only relativize these metrics. Other + metrics are passed through with raw values. If None, all + metrics are relativized. Returns: The new data object with the relativized metrics (excluding the @@ -471,6 +475,7 @@ def relativize( include_sq=include_sq, bias_correction=bias_correction, control_as_constant=control_as_constant, + metric_names=metric_names, ) return self.__class__(df=df_rel) @@ -698,6 +703,7 @@ def relativize_dataframe( include_sq: bool = False, bias_correction: bool = True, control_as_constant: bool = False, + metric_names: Sequence[str] | None = None, ) -> pd.DataFrame: """Relativize a dataframe w.r.t. a status_quo arm. @@ -712,6 +718,9 @@ def relativize_dataframe( ax.utils.stats.math_utils.relativize for more details. control_as_constant: If true, control is treated as a constant. bias_correction is ignored when this is true. + metric_names: If provided, only relativize these metrics. Other + metrics are passed through with raw values. If None, all + metrics are relativized. Returns: The new dataframe with the relativized metrics (excluding the @@ -721,11 +730,26 @@ def relativize_dataframe( grp_cols = list( {"trial_index", "metric_name", "random_split"}.intersection(df.columns.values) ) + metric_names_set = set(metric_names) if metric_names is not None else None grouped_df = df.groupby(grp_cols) dfs = [] for grp in grouped_df.groups.keys(): subgroup_df = grouped_df.get_group(grp) + + # If metric scoping is requested, skip relativization for excluded + # metrics and pass through raw data (with or without SQ row). + if metric_names_set is not None and "metric_name" in grp_cols: + grp_metric = ( + grp if isinstance(grp, str) else grp[grp_cols.index("metric_name")] + ) + if grp_metric not in metric_names_set: + if include_sq: + dfs.append(subgroup_df) + else: + dfs.append(subgroup_df[subgroup_df["arm_name"] != status_quo_name]) + continue + is_sq = subgroup_df["arm_name"] == status_quo_name # Check if status quo exists in this subgroup (trial) @@ -758,7 +782,13 @@ def relativize_dataframe( dfs.append(subgroup_df.assign(mean=means_rel, sem=sems_rel)) df_rel = pd.concat(dfs, axis=0) if include_sq: - df_rel.loc[df_rel["arm_name"] == status_quo_name, "sem"] = 0.0 + # Zero SEM only for metrics that were actually relativized. + # Non-relativized metrics (excluded via metric_names scoping) + # should retain their original SEM values. + sq_mask = df_rel["arm_name"] == status_quo_name + if metric_names_set is not None and "metric_name" in df_rel.columns: + sq_mask = sq_mask & df_rel["metric_name"].isin(metric_names_set) + df_rel.loc[sq_mask, "sem"] = 0.0 df_rel.reset_index(inplace=True, drop=True) # Reorder columns to match expected order (reuses Data class logic) df_rel = Data._get_df_with_cols_in_expected_order(df_rel) diff --git a/ax/core/experiment.py b/ax/core/experiment.py index d5c3fabd732..d73a9187488 100644 --- a/ax/core/experiment.py +++ b/ax/core/experiment.py @@ -2303,6 +2303,7 @@ def to_df( trial_statuses: Sequence[TrialStatus] | None = None, omit_empty_columns: bool = True, relativize: bool = False, + metric_names_to_relativize: Sequence[str] | None = None, ) -> pd.DataFrame: """ High-level summary of the Experiment with one row per arm. Any values missing at @@ -2329,6 +2330,9 @@ def to_df( * experiment has a status quo on all of its ``BatchTrial``-s * OR a status quo trial among its ``Trial``-s, , relativize metrics against the status quo. + metric_names_to_relativize: If provided alongside ``relativize=True``, + only relativize these specific metrics. Other metrics are passed + through with raw values. If None, all metrics are relativized. """ records = [] @@ -2347,6 +2351,7 @@ def to_df( status_quo_name=self.status_quo.name, as_percent=True, include_sq=True, + metric_names=metric_names_to_relativize, ).df else: data_df = data.df @@ -2410,9 +2415,14 @@ def to_df( df = df.loc[:, df.notnull().any()] # Format metric columns as percentages with 4 significant figures when - # relativized + # relativized. Only format metrics that were actually relativized. if relativize: - for metric_name in self.metrics.keys(): + metrics_to_format = ( + metric_names_to_relativize + if metric_names_to_relativize is not None + else list(self.metrics.keys()) + ) + for metric_name in metrics_to_format: if metric_name in df.columns: df[metric_name] = df[metric_name].apply( lambda x: ( diff --git a/ax/core/tests/test_data.py b/ax/core/tests/test_data.py index 2b432e7ae9b..df83f770ec6 100644 --- a/ax/core/tests/test_data.py +++ b/ax/core/tests/test_data.py @@ -837,3 +837,68 @@ def test_relativize_data_no_sem(self) -> None: self.assertEqual( expected_relativized_data_with_sq, actual_relativized_data_with_sq ) + + def test_relativize_with_metric_names_preserves_unscoped_sem(self) -> None: + """When metric_names scoping is used with include_sq=True, + non-relativized metrics should retain their original SEM values.""" + df = pd.DataFrame( + [ + { + "trial_index": 0, + "mean": 2.0, + "sem": 0.5, + "metric_name": "to_relativize", + "metric_signature": "to_relativize", + "arm_name": "status_quo", + }, + { + "trial_index": 0, + "mean": 4.0, + "sem": 1.0, + "metric_name": "to_relativize", + "metric_signature": "to_relativize", + "arm_name": "0_0", + }, + { + "trial_index": 0, + "mean": 3.0, + "sem": 0.8, + "metric_name": "not_relativized", + "metric_signature": "not_relativized", + "arm_name": "status_quo", + }, + { + "trial_index": 0, + "mean": 6.0, + "sem": 1.2, + "metric_name": "not_relativized", + "metric_signature": "not_relativized", + "arm_name": "0_0", + }, + ] + ) + data = Data(df=df) + result = data.relativize(include_sq=True, metric_names=["to_relativize"]) + result_df = result.df + + with self.subTest("relativized metric SQ SEM is zero"): + sq_rel = result_df[ + (result_df["arm_name"] == "status_quo") + & (result_df["metric_name"] == "to_relativize") + ] + self.assertEqual(sq_rel["sem"].iloc[0], 0.0) + + with self.subTest("non-relativized metric SQ SEM is preserved"): + sq_raw = result_df[ + (result_df["arm_name"] == "status_quo") + & (result_df["metric_name"] == "not_relativized") + ] + self.assertEqual(sq_raw["sem"].iloc[0], 0.8) + + with self.subTest("non-relativized metric values are raw"): + raw_arm = result_df[ + (result_df["arm_name"] == "0_0") + & (result_df["metric_name"] == "not_relativized") + ] + self.assertEqual(raw_arm["mean"].iloc[0], 6.0) + self.assertEqual(raw_arm["sem"].iloc[0], 1.2)