fairlearn · IanEisenberg · Sep 24, 2021 · Sep 30, 2021 · Oct 16, 2023 · Oct 16, 2023
diff --git a/fairlearn/metrics/_fairness_metrics.py b/fairlearn/metrics/_fairness_metrics.py
@@ -100,7 +100,8 @@
 
 
 def equalized_odds_difference(
-    y_true, y_pred, *, sensitive_features, method="between_groups", sample_weight=None
+    y_true, y_pred, *, sensitive_features, method="between_groups",
+    sample_weight=None, agg="worst_case"
 ) -> float:
     """Calculate the equalized odds difference.
 
@@ -123,27 +124,40 @@
         Predicted labels :math:`h(X)` returned by the classifier.
 
     sensitive_features :
-        The sensitive features over which demographic parity should be assessed
+        The sensitive features over which equalized odds should be assessed
 
     method : str
-        How to compute the differences. See :func:`fairlearn.metrics.MetricFrame.difference`
-        for details.
+        How to compute the differences.
+        See :func:`fairlearn.metrics.MetricFrame.difference` for details.
 
     sample_weight : array-like
         The sample weights
 
+    agg : str
+        The aggregation method. One of `"worst_case"` or `"mean"`.
+        If `"worst_case"`, the greater one of the false positive rate
+        difference and true positive rate difference is returned.
+        If `"mean"`, the mean of the differences is returned.
+
     Returns
     -------
     float
         The equalized odds difference
     """
+    if agg not in ["worst_case", "mean"]:
+        return ValueError(f"agg must be one of 'worst_case' or 'mean', got {agg}")
+
     eo = _get_eo_frame(y_true, y_pred, sensitive_features, sample_weight)
 
-    return max(eo.difference(method=method))
+    if agg == "worst_case":
+        return max(eo.difference(method=method))
+    else:
+        return eo.difference(method=method).mean()
 
 
 def equalized_odds_ratio(
-    y_true, y_pred, *, sensitive_features, method="between_groups", sample_weight=None
+    y_true, y_pred, *, sensitive_features, method="between_groups",
+    sample_weight=None, agg="worst_case"
 ) -> float:
     """Calculate the equalized odds ratio.
 
@@ -166,7 +180,7 @@
         Predicted labels :math:`h(X)` returned by the classifier.
 
     sensitive_features :
-        The sensitive features over which demographic parity should be assessed
+        The sensitive features over which equalized odds should be assessed
 
     method : str
         How to compute the differences. See :func:`fairlearn.metrics.MetricFrame.ratio`
@@ -175,14 +189,26 @@
     sample_weight : array-like
         The sample weights
 
+    agg : str
+        The aggregation method. One of `"worst_case"` or `"mean"`.
+        If `"worst_case"`, the smaller one of the false positive rate ratio
+        and true positive rate ratio is returned.
+        If `"mean"`, the mean of the ratios is returned.
+
     Returns
     -------
     float
         The equalized odds ratio
     """
+    if agg not in ["worst_case", "mean"]:
+        return ValueError(f"agg must be one of 'worst_case' or 'mean', got {agg}")
+
     eo = _get_eo_frame(y_true, y_pred, sensitive_features, sample_weight)
 
-    return min(eo.ratio(method=method))
+    if agg == "worst_case":
+        return min(eo.ratio(method=method))
+    else:
+        return eo.ratio(method=method).mean()
 
 
 def _get_eo_frame(y_true, y_pred, sensitive_features, sample_weight) -> MetricFrame:

diff --git a/test/unit/metrics/test_fairness_metrics.py b/test/unit/metrics/test_fairness_metrics.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation and Fairlearn contributors.
 # Licensed under the MIT License.
 
+from itertools import product
 import pytest
 
 from fairlearn.metrics import (
@@ -17,6 +18,9 @@
 from .data_for_test import g_1, s_w, y_p, y_t
 
 _aggregate_methods = ["between_groups", "to_overall"]
+_agg_options = ["worst_case", "mean"]
+# cartesian product of the two lists
+_agg_combinations = list(product(_aggregate_methods, _agg_options))
 
 
 @pytest.mark.parametrize("agg_method", _aggregate_methods)
@@ -79,23 +83,26 @@ def test_demographic_parity_ratio_weighted(agg_method):
     assert actual == gm.ratio(method=agg_method)
 
 
-@pytest.mark.parametrize("agg_method", _aggregate_methods)
-def test_equalized_odds_difference(agg_method):
+@pytest.mark.parametrize("agg_method, agg", _agg_combinations)
+def test_equalized_odds_difference(agg_method, agg):
     actual = equalized_odds_difference(
-        y_t, y_p, sensitive_features=g_1, method=agg_method
+        y_t, y_p, sensitive_features=g_1, method=agg_method, agg=agg
     )
 
     metrics = {"tpr": true_positive_rate, "fpr": false_positive_rate}
     gm = MetricFrame(metrics=metrics, y_true=y_t, y_pred=y_p, sensitive_features=g_1)
 
     diffs = gm.difference(method=agg_method)
-    assert actual == diffs.max()
+    if agg == "worst_case":
+        assert actual == diffs.max()
+    else:
+        assert actual == diffs.mean()
 
 
-@pytest.mark.parametrize("agg_method", _aggregate_methods)
-def test_equalized_odds_difference_weighted(agg_method):
+@pytest.mark.parametrize("agg_method, agg", _agg_combinations)
+def test_equalized_odds_difference_weighted(agg_method, agg):
     actual = equalized_odds_difference(
-        y_t, y_p, sensitive_features=g_1, method=agg_method, sample_weight=s_w
+        y_t, y_p, sensitive_features=g_1, method=agg_method, sample_weight=s_w, agg=agg
     )
 
     metrics = {"tpr": true_positive_rate, "fpr": false_positive_rate}
@@ -110,24 +117,30 @@ def test_equalized_odds_difference_weighted(agg_method):
     )
 
     diffs = gm.difference(method=agg_method)
-    assert actual == diffs.max()
+    if agg == "worst_case":
+        assert actual == diffs.max()
+    else:
+        assert actual == diffs.mean()
 
 
-@pytest.mark.parametrize("agg_method", _aggregate_methods)
-def test_equalized_odds_ratio(agg_method):
-    actual = equalized_odds_ratio(y_t, y_p, method=agg_method, sensitive_features=g_1)
+@pytest.mark.parametrize("agg_method, agg", _agg_combinations)
+def test_equalized_odds_ratio(agg_method, agg):
+    actual = equalized_odds_ratio(y_t, y_p, method=agg_method, sensitive_features=g_1, agg=agg)
 
     metrics = {"tpr": true_positive_rate, "fpr": false_positive_rate}
     gm = MetricFrame(metrics=metrics, y_true=y_t, y_pred=y_p, sensitive_features=g_1)
 
     ratios = gm.ratio(method=agg_method)
-    assert actual == ratios.min()
+    if agg == "worst_case":
+        assert actual == ratios.min()
+    else:
+        assert actual == ratios.mean()
 
 
-@pytest.mark.parametrize("agg_method", _aggregate_methods)
-def test_equalized_odds_ratio_weighted(agg_method):
+@pytest.mark.parametrize("agg_method, agg", _agg_combinations)
+def test_equalized_odds_ratio_weighted(agg_method, agg):
     actual = equalized_odds_ratio(
-        y_t, y_p, method=agg_method, sensitive_features=g_1, sample_weight=s_w
+        y_t, y_p, method=agg_method, sensitive_features=g_1, sample_weight=s_w, agg=agg
     )
 
     metrics = {"tpr": true_positive_rate, "fpr": false_positive_rate}
@@ -142,4 +155,7 @@ def test_equalized_odds_ratio_weighted(agg_method):
     )
 
     ratios = gm.ratio(method=agg_method)
-    assert actual == ratios.min()
+    if agg == "worst_case":
+        assert actual == ratios.min()
+    else:
+        assert actual == ratios.mean()