fairlearn · bram49 · Oct 6, 2021 · Oct 6, 2021 · Oct 9, 2021 · Oct 18, 2021
diff --git a/docs/user_guide/assessment.rst b/docs/user_guide/assessment.rst
@@ -277,6 +277,9 @@ Base metric                                     :code:`group_min` :code:`group_m
 :func:`.selection_rate`                         .                 .                 Y                  Y
 :func:`.true_negative_rate`                     .                 .                 Y                  Y
 :func:`.true_positive_rate`                     .                 .                 Y                  Y
+:func:`.exposure`                               .                 .                 Y                  Y
+:func:`.utility`                                .                 .                 Y                  Y
+:func:`.exposure_utility_ratio`                 .                 .                 Y                  Y
 :func:`sklearn.metrics.accuracy_score`          Y                 .                 Y                  Y
 :func:`sklearn.metrics.balanced_accuracy_score` Y                 .                 .                  .
 :func:`sklearn.metrics.f1_score`                Y                 .                 .                  .

diff --git a/docs/user_guide/fairness_in_machine_learning.rst b/docs/user_guide/fairness_in_machine_learning.rst
@@ -194,6 +194,24 @@ group loss primarily seeks to mitigate quality-of-service harms. Equalized
 odds and equal opportunity can be used as a diagnostic for both allocation
 harms as well as quality-of-service harms.
 
+*Ranking*:
+
+Fairlearn includes two constraints for rankings, based on exposure: a measure for the amount of
+attention an instance is expected to receive, based on their position in the ranking. Exposure is
+computed as a logarithmic discount :math:`\frac{1}{log(1+i)}` for each position :math:`i`, as used
+in discounted cumulative gain (DCG).
+
+* *Allocation harm*: We try to allocate the exposure that each item gets fairly across the groups.
+  A ranking :math:`\tau` has a fair exposure allocation under a distribution over :math:`(X,A,Y)`,
+  if its ranking for :math:`\tau(X)` is statistically independent over sensitive feature:math:`A`.
+  [#6]_
+
+* *Quality-of-service harm*: We try to keep the exposure that each item gets proportional to its
+  "ground-truth" relevance. Otherwise small differences in relevance can lead to huge differences
+  in exposure. A ranking :math:`\tau` satisfies parity in quality-of-service under
+  a distribution over :math:`(X,A,Y)`, if its ranking for :math:`\tau(X)` is statistically
+  proportional to :math:`Y`, independent over sensitive feature :math:`A`. [#6]_
+
 Disparity metrics, group metrics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -420,4 +438,7 @@ the algorithm may impact the intended outcomes of a given model.
       <https://arxiv.org/pdf/1912.05511.pdf>`_, FAccT, 2021.
 
    .. [#5] Obermeyer, Powers, Vogeli, Mullainathan `"Dissecting racial bias in an algorithm used to manage the health of populations"
-      <https://science.sciencemag.org/content/366/6464/447>`_, Science, 2019.
+      <https://science.sciencemag.org/content/366/6464/447>`_, Science, 2019.
+
+   .. [#6] Singh, Joachims `"Fairness of Exposure in Rankings"
+      <https://dl.acm.org/doi/10.1145/3219819.3220088>`_, KDD, 2018.
diff --git a/examples/plot_ranking.py b/examples/plot_ranking.py
@@ -0,0 +1,117 @@
+# Copyright (c) Fairlearn contributors.
+# Licensed under the MIT License.
+
+"""
+=========================================
+Ranking
+=========================================
+"""
+
+from fairlearn.metrics import exposure, utility, exposure_utility_ratio
+from fairlearn.metrics import MetricFrame
+
+# %%
+# This notebook shows how to use Fairlearn with rankings. We showcase the example "Fairly
+# Allocating Economic Opportunity" from the paper "Fairness of Exposure in Ranking" by Singh and
+# Joachims (2018).
+# The example demonstrates how small differences in item relevance can lead to large differences
+# in exposure.
+#
+# Consider a web-service that connects employers (users) to potential employees (items).
+# The web-service uses a ranking-kased system to present a set of 6 applicants of which 3 are male
+# and 3 are female. Male applicants have relevance of 0.80, 0.79, 0.78 respectively for the
+# employer, while female applicants have relevance of 0.77, 0.76, 0.75.
+# In this setting a relevance of 0.75 is defined as, 75% of all employers issuing the query found
+# the applicant relevant.
+#
+# The Probability Ranking Principle suggests to rank the applicants in decreasing order of
+# relevance. What does this mean for the exposure between the two groups?
+
+ranking_pred = [1, 2, 3, 4, 5, 6]  # ranking
+sex = ['Male', 'Male', 'Male', 'Female', 'Female', 'Female']
+y_true = [0.82, 0.81, 0.80, 0.79, 0.78, 0.77]
+
+# %%
+# Here we define what metrics we want to analyze.
+#
+# - The `exposure` metric shows the average exposure that each group gets, based on their position
+#   biases. Exposure is the value that we assign to every place in the ranking, calculated by a
+#   standard exposure drop-off of :math:`1/log_2(1+j)` as used in Discounted Cumulative Gain (DCG),
+#   to account for position bias. If there are big differences in exposure
+#   we could say that there is allocation harm in the data, i.e. males are on average ranked way
+#   higher than females by the web-service.
+#
+# - The `utility` metric shows the average relevance that each group has.
+#
+# - The `exposure_utility_ratio` metric shows quality-of-service harms in the data. Since it shows
+#   what the average exposure of each group is compared to its relevance. If there a big
+#   differences in this metric we could say that the exposure of some sensitive groups is not
+#   proportional to its utility.
+
+metrics = {
+    'exposure (allocation harm)': exposure,
+    'average utility': utility,
+    'exposure/utility (quality-of-service)': exposure_utility_ratio
+}
+
+mf = MetricFrame(metrics=metrics,
+                 y_true=y_true,
+                 y_pred=ranking_pred,
+                 sensitive_features={'sex': sex})
+
+# Customize the plot
+mf.by_group.plot(
+    kind="bar",
+    subplots=True,
+    layout=[1, 3],
+    legend=False,
+    figsize=(12, 4)
+)
+
+# Show the ratio of the metrics, 0 equals unfair and 1 equals fair.
+mf.ratio()
+
+# %%
+# The first plot shows that the web-service that men get significantly more exposure than women.
+# Although the second plot shows that the utility of females is comparable to the males group.
+# Therefor we can say that the ranking contains quality-of-service harm against women, since the
-# Therefor we can say that the ranking contains quality-of-service harm against women, since the
+# Therefore, we can say that the ranking shows evidence of quality-of-service harm against women, since the
-# Therefor we can say that the ranking contains quality-of-service harm against women, since the
+# Therefore, we can say that the ranking shows evidence of quality-of-service harm against women, since the
+# exposure/utility ratio is not equal (plot 3)
+
+# %%
+# How can we fix this? A simple solution is to rerank the items, in such a way that females get
-# How can we fix this? A simple solution is to rerank the items, in such a way that females get
+# How can we fix this? A simple solution is to rerank the items, in such a way that women get
-# How can we fix this? A simple solution is to rerank the items, in such a way that females get
+# How can we fix this? A simple solution is to rerank the items, in such a way that women get
+# more exposure and males get less exposure. For example we can switch the top male with the top
+# female applicant and remeasure the quality-of-service harm.
-# female applicant and remeasure the quality-of-service harm.
+# woman and remeasure the quality-of-service harm.
-# female applicant and remeasure the quality-of-service harm.
+# woman and remeasure the quality-of-service harm.
+
+ranking_pred = [1, 2, 3, 4, 5, 6]  # ranking
+sex = ['Female', 'Male', 'Male', 'Male', 'Female', 'Female']
+y_true = [0.79, 0.81, 0.80, 0.82, 0.78, 0.77]  # Continuous relevance score
+
+print(len(ranking_pred), len(sex), len(y_true))
+
+# Analyze metrics using MetricFrame
+# Careful that in contrast to the classification problem, y_pred now requires a ranking
+metrics = {
+    'exposure (allocation harm)': exposure,
+    'average utility': utility,
+    'exposure/utility (quality_of_service)': exposure_utility_ratio
+}
+
+mf = MetricFrame(metrics=metrics,
+                 y_true=y_true,
+                 y_pred=ranking_pred,
+                 sensitive_features={'sex': sex})
+
+# Customize the plot
+mf.by_group.plot(
+    kind="bar",
+    subplots=True,
+    layout=[1, 3],
+    legend=False,
+    figsize=(12, 4)
+)
+
+# Show the ratio of the metrics, 0 equals unfair and 1 equals fair.
+mf.ratio()
+
+# %%
+# The new plots show that the exposure and exposure/utility ratio are now much more equal.
diff --git a/fairlearn/metrics/__init__.py b/fairlearn/metrics/__init__.py
@@ -48,6 +48,16 @@
     _mean_underprediction,
     count)
 
+from ._exposure import (  # noqa: F401
+    exposure,
+    utility,
+    exposure_utility_ratio,
+    allocation_harm_in_ranking_difference,
+    allocation_harm_in_ranking_ratio,
+    quality_of_service_harm_in_ranking_difference,
+    quality_of_service_harm_in_ranking_ratio
+)
+
 
 # Add the generated metrics of the form and
 # `<metric>_{difference,ratio,group_min,group_max`
@@ -67,7 +77,11 @@
     "demographic_parity_difference",
     "demographic_parity_ratio",
     "equalized_odds_difference",
-    "equalized_odds_ratio"
+    "equalized_odds_ratio",
+    "allocation_harm_in_ranking_difference",
+    "allocation_harm_in_ranking_ratio",
+    "quality_of_service_harm_in_ranking_difference",
+    "quality_of_service_harm_in_ranking_ratio"
 ]
 
 _extra_metrics = [
@@ -80,4 +94,11 @@
     "count"
 ]
 
-__all__ = _core + _disparities + _extra_metrics + list(sorted(_generated_metric_dict.keys()))
+_ranking_metrics = [
+    "exposure",
+    "utility",
+    "exposure_utility_ratio"
+]
+
+__all__ = _core + _disparities + _extra_metrics + _ranking_metrics \
+          + list(sorted(_generated_metric_dict.keys()))