Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: threshold optimizer with relaxed fairness constraint fulfillment #1248

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
11 changes: 10 additions & 1 deletion docs/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -425,4 +425,13 @@ @article{yeh2009comparisons
@misc{uscode2011title15chapter41subchapteriv,
title={United States Code 2011 Edition - Title 15 Commerce and Trade - Chapter 41 Consumer Credit Protection - Subchapter IV—Equal Credit Opportunity},
url={https://www.govinfo.gov/content/pkg/USCODE-2011-title15/html/USCODE-2011-title15-chap41-subchapIV.htm}
}
}

@misc{cruz2023unprocessing,
title={Unprocessing Seven Years of Algorithmic Fairness},
author={Andr\'{e} F. Cruz and Moritz Hardt},
year={2023},
eprint={2306.07261},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
191 changes: 191 additions & 0 deletions examples/plot_relaxed_equalized_odds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
# Copyright (c) Fairlearn contributors.
# Licensed under the MIT License.

"""
==========================================
RelaxedThresholdOptimizer with Census Data
==========================================
"""

# %%
# Load and preprocess the data set
# --------------------------------
# We download the data set using `fetch_adult` function in
# `fairlearn.datasets`. We start by importing the various modules we're going
# to use:
#

import numpy as np
import pandas as pd
from sklearn import metrics as skm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from fairlearn.datasets import fetch_adult
from fairlearn.metrics import (
MetricFrame,
equalized_odds_difference,
true_positive_rate,
false_positive_rate,
count,
plot_model_comparison,
)

# %%
# We can now load and inspect the data by using the `fairlearn.datasets` module:

data = fetch_adult()
X_raw = data.data
Y = (data.target == ">50K") * 1
X_raw

# %%
# We are going to treat the sex of each individual as a sensitive feature
# (where 0 indicates female and 1 indicates male), and in this particular case
# we are going separate this feature out and drop it from the main data. We
# then perform some standard data preprocessing steps to convert the data into
# a format suitable for the ML algorithms

A = X_raw["sex"]
X = pd.get_dummies(X_raw)

sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

le = LabelEncoder()
Y = le.fit_transform(Y)

# %%
# Finally, we split the data into training, validation, and test sets:
X_train, X_other, Y_train, Y_other, A_train, A_other = train_test_split(
X_scaled, Y, A, test_size=0.4, random_state=0, stratify=Y,
)

# Split (X_other, Y_other, A_other) into validation and test
X_test, X_val, Y_test, Y_val, A_test, A_val = train_test_split(
X_other, Y_other, A_other, test_size=0.5, random_state=0, stratify=Y_other,
)

# Work around indexing bug
X_train = X_train.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
A_val = A_val.reset_index(drop=True)

# %%
# Training a fairness-unaware predictor
# -------------------------------------
# To show the effect of Fairlearn we will first train a standard ML predictor
# that does not incorporate fairness. For speed of demonstration, we use the
# simple :class:`sklearn.linear_model.LogisticRegression` class:

unmitigated_predictor = GradientBoostingClassifier(n_estimators=500)

# %%time
unmitigated_predictor.fit(X_train, Y_train)

# %%
# Compute predictions
y_test_pred_scores = unmitigated_predictor.predict_proba(X_test)[:, -1]
y_test_pred_binary = y_test_pred_scores >= 0.5 # threshold = 0.5

# %%
# We can start to assess the predictor's fairness using the `MetricFrame`:
metric_frame = MetricFrame(
metrics={
"accuracy": skm.accuracy_score,
"true_positive_rate": true_positive_rate,
"false_positive_rate": false_positive_rate,
"count": count,
},
sensitive_features=A_test,
y_true=Y_test,
y_pred=y_test_pred_binary,
)
print(metric_frame.overall)
print(metric_frame.by_group)
metric_frame.by_group.plot.bar(
subplots=True,
layout=[4, 1],
legend=False,
figsize=[12, 8],
title="Accuracy and error-rates rate by group",
)


# %%
unmitigated_equalized_odds_diff = equalized_odds_difference(
y_true=Y_test, y_pred=y_test_pred_binary, sensitive_features=A_test,
)

print(f"Equalized odds difference for unmitigated classifier: {unmitigated_equalized_odds_diff:.3}")

# %%
from fairlearn.postprocessing._cvxpy_threshold_optimizer import _RelaxedThresholdOptimizer

fair_clf = _RelaxedThresholdOptimizer(
# predictor=unmitigated_predictor, # TODO: use this when we no longer rely on callables
# predict_method="predict_proba",
predictor=lambda *args, **kwargs: unmitigated_predictor.predict(*args, **kwargs),
predict_method="__call__",
constraint="equalized_odds",
tolerance=0,
)

# %%
fair_clf.fit(X_val, Y_val, sensitive_features=A_val)


# %%
y_test_pred_postprocessed = fair_clf.predict(X_test, sensitive_features=A_test)

# %%
postprocessed_equalized_odds_diff = equalized_odds_difference(
y_true=Y_test, y_pred=y_test_pred_postprocessed, sensitive_features=A_test,
)

print(f"Equalized odds difference after postprocessing: {postprocessed_equalized_odds_diff:.3}")

# %%
# Add the unconstrained/unmitigated classifier predictions
all_model_predictions = {"unconstrained": y_test_pred_binary}


# Helper to get different thresholdings for different tolerance values
def compute_test_predictions_with_relaxed_constraints(tolerance: float) -> np.ndarray:
# Instantiate
clf = _RelaxedThresholdOptimizer(
predictor=lambda *args, **kwargs: unmitigated_predictor.predict(*args, **kwargs),
predict_method="__call__",
constraint="equalized_odds",
tolerance=tolerance,
random_state=23,
)

# Fit
clf.fit(X_train, Y_train, sensitive_features=A_train)

return clf.predict(X_test, sensitive_features=A_test)


# Compute predictions at different levels of tolerance
all_model_predictions.update({
f"train tolerance={tol:.1}": compute_test_predictions_with_relaxed_constraints(tol)
for tol in np.arange(0, unmitigated_equalized_odds_diff, 1e-2)
})

# %%
# Plot all models in the fairness-accuracy landscape
plot_model_comparison(
x_axis_metric=skm.accuracy_score,
y_axis_metric=equalized_odds_difference,
y_true=Y_test,
y_preds=all_model_predictions,
sensitive_features=A_test,
point_labels=True,
show_plot=True,
)
4 changes: 4 additions & 0 deletions fairlearn/postprocessing/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
"Please make sure to install fairlearn[customplots] to use "
"the postprocessing plots."
)
_CVXPY_IMPORT_ERROR_MESSAGE = (
"Please make sure to install `cvxpy` to use postprocessing with relaxed "
"fairness constraint fulfillment."
)
BASE_ESTIMATOR_NONE_ERROR_MESSAGE = "The base estimator cannot be `None`."
BASE_ESTIMATOR_NOT_FITTED_WARNING = (
"The value of `prefit` is `True`, but `check_is_fitted` raised `NotFittedError` on"
Expand Down
Loading
Loading