From 5cec48007dd063919c2e6feefeefda1efd7e7454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?In=C3=AAs=20Silva?= Date: Mon, 22 Jan 2024 14:41:59 +0000 Subject: [PATCH 1/5] Fixed label flipping docstring and disparity calculation --- .../methods/preprocessing/label_flipping.py | 59 +++++++++++-------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py index 88f21765..b5a94710 100644 --- a/src/aequitas/flow/methods/preprocessing/label_flipping.py +++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py @@ -15,7 +15,7 @@ class LabelFlipping(PreProcessing): def __init__( self, - flip_rate: float = 0.1, + max_flip_rate: float = 0.1, disparity_target: Optional[float] = None, score_threshold: Optional[float] = None, bagging_max_samples: float = 0.5, @@ -34,8 +34,17 @@ def __init__( Parameters ---------- - flip_rate : float, optional + max_flip_rate : float, optional Maximum fraction of the training data to flip, by default 0.1 + disparity_target : float, optional + The target disparity between the groups (difference between the + prevalence of a group and the group with the highest prevalence). By + default None, which means the method will attempt to equalize the + prevalence of the groups. + score_threshold : float, optional + The threshold above which the labels are flipped. By default None, + which means the method will flip the labels of the instances with + a score value higher than 0. bagging_max_samples : float, optional The number of samples to draw from X to train each base estimator of the bagging classifier (with replacement). @@ -45,17 +54,17 @@ def __init__( bagging_n_estimators : int, optional The number of base estimators in the ensemble, by default 10. fair_ordering : bool, optional - Whether to take additional fairness criteria into account when flipping + Whether to take additional fairness criteria into account when flipping labels, only modifying the labels that contribute to equalizing the prevalence of the groups. By default True. ordering_method : str, optional - The method used to calculate the margin of the base estimator. If - "ensemble_margin", calculates the ensemble margins based on the binary - predictions of the classifiers. If "residuals", oreders the missclafied - instances based on the average residuals of the classifiers predictions. By + The method used to calculate the margin of the base estimator. If + "ensemble_margin", calculates the ensemble margins based on the binary + predictions of the classifiers. If "residuals", oreders the missclafied + instances based on the average residuals of the classifiers predictions. By default "ensemble_margin". unawareness_features : list, optional - The sensitive attributes (or proxies) to ignore when fitting the ensemble + The sensitive attributes (or proxies) to ignore when fitting the ensemble to enable fairness through unawareness. seed : int, optional The seed to use when fitting the ensemble. @@ -67,17 +76,17 @@ def __init__( >>> from aequitas.preprocessing import LabelFlipping >>> from sklearn.tree import DecisionTreeClassifier >>> from sklearn.datasets import make_classification - >>> X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, + >>> X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=0, random_state=42) - >>> lf = LabelFlipping(bagging_base_estimator=DecisionTreeClassifier, - flip_rate=0.1, max_depth=3) + >>> lf = LabelFlipping(bagging_base_estimator=DecisionTreeClassifier, + max_flip_rate=0.1, max_depth=3) >>> lf.fit(X, y) >>> X_transformed, y_transformed = lf.transform(X, y) """ self.logger = create_logger("methods.preprocessing.LabelFlipping") self.logger.info("Instantiating a LabelFlipping preprocessing method.") - self.flip_rate = flip_rate + self.max_flip_rate = max_flip_rate if disparity_target is not None: if disparity_target < 0 or disparity_target > 1: @@ -114,7 +123,7 @@ def __init__( self.bagging_base_estimator = bagging_base_estimator(**args) self.logger.info( f"Created base estimator {self.bagging_base_estimator} with params {args}, " - F"discarded args:{list(set(base_estimator_args.keys()) - set(args.keys()))}" + f"discarded args:{list(set(base_estimator_args.keys()) - set(args.keys()))}" ) self.bagging_n_estimators = bagging_n_estimators @@ -159,8 +168,8 @@ def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None: def _score_instances(self, X: pd.DataFrame, y: pd.Series) -> pd.Series: """Scores the instances based on the predictions of the ensemble of classifiers. - If the ordering method is "ensemble_margin", the scores are the ensemble - margins. If the ordering method is "residuals", the scores are the average + If the ordering method is "ensemble_margin", the scores are the ensemble + margins. If the ordering method is "residuals", the scores are the average residuals of the classifiers predictions. Parameters @@ -203,9 +212,13 @@ def _score_instances(self, X: pd.DataFrame, y: pd.Series) -> pd.Series: return scores def _calculate_prevalence_disparity(self, y: pd.Series, s: pd.Series): - prevalence = y.mean() - group_prevalence = y.groupby(s).mean().to_dict() - group_disparity = {k: v - prevalence for k, v in group_prevalence.items()} + prevalences = y.groupby(s).mean() + max_prevalence = max(prevalences) + group_prevalences = prevalences.to_dict() + group_disparity = { + k: (max_prevalence - v) / max_prevalence + for k, v in group_prevalences.items() + } return group_disparity @@ -214,7 +227,7 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie If fair_ordering is True, only flips the labels of the instances that contribute to equalizing the prevalence of the groups. - Otherwise, the labels of the instances with the largest score values are + Otherwise, the labels of the instances with the largest score values are flipped. Parameters @@ -236,7 +249,7 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie ascending=(self.ordering_method == "ensemble_margin") ).index ) - n_flip = int(self.flip_rate * len(y)) + n_flip = int(self.max_flip_rate * len(y)) if self.fair_ordering: disparity = self._calculate_prevalence_disparity(y_flipped, s) @@ -251,8 +264,8 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie if abs(scores.loc[i]) < self.score_threshold: break - if (disparity[s.loc[i]] > self.disparity_target and y.loc[i] == 1) or ( - disparity[s.loc[i]] < self.disparity_target and y.loc[i] == 0 + if (disparity[s.loc[i]] > self.disparity_target and y.loc[i] == 0) or ( + disparity[s.loc[i]] == 0 and y.loc[i] == 1 ): y_flipped.loc[i] = 1 - y.loc[i] disparity = self._calculate_prevalence_disparity(y_flipped, s) @@ -295,7 +308,7 @@ def transform( if s is None and self.fair_ordering: raise ValueError( - "Sensitive Attribute `s` not passed. Must be passed if `fair_ordering` " + "Sensitive Attribute `s` not passed. Must be passed if `fair_ordering` " "is True." ) From a31868559bcae5f033cd8ac028f668f6bef697ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?In=C3=AAs=20Silva?= Date: Tue, 23 Jan 2024 14:36:31 +0000 Subject: [PATCH 2/5] Fixed spelling mistakes --- src/aequitas/flow/methods/preprocessing/label_flipping.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py index b5a94710..b81937c5 100644 --- a/src/aequitas/flow/methods/preprocessing/label_flipping.py +++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py @@ -60,7 +60,7 @@ def __init__( ordering_method : str, optional The method used to calculate the margin of the base estimator. If "ensemble_margin", calculates the ensemble margins based on the binary - predictions of the classifiers. If "residuals", oreders the missclafied + predictions of the classifiers. If "residuals", orders the misclassified instances based on the average residuals of the classifiers predictions. By default "ensemble_margin". unawareness_features : list, optional @@ -295,7 +295,9 @@ def transform( Parameters ---------- X : pd.DataFrame - Feature[s.loc[i]]ector. + Feature matrix. + y : pd.Series + Label vector. s : pd.Series, optional Protected attribute vector. From 981ca66da493f409d221d7655a915ab4ddb25b7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?In=C3=AAs=20Silva?= Date: Wed, 24 Jan 2024 11:48:25 +0000 Subject: [PATCH 3/5] Fixed max prevalence to mean prevalence --- .../flow/methods/preprocessing/label_flipping.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py index b81937c5..20750b2c 100644 --- a/src/aequitas/flow/methods/preprocessing/label_flipping.py +++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py @@ -37,10 +37,9 @@ def __init__( max_flip_rate : float, optional Maximum fraction of the training data to flip, by default 0.1 disparity_target : float, optional - The target disparity between the groups (difference between the - prevalence of a group and the group with the highest prevalence). By - default None, which means the method will attempt to equalize the - prevalence of the groups. + The target disparity between the groups (difference between the prevalence + of a group and the mean prevalence). By default None, which means the + method will attempt to equalize the prevalence of the groups. score_threshold : float, optional The threshold above which the labels are flipped. By default None, which means the method will flip the labels of the instances with @@ -213,10 +212,10 @@ def _score_instances(self, X: pd.DataFrame, y: pd.Series) -> pd.Series: def _calculate_prevalence_disparity(self, y: pd.Series, s: pd.Series): prevalences = y.groupby(s).mean() - max_prevalence = max(prevalences) + mean_prevalence = y.mean() group_prevalences = prevalences.to_dict() group_disparity = { - k: (max_prevalence - v) / max_prevalence + k: (v - mean_prevalence) / mean_prevalence for k, v in group_prevalences.items() } @@ -264,8 +263,8 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie if abs(scores.loc[i]) < self.score_threshold: break - if (disparity[s.loc[i]] > self.disparity_target and y.loc[i] == 0) or ( - disparity[s.loc[i]] == 0 and y.loc[i] == 1 + if (disparity[s.loc[i]] > self.disparity_target and y.loc[i] == 1) or ( + disparity[s.loc[i]] < -self.disparity_target and y.loc[i] == 0 ): y_flipped.loc[i] = 1 - y.loc[i] disparity = self._calculate_prevalence_disparity(y_flipped, s) From 4d047268334c0f98635369f2a7aae694bebbf456 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?In=C3=AAs=20Silva?= Date: Thu, 25 Jan 2024 15:06:57 +0000 Subject: [PATCH 4/5] Faster disparity fair ordering --- .../methods/preprocessing/label_flipping.py | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py index 20750b2c..c0f75c1a 100644 --- a/src/aequitas/flow/methods/preprocessing/label_flipping.py +++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py @@ -5,6 +5,7 @@ import inspect import pandas as pd +import math from typing import Optional, Tuple, Literal, Union, Callable import numpy as np from sklearn.ensemble import BaggingClassifier @@ -16,7 +17,7 @@ class LabelFlipping(PreProcessing): def __init__( self, max_flip_rate: float = 0.1, - disparity_target: Optional[float] = None, + disparity_target: Optional[float] = 0.05, score_threshold: Optional[float] = None, bagging_max_samples: float = 0.5, bagging_base_estimator: Union[ @@ -221,6 +222,22 @@ def _calculate_prevalence_disparity(self, y: pd.Series, s: pd.Series): return group_disparity + def _calculate_group_flips(self, y: pd.Series, s: pd.Series): + prevalence = y.mean() + group_prevalences = y.groupby(s).mean() + + min_prevalence = prevalence - self.disparity_target * prevalence + max_prevalence = prevalence + self.disparity_target * prevalence + + group_flips = { + group: math.ceil(min_prevalence * len(y[s == group])) - y[s == group].sum() + if group_prevalences[group] < min_prevalence + else math.floor(max_prevalence * len(y[s == group])) - y[s == group].sum() + for group in group_prevalences.index + } + + return group_flips + def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Series): """Flips the labels of the desired fraction of the training data. @@ -251,7 +268,7 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie n_flip = int(self.max_flip_rate * len(y)) if self.fair_ordering: - disparity = self._calculate_prevalence_disparity(y_flipped, s) + group_flips = self._calculate_group_flips(y_flipped, s) flip_index = ( y_flipped.index if self.ordering_method == "residuals" @@ -263,12 +280,15 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie if abs(scores.loc[i]) < self.score_threshold: break - if (disparity[s.loc[i]] > self.disparity_target and y.loc[i] == 1) or ( - disparity[s.loc[i]] < -self.disparity_target and y.loc[i] == 0 + if (group_flips[s.loc[i]] > 0 and y.loc[i] == 0) or ( + group_flips[s.loc[i]] < 0 and y.loc[i] == 1 ): y_flipped.loc[i] = 1 - y.loc[i] - disparity = self._calculate_prevalence_disparity(y_flipped, s) flip_count += 1 + if group_flips[s.loc[i]] > 0: + group_flips[s.loc[i]] -= 1 + else: + group_flips[s.loc[i]] += 1 if flip_count == n_flip: break From 5e59d53ebb8eb3667d1727255e47cbcc5c425308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?In=C3=AAs=20Silva?= Date: Thu, 25 Jan 2024 16:17:55 +0000 Subject: [PATCH 5/5] Deleted old code --- .../flow/methods/preprocessing/label_flipping.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py index c0f75c1a..0e339393 100644 --- a/src/aequitas/flow/methods/preprocessing/label_flipping.py +++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py @@ -211,17 +211,6 @@ def _score_instances(self, X: pd.DataFrame, y: pd.Series) -> pd.Series: return scores - def _calculate_prevalence_disparity(self, y: pd.Series, s: pd.Series): - prevalences = y.groupby(s).mean() - mean_prevalence = y.mean() - group_prevalences = prevalences.to_dict() - group_disparity = { - k: (v - mean_prevalence) / mean_prevalence - for k, v in group_prevalences.items() - } - - return group_disparity - def _calculate_group_flips(self, y: pd.Series, s: pd.Series): prevalence = y.mean() group_prevalences = y.groupby(s).mean()