From 5cec48007dd063919c2e6feefeefda1efd7e7454 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?In=C3=AAs=20Silva?= <inesoliveiraesilva@gmail.com>
Date: Mon, 22 Jan 2024 14:41:59 +0000
Subject: [PATCH 1/5] Fixed label flipping docstring and disparity calculation

---
 .../methods/preprocessing/label_flipping.py   | 59 +++++++++++--------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py
index 88f21765..b5a94710 100644
--- a/src/aequitas/flow/methods/preprocessing/label_flipping.py
+++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py
@@ -15,7 +15,7 @@
 class LabelFlipping(PreProcessing):
     def __init__(
         self,
-        flip_rate: float = 0.1,
+        max_flip_rate: float = 0.1,
         disparity_target: Optional[float] = None,
         score_threshold: Optional[float] = None,
         bagging_max_samples: float = 0.5,
@@ -34,8 +34,17 @@ def __init__(
 
         Parameters
         ----------
-        flip_rate : float, optional
+        max_flip_rate : float, optional
             Maximum fraction of the training data to flip, by default 0.1
+        disparity_target : float, optional
+            The target disparity between the groups (difference between the
+            prevalence of a group and the group with the highest prevalence). By
+            default None, which means the method will attempt to equalize the
+            prevalence of the groups.
+        score_threshold : float, optional
+            The threshold above which the labels are flipped. By default None,
+            which means the method will flip the labels of the instances with
+            a score value higher than 0.
         bagging_max_samples : float, optional
             The number of samples to draw from X to train each base estimator of the
             bagging classifier (with replacement).
@@ -45,17 +54,17 @@ def __init__(
         bagging_n_estimators : int, optional
             The number of base estimators in the ensemble, by default 10.
         fair_ordering : bool, optional
-            Whether to take additional fairness criteria into account when flipping 
+            Whether to take additional fairness criteria into account when flipping
             labels, only modifying the labels that contribute to equalizing the
             prevalence of the groups. By default True.
         ordering_method : str, optional
-            The method used to calculate the margin of the base estimator. If 
-            "ensemble_margin", calculates the ensemble margins based on the binary 
-            predictions of the classifiers. If "residuals", oreders the missclafied 
-            instances based on the average residuals of the classifiers predictions. By 
+            The method used to calculate the margin of the base estimator. If
+            "ensemble_margin", calculates the ensemble margins based on the binary
+            predictions of the classifiers. If "residuals", oreders the missclafied
+            instances based on the average residuals of the classifiers predictions. By
             default "ensemble_margin".
         unawareness_features : list, optional
-            The sensitive attributes (or proxies) to ignore when fitting the ensemble 
+            The sensitive attributes (or proxies) to ignore when fitting the ensemble
             to enable fairness through unawareness.
         seed : int, optional
             The seed to use when fitting the ensemble.
@@ -67,17 +76,17 @@ def __init__(
         >>> from aequitas.preprocessing import LabelFlipping
         >>> from sklearn.tree import DecisionTreeClassifier
         >>> from sklearn.datasets import make_classification
-        >>> X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, 
+        >>> X, y = make_classification(n_samples=1000, n_features=10, n_informative=5,
                 n_redundant=0, random_state=42)
-        >>> lf = LabelFlipping(bagging_base_estimator=DecisionTreeClassifier, 
-                flip_rate=0.1, max_depth=3)
+        >>> lf = LabelFlipping(bagging_base_estimator=DecisionTreeClassifier,
+                max_flip_rate=0.1, max_depth=3)
         >>> lf.fit(X, y)
         >>> X_transformed, y_transformed = lf.transform(X, y)
         """
         self.logger = create_logger("methods.preprocessing.LabelFlipping")
         self.logger.info("Instantiating a LabelFlipping preprocessing method.")
 
-        self.flip_rate = flip_rate
+        self.max_flip_rate = max_flip_rate
 
         if disparity_target is not None:
             if disparity_target < 0 or disparity_target > 1:
@@ -114,7 +123,7 @@ def __init__(
         self.bagging_base_estimator = bagging_base_estimator(**args)
         self.logger.info(
             f"Created base estimator {self.bagging_base_estimator} with params {args}, "
-            F"discarded args:{list(set(base_estimator_args.keys()) - set(args.keys()))}"
+            f"discarded args:{list(set(base_estimator_args.keys()) - set(args.keys()))}"
         )
         self.bagging_n_estimators = bagging_n_estimators
 
@@ -159,8 +168,8 @@ def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None:
     def _score_instances(self, X: pd.DataFrame, y: pd.Series) -> pd.Series:
         """Scores the instances based on the predictions of the ensemble of classifiers.
 
-        If the ordering method is "ensemble_margin", the scores are the ensemble 
-        margins. If the ordering method is "residuals", the scores are the average 
+        If the ordering method is "ensemble_margin", the scores are the ensemble
+        margins. If the ordering method is "residuals", the scores are the average
         residuals of the classifiers predictions.
 
         Parameters
@@ -203,9 +212,13 @@ def _score_instances(self, X: pd.DataFrame, y: pd.Series) -> pd.Series:
         return scores
 
     def _calculate_prevalence_disparity(self, y: pd.Series, s: pd.Series):
-        prevalence = y.mean()
-        group_prevalence = y.groupby(s).mean().to_dict()
-        group_disparity = {k: v - prevalence for k, v in group_prevalence.items()}
+        prevalences = y.groupby(s).mean()
+        max_prevalence = max(prevalences)
+        group_prevalences = prevalences.to_dict()
+        group_disparity = {
+            k: (max_prevalence - v) / max_prevalence
+            for k, v in group_prevalences.items()
+        }
 
         return group_disparity
 
@@ -214,7 +227,7 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie
 
         If fair_ordering is True, only flips the labels of the instances that contribute
         to equalizing the prevalence of the groups.
-        Otherwise, the labels of the instances with the largest score values are 
+        Otherwise, the labels of the instances with the largest score values are
         flipped.
 
         Parameters
@@ -236,7 +249,7 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie
                 ascending=(self.ordering_method == "ensemble_margin")
             ).index
         )
-        n_flip = int(self.flip_rate * len(y))
+        n_flip = int(self.max_flip_rate * len(y))
 
         if self.fair_ordering:
             disparity = self._calculate_prevalence_disparity(y_flipped, s)
@@ -251,8 +264,8 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie
                 if abs(scores.loc[i]) < self.score_threshold:
                     break
 
-                if (disparity[s.loc[i]] > self.disparity_target and y.loc[i] == 1) or (
-                    disparity[s.loc[i]] < self.disparity_target and y.loc[i] == 0
+                if (disparity[s.loc[i]] > self.disparity_target and y.loc[i] == 0) or (
+                    disparity[s.loc[i]] == 0 and y.loc[i] == 1
                 ):
                     y_flipped.loc[i] = 1 - y.loc[i]
                     disparity = self._calculate_prevalence_disparity(y_flipped, s)
@@ -295,7 +308,7 @@ def transform(
 
         if s is None and self.fair_ordering:
             raise ValueError(
-                "Sensitive Attribute `s` not passed. Must be passed if `fair_ordering` " 
+                "Sensitive Attribute `s` not passed. Must be passed if `fair_ordering` "
                 "is True."
             )
 

From a31868559bcae5f033cd8ac028f668f6bef697ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?In=C3=AAs=20Silva?= <inesoliveiraesilva@gmail.com>
Date: Tue, 23 Jan 2024 14:36:31 +0000
Subject: [PATCH 2/5] Fixed spelling mistakes

---
 src/aequitas/flow/methods/preprocessing/label_flipping.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py
index b5a94710..b81937c5 100644
--- a/src/aequitas/flow/methods/preprocessing/label_flipping.py
+++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py
@@ -60,7 +60,7 @@ def __init__(
         ordering_method : str, optional
             The method used to calculate the margin of the base estimator. If
             "ensemble_margin", calculates the ensemble margins based on the binary
-            predictions of the classifiers. If "residuals", oreders the missclafied
+            predictions of the classifiers. If "residuals", orders the misclassified
             instances based on the average residuals of the classifiers predictions. By
             default "ensemble_margin".
         unawareness_features : list, optional
@@ -295,7 +295,9 @@ def transform(
         Parameters
         ----------
         X : pd.DataFrame
-            Feature[s.loc[i]]ector.
+            Feature matrix.
+        y : pd.Series
+            Label vector.
         s : pd.Series, optional
             Protected attribute vector.
 

From 981ca66da493f409d221d7655a915ab4ddb25b7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?In=C3=AAs=20Silva?= <inesoliveiraesilva@gmail.com>
Date: Wed, 24 Jan 2024 11:48:25 +0000
Subject: [PATCH 3/5] Fixed max prevalence to mean prevalence

---
 .../flow/methods/preprocessing/label_flipping.py  | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py
index b81937c5..20750b2c 100644
--- a/src/aequitas/flow/methods/preprocessing/label_flipping.py
+++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py
@@ -37,10 +37,9 @@ def __init__(
         max_flip_rate : float, optional
             Maximum fraction of the training data to flip, by default 0.1
         disparity_target : float, optional
-            The target disparity between the groups (difference between the
-            prevalence of a group and the group with the highest prevalence). By
-            default None, which means the method will attempt to equalize the
-            prevalence of the groups.
+            The target disparity between the groups (difference between the prevalence
+            of a group and the mean prevalence). By default None, which means the
+            method will attempt to equalize the prevalence of the groups.
         score_threshold : float, optional
             The threshold above which the labels are flipped. By default None,
             which means the method will flip the labels of the instances with
@@ -213,10 +212,10 @@ def _score_instances(self, X: pd.DataFrame, y: pd.Series) -> pd.Series:
 
     def _calculate_prevalence_disparity(self, y: pd.Series, s: pd.Series):
         prevalences = y.groupby(s).mean()
-        max_prevalence = max(prevalences)
+        mean_prevalence = y.mean()
         group_prevalences = prevalences.to_dict()
         group_disparity = {
-            k: (max_prevalence - v) / max_prevalence
+            k: (v - mean_prevalence) / mean_prevalence
             for k, v in group_prevalences.items()
         }
 
@@ -264,8 +263,8 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie
                 if abs(scores.loc[i]) < self.score_threshold:
                     break
 
-                if (disparity[s.loc[i]] > self.disparity_target and y.loc[i] == 0) or (
-                    disparity[s.loc[i]] == 0 and y.loc[i] == 1
+                if (disparity[s.loc[i]] > self.disparity_target and y.loc[i] == 1) or (
+                    disparity[s.loc[i]] < -self.disparity_target and y.loc[i] == 0
                 ):
                     y_flipped.loc[i] = 1 - y.loc[i]
                     disparity = self._calculate_prevalence_disparity(y_flipped, s)

From 4d047268334c0f98635369f2a7aae694bebbf456 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?In=C3=AAs=20Silva?= <inesoliveiraesilva@gmail.com>
Date: Thu, 25 Jan 2024 15:06:57 +0000
Subject: [PATCH 4/5] Faster disparity fair ordering

---
 .../methods/preprocessing/label_flipping.py   | 30 +++++++++++++++----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py
index 20750b2c..c0f75c1a 100644
--- a/src/aequitas/flow/methods/preprocessing/label_flipping.py
+++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py
@@ -5,6 +5,7 @@
 
 import inspect
 import pandas as pd
+import math
 from typing import Optional, Tuple, Literal, Union, Callable
 import numpy as np
 from sklearn.ensemble import BaggingClassifier
@@ -16,7 +17,7 @@ class LabelFlipping(PreProcessing):
     def __init__(
         self,
         max_flip_rate: float = 0.1,
-        disparity_target: Optional[float] = None,
+        disparity_target: Optional[float] = 0.05,
         score_threshold: Optional[float] = None,
         bagging_max_samples: float = 0.5,
         bagging_base_estimator: Union[
@@ -221,6 +222,22 @@ def _calculate_prevalence_disparity(self, y: pd.Series, s: pd.Series):
 
         return group_disparity
 
+    def _calculate_group_flips(self, y: pd.Series, s: pd.Series):
+        prevalence = y.mean()
+        group_prevalences = y.groupby(s).mean()
+
+        min_prevalence = prevalence - self.disparity_target * prevalence
+        max_prevalence = prevalence + self.disparity_target * prevalence
+
+        group_flips = {
+            group: math.ceil(min_prevalence * len(y[s == group])) - y[s == group].sum()
+            if group_prevalences[group] < min_prevalence
+            else math.floor(max_prevalence * len(y[s == group])) - y[s == group].sum()
+            for group in group_prevalences.index
+        }
+
+        return group_flips
+
     def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Series):
         """Flips the labels of the desired fraction of the training data.
 
@@ -251,7 +268,7 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie
         n_flip = int(self.max_flip_rate * len(y))
 
         if self.fair_ordering:
-            disparity = self._calculate_prevalence_disparity(y_flipped, s)
+            group_flips = self._calculate_group_flips(y_flipped, s)
             flip_index = (
                 y_flipped.index
                 if self.ordering_method == "residuals"
@@ -263,12 +280,15 @@ def _label_flipping(self, y: pd.Series, s: Optional[pd.Series], scores: pd.Serie
                 if abs(scores.loc[i]) < self.score_threshold:
                     break
 
-                if (disparity[s.loc[i]] > self.disparity_target and y.loc[i] == 1) or (
-                    disparity[s.loc[i]] < -self.disparity_target and y.loc[i] == 0
+                if (group_flips[s.loc[i]] > 0 and y.loc[i] == 0) or (
+                    group_flips[s.loc[i]] < 0 and y.loc[i] == 1
                 ):
                     y_flipped.loc[i] = 1 - y.loc[i]
-                    disparity = self._calculate_prevalence_disparity(y_flipped, s)
                     flip_count += 1
+                    if group_flips[s.loc[i]] > 0:
+                        group_flips[s.loc[i]] -= 1
+                    else:
+                        group_flips[s.loc[i]] += 1
 
                 if flip_count == n_flip:
                     break

From 5e59d53ebb8eb3667d1727255e47cbcc5c425308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?In=C3=AAs=20Silva?= <inesoliveiraesilva@gmail.com>
Date: Thu, 25 Jan 2024 16:17:55 +0000
Subject: [PATCH 5/5] Deleted old code

---
 .../flow/methods/preprocessing/label_flipping.py      | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py
index c0f75c1a..0e339393 100644
--- a/src/aequitas/flow/methods/preprocessing/label_flipping.py
+++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py
@@ -211,17 +211,6 @@ def _score_instances(self, X: pd.DataFrame, y: pd.Series) -> pd.Series:
 
         return scores
 
-    def _calculate_prevalence_disparity(self, y: pd.Series, s: pd.Series):
-        prevalences = y.groupby(s).mean()
-        mean_prevalence = y.mean()
-        group_prevalences = prevalences.to_dict()
-        group_disparity = {
-            k: (v - mean_prevalence) / mean_prevalence
-            for k, v in group_prevalences.items()
-        }
-
-        return group_disparity
-
     def _calculate_group_flips(self, y: pd.Series, s: pd.Series):
         prevalence = y.mean()
         group_prevalences = y.groupby(s).mean()