/
massaging.py
109 lines (90 loc) · 3.75 KB
/
massaging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from typing import Optional, Union, Callable
import pandas as pd
import math
from ...utils import create_logger
from ...utils.imports import instantiate_object
from .preprocessing import PreProcessing
class Massaging(PreProcessing):
def __init__(
self,
classifier: Union[str, Callable] = "sklearn.naive_bayes.GaussianNB",
**classifier_args,
):
"""
Instantiates a Massaging preprocessing method.
Flips selected labels to reduce disparity between groups.
"""
self.logger = create_logger("methods.preprocessing.Massaging")
self.logger.info("Instantiating a Massaging preprocessing method.")
self.classifier = instantiate_object(classifier, **classifier_args)
self.logger.info(f"Created base estimator {self.classifier}")
self.used_in_inference = False
def _rank(
self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]
) -> tuple[list, list]:
features = pd.concat([X, s], axis=1)
features = pd.get_dummies(features)
R = self.classifier.fit(features, y)
scores = pd.Series(R.predict_proba(features)[:, 1], index=X.index)
pr = []
dem = []
for g in s.unique():
prevalence = y[s == g].mean()
if prevalence < y.mean():
pr += list(X.loc[(s == g) & (y == 0)].index)
elif prevalence > y.mean():
dem += list(X.loc[(s == g) & (y == 1)].index)
pr = scores.loc[pr].sort_values(ascending=False).index
dem = scores.loc[dem].sort_values(ascending=True).index
return pr, dem
def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None:
"""Fits a classifier to the data and orders the instances by the predictions.
Promotion candidates are the instances with negative label in the group with
lowest prevalence and demotion candidates are the instances with positive
label in the group with highest prevalence. The number of instances to be
flipped is calculated to equalize the prevalences of the groups.
Parameters
----------
X : pandas.DataFrame
Feature matrix.
y : pandas.Series
Label vector.
s : pandas.Series
Protected attribute vector.
"""
self.logger.info("Fitting Massaging preprocessing method.")
self.pr, self.dem = self._rank(X, y, s)
g_pr = s.loc[self.pr].unique()
g_dem = s.loc[self.dem].unique()
d_b = y.loc[s.isin(g_pr)].mean()
d_w = y.loc[s.isin(g_dem)].mean()
d = d_w - d_b
self.m = math.ceil(
(d * y.loc[s.isin(g_pr)].shape[0] * y.loc[s.isin(g_dem)].shape[0])
/ y.shape[0]
)
self.logger.info("Massaging preprocessing method fitted.")
def transform(
self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series] = None
) -> tuple[pd.DataFrame, pd.Series, pd.Series]:
"""Transforms the data by flipping the calculated number of label of the top
candidates in the promotion and the demotion groups.
Parameters
----------
X : pd.DataFrame
Feature matrix.
y : pd.Series
Label vector.
s : pd.Series, optional
Protected attribute vector.
Returns
-------
tuple[pd.DataFrame, pd.Series, pd.Series]
The transformed input, X, y, and s.
"""
self.logger.info("Transforming data with Massaging preprocessing method.")
y_corrected = y.copy()
y_corrected.loc[self.pr[: self.m]] = 1
y_corrected.loc[self.dem[: self.m]] = 0
self.logger.info("Data transformed with Massaging preprocessing method.")
return X, y_corrected, s