This repository has been archived by the owner on Dec 18, 2023. It is now read-only.
/
base_data.py
224 lines (192 loc) · 7.42 KB
/
base_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
"""Abstract class for the data artifacts used by `Lens`"""
# Data is a lightweight wrapper that stores data
import itertools
from abc import ABC, abstractmethod
from typing import Optional, Union
import pandas as pd
from credoai.utils.common import ValidationError
from credoai.utils.model_utils import type_of_target
from copy import deepcopy
class Data(ABC):
"""Class wrapper around data-to-be-assessed
Data is passed to Lens for certain assessments.
Data serves as an adapter between datasets
and the evaluators in Lens.
Parameters
-------------
type : str
Type of the dataset
name : str
Label of the dataset
X : to-be-defined by children
Dataset
y : to-be-defined by children
Outcome
sensitive_features : pd.Series, pd.DataFrame, optional
Sensitive Features, which will be used for disaggregating performance
metrics. This can be the columns you want to perform segmentation analysis on, or
a feature related to fairness like 'race' or 'gender'
sensitive_intersections : bool, list
Whether to add intersections of sensitive features. If True, add all possible
intersections. If list, only create intersections from specified sensitive features.
If False, no intersections will be created. Defaults False
"""
def __init__(
self,
type: str,
name: str,
X=None,
y=None,
sensitive_features=None,
sensitive_intersections: Union[bool, list] = False,
):
if isinstance(name, str):
self.name = name
else:
raise ValidationError("{Name} must be a string")
self.X = X
self.y = y
self.sensitive_features = sensitive_features
self._validate_inputs()
self._process_inputs(sensitive_intersections)
self._validate_processing()
self._active_sensitive_feature: Optional[str] = None
@property
def active_sens_feat(self):
"""
Defines which sensitive feature an evaluator will be working on.
In combination with the property sensitive_feature this effectively creates
a view of a specific artifact.
"""
if self._active_sensitive_feature is None:
self._active_sensitive_feature = self.sensitive_features.columns[0]
return self._active_sensitive_feature
@active_sens_feat.setter
def active_sens_feat(self, value: str):
"""
Sets the active_sens_feat value.
Parameters
----------
value : str
Name of the sensitive feature column an evaluator has to operate on.
"""
self._active_sensitive_feature = value
@property
def sensitive_feature(self):
"""
Reveals the sensitive feature defined by active_sens_feat.
This is generally called from within an evaluator, when it is working
on a single sensitive feature.
"""
return self.sensitive_features[self.active_sens_feat]
@property
def y_type(self):
return type_of_target(self.y)
@property
def data(self):
data = {"X": self.X, "y": self.y, "sensitive_features": self.sensitive_features}
return data
def _process_inputs(self, sensitive_intersections):
if self.X is not None:
self.X = self._process_X(self.X)
if self.y is not None:
self.y = self._process_y(self.y)
if self.sensitive_features is not None:
self.sensitive_features = self._process_sensitive(
deepcopy(self.sensitive_features), sensitive_intersections
)
def _process_sensitive(self, sensitive_features, sensitive_intersections):
"""
Formats sensitive features
Parameters
----------
sensitive_features :
Sensitive features as provided by a user. Any format that can be constrained
in a dataframe is accepted.
sensitive_intersections : Bool
Indicates whether to create intersections among sensitive features.
Returns
-------
_type_
_description_
"""
df = pd.DataFrame(sensitive_features)
if len(df.columns) == 1 and isinstance(df.columns[0], int):
df.columns = ["NA"]
# add intersections if asked for
features = df.columns
if sensitive_intersections is False or len(features) == 1:
return df
elif sensitive_intersections is True:
sensitive_intersections = features
intersections = []
for i in range(2, len(features) + 1):
intersections += list(itertools.combinations(sensitive_intersections, i))
for intersection in intersections:
tmp = df[intersection[0]]
for col in intersection[1:]:
tmp = tmp.str.cat(df[col].astype(str), sep="_")
label = "_".join(intersection)
df[label] = tmp
return df
def _process_X(self, X):
return X
def _process_y(self, y):
return y
def _validate_inputs(self):
"""Basic input validation"""
if self.X is not None:
self._validate_X()
if self.y is not None:
self._validate_y()
if self.sensitive_features is not None:
self._validate_sensitive()
def _validate_sensitive(self):
"""Sensitive features validation"""
# Validate the types
if not isinstance(self.sensitive_features, (pd.Series, pd.DataFrame)):
raise ValidationError(
"Sensitive_feature type is '"
+ type(self.sensitive_features).__name__
+ "' but the required type is either pd.DataFrame or pd.Series"
)
if self.X is not None:
if len(self.X) != len(self.sensitive_features):
raise ValidationError(
"X and sensitive_features are not the same length. "
+ f"X Length: {len(self.X)}, sensitive_features Length: {len(self.y)}"
)
if isinstance(self.X, (pd.Series, pd.DataFrame)) and not self.X.index.equals(
self.sensitive_features.index
):
raise ValidationError("X and sensitive features must have the same index")
if isinstance(self.sensitive_features, pd.Series):
if not hasattr(self.sensitive_features, "name"):
raise ValidationError("Feature Series should have a name attribute")
@abstractmethod
def _validate_X(self):
pass
@abstractmethod
def _validate_y(self):
pass
def _validate_processing(self):
"""Validation of processed data"""
if self.X is not None:
self._validate_processed_X()
if self.y is not None:
self._validate_processed_y()
if self.sensitive_features is not None:
self._validate_processed_sensitive()
def _validate_processed_X(self):
pass
def _validate_processed_y(self):
pass
def _validate_processed_sensitive(self):
"""Validation of processed sensitive features"""
for col_name, col in self.sensitive_features.iteritems():
unique_values = col.unique()
if len(unique_values) == 1:
raise ValidationError(
f"Sensitive Feature column {col_name} must have more "
f"than one unique value. Only found one value: {unique_values[0]}"
)