Skip to content
This repository has been archived by the owner on May 31, 2023. It is now read-only.

Commit

Permalink
ACF model handles single-valued X
Browse files Browse the repository at this point in the history
This revision adds functionality to the ACF
model so that it can handle single-valued
variables in the X matrix. This is a problem
becuase when learning the residual model for
each feature, for binary variables there
needs to be two values in the training set.
  • Loading branch information
cosmicBboy committed Dec 17, 2017
1 parent 3cf63f2 commit c0c392d
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 9 deletions.
22 changes: 22 additions & 0 deletions tests/test_counterfactually_fair_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,28 @@ def test_fit_predict(random_X_data):
assert min(lin_acf_pred_proba) > 0


def test_binary_single_class(random_X_data):
"""Linear ACF can handle training data with single-valued column."""
X = create_random_X(random_X_data)
X = np.concatenate([
X, np.ones((X.shape[0], 1))
], axis=1)
s = create_s()
y = create_y()
lin_acf = counterfactually_fair_models.LinearACFClassifier()
for residual_type in ["pearson", "deviance", "absolute"]:
lin_acf = counterfactually_fair_models.LinearACFClassifier(
binary_residual_type=residual_type)
lin_acf.fit(X, y, s)
lin_acf_pred_proba = lin_acf.predict_proba(X, s)[:, 1]
assert(lin_acf.fit_residuals_ ==
lin_acf._compute_residuals_on_predict(X, s)).all()
assert is_binary(lin_acf.predict(X, s))
assert is_continuous(lin_acf_pred_proba)
assert max(lin_acf_pred_proba) < 1
assert min(lin_acf_pred_proba) > 0


def test_predict_value_error(random_X_data):
"""Raise ValueError if X doesn't have expected number of variables."""
X = create_random_X(random_X_data)
Expand Down
23 changes: 17 additions & 6 deletions themis_ml/linear_model/counterfactually_fair_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,11 @@ def fit(self, X, y, s):

# fit residual estimators and compute residuals
for i in range(self.n_input_variables_):
if i in continuous_index_set:
if i in binary_index_set and len(set(X[:, i])) == 1:
# if a binary variable only contains one of the classes
# in the training set, then no residual can be computed.
estimator, compute_residual_func = None, None
elif i in continuous_index_set:
estimator = clone(self.continuous_estimator)
compute_residual_func = _compute_absolute_residuals
elif i in binary_index_set:
Expand All @@ -123,9 +127,13 @@ def fit(self, X, y, s):
else:
raise ValueError(
"index %s is not in continuous_index_ or binary_index_")
estimator.fit(residual_input, X[:, i])
self.fit_residuals_[:, i] = compute_residual_func(
estimator, residual_input, X[:, i])
# fit residual estimator and compute residuals
if estimator and compute_residual_func:
estimator.fit(residual_input, X[:, i])
self.fit_residuals_[:, i] = compute_residual_func(
estimator, residual_input, X[:, i])
else:
self.fit_residuals_[:, i] = 0
self.compute_residual_funcs_.append(compute_residual_func)
self.residual_estimators_.append(estimator)

Expand All @@ -138,8 +146,11 @@ def _compute_residuals_on_predict(self, X, s):
residual_input = s.reshape(-1, 1)
for i, (estimator, compute_residual_func) in enumerate(
zip(self.residual_estimators_, self.compute_residual_funcs_)):
predict_residuals[:, i] = compute_residual_func(
estimator, residual_input, X[:, i])
if estimator and compute_residual_func:
predict_residuals[:, i] = compute_residual_func(
estimator, residual_input, X[:, i])
else:
predict_residuals[:, i] = self.fit_residuals_[:, i]
return predict_residuals

def _check_fitted(self, X):
Expand Down
8 changes: 5 additions & 3 deletions themis_ml/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def mean_differences_ci(y, s, ci=DEFAULT_CI):
:param array-like y: shape (n, ) containing binary target variable, where
1 is the desireable outcome and 0 is the undesireable outcome.
:param array-like s: shape (n, ) containing binary protected class
variable where 0 is the advantaged groupd and 1 is the disadvantaged
variable where 0 is the advantaged group and 1 is the disadvantaged
group.
:param float ci: % confidence interval to compute. Default: 97.5% to
compute 95% two-sided t-statistic associated with degrees of freedom.
Expand Down Expand Up @@ -107,8 +107,10 @@ def normalized_mean_difference(y, s, norm_y=None, ci=DEFAULT_CI):
Therefore the normalized mean difference will report a higher score than
mean difference in two cases:
- if there are fewer positive examples than there are advantaged observations.
- if there are fewer negative examples than there are disadvantaged observations.
- if there are fewer positive examples than there are advantaged
observations.
- if there are fewer negative examples than there are disadvantaged
observations.
Reference:
Zliobaite, I. (2015). A survey on measuring indirect discrimination in
Expand Down

0 comments on commit c0c392d

Please sign in to comment.