ACF model handles single-valued X

This revision adds functionality to the ACF model so that it can handle single-valued variables in the X matrix. This is a problem becuase when learning the residual model for each feature, for binary variables there needs to be two values in the training set.
cosmicBboy · Dec 17, 2017 · c0c392d · c0c392d
1 parent 3cf63f2
commit c0c392d
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 9 deletions.
diff --git a/tests/test_counterfactually_fair_models.py b/tests/test_counterfactually_fair_models.py
@@ -45,6 +45,28 @@ def test_fit_predict(random_X_data):
         assert min(lin_acf_pred_proba) > 0
 
 
+def test_binary_single_class(random_X_data):
+    """Linear ACF can handle training data with single-valued column."""
+    X = create_random_X(random_X_data)
+    X = np.concatenate([
+        X, np.ones((X.shape[0], 1))
+    ], axis=1)
+    s = create_s()
+    y = create_y()
+    lin_acf = counterfactually_fair_models.LinearACFClassifier()
+    for residual_type in ["pearson", "deviance", "absolute"]:
+        lin_acf = counterfactually_fair_models.LinearACFClassifier(
+            binary_residual_type=residual_type)
+        lin_acf.fit(X, y, s)
+        lin_acf_pred_proba = lin_acf.predict_proba(X, s)[:, 1]
+        assert(lin_acf.fit_residuals_ ==
+               lin_acf._compute_residuals_on_predict(X, s)).all()
+        assert is_binary(lin_acf.predict(X, s))
+        assert is_continuous(lin_acf_pred_proba)
+        assert max(lin_acf_pred_proba) < 1
+        assert min(lin_acf_pred_proba) > 0
+
+
 def test_predict_value_error(random_X_data):
     """Raise ValueError if X doesn't have expected number of variables."""
     X = create_random_X(random_X_data)

diff --git a/themis_ml/linear_model/counterfactually_fair_models.py b/themis_ml/linear_model/counterfactually_fair_models.py
@@ -112,7 +112,11 @@ def fit(self, X, y, s):
 
         # fit residual estimators and compute residuals
         for i in range(self.n_input_variables_):
-            if i in continuous_index_set:
+            if i in binary_index_set and len(set(X[:, i])) == 1:
+                # if a binary variable only contains one of the classes
+                # in the training set, then no residual can be computed.
+                estimator, compute_residual_func = None, None
+            elif i in continuous_index_set:
                 estimator = clone(self.continuous_estimator)
                 compute_residual_func = _compute_absolute_residuals
             elif i in binary_index_set:
@@ -123,9 +127,13 @@ def fit(self, X, y, s):
             else:
                 raise ValueError(
                     "index %s is not in continuous_index_ or binary_index_")
-            estimator.fit(residual_input, X[:, i])
-            self.fit_residuals_[:, i] = compute_residual_func(
-                estimator, residual_input, X[:, i])
+            # fit residual estimator and compute residuals
+            if estimator and compute_residual_func:
+                estimator.fit(residual_input, X[:, i])
+                self.fit_residuals_[:, i] = compute_residual_func(
+                    estimator, residual_input, X[:, i])
+            else:
+                self.fit_residuals_[:, i] = 0
             self.compute_residual_funcs_.append(compute_residual_func)
             self.residual_estimators_.append(estimator)
 
@@ -138,8 +146,11 @@ def _compute_residuals_on_predict(self, X, s):
         residual_input = s.reshape(-1, 1)
         for i, (estimator, compute_residual_func) in enumerate(
                 zip(self.residual_estimators_, self.compute_residual_funcs_)):
-            predict_residuals[:, i] = compute_residual_func(
-                estimator, residual_input, X[:, i])
+            if estimator and compute_residual_func:
+                predict_residuals[:, i] = compute_residual_func(
+                    estimator, residual_input, X[:, i])
+            else:
+                predict_residuals[:, i] = self.fit_residuals_[:, i]
         return predict_residuals
 
     def _check_fitted(self, X):

diff --git a/themis_ml/metrics.py b/themis_ml/metrics.py
@@ -23,7 +23,7 @@ def mean_differences_ci(y, s, ci=DEFAULT_CI):
     :param array-like y: shape (n, ) containing binary target variable, where
         1 is the desireable outcome and 0 is the undesireable outcome.
     :param array-like s: shape (n, ) containing binary protected class
-        variable where 0 is the advantaged groupd and 1 is the disadvantaged
+        variable where 0 is the advantaged group and 1 is the disadvantaged
         group.
     :param float ci: % confidence interval to compute. Default: 97.5% to
         compute 95% two-sided t-statistic associated with degrees of freedom.
@@ -107,8 +107,10 @@ def normalized_mean_difference(y, s, norm_y=None, ci=DEFAULT_CI):
 
     Therefore the normalized mean difference will report a higher score than
     mean difference in two cases:
-    - if there are fewer positive examples than there are advantaged observations.
-    - if there are fewer negative examples than there are disadvantaged observations.
+    - if there are fewer positive examples than there are advantaged
+      observations.
+    - if there are fewer negative examples than there are disadvantaged
+      observations.
 
     Reference:
     Zliobaite, I. (2015). A survey on measuring indirect discrimination in