Merge pull request #87 from ealcobaca/new-one-hot-enc

One-hot encoding option with only 'k-1' features
ealcobaca · Jul 1, 2020 · b2d0f50 · b2d0f50
2 parents f477dc1 + b8f6cf6
commit b2d0f50
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 27 deletions.
diff --git a/pymfe/_internal.py b/pymfe/_internal.py
@@ -150,6 +150,7 @@
 VALID_TRANSFORM_CAT = (
     "gray",
     "one-hot",
+    "one-hot-full",
 )
 
 _RESCALE_SCALERS = {
@@ -1432,21 +1433,34 @@ def transform_cat_gray(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
     return np.asarray(patsy.dmatrix(formula, named_data))
 
 
-def transform_cat_onehot(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
+def transform_cat_onehot(
+        data_categoric: np.ndarray,
+        use_all_columns: bool = True) -> t.Optional[np.ndarray]:
     """Transform categorical data using one-hot encoding."""
     if data_categoric.size == 0:
         return None
 
     _, num_col = data_categoric.shape
 
-    ohe = sklearn.preprocessing.OneHotEncoder(sparse=False)
+    _drop = None if use_all_columns else "first"
 
-    one_cat_attrs = np.hstack([
-        ohe.fit_transform(data_categoric[:, attr_ind, np.newaxis])
-        for attr_ind in np.arange(num_col)
-    ])
+    ohe = sklearn.preprocessing.OneHotEncoder(drop=_drop, sparse=False)
 
-    return one_cat_attrs
+    one_cat_attrs = []  # type: t.List[np.ndarray]
+
+    for attr_ind in np.arange(num_col):
+        cur_attr = data_categoric[:, attr_ind, np.newaxis]
+
+        if not use_all_columns and np.unique(cur_attr).size <= 1:
+            raise ValueError("This type of one-hot encoding does not "
+                             "support features with 1 or less distinct "
+                             "values. Drop the {}th categorical feature "
+                             "or select another encoding strategy.".format(
+                                 attr_ind + 1))
+
+        one_cat_attrs.append(ohe.fit_transform(cur_attr))
+
+    return np.hstack(one_cat_attrs)
 
 
 def _equal_freq_discretization(data: np.ndarray,

diff --git a/pymfe/mfe.py b/pymfe/mfe.py
@@ -737,26 +737,31 @@ def _set_data_categoric(self, transform_num: bool,
 
     def _set_data_numeric(
             self,
-            transform_cat: str,
+            transform_cat: str = None,
             rescale: t.Optional[str] = None,
             rescale_args: t.Optional[t.Dict[str, t.Any]] = None) -> np.ndarray:
         """Returns numeric data from the fitted dataset.
 
         Parameters
         ----------
-        transform_cat: :obj:`bool`
+        transform_cat: :obj:`str`, optional
             If `gray`, then all categoric-type data will be binarized with a
             model matrix strategy. If `one-hot`, then all categoric-type
-            data will be transformed using the one-hot encoding strategy.
-            If None, then categorical attributes are not transformed.
+            data will be transformed using the k-1 one-hot encoding strategy
+            (for a traditional one-hot encoding, the first column is dropped
+            out). If `one-hot-full`, the strategy used is the one-hot encoding
+            with all encoded features (`k` features for an attribute with `k`
+            unique values; not recommended due to multicollinearity problems
+            due to the `dummy variable trap`). If None, then the categorical
+            attributes are not transformed.
 
         rescale : :obj:`str`, optional
-            Check ``fit`` documentation for more information about this
-            parameter.
+            Check the documentation of the method ``fit`` for more information
+            about this.
 
         rescale_args : :obj:`dict`, optional
-            Check ``fit`` documentation for more information about this
-            parameter.
+            Check the documentation of the method ``fit`` for more information
+            about this.
 
         Returns
         -------
@@ -775,8 +780,8 @@ def _set_data_numeric(
             this method.
 
         ValueError
-            If `transform_cat` is neither None nor a value among `one-hot` and
-            `gray`.
+            If `transform_cat` is not in the set {None, `one-hot`, `gray`,
+            `one-hot-full`}.
         """
         if self.X is None:
             raise TypeError("It is necessary to fit valid data into the "
@@ -802,8 +807,11 @@ def _set_data_numeric(
                     self.X[:, self._attr_indexes_cat])
 
             else:
+                _use_all_ohe_columns = transform_cat == "one-hot-full"
+
                 categorical_dummies = _internal.transform_cat_onehot(
-                    self.X[:, self._attr_indexes_cat])
+                    self.X[:, self._attr_indexes_cat],
+                    use_all_columns=_use_all_ohe_columns)
 
             if categorical_dummies is not None:
                 data_num = np.concatenate((data_num, categorical_dummies),
@@ -859,16 +867,37 @@ def fit(self,
             binarized ones.
 
             If `one-hot`, categorical attributes are binarized using one-hot
-            encoding.
+            encoding with `k-1` features for a categorical attribute with `k`
+            distinct values. This algorithm works as follows:
 
-            If `gray`, categorical attributes are binarized using a model
-            matrix.
+            For each categorical attribute C:
+                1. Encode C with traditional one-hot encoding.
+                2. Arbitrarily drop the first column of the encoding result.
 
-            The formula used for this transformation is just the union (+) of
-            all categoric attributes using formula language from ``patsy``
-            package API, removing the intercept terms:
-            ``~ 0 + A_1 + ... + A_n``, where ``n`` is the number of attributes
-            and A_i is the ith categoric attribute, 1 <= i <= n.
+            The unique value previously represented by the k-length vector
+            [1, 0, ..., 0] will now be presented by the (k-1)-length vector
+            [0, 0, ..., 0]. Note that all other unique values will also now be
+            represented by (k-1)-length vectors (the first `0` is dropped out).
+
+            This algorithm avoids the `dummy variable trap`, which may raise
+            multicollinearity problems due to the unnecessary extra feature.
+            Note that the decision of dropping the very first encoded feature
+            is arbitrary, as any other encoded feature could have been dropped
+            instead.
+
+            If `gray`, categorical attributes are binarized using a model
+            matrix. The formula used for this transformation is just the union
+            (+) of all categoric attributes using formula language from `patsy`
+            package API, removing the intercept terms: `~ 0 + A_1 + ... + A_n`,
+            where `n` is the number of features and `A_i` is the ith categoric
+            attribute, 1 <= i <= n.
+
+            If `one-hot-full`, categorical attributes are binarized using one-
+            hot encoding with `k` features for a categorical attributes with
+            `k` distinct values. This option is not recommended due to the
+            `dummy variable trap`, which may cause multicollinearity problems
+            due to an extra unnecessary variable (a label can be encoded using
+            the null vector [0, ..., 0]^T).
 
             If None, then categorical attributes are not transformed.
 

diff --git a/tests/test_architecture.py b/tests/test_architecture.py
@@ -284,11 +284,20 @@ def test_one_hot_encoding_01(self):
         mfe = MFE()
         mfe.fit(X.values, y.values, transform_cat="one-hot")
 
-        exp_value = np.sum([np.unique(attr).size for attr in X.values.T])
+        exp_value = np.sum([np.unique(attr).size - 1 for attr in X.values.T])
 
         assert mfe._custom_args_ft["N"].shape[1] == exp_value
 
     def test_one_hot_encoding_02(self):
+        X, y = utils.load_xy(1)
+        mfe = MFE()
+        mfe.fit(X.values, y.values, transform_cat="one-hot-full")
+
+        exp_value = np.sum([np.unique(attr).size for attr in X.values.T])
+
+        assert mfe._custom_args_ft["N"].shape[1] == exp_value
+
+    def test_one_hot_encoding_03(self):
         X, y = utils.load_xy(2)
         mfe = MFE()
         mfe.fit(X.values, y.values, transform_cat="one-hot")
@@ -297,6 +306,16 @@ def test_one_hot_encoding_02(self):
 
         assert mfe._custom_args_ft["N"].shape[1] == exp_value
 
+    def test_one_hot_encoding_04(self):
+        X, y = utils.load_xy(2)
+        mfe = MFE()
+
+        X = np.hstack((X.values, np.ones((y.size, 1), dtype=str)))
+        y = y.values
+
+        with pytest.raises(ValueError):
+            mfe.fit(X=X, y=y, transform_cat="one-hot")
+
     @pytest.mark.parametrize("confidence", (0.95, 0.99))
     def test_extract_with_confidence(self, confidence):
         X, y = utils.load_xy(2)
@@ -462,6 +481,7 @@ def test_extract_from_model_invalid4(self):
         with pytest.raises(ValueError):
             MFE(groups="general").extract_from_model(model)
 
+
 class TestArchitectureWarnings:
     def test_feature_warning1(self):
         """Test exception handling of feature extraction."""