Merge pull request #74 from ealcobaca/one-hot-encoding

Categorical attributes one-hot encoding option
ealcobaca · Mar 6, 2020 · 44344a5 · 44344a5
2 parents c5f0045 + a118a17
commit 44344a5
Show file tree

Hide file tree

Showing 6 changed files with 92 additions and 16 deletions.
diff --git a/pymfe/_internal.py b/pymfe/_internal.py
@@ -145,6 +145,11 @@
     "total_summ",
 )
 
+VALID_TRANSFORM_CAT = (
+    "gray",
+    "one-hot",
+)
+
 _RESCALE_SCALERS = {
     "standard": sklearn.preprocessing.StandardScaler,
     "min-max": sklearn.preprocessing.MinMaxScaler,
@@ -1312,7 +1317,7 @@ def timeit(func: t.Callable, *args) -> t.Tuple[t.Any, float]:
     return ret_val, time_total
 
 
-def transform_cat(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
+def transform_cat_gray(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
     """Transform categorical data using a model matrix.
 
     The formula used for this transformation is just the union (+) of all cat-
@@ -1342,6 +1347,23 @@ def transform_cat(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
     return np.asarray(patsy.dmatrix(formula, named_data))
 
 
+def transform_cat_onehot(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
+    """Transform categorical data using one-hot encoding."""
+    if data_categoric.size == 0:
+        return None
+
+    _, num_col = data_categoric.shape
+
+    ohe = sklearn.preprocessing.OneHotEncoder(sparse=False)
+
+    one_cat_attrs = np.hstack([
+        ohe.fit_transform(data_categoric[:, attr_ind, np.newaxis])
+        for attr_ind in np.arange(num_col)
+    ])
+
+    return one_cat_attrs
+
+
 def _equal_freq_discretization(data: np.ndarray,
                                num_bins: int,
                                tol: float = 1e-8) -> np.ndarray:

diff --git a/pymfe/mfe.py b/pymfe/mfe.py
@@ -727,16 +727,18 @@ def _set_data_categoric(self, transform_num: bool,
 
     def _set_data_numeric(
             self,
-            transform_cat: bool,
+            transform_cat: str,
             rescale: t.Optional[str] = None,
             rescale_args: t.Optional[t.Dict[str, t.Any]] = None) -> np.ndarray:
         """Returns numeric data from the fitted dataset.
 
         Parameters
         ----------
         transform_cat: :obj:`bool`
-            If True, then all categoric-type data will be binarized with a
-            model matrix strategy.
+            If `gray`, then all categoric-type data will be binarized with a
+            model matrix strategy. If `one-hot`, then all categoric-type
+            data will be transformed using the one-hot encoding strategy.
+            If None, then categorical attributes are not transformed.
 
         rescale : :obj:`str`, optional
             Check ``fit`` documentation for more information about this
@@ -761,6 +763,10 @@ def _set_data_numeric(
             :obj:`NoneType`. This can be avoided passing valid data to fit and
             first calling ``_fill_col_ind_by_type`` instance method before
             this method.
+
+        ValueError
+            If `transform_cat` is neither None nor a value among `one-hot` and
+            `gray`.
         """
         if self.X is None:
             raise TypeError("It is necessary to fit valid data into the "
@@ -772,11 +778,22 @@ def _set_data_numeric(
                             "attributes. Please be sure to call method "
                             '"_fill_col_ind_by_type" before this method.')
 
+        if (transform_cat is not None and
+                transform_cat not in _internal.VALID_TRANSFORM_CAT):
+            raise ValueError("Invalid 'transform_cat' value ('{}'). Must be "
+                             "a value in {}.".format(
+                                 transform_cat, _internal.VALID_TRANSFORM_CAT))
+
         data_num = self.X[:, self._attr_indexes_num]
 
         if transform_cat:
-            categorical_dummies = _internal.transform_cat(
-                self.X[:, self._attr_indexes_cat])
+            if transform_cat == "gray":
+                categorical_dummies = _internal.transform_cat_gray(
+                    self.X[:, self._attr_indexes_cat])
+
+            else:
+                categorical_dummies = _internal.transform_cat_onehot(
+                    self.X[:, self._attr_indexes_cat])
 
             if categorical_dummies is not None:
                 data_num = np.concatenate((data_num, categorical_dummies),
@@ -792,7 +809,7 @@ def fit(self,
             X: t.Sequence,
             y: t.Sequence,
             transform_num: bool = True,
-            transform_cat: bool = True,
+            transform_cat: str = "gray",
             rescale: t.Optional[str] = None,
             rescale_args: t.Optional[t.Dict[str, t.Any]] = None,
             cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto",
@@ -820,19 +837,26 @@ def fit(self,
             discretized ones. If False, then numeric attributes are ignored for
             categorical-only meta-features.
 
-        transform_cat : :obj:`bool`, optional
-            If True, categorical attributes are binarized using a model matrix
-            to use when alongside numerical data while extracting numeric-only
-            metafeatures. Note that categoric-only features still uses the
-            original categoric values, not the binarized ones. If False, then
-            categorical attributes are ignored for numeric-only metafeatures.
+        transform_cat : :obj:`str`, optional
+            Transform categorical data to use alongside numerical data while
+            extracting numeric-only metafeatures. Note that categoric-only
+            features still uses the original categoric values, and not the
+            binarized ones.
+
+            If `one-hot`, categorical attributes are binarized using one-hot
+            encoding.
+
+            If `gray`, categorical attributes are binarized using a model
+            matrix.
 
             The formula used for this transformation is just the union (+) of
             all categoric attributes using formula language from ``patsy``
             package API, removing the intercept terms:
             ``~ 0 + A_1 + ... + A_n``, where ``n`` is the number of attributes
             and A_i is the ith categoric attribute, 1 <= i <= n.
 
+            If None, then categorical attributes are not transformed.
+
         rescale : :obj:`str`, optional
             If :obj:`NoneType`, the model keeps all numeric data with its
             original values. Otherwise, this argument can assume one of the

diff --git a/pymfe/statistical.py b/pymfe/statistical.py
@@ -215,8 +215,8 @@ def _calc_can_cors(
         At most min(num_classes, num_attr) canonical correlations are
         kept.
         """
-        y_bin = sklearn.preprocessing.OneHotEncoder().fit_transform(
-            y.reshape(-1, 1)).todense()
+        y_bin = sklearn.preprocessing.OneHotEncoder(
+            sparse=False).fit_transform(y.reshape(-1, 1))
 
         num_classes, num_attr = y_bin.shape[1], N.shape[1]
         # Note: 'n_components' is a theoretical upper bound, so it is not

diff --git a/tests/test_architecture.py b/tests/test_architecture.py
@@ -222,3 +222,27 @@ def test_parse_valid_metafeatures(self, groups):
         names, _ = mfe.parse_by_group(groups, res)
 
         assert not set(names).symmetric_difference(target_mtf)
+
+    def test_no_cat_transformation(self):
+        X, y = load_xy(1)
+        mfe = MFE()
+        mfe.fit(X.values, y.values, transform_cat=None)
+        assert mfe._custom_args_ft["N"].size == 0
+
+    def test_one_hot_encoding_01(self):
+        X, y = load_xy(1)
+        mfe = MFE()
+        mfe.fit(X.values, y.values, transform_cat="one-hot")
+
+        exp_value = np.sum([np.unique(attr).size for attr in X.values.T])
+
+        assert mfe._custom_args_ft["N"].shape[1] == exp_value
+
+    def test_one_hot_encoding_02(self):
+        X, y = load_xy(2)
+        mfe = MFE()
+        mfe.fit(X.values, y.values, transform_cat="one-hot")
+
+        exp_value = X.values.shape[1]
+
+        assert mfe._custom_args_ft["N"].shape[1] == exp_value
diff --git a/tests/test_errors_warnings.py b/tests/test_errors_warnings.py
@@ -293,3 +293,9 @@ def test_error__set_data_numeric(self):
             mfe = MFE()
             mfe.X = np.array([])
             mfe._set_data_numeric(True)
+
+    def test_invalid_cat_transf(self):
+        X, y = load_xy(0)
+        with pytest.raises(ValueError):
+            mfe = MFE()
+            mfe.fit(X.values, y.values, transform_cat="invalid")
diff --git a/tests/test_scaling.py b/tests/test_scaling.py
@@ -35,7 +35,7 @@ def test_output_lengths_2(self, dt_id, scaler, exp_mean, exp_var, exp_min,
                               exp_max):
         X, y = load_xy(dt_id)
         model = MFE().fit(
-            X=X.values, y=y.values, rescale=scaler, transform_cat=False)
+            X=X.values, y=y.values, rescale=scaler, transform_cat=None)
 
         numeric_data = model._custom_args_ft["N"]