Merge pull request #89 from ealcobaca/missing-vals

Handling more special cases in data encoding
ealcobaca · Jul 2, 2020 · 45530ec · 45530ec
2 parents 80a0fc8 + 573475c
commit 45530ec
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 20 deletions.
diff --git a/pymfe/_internal.py b/pymfe/_internal.py
@@ -1430,7 +1430,15 @@ def transform_cat_gray(data_categoric: np.ndarray) -> t.Optional[np.ndarray]:
 
     formula = "~ 0 + {}".format(" + ".join(dummy_attr_names))
 
-    return np.asarray(patsy.dmatrix(formula, named_data))
+    try:
+        enc_data = patsy.dmatrix(formula, named_data, NA_action="raise")
+        return np.asarray(enc_data, dtype=float)
+
+    except patsy.PatsyError:
+        raise ValueError("Categorical data encoding of type 'gray' has no "
+                         "support for missing values. Please handle the "
+                         "missing data manually before fitting it into the "
+                         "MFE model.")
 
 
 def transform_cat_onehot(
@@ -1451,14 +1459,21 @@ def transform_cat_onehot(
     for attr_ind in np.arange(num_col):
         cur_attr = data_categoric[:, attr_ind, np.newaxis]
 
-        if not use_all_columns and np.unique(cur_attr).size <= 1:
+        if not use_all_columns and len(set(cur_attr.ravel())) <= 1:
             raise ValueError("This type of one-hot encoding does not "
                              "support features with 1 or less distinct "
                              "values. Drop the {}th categorical feature "
                              "or select another encoding strategy.".format(
                                  attr_ind + 1))
 
-        one_cat_attrs.append(ohe.fit_transform(cur_attr))
+        try:
+            one_cat_attrs.append(ohe.fit_transform(cur_attr))
+
+        except ValueError:
+            raise ValueError("Categorical data encoding of type 'one-hot' has "
+                             "no support for missing values. Please handle the"
+                             " missing data manually before fitting it into "
+                             "the MFE model.")
 
     return np.hstack(one_cat_attrs)
 
@@ -1467,23 +1482,23 @@ def _equal_freq_discretization(data: np.ndarray,
                                num_bins: int,
                                tol: float = 1e-8) -> np.ndarray:
     """Discretize a 1-D numeric array into an equal-frequency histogram."""
-    perc_interval = 100.0 / num_bins
-    perc_range = np.arange(perc_interval, 100, perc_interval)
-    hist_divs = np.percentile(data, perc_range)
+    hist_divs = np.quantile(data, np.linspace(0, 1, num_bins + 1)[1:])
 
     # Sometimes the 'hist_divs' is not appropriated.
     # For example when all values are constants. It implies in 'hist_divs'
     # repetitive values.
     # To avoid partitions with the same value, we check if all partitions are
     # different. Unfortunately, it leads to a non-equal frequency
     # discretization.
-    aux = len(hist_divs)
-    diffs = np.append(True, np.diff(hist_divs))
-    hist_divs = hist_divs[diffs > tol]
-    if aux != len(hist_divs):
+    prev_size = hist_divs.size
+
+    hist_divs = hist_divs[np.append(True, np.diff(hist_divs) > tol)]
+
+    if prev_size != hist_divs.size:
         warnings.warn("It is not possible make equal discretization")
 
     hist_divs = np.unique(hist_divs)
+
     return np.digitize(x=data, bins=hist_divs, right=True)
 
 

diff --git a/pymfe/mfe.py b/pymfe/mfe.py
@@ -726,12 +726,11 @@ def _set_data_categoric(self, transform_num: bool,
         data_cat = self.X[:, self._attr_indexes_cat]
 
         if transform_num:
-            data_num_discretized = _internal.transform_num(
+            data_num_disc = _internal.transform_num(
                 self.X[:, self._attr_indexes_num], num_bins=num_bins)
 
-            if data_num_discretized is not None:
-                data_cat = np.concatenate((data_cat, data_num_discretized),
-                                          axis=1)
+            if data_num_disc is not None and data_num_disc.size > 0:
+                data_cat = np.hstack((data_cat, data_num_disc))
 
         return data_cat
 
@@ -803,19 +802,18 @@ def _set_data_numeric(
 
         if transform_cat:
             if transform_cat == "gray":
-                categorical_dummies = _internal.transform_cat_gray(
+                cat_dummies = _internal.transform_cat_gray(
                     self.X[:, self._attr_indexes_cat])
 
             else:
                 _use_all_ohe_columns = transform_cat == "one-hot-full"
 
-                categorical_dummies = _internal.transform_cat_onehot(
+                cat_dummies = _internal.transform_cat_onehot(
                     self.X[:, self._attr_indexes_cat],
                     use_all_columns=_use_all_ohe_columns)
 
-            if categorical_dummies is not None:
-                data_num = np.concatenate((data_num, categorical_dummies),
-                                          axis=1).astype(float)
+            if cat_dummies is not None and cat_dummies.size > 0:
+                data_num = np.hstack((data_num, cat_dummies)).astype(float)
 
         if rescale:
             data_num = _internal.rescale_data(
@@ -984,7 +982,6 @@ def fit(self,
         TypeError
             If X or y (or both) is neither a :obj:`list` or a :obj:`np.ndarray`
             object.
-
         """
         if verbose >= 2:
             print("Fitting data into model... ", end="")

diff --git a/tests/test_architecture.py b/tests/test_architecture.py
@@ -279,6 +279,18 @@ def test_no_cat_transformation(self):
         mfe.fit(X.values, y.values, transform_cat=None)
         assert mfe._custom_args_ft["N"].size == 0
 
+    def test_gray_encoding_missing_value(self):
+        X, y = utils.load_xy(1)
+        mfe = MFE()
+
+        X = np.copy(X.values)
+        y = y.values
+
+        X[5, 0] = np.nan
+
+        with pytest.raises(ValueError):
+            mfe.fit(X, y, transform_cat="gray")
+
     def test_one_hot_encoding_01(self):
         X, y = utils.load_xy(1)
         mfe = MFE()
@@ -316,6 +328,18 @@ def test_one_hot_encoding_04(self):
         with pytest.raises(ValueError):
             mfe.fit(X=X, y=y, transform_cat="one-hot")
 
+    def test_ohe_full_encoding_missing_value(self):
+        X, y = utils.load_xy(1)
+        mfe = MFE()
+
+        X = np.copy(X.values)
+        y = y.values
+
+        X[5, 0] = np.nan
+
+        with pytest.raises(ValueError):
+            mfe.fit(X, y, transform_cat="one-hot")
+
     @pytest.mark.parametrize("confidence", (0.95, 0.99))
     def test_extract_with_confidence(self, confidence):
         X, y = utils.load_xy(2)