Merge pull request #107 from ealcobaca/extract-mtf-names

Method to extract meta-feature names before any extraction
ealcobaca · Aug 25, 2020 · 29e3527 · 29e3527
2 parents c8f5d4a + 07c85d0
commit 29e3527
Show file tree

Hide file tree

Showing 3 changed files with 227 additions and 8 deletions.
diff --git a/pymfe/complexity.py b/pymfe/complexity.py
@@ -813,7 +813,7 @@ def ft_f2(
         y: np.ndarray,
         ovo_comb: t.Optional[np.ndarray] = None,
         cls_inds: t.Optional[np.ndarray] = None,
-    ) -> float:
+    ) -> np.ndarray:
         """Volume of the overlapping region.
 
         This measure calculates the overlap of the distributions of
@@ -840,8 +840,8 @@ def ft_f2(
 
         Returns
         -------
-        float
-            Volume of the overlapping region.
+        :obj:`np.ndarray`
+            Volume of the overlapping region for each OVO combination.
 
         References
         ----------
@@ -864,7 +864,7 @@ def ft_f2(
             ovo_comb = sub_dic["ovo_comb"]
             cls_inds = sub_dic["cls_inds"]
 
-        f4 = np.zeros(ovo_comb.shape[0], dtype=float)
+        f2 = np.zeros(ovo_comb.shape[0], dtype=float)
 
         for ind, (cls_id_1, cls_id_2) in enumerate(ovo_comb):
             N_cls_1 = N[cls_inds[cls_id_1], :]
@@ -875,11 +875,11 @@ def ft_f2(
             minmax = cls._calc_minmax(N_cls_1, N_cls_2)
             maxmin = cls._calc_maxmin(N_cls_1, N_cls_2)
 
-            f4[ind] = np.prod(
+            f2[ind] = np.prod(
                 np.maximum(0.0, minmax - maxmin) / (maxmax - minmin)
             )
 
-        return f4
+        return f2
 
     @classmethod
     def ft_f3(
@@ -2499,7 +2499,7 @@ def ft_cls_coef(
         cls_inds: t.Optional[np.ndarray] = None,
         N_scaled: t.Optional[np.ndarray] = None,
         norm_dist_mat: t.Optional[np.ndarray] = None,
-    ) -> np.ndarray:
+    ) -> float:
         """Clustering coefficient.
 
         The clustering coefficient of a vertex `v_i` is given by the
@@ -2552,7 +2552,7 @@ class to both be considered neighbors of each other. Note that
 
         Returns
         -------
-        :obj:`np.ndarray`
+        float
             Clustering coefficient of given data.
 
         References

diff --git a/pymfe/mfe.py b/pymfe/mfe.py
@@ -1345,6 +1345,129 @@ def extract(
 
         return res_names, res_vals
 
+    def extract_metafeature_names(
+        self, supervised: bool = True
+    ) -> t.Tuple[str, ...]:
+        """Extract the pre-configured meta-feature names.
+
+        Parameters
+        ----------
+        supervised : bool, optional
+            If True, extract the meta-feature names assuming that `y` (data
+            labels) is given alongside `X` (independent attributes).
+
+            If there is some data fit into the MFE model, this method checks
+            wether `y` was fitted or not. Therefore, setting `supervised=True`
+            while fitting only `X` has no effect, and only unsupervised
+            meta-feature names will be returned.
+
+        Returns
+        -------
+        tuple
+            Tuple with meta-feature names to be extracted as values.
+        """
+        if self.X is not None:
+            custom_args_ft = self._custom_args_ft
+            precomp_args_ft = self._precomp_args_ft
+            postprocess_args_ft = self._postprocess_args_ft
+
+        else:
+            # No data fit, assume given arguments.
+            given_arguments = {
+                "X",
+                "N",
+                "C",
+                "num_cv_folds",
+                "shuffle_cv_folds",
+                "lm_sample_frac",
+                "score",
+                "random_state",
+                "cat_cols",
+                "hypparam_model_dt",
+            }
+
+            if supervised:
+                given_arguments.add("y")
+                given_arguments.add("dt_model")
+
+            postprocess_args_ft = {
+                "inserted_group_dep": self.inserted_group_dep,
+            }
+
+            custom_args_ft = dict.fromkeys(given_arguments, None)
+            precomp_args_ft = {}
+
+        metafeat_names = []  # type: t.List[str]
+
+        for cur_metadata in self._metadata_mtd_ft:
+            (
+                ft_mtd_name,
+                ft_mtd_callable,
+                ft_mtd_args,
+                ft_mandatory,
+            ) = cur_metadata
+
+            ft_name_without_prefix = _internal.remove_prefix(
+                value=ft_mtd_name, prefix=_internal.MTF_PREFIX
+            )
+
+            try:
+                # Test if meta-feature can be extracted.
+                _internal.build_mtd_kwargs(
+                    mtd_name=ft_name_without_prefix,
+                    mtd_args=ft_mtd_args,
+                    mtd_mandatory=ft_mandatory,
+                    user_custom_args=None,
+                    inner_custom_args=custom_args_ft,
+                    precomp_args=precomp_args_ft,
+                    suppress_warnings=True,
+                )
+
+            except RuntimeError:
+                continue
+
+            ft_has_length = _internal.array_is_returned(ft_mtd_callable)
+
+            if self._metadata_mtd_sm and ft_has_length:
+                for cur_metadata_sm in self._metadata_mtd_sm:
+                    sm_mtd_name, sm_mtd_callable, _, _ = cur_metadata_sm
+
+                    try:
+                        summarized_val_len = len(sm_mtd_callable([0]))
+
+                    except TypeError:
+                        summarized_val_len = 0
+
+                    if summarized_val_len > 0:
+                        metafeat_names += [
+                            ".".join(
+                                (ft_name_without_prefix, sm_mtd_name, str(i))
+                            )
+                            for i in range(summarized_val_len)
+                        ]
+
+                    else:
+                        metafeat_names.append(
+                            ".".join((ft_name_without_prefix, sm_mtd_name))
+                        )
+
+            else:
+                metafeat_names.append(ft_name_without_prefix)
+
+        fake_vals = len(metafeat_names) * [0]
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+
+            _internal.post_processing(
+                results=(metafeat_names, fake_vals, fake_vals),
+                groups=self.groups,
+                suppress_warnings=True,
+                **postprocess_args_ft,
+            )
+
+        return tuple(sorted(metafeat_names))
+
     def _extract_with_bootstrap(
         self,
         extractor: "MFE",

diff --git a/tests/test_architecture.py b/tests/test_architecture.py
@@ -243,6 +243,102 @@ def test_default_alias_groups(self):
         assert (len(res) == len(_internal.VALID_GROUPS)
                 and not set(res).symmetric_difference(_internal.VALID_GROUPS))
 
+    @pytest.mark.parametrize("groups, summary", [
+        ("statistical", "all"),
+        ("general", "all"),
+        ("landmarking", "all"),
+        ("relative", "all"),
+        ("model-based", "all"),
+        ("info-theory", "all"),
+        ("statistical", ("mean", "sd")),
+        ("general", ("mean", "sd")),
+        ("landmarking", ("mean", "sd")),
+        ("model-based", ("mean", "sd")),
+        ("general", ("mean", "histogram")),
+        ("landmarking", ("mean", "histogram")),
+        ("model-based", ("mean", "histogram")),
+        ("general", ("quantiles", "histogram")),
+        ("landmarking", ("quantiles", "histogram")),
+        ("model-based", ("quantiles", "histogram")),
+        (["general", "relative"], ("mean", "sd")),
+        (["general", "relative"], ("quantiles", "histogram")),
+        (["landmarking", "relative"], ("mean", "sd")),
+        (["landmarking", "relative"], ("quantiles", "histogram")),
+        (["statistical", "landmarking", "relative"], ("mean", "sd")),
+        ("all", "all"),
+    ])
+    def test_extract_metafeature_names_supervised(self, groups, summary):
+        """Test .extract_metafeature_names method."""
+        X, y = utils.load_xy(0)
+
+        mfe = MFE(groups=groups, summary=summary)
+
+        mtf_names_1 = mfe.extract_metafeature_names(supervised=True)
+        mtf_names_2 = mfe.fit(X.values, y.values).extract(suppress_warnings=True)[0]
+
+        assert mtf_names_1 == tuple(mtf_names_2)
+
+    @pytest.mark.parametrize("groups, summary", [
+        ("statistical", "all"),
+        ("general", "all"),
+        ("landmarking", "all"),
+        ("relative", "all"),
+        ("model-based", "all"),
+        ("info-theory", "all"),
+        ("statistical", ("mean", "sd")),
+        ("general", ("mean", "sd")),
+        ("landmarking", ("mean", "sd")),
+        ("model-based", ("mean", "sd")),
+        ("general", ("mean", "histogram")),
+        ("landmarking", ("mean", "histogram")),
+        ("model-based", ("mean", "histogram")),
+        ("general", ("quantiles", "histogram")),
+        ("landmarking", ("quantiles", "histogram")),
+        ("model-based", ("quantiles", "histogram")),
+        (["general", "relative"], ("mean", "sd")),
+        (["general", "relative"], ("quantiles", "histogram")),
+        (["landmarking", "relative"], ("mean", "sd")),
+        (["landmarking", "relative"], ("quantiles", "histogram")),
+        (["statistical", "landmarking", "relative"], ("mean", "sd")),
+        ("all", "all"),
+    ])
+    def test_extract_metafeature_names_unsupervised_01(self, groups, summary):
+        """Test .extract_metafeature_names method."""
+        X, _ = utils.load_xy(0)
+
+        mfe = MFE(groups=groups, summary=summary)
+
+        mtf_names_1 = mfe.extract_metafeature_names(supervised=False)
+        mtf_names_2 = mfe.fit(X.values).extract(suppress_warnings=True)[0]
+
+        assert mtf_names_1 == tuple(mtf_names_2)
+
+    @pytest.mark.parametrize("groups, summary", [
+        ("general", "all"),
+        ("statistical", ("mean", "sd")),
+        (["general", "relative"], ("mean", "sd")),
+        (["general", "relative"], ("quantiles", "histogram")),
+        (["landmarking", "relative"], ("mean", "sd")),
+        (["landmarking", "relative"], ("quantiles", "histogram")),
+        (["statistical", "landmarking", "relative"], ("mean", "sd")),
+        ("all", "all"),
+    ])
+    def test_extract_metafeature_names_unsupervised_02(self, groups, summary):
+        """Test .extract_metafeature_names method."""
+        X, _ = utils.load_xy(0)
+
+        mfe = MFE(groups=groups, summary=summary)
+
+        mtf_names_1 = mfe.fit(X.values).extract(suppress_warnings=True)[0]
+        # Note: by default, .extract_metafeature_names should check wether
+        # 'y' was fitted or not if .fit was called before. Therefore, here,
+        # supervised=True is expected to be ignored and behave like
+        # supervised=False.
+        mtf_names_2 = mfe.extract_metafeature_names(supervised=True)
+        mtf_names_3 = mfe.extract_metafeature_names(supervised=False)
+
+        assert tuple(mtf_names_1) == mtf_names_2 == mtf_names_3
+
     @pytest.mark.parametrize("groups", [
         "statistical",
         "general",