Skip to content

Commit

Permalink
Merge pull request #107 from ealcobaca/extract-mtf-names
Browse files Browse the repository at this point in the history
Method to extract meta-feature names before any extraction
  • Loading branch information
ealcobaca committed Aug 25, 2020
2 parents c8f5d4a + 07c85d0 commit 29e3527
Show file tree
Hide file tree
Showing 3 changed files with 227 additions and 8 deletions.
16 changes: 8 additions & 8 deletions pymfe/complexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ def ft_f2(
y: np.ndarray,
ovo_comb: t.Optional[np.ndarray] = None,
cls_inds: t.Optional[np.ndarray] = None,
) -> float:
) -> np.ndarray:
"""Volume of the overlapping region.
This measure calculates the overlap of the distributions of
Expand All @@ -840,8 +840,8 @@ def ft_f2(
Returns
-------
float
Volume of the overlapping region.
:obj:`np.ndarray`
Volume of the overlapping region for each OVO combination.
References
----------
Expand All @@ -864,7 +864,7 @@ def ft_f2(
ovo_comb = sub_dic["ovo_comb"]
cls_inds = sub_dic["cls_inds"]

f4 = np.zeros(ovo_comb.shape[0], dtype=float)
f2 = np.zeros(ovo_comb.shape[0], dtype=float)

for ind, (cls_id_1, cls_id_2) in enumerate(ovo_comb):
N_cls_1 = N[cls_inds[cls_id_1], :]
Expand All @@ -875,11 +875,11 @@ def ft_f2(
minmax = cls._calc_minmax(N_cls_1, N_cls_2)
maxmin = cls._calc_maxmin(N_cls_1, N_cls_2)

f4[ind] = np.prod(
f2[ind] = np.prod(
np.maximum(0.0, minmax - maxmin) / (maxmax - minmin)
)

return f4
return f2

@classmethod
def ft_f3(
Expand Down Expand Up @@ -2499,7 +2499,7 @@ def ft_cls_coef(
cls_inds: t.Optional[np.ndarray] = None,
N_scaled: t.Optional[np.ndarray] = None,
norm_dist_mat: t.Optional[np.ndarray] = None,
) -> np.ndarray:
) -> float:
"""Clustering coefficient.
The clustering coefficient of a vertex `v_i` is given by the
Expand Down Expand Up @@ -2552,7 +2552,7 @@ class to both be considered neighbors of each other. Note that
Returns
-------
:obj:`np.ndarray`
float
Clustering coefficient of given data.
References
Expand Down
123 changes: 123 additions & 0 deletions pymfe/mfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,129 @@ def extract(

return res_names, res_vals

def extract_metafeature_names(
self, supervised: bool = True
) -> t.Tuple[str, ...]:
"""Extract the pre-configured meta-feature names.
Parameters
----------
supervised : bool, optional
If True, extract the meta-feature names assuming that `y` (data
labels) is given alongside `X` (independent attributes).
If there is some data fit into the MFE model, this method checks
wether `y` was fitted or not. Therefore, setting `supervised=True`
while fitting only `X` has no effect, and only unsupervised
meta-feature names will be returned.
Returns
-------
tuple
Tuple with meta-feature names to be extracted as values.
"""
if self.X is not None:
custom_args_ft = self._custom_args_ft
precomp_args_ft = self._precomp_args_ft
postprocess_args_ft = self._postprocess_args_ft

else:
# No data fit, assume given arguments.
given_arguments = {
"X",
"N",
"C",
"num_cv_folds",
"shuffle_cv_folds",
"lm_sample_frac",
"score",
"random_state",
"cat_cols",
"hypparam_model_dt",
}

if supervised:
given_arguments.add("y")
given_arguments.add("dt_model")

postprocess_args_ft = {
"inserted_group_dep": self.inserted_group_dep,
}

custom_args_ft = dict.fromkeys(given_arguments, None)
precomp_args_ft = {}

metafeat_names = [] # type: t.List[str]

for cur_metadata in self._metadata_mtd_ft:
(
ft_mtd_name,
ft_mtd_callable,
ft_mtd_args,
ft_mandatory,
) = cur_metadata

ft_name_without_prefix = _internal.remove_prefix(
value=ft_mtd_name, prefix=_internal.MTF_PREFIX
)

try:
# Test if meta-feature can be extracted.
_internal.build_mtd_kwargs(
mtd_name=ft_name_without_prefix,
mtd_args=ft_mtd_args,
mtd_mandatory=ft_mandatory,
user_custom_args=None,
inner_custom_args=custom_args_ft,
precomp_args=precomp_args_ft,
suppress_warnings=True,
)

except RuntimeError:
continue

ft_has_length = _internal.array_is_returned(ft_mtd_callable)

if self._metadata_mtd_sm and ft_has_length:
for cur_metadata_sm in self._metadata_mtd_sm:
sm_mtd_name, sm_mtd_callable, _, _ = cur_metadata_sm

try:
summarized_val_len = len(sm_mtd_callable([0]))

except TypeError:
summarized_val_len = 0

if summarized_val_len > 0:
metafeat_names += [
".".join(
(ft_name_without_prefix, sm_mtd_name, str(i))
)
for i in range(summarized_val_len)
]

else:
metafeat_names.append(
".".join((ft_name_without_prefix, sm_mtd_name))
)

else:
metafeat_names.append(ft_name_without_prefix)

fake_vals = len(metafeat_names) * [0]

with warnings.catch_warnings():
warnings.filterwarnings("ignore")

_internal.post_processing(
results=(metafeat_names, fake_vals, fake_vals),
groups=self.groups,
suppress_warnings=True,
**postprocess_args_ft,
)

return tuple(sorted(metafeat_names))

def _extract_with_bootstrap(
self,
extractor: "MFE",
Expand Down
96 changes: 96 additions & 0 deletions tests/test_architecture.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,102 @@ def test_default_alias_groups(self):
assert (len(res) == len(_internal.VALID_GROUPS)
and not set(res).symmetric_difference(_internal.VALID_GROUPS))

@pytest.mark.parametrize("groups, summary", [
("statistical", "all"),
("general", "all"),
("landmarking", "all"),
("relative", "all"),
("model-based", "all"),
("info-theory", "all"),
("statistical", ("mean", "sd")),
("general", ("mean", "sd")),
("landmarking", ("mean", "sd")),
("model-based", ("mean", "sd")),
("general", ("mean", "histogram")),
("landmarking", ("mean", "histogram")),
("model-based", ("mean", "histogram")),
("general", ("quantiles", "histogram")),
("landmarking", ("quantiles", "histogram")),
("model-based", ("quantiles", "histogram")),
(["general", "relative"], ("mean", "sd")),
(["general", "relative"], ("quantiles", "histogram")),
(["landmarking", "relative"], ("mean", "sd")),
(["landmarking", "relative"], ("quantiles", "histogram")),
(["statistical", "landmarking", "relative"], ("mean", "sd")),
("all", "all"),
])
def test_extract_metafeature_names_supervised(self, groups, summary):
"""Test .extract_metafeature_names method."""
X, y = utils.load_xy(0)

mfe = MFE(groups=groups, summary=summary)

mtf_names_1 = mfe.extract_metafeature_names(supervised=True)
mtf_names_2 = mfe.fit(X.values, y.values).extract(suppress_warnings=True)[0]

assert mtf_names_1 == tuple(mtf_names_2)

@pytest.mark.parametrize("groups, summary", [
("statistical", "all"),
("general", "all"),
("landmarking", "all"),
("relative", "all"),
("model-based", "all"),
("info-theory", "all"),
("statistical", ("mean", "sd")),
("general", ("mean", "sd")),
("landmarking", ("mean", "sd")),
("model-based", ("mean", "sd")),
("general", ("mean", "histogram")),
("landmarking", ("mean", "histogram")),
("model-based", ("mean", "histogram")),
("general", ("quantiles", "histogram")),
("landmarking", ("quantiles", "histogram")),
("model-based", ("quantiles", "histogram")),
(["general", "relative"], ("mean", "sd")),
(["general", "relative"], ("quantiles", "histogram")),
(["landmarking", "relative"], ("mean", "sd")),
(["landmarking", "relative"], ("quantiles", "histogram")),
(["statistical", "landmarking", "relative"], ("mean", "sd")),
("all", "all"),
])
def test_extract_metafeature_names_unsupervised_01(self, groups, summary):
"""Test .extract_metafeature_names method."""
X, _ = utils.load_xy(0)

mfe = MFE(groups=groups, summary=summary)

mtf_names_1 = mfe.extract_metafeature_names(supervised=False)
mtf_names_2 = mfe.fit(X.values).extract(suppress_warnings=True)[0]

assert mtf_names_1 == tuple(mtf_names_2)

@pytest.mark.parametrize("groups, summary", [
("general", "all"),
("statistical", ("mean", "sd")),
(["general", "relative"], ("mean", "sd")),
(["general", "relative"], ("quantiles", "histogram")),
(["landmarking", "relative"], ("mean", "sd")),
(["landmarking", "relative"], ("quantiles", "histogram")),
(["statistical", "landmarking", "relative"], ("mean", "sd")),
("all", "all"),
])
def test_extract_metafeature_names_unsupervised_02(self, groups, summary):
"""Test .extract_metafeature_names method."""
X, _ = utils.load_xy(0)

mfe = MFE(groups=groups, summary=summary)

mtf_names_1 = mfe.fit(X.values).extract(suppress_warnings=True)[0]
# Note: by default, .extract_metafeature_names should check wether
# 'y' was fitted or not if .fit was called before. Therefore, here,
# supervised=True is expected to be ignored and behave like
# supervised=False.
mtf_names_2 = mfe.extract_metafeature_names(supervised=True)
mtf_names_3 = mfe.extract_metafeature_names(supervised=False)

assert tuple(mtf_names_1) == mtf_names_2 == mtf_names_3

@pytest.mark.parametrize("groups", [
"statistical",
"general",
Expand Down

0 comments on commit 29e3527

Please sign in to comment.