Skip to content

Commit

Permalink
Merge pull request #56 from ealcobaca/itemset
Browse files Browse the repository at this point in the history
Itemset
  • Loading branch information
FelSiq committed Dec 10, 2019
2 parents 7caad1c + 1a3051f commit 4b475c9
Show file tree
Hide file tree
Showing 4 changed files with 291 additions and 7 deletions.
23 changes: 20 additions & 3 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,30 @@ This is the full API documentation of the `pymfe` toolbox.
.. autosummary::
:toctree: generated/

concept.MFEConcept
clustering.MFEClustering


.. _itemset_ref:

:mod:`pymfe.itemset`: Itemset Meta-features
===================================================

.. automodule:: pymfe.itemset
:no-members:
:no-inherited-members:

.. currentmodule:: pymfe

.. autosummary::
:toctree: generated/

itemset.MFEItemset


.. _concept_ref:

:mod:`pymfe.concept`: Concept Meta-features
===========================================
===================================================

.. automodule:: pymfe.concept
:no-members:
Expand All @@ -136,4 +153,4 @@ This is the full API documentation of the `pymfe` toolbox.
.. autosummary::
:toctree: generated/

concept.MFEConcept
concept.MFEConcept
8 changes: 4 additions & 4 deletions pymfe/_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
import pymfe.clustering as clustering
import pymfe.model_based as model_based
import pymfe.complexity as complexity
# import pymfe.itemset as itemset
import pymfe.itemset as itemset
import pymfe.concept as concept
import pymfe.scoring as scoring

Expand All @@ -106,7 +106,7 @@
"relative",
"clustering",
"complexity",
# "itemset",
"itemset",
"concept"
) # type: t.Tuple[str, ...]

Expand All @@ -119,7 +119,7 @@
"landmarking",
None,
None,
# None,
None,
None
) # type: t.Tuple[t.Optional[str], ...]

Expand All @@ -132,7 +132,7 @@
relative.MFERelativeLandmarking,
clustering.MFEClustering,
complexity.MFEComplexity,
# itemset.MFEItemset,
itemset.MFEItemset,
concept.MFEConcept
) # type: t.Tuple

Expand Down
187 changes: 187 additions & 0 deletions pymfe/itemset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
"""Module dedicated to extraction of Itemset Metafeatures."""

import typing as t
import numpy as np


class MFEItemset:
"""Keep methods for metafeatures of ``Itemset`` group.
The convention adopted for metafeature extraction related methods is to
always start with ``ft_`` prefix to allow automatic method detection. This
prefix is predefined within ``_internal`` module.
All method signature follows the conventions and restrictions listed below:
1. For independent attribute data, ``X`` means ``every type of attribute``,
``N`` means ``Numeric attributes only`` and ``C`` stands for
``Categorical attributes only``. It is important to note that the
categorical attribute sets between ``X`` and ``C`` and the numerical
attribute sets between ``X`` and ``N`` may differ due to data
transformations, performed while fitting data into MFE model,
enabled by, respectively, ``transform_num`` and ``transform_cat``
arguments from ``fit`` (MFE method).
2. Only arguments in MFE ``_custom_args_ft`` attribute (set up inside
``fit`` method) are allowed to be required method arguments. All other
arguments must be strictly optional (i.e., has a predefined default
value).
3. The initial assumption is that the user can change any optional
argument, without any previous verification of argument value or its
type, via kwargs argument of ``extract`` method of MFE class.
4. The return value of all feature extraction methods should be a single
value or a generic Sequence (preferably a :obj:`np.ndarray`) type with
numeric values.
There is another type of method adopted for automatic detection. It is
adopted the prefix ``precompute_`` for automatic detection of these
methods. These methods run while fitting some data into an MFE model
automatically, and their objective is to precompute some common value
shared between more than one feature extraction method. This strategy is a
trade-off between more system memory consumption and speeds up of feature
extraction. Their return value must always be a dictionary whose keys are
possible extra arguments for both feature extraction methods and other
precomputation methods. Note that there is a share of precomputed values
between all valid feature-extraction modules (e.g., ``class_freqs``
computed in module ``statistical`` can freely be used for any
precomputation or feature extraction method of module ``landmarking``).
"""

@classmethod
def precompute_binary_matrix(cls, C: t.Optional[np.ndarray],
**kwargs) -> t.Dict[str, t.Any]:
"""Precompute the binary representation of attributes.
Parameters
----------
C : :obj:`np.ndarray`, optional
Attributes from fitted data.
**kwargs
Additional arguments. May have previously precomputed before this
method from other precomputed methods, so they can help speed up
this precomputation.
Returns
-------
:obj:`dict`
With following precomputed items:
- ``itemset_binary_matrix`` (:obj:`list`): Binary
representation of the attributes.
"""
precomp_vals = {}

if C is not None and "itemset_binary_matrix" not in kwargs:
itemset_binary_matrix = MFEItemset._matrix_to_binary(C)
precomp_vals["itemset_binary_matrix"] = itemset_binary_matrix

return precomp_vals

@staticmethod
def _array_to_binary(array: np.ndarray) -> np.ndarray:
"""Convert an array to its binary representation."""
values = np.unique(array)
res = np.zeros((array.shape[0], values.shape[0])).astype(bool)
for i, val in enumerate(values):
res[:, i] = array == val
return res

@staticmethod
def _matrix_to_binary(C: np.array) -> t.List[np.ndarray]:
"""Convert an matrix to its binary representation."""
return [MFEItemset._array_to_binary(col) for col in C.T]

@classmethod
def ft_two_itemset(
cls,
C: np.ndarray,
itemset_binary_matrix: t.List[np.ndarray] = None,
) -> np.ndarray:
"""Computes the two itemset meta-feature.
The two itemset is the individual frequency of each attribute
in binary format.
Parameters
----------
C : :obj:`np.ndarray`
Attributes from fitted data.
itemset_binary_matrix : :obj:`list`
Binary representation of the attributes. Each list value has a
binary representation of each attributes in the dataset.
Returns
-------
:obj:`np.ndarray`
An array with the oneitem for each attribute.
References
----------
.. [1] Song, Q., Wang, G., & Wang, C. (2012). Automatic recommendation
of classification algorithms based on data set characteristics.
Pattern recognition, 45(7), 2672-2689.
"""
if itemset_binary_matrix is None:
sub_dic = MFEItemset.precompute_binary_matrix(C)
itemset_binary_matrix = sub_dic["itemset_binary_matrix"]

B = itemset_binary_matrix

result = [] # type: t.List[float]
while B:
Bi = B[0]
del B[0]
for Bj in B:
aux = [
np.sum(np.logical_xor(i, j)) for i in Bi.T for j in Bj.T
]
result += aux

twoitem_by_attr = np.array(result) / C.shape[0]

return twoitem_by_attr

@classmethod
def ft_one_itemset(cls,
C: np.ndarray,
itemset_binary_matrix: t.List[np.ndarray] = None
) -> np.ndarray:
"""Computes the one itemset meta-feature.
The one itemset meta-feature can be seen as the correlation
information of each one attributes value pairs in binary
format.
Parameters
----------
C : :obj:`np.ndarray`
Attributes from fitted data.
itemset_binary_matrix : :obj:`list`
Binary representation of the attributes. Each list value has a
binary representation of each attributes in the dataset.
Returns
-------
:obj:`np.ndarray`
An array with the twoitem for each attribute.
References
----------
.. [1] Song, Q., Wang, G., & Wang, C. (2012). Automatic recommendation
of classification algorithms based on data set characteristics.
Pattern recognition, 45(7), 2672-2689.
"""
if itemset_binary_matrix is None:
sub_dic = MFEItemset.precompute_binary_matrix(C)
itemset_binary_matrix = sub_dic["itemset_binary_matrix"]

B = itemset_binary_matrix
B = np.concatenate(B, axis=1)

oneitem_by_attr = np.sum(B, axis=0) / C.shape[0]

return oneitem_by_attr
80 changes: 80 additions & 0 deletions tests/test_itemset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Test module for Itemset metafeatures."""
import pytest
import numpy as np

from pymfe.mfe import MFE
from tests.utils import load_xy
from pymfe.itemset import MFEItemset


GNAME = "itemset"


class TestLandmarking():
"""TestClass dedicated to test Itemset metafeatures."""

@pytest.mark.parametrize(
"dt_id, ft_name, exp_value, precompute",
[
###################
# Mixed data
###################
(0, 'one_itemset', [0.24999999, 0.0669328], True),
(0, 'two_itemset', [0.38297877, 0.10911008], True),
(0, 'one_itemset', [0.24999999, 0.0669328], False),
(0, 'two_itemset', [0.38297877, 0.10911008], False),
###################
# Categorical data
###################
(1, 'one_itemset', [0.49315068, 0.34882316], True),
(1, 'two_itemset', [0.5, 0.24335141], True),
(1, 'one_itemset', [0.49315068, 0.34882316], False),
(1, 'two_itemset', [0.5, 0.24335141], False),
###################
# Numerical data
###################
(2, 'one_itemset', [0.2, 0.049322903], True),
(2, 'two_itemset', [0.32, 0.084694475], True),
(2, 'one_itemset', [0.2, 0.049322903], False),
(2, 'two_itemset', [0.32, 0.084694475], False),
])
def test_ft_methods_itemset(self, dt_id, ft_name, exp_value, precompute):
"""Function to test each meta-feature belongs to itemset group.
"""
precomp_group = GNAME if precompute else None

X, y = load_xy(dt_id)
mfe = MFE(
groups=[GNAME],
features=[ft_name],
random_state=1234)

mfe.fit(X.values, y.values, precomp_groups=precomp_group)

value = mfe.extract()[1]

if exp_value is np.nan:
assert value[0] is exp_value
else:
assert np.allclose(value, exp_value, equal_nan=True)

def test_itemset_using_author_dataset(self):
"""In this test we use the toy dataset and results used by the authors'
paper.
"""
C = np.array([[0, 2, 3], [2, 5, 0], [1, 4, 1], [0, 2, 2], [3, 3, 3],
[3, 2, 3], [0, 2, 0], [1, 3, 1], [2, 4, 3], [1, 5, 2]])

value = MFEItemset.ft_one_itemset(C=C)
exp_value = [0.3, 0.3, 0.2, 0.2, 0.4, 0.2,
0.2, 0.2, 0.2, 0.2, 0.2, 0.4]

assert np.allclose(value, exp_value, equal_nan=True)

value = MFEItemset.ft_two_itemset(C=C)
exp_value = [0.1, 0.5, 0.5]

assert np.allclose(value[[0, 1, 2]], exp_value, equal_nan=True)

exp_value = [0.2, 0.6]
assert np.allclose(value[[-2, -1]], exp_value, equal_nan=True)

0 comments on commit 4b475c9

Please sign in to comment.