Merge pull request #56 from ealcobaca/itemset

Itemset
ealcobaca · Dec 10, 2019 · 4b475c9 · 4b475c9
2 parents 7caad1c + 1a3051f
commit 4b475c9
Show file tree

Hide file tree

Showing 4 changed files with 291 additions and 7 deletions.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -119,13 +119,30 @@ This is the full API documentation of the `pymfe` toolbox.
 .. autosummary::
    :toctree: generated/
 
-   concept.MFEConcept
+   clustering.MFEClustering
+
+
+.. _itemset_ref:
+
+:mod:`pymfe.itemset`: Itemset Meta-features
+===================================================
+
+.. automodule:: pymfe.itemset
+   :no-members:
+   :no-inherited-members:
+
+.. currentmodule:: pymfe
+
+.. autosummary::
+   :toctree: generated/
+
+   itemset.MFEItemset
 
 
 .. _concept_ref:
 
 :mod:`pymfe.concept`: Concept Meta-features
-===========================================
+===================================================
 
 .. automodule:: pymfe.concept
    :no-members:
@@ -136,4 +153,4 @@ This is the full API documentation of the `pymfe` toolbox.
 .. autosummary::
    :toctree: generated/
 
-   concept.MFEConcept
+   concept.MFEConcept
diff --git a/pymfe/_internal.py b/pymfe/_internal.py
@@ -83,7 +83,7 @@
 import pymfe.clustering as clustering
 import pymfe.model_based as model_based
 import pymfe.complexity as complexity
-# import pymfe.itemset as itemset
+import pymfe.itemset as itemset
 import pymfe.concept as concept
 import pymfe.scoring as scoring
 
@@ -106,7 +106,7 @@
     "relative",
     "clustering",
     "complexity",
-    # "itemset",
+    "itemset",
     "concept"
 )  # type: t.Tuple[str, ...]
 
@@ -119,7 +119,7 @@
     "landmarking",
     None,
     None,
-    # None,
+    None,
     None
 )  # type: t.Tuple[t.Optional[str], ...]
 
@@ -132,7 +132,7 @@
     relative.MFERelativeLandmarking,
     clustering.MFEClustering,
     complexity.MFEComplexity,
-    # itemset.MFEItemset,
+    itemset.MFEItemset,
     concept.MFEConcept
 )  # type: t.Tuple
 

diff --git a/pymfe/itemset.py b/pymfe/itemset.py
@@ -0,0 +1,187 @@
+"""Module dedicated to extraction of Itemset Metafeatures."""
+
+import typing as t
+import numpy as np
+
+
+class MFEItemset:
+    """Keep methods for metafeatures of ``Itemset`` group.
+
+    The convention adopted for metafeature extraction related methods is to
+    always start with ``ft_`` prefix to allow automatic method detection. This
+    prefix is predefined within ``_internal`` module.
+
+    All method signature follows the conventions and restrictions listed below:
+
+    1. For independent attribute data, ``X`` means ``every type of attribute``,
+       ``N`` means ``Numeric attributes only`` and ``C`` stands for
+       ``Categorical attributes only``. It is important to note that the
+       categorical attribute sets between ``X`` and ``C`` and the numerical
+       attribute sets between ``X`` and ``N`` may differ due to data
+       transformations, performed while fitting data into MFE model,
+       enabled by, respectively, ``transform_num`` and ``transform_cat``
+       arguments from ``fit`` (MFE method).
+
+    2. Only arguments in MFE ``_custom_args_ft`` attribute (set up inside
+       ``fit`` method) are allowed to be required method arguments. All other
+       arguments must be strictly optional (i.e., has a predefined default
+       value).
+
+    3. The initial assumption is that the user can change any optional
+       argument, without any previous verification of argument value or its
+       type, via kwargs argument of ``extract`` method of MFE class.
+
+    4. The return value of all feature extraction methods should be a single
+       value or a generic Sequence (preferably a :obj:`np.ndarray`) type with
+       numeric values.
+
+    There is another type of method adopted for automatic detection. It is
+    adopted the prefix ``precompute_`` for automatic detection of these
+    methods. These methods run while fitting some data into an MFE model
+    automatically, and their objective is to precompute some common value
+    shared between more than one feature extraction method. This strategy is a
+    trade-off between more system memory consumption and speeds up of feature
+    extraction. Their return value must always be a dictionary whose keys are
+    possible extra arguments for both feature extraction methods and other
+    precomputation methods. Note that there is a share of precomputed values
+    between all valid feature-extraction modules (e.g., ``class_freqs``
+    computed in module ``statistical`` can freely be used for any
+    precomputation or feature extraction method of module ``landmarking``).
+    """
+
+    @classmethod
+    def precompute_binary_matrix(cls, C: t.Optional[np.ndarray],
+                                 **kwargs) -> t.Dict[str, t.Any]:
+        """Precompute the binary representation of attributes.
+
+        Parameters
+        ----------
+        C : :obj:`np.ndarray`, optional
+            Attributes from fitted data.
+
+        **kwargs
+            Additional arguments. May have previously precomputed before this
+            method from other precomputed methods, so they can help speed up
+            this precomputation.
+
+        Returns
+        -------
+        :obj:`dict`
+            With following precomputed items:
+                - ``itemset_binary_matrix`` (:obj:`list`): Binary
+                  representation of the attributes.
+        """
+        precomp_vals = {}
+
+        if C is not None and "itemset_binary_matrix" not in kwargs:
+            itemset_binary_matrix = MFEItemset._matrix_to_binary(C)
+            precomp_vals["itemset_binary_matrix"] = itemset_binary_matrix
+
+        return precomp_vals
+
+    @staticmethod
+    def _array_to_binary(array: np.ndarray) -> np.ndarray:
+        """Convert an array to its binary representation."""
+        values = np.unique(array)
+        res = np.zeros((array.shape[0], values.shape[0])).astype(bool)
+        for i, val in enumerate(values):
+            res[:, i] = array == val
+        return res
+
+    @staticmethod
+    def _matrix_to_binary(C: np.array) -> t.List[np.ndarray]:
+        """Convert an matrix to its binary representation."""
+        return [MFEItemset._array_to_binary(col) for col in C.T]
+
+    @classmethod
+    def ft_two_itemset(
+            cls,
+            C: np.ndarray,
+            itemset_binary_matrix: t.List[np.ndarray] = None,
+    ) -> np.ndarray:
+        """Computes the two itemset meta-feature.
+
+        The two itemset is the individual frequency of each attribute
+        in binary format.
+
+        Parameters
+        ----------
+        C : :obj:`np.ndarray`
+            Attributes from fitted data.
+
+        itemset_binary_matrix : :obj:`list`
+            Binary representation of the attributes. Each list value has a
+            binary representation of each attributes in the dataset.
+
+        Returns
+        -------
+        :obj:`np.ndarray`
+            An array with the oneitem for each attribute.
+
+        References
+        ----------
+        .. [1] Song, Q., Wang, G., & Wang, C. (2012). Automatic recommendation
+           of classification algorithms based on data set characteristics.
+           Pattern recognition, 45(7), 2672-2689.
+        """
+        if itemset_binary_matrix is None:
+            sub_dic = MFEItemset.precompute_binary_matrix(C)
+            itemset_binary_matrix = sub_dic["itemset_binary_matrix"]
+
+        B = itemset_binary_matrix
+
+        result = []  # type: t.List[float]
+        while B:
+            Bi = B[0]
+            del B[0]
+            for Bj in B:
+                aux = [
+                    np.sum(np.logical_xor(i, j)) for i in Bi.T for j in Bj.T
+                ]
+                result += aux
+
+        twoitem_by_attr = np.array(result) / C.shape[0]
+
+        return twoitem_by_attr
+
+    @classmethod
+    def ft_one_itemset(cls,
+                       C: np.ndarray,
+                       itemset_binary_matrix: t.List[np.ndarray] = None
+                       ) -> np.ndarray:
+        """Computes the one itemset meta-feature.
+
+        The one itemset meta-feature can be seen as the correlation
+        information of each one attributes value pairs in binary
+        format.
+
+        Parameters
+        ----------
+        C : :obj:`np.ndarray`
+            Attributes from fitted data.
+
+        itemset_binary_matrix : :obj:`list`
+            Binary representation of the attributes. Each list value has a
+            binary representation of each attributes in the dataset.
+
+        Returns
+        -------
+        :obj:`np.ndarray`
+            An array with the twoitem for each attribute.
+
+        References
+        ----------
+        .. [1] Song, Q., Wang, G., & Wang, C. (2012). Automatic recommendation
+           of classification algorithms based on data set characteristics.
+           Pattern recognition, 45(7), 2672-2689.
+        """
+        if itemset_binary_matrix is None:
+            sub_dic = MFEItemset.precompute_binary_matrix(C)
+            itemset_binary_matrix = sub_dic["itemset_binary_matrix"]
+
+        B = itemset_binary_matrix
+        B = np.concatenate(B, axis=1)
+
+        oneitem_by_attr = np.sum(B, axis=0) / C.shape[0]
+
+        return oneitem_by_attr
diff --git a/tests/test_itemset.py b/tests/test_itemset.py
@@ -0,0 +1,80 @@
+"""Test module for Itemset metafeatures."""
+import pytest
+import numpy as np
+
+from pymfe.mfe import MFE
+from tests.utils import load_xy
+from pymfe.itemset import MFEItemset
+
+
+GNAME = "itemset"
+
+
+class TestLandmarking():
+    """TestClass dedicated to test Itemset metafeatures."""
+
+    @pytest.mark.parametrize(
+        "dt_id, ft_name, exp_value, precompute",
+        [
+            ###################
+            # Mixed data
+            ###################
+            (0, 'one_itemset', [0.24999999, 0.0669328], True),
+            (0, 'two_itemset', [0.38297877, 0.10911008], True),
+            (0, 'one_itemset', [0.24999999, 0.0669328], False),
+            (0, 'two_itemset', [0.38297877, 0.10911008], False),
+            ###################
+            # Categorical data
+            ###################
+            (1, 'one_itemset', [0.49315068, 0.34882316], True),
+            (1, 'two_itemset', [0.5, 0.24335141], True),
+            (1, 'one_itemset', [0.49315068, 0.34882316], False),
+            (1, 'two_itemset', [0.5, 0.24335141], False),
+            ###################
+            # Numerical data
+            ###################
+            (2, 'one_itemset', [0.2, 0.049322903], True),
+            (2, 'two_itemset', [0.32, 0.084694475], True),
+            (2, 'one_itemset', [0.2, 0.049322903], False),
+            (2, 'two_itemset', [0.32, 0.084694475], False),
+        ])
+    def test_ft_methods_itemset(self, dt_id, ft_name, exp_value, precompute):
+        """Function to test each meta-feature belongs to itemset group.
+        """
+        precomp_group = GNAME if precompute else None
+
+        X, y = load_xy(dt_id)
+        mfe = MFE(
+            groups=[GNAME],
+            features=[ft_name],
+            random_state=1234)
+
+        mfe.fit(X.values, y.values, precomp_groups=precomp_group)
+
+        value = mfe.extract()[1]
+
+        if exp_value is np.nan:
+            assert value[0] is exp_value
+        else:
+            assert np.allclose(value, exp_value, equal_nan=True)
+
+    def test_itemset_using_author_dataset(self):
+        """In this test we use the toy dataset and results used by the authors'
+        paper.
+        """
+        C = np.array([[0, 2, 3], [2, 5, 0], [1, 4, 1], [0, 2, 2], [3, 3, 3],
+                      [3, 2, 3], [0, 2, 0], [1, 3, 1], [2, 4, 3], [1, 5, 2]])
+
+        value = MFEItemset.ft_one_itemset(C=C)
+        exp_value = [0.3, 0.3, 0.2, 0.2, 0.4, 0.2,
+                     0.2, 0.2, 0.2, 0.2, 0.2, 0.4]
+
+        assert np.allclose(value, exp_value, equal_nan=True)
+
+        value = MFEItemset.ft_two_itemset(C=C)
+        exp_value = [0.1, 0.5, 0.5]
+
+        assert np.allclose(value[[0, 1, 2]], exp_value, equal_nan=True)
+
+        exp_value = [0.2, 0.6]
+        assert np.allclose(value[[-2, -1]], exp_value, equal_nan=True)