Merge pull request #53 from ealcobaca/clustering-updates

Clustering updates
ealcobaca · Dec 10, 2019 · f93ad2b · f93ad2b
2 parents ccd4870 + d25468b
commit f93ad2b
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 28 deletions.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -103,3 +103,19 @@ This is the full API documentation of the `pymfe` toolbox.
    :toctree: generated/
 
    landmarking.MFELandmarking
+
+.. _clustering_ref:
+
+:mod:`pymfe.clustering`: Clustering Meta-features
+===================================================
+
+.. automodule:: pymfe.clustering
+   :no-members:
+   :no-inherited-members:
+
+.. currentmodule:: pymfe
+
+.. autosummary::
+   :toctree: generated/
+
+   clustering.MFEClustering
diff --git a/pymfe/clustering.py b/pymfe/clustering.py
@@ -161,8 +161,15 @@ def precompute_group_distances(cls,
                     classes=classes,
                     get_max_dist=False))
 
-            precomp_vals["intraclass_dists"] = (
-                precomp_vals["pairwise_intraclass_dists"].max(axis=1))
+            if precomp_vals["pairwise_intraclass_dists"].ndim == 2:
+                precomp_vals["intraclass_dists"] = (
+                    precomp_vals["pairwise_intraclass_dists"].max(axis=1))
+
+            else:
+                precomp_vals["intraclass_dists"] = np.array([
+                    np.max(class_arr)
+                    for class_arr in precomp_vals["pairwise_intraclass_dists"]
+                ])
 
         return precomp_vals
 
@@ -302,10 +309,7 @@ class (effectively holding the same result as if the argument
         if not {"representative"}.issubset(kwargs):
             precomp_vals["representative"] = (
                 MFEClustering._get_class_representatives(
-                    N=N,
-                    y=y,
-                    representative=representative,
-                    classes=classes))
+                    N=N, y=y, representative=representative, classes=classes))
 
         return precomp_vals
 
@@ -590,14 +594,12 @@ def ft_int(
         return pairwise_norm_interclass_dist.sum() * norm_factor
 
     @classmethod
-    def ft_sil(
-            cls,
-            N: np.ndarray,
-            y: np.ndarray,
-            dist_metric: str = "euclidean",
-            sample_size: t.Optional[int] = None,
-            random_state: t.Optional[int] = None
-    ) -> float:
+    def ft_sil(cls,
+               N: np.ndarray,
+               y: np.ndarray,
+               dist_metric: str = "euclidean",
+               sample_frac: t.Optional[int] = None,
+               random_state: t.Optional[int] = None) -> float:
         """Calculate the mean silhouette value from ``N``.
 
         Metric range is -1 to +1 (both inclusive).
@@ -611,12 +613,12 @@ def ft_sil(
             instances. Check `distmetric`_ for a full list of valid
             distance metrics.
 
-        sample_size : :obj:`int`, optional
-            Sample size used to compute the silhouette coefficient. If
-            None is used, then all data is used.
+        sample_frac : :obj:`int`, optional
+            Sample fraction used to compute the silhouette coefficient. If
+            None is given, then all data is used.
 
         random_state : :obj:`int`, optional
-            Used if ``sample_size`` is not None. Random seed used while
+            Used if ``sample_frac`` is not None. Random seed used while
             sampling the data.
 
         Returns
@@ -631,9 +633,10 @@ def ft_sil(
             .. _distmetric: :obj:`sklearn.neighbors.DistanceMetric`
                 documentation.
         """
+        sample_size = N.shape[0]
 
-        if sample_size is not None:
-            sample_size = int(sample_size*len(N))
+        if sample_frac is not None:
+            sample_size = int(sample_frac * sample_size)
 
         silhouette = sklearn.metrics.silhouette_score(
             X=N,
@@ -684,10 +687,7 @@ def ft_pb(
         return correlation
 
     @classmethod
-    def ft_ch(
-            cls,
-            N: np.ndarray,
-            y: np.ndarray) -> float:
+    def ft_ch(cls, N: np.ndarray, y: np.ndarray) -> float:
         """Calinski and Harabasz index.
         Check `cahascore`_ for more information.
 
@@ -733,7 +733,7 @@ def ft_sc(cls,
               y: np.ndarray,
               size: int = 15,
               class_freqs: t.Optional[np.ndarray] = None,
-              normalize: bool = False) -> t.Union[int]:
+              normalize: bool = False) -> int:
         """Number of clusters with size smaller than ``size``.
 
         Parameters

diff --git a/tests/test_clustering.py b/tests/test_clustering.py
@@ -73,9 +73,10 @@ class TestClustering():
             (2, 'vdb', 0.7517428073901388, True),
             (2, 'vdu', 2.3392212797698888e-05, True),
         ])
-    def test_ft_methods_general(self, dt_id, ft_name, exp_value, precompute):
-        """Function to test each meta-feature belongs to general group.
-        """
+    def test_ft_methods_clustering(self, dt_id, ft_name, exp_value,
+                                   precompute):
+        """Function to test each meta-feature belongs to clustering group."""
+
         precomp_group = GNAME if precompute else None
         X, y = load_xy(dt_id)
         mfe = MFE(
@@ -89,6 +90,17 @@ def test_ft_methods_general(self, dt_id, ft_name, exp_value, precompute):
         else:
             assert np.allclose(value, exp_value)
 
+    @pytest.mark.parametrize("precompute", [False, True])
+    def test_silhouette_subsampling(self, precompute):
+        X, y = load_xy(0)
+        precomp_group = GNAME if precompute else None
+        mfe = MFE(
+            features="sil", random_state=1234).fit(
+                X.values, y.values, precomp_groups=precomp_group)
+        value = mfe.extract(sil={"sample_frac": 0.5})[1]
+
+        assert np.allclose(value, -0.07137712254830314)
+
     @staticmethod
     def test_precompute_nearest_neighbors():
         N = np.array([[1, 2, 3], [4, 5, 6]])