Skip to content

Commit

Permalink
Merge pull request #53 from ealcobaca/clustering-updates
Browse files Browse the repository at this point in the history
Clustering updates
  • Loading branch information
ealcobaca committed Dec 10, 2019
2 parents ccd4870 + d25468b commit f93ad2b
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 28 deletions.
16 changes: 16 additions & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,19 @@ This is the full API documentation of the `pymfe` toolbox.
:toctree: generated/

landmarking.MFELandmarking

.. _clustering_ref:

:mod:`pymfe.clustering`: Clustering Meta-features
===================================================

.. automodule:: pymfe.clustering
:no-members:
:no-inherited-members:

.. currentmodule:: pymfe

.. autosummary::
:toctree: generated/

clustering.MFEClustering
50 changes: 25 additions & 25 deletions pymfe/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,15 @@ def precompute_group_distances(cls,
classes=classes,
get_max_dist=False))

precomp_vals["intraclass_dists"] = (
precomp_vals["pairwise_intraclass_dists"].max(axis=1))
if precomp_vals["pairwise_intraclass_dists"].ndim == 2:
precomp_vals["intraclass_dists"] = (
precomp_vals["pairwise_intraclass_dists"].max(axis=1))

else:
precomp_vals["intraclass_dists"] = np.array([
np.max(class_arr)
for class_arr in precomp_vals["pairwise_intraclass_dists"]
])

return precomp_vals

Expand Down Expand Up @@ -302,10 +309,7 @@ class (effectively holding the same result as if the argument
if not {"representative"}.issubset(kwargs):
precomp_vals["representative"] = (
MFEClustering._get_class_representatives(
N=N,
y=y,
representative=representative,
classes=classes))
N=N, y=y, representative=representative, classes=classes))

return precomp_vals

Expand Down Expand Up @@ -590,14 +594,12 @@ def ft_int(
return pairwise_norm_interclass_dist.sum() * norm_factor

@classmethod
def ft_sil(
cls,
N: np.ndarray,
y: np.ndarray,
dist_metric: str = "euclidean",
sample_size: t.Optional[int] = None,
random_state: t.Optional[int] = None
) -> float:
def ft_sil(cls,
N: np.ndarray,
y: np.ndarray,
dist_metric: str = "euclidean",
sample_frac: t.Optional[int] = None,
random_state: t.Optional[int] = None) -> float:
"""Calculate the mean silhouette value from ``N``.
Metric range is -1 to +1 (both inclusive).
Expand All @@ -611,12 +613,12 @@ def ft_sil(
instances. Check `distmetric`_ for a full list of valid
distance metrics.
sample_size : :obj:`int`, optional
Sample size used to compute the silhouette coefficient. If
None is used, then all data is used.
sample_frac : :obj:`int`, optional
Sample fraction used to compute the silhouette coefficient. If
None is given, then all data is used.
random_state : :obj:`int`, optional
Used if ``sample_size`` is not None. Random seed used while
Used if ``sample_frac`` is not None. Random seed used while
sampling the data.
Returns
Expand All @@ -631,9 +633,10 @@ def ft_sil(
.. _distmetric: :obj:`sklearn.neighbors.DistanceMetric`
documentation.
"""
sample_size = N.shape[0]

if sample_size is not None:
sample_size = int(sample_size*len(N))
if sample_frac is not None:
sample_size = int(sample_frac * sample_size)

silhouette = sklearn.metrics.silhouette_score(
X=N,
Expand Down Expand Up @@ -684,10 +687,7 @@ def ft_pb(
return correlation

@classmethod
def ft_ch(
cls,
N: np.ndarray,
y: np.ndarray) -> float:
def ft_ch(cls, N: np.ndarray, y: np.ndarray) -> float:
"""Calinski and Harabasz index.
Check `cahascore`_ for more information.
Expand Down Expand Up @@ -733,7 +733,7 @@ def ft_sc(cls,
y: np.ndarray,
size: int = 15,
class_freqs: t.Optional[np.ndarray] = None,
normalize: bool = False) -> t.Union[int]:
normalize: bool = False) -> int:
"""Number of clusters with size smaller than ``size``.
Parameters
Expand Down
18 changes: 15 additions & 3 deletions tests/test_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,10 @@ class TestClustering():
(2, 'vdb', 0.7517428073901388, True),
(2, 'vdu', 2.3392212797698888e-05, True),
])
def test_ft_methods_general(self, dt_id, ft_name, exp_value, precompute):
"""Function to test each meta-feature belongs to general group.
"""
def test_ft_methods_clustering(self, dt_id, ft_name, exp_value,
precompute):
"""Function to test each meta-feature belongs to clustering group."""

precomp_group = GNAME if precompute else None
X, y = load_xy(dt_id)
mfe = MFE(
Expand All @@ -89,6 +90,17 @@ def test_ft_methods_general(self, dt_id, ft_name, exp_value, precompute):
else:
assert np.allclose(value, exp_value)

@pytest.mark.parametrize("precompute", [False, True])
def test_silhouette_subsampling(self, precompute):
X, y = load_xy(0)
precomp_group = GNAME if precompute else None
mfe = MFE(
features="sil", random_state=1234).fit(
X.values, y.values, precomp_groups=precomp_group)
value = mfe.extract(sil={"sample_frac": 0.5})[1]

assert np.allclose(value, -0.07137712254830314)

@staticmethod
def test_precompute_nearest_neighbors():
N = np.array([[1, 2, 3], [4, 5, 6]])
Expand Down

0 comments on commit f93ad2b

Please sign in to comment.