Merge pull request #56 from gmrukwa/develop

Release 2.5.0
gmrukwa · Mar 16, 2020 · 0fc7dc5 · 0fc7dc5
2 parents 106dad1 + b79ead6
commit 0fc7dc5
Show file tree

Hide file tree

Showing 11 changed files with 287 additions and 45 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -11,8 +11,8 @@ on:
 
 env:
   MAJOR: ${{ 2 }}
-  MINOR: ${{ 4 }}
-  FIXUP: ${{ 8 }}
+  MINOR: ${{ 5 }}
+  FIXUP: ${{ 0 }}
   PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
   PACKAGE_INIT_FILE_VERSION_LINE: ${{ 1 }}
   PACKAGE_SETUP_FILE: ${{ 'setup.py' }}

diff --git a/README.md b/README.md
@@ -40,14 +40,14 @@ docker pull gmrukwa/divik
 To install specific version, you can specify it in the command, e.g.:
 
 ```bash
-docker pull gmrukwa/divik:2.4.8
+docker pull gmrukwa/divik:2.5.0
 ```
 
 ## Python package
 
 Prerequisites for installation of base package:
 
-- Python 3.6 / 3.7
+- Python 3.6 / 3.7 / 3.8
 - compiler capable of compiling the native C code and OpenMP support
 
 #### Installation of OpenMP for Ubuntu / Debian
@@ -79,7 +79,7 @@ pip install divik
 or any stable tagged version, e.g.:
 
 ```bash
-pip install divik==2.4.8
+pip install divik==2.5.0
 ```
 
 If you want to have compatibility with
@@ -92,14 +92,6 @@ pip install divik[gin]
 
 **Note:** Remember about `\` before `[` and `]` in `zsh` shell.
 
-## Known Issues
-
-### Mac OS & Numba
-
-Certain code compilation is not supported for Mac OS, as it started to
-freeze. Therefore it is disabled and slower. You can try yourself by
-installation of `numba` package, but I had no luck up to this point.
-
 # References
 
 This software is part of contribution made by [Data Mining Group of Silesian

diff --git a/divik/__init__.py b/divik/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '2.4.8'
+__version__ = '2.5.0'
 
 from ._summary import plot, reject_split
 

diff --git a/divik/cluster/_kmeans/_dunn.py b/divik/cluster/_kmeans/_dunn.py
@@ -9,7 +9,10 @@
 
 from divik.core import configurable
 from ._core import KMeans
-from divik.score import dunn
+from divik.score import (
+    dunn,
+    sampled_dunn,
+)
 from divik.core import maybe_pool, share
 
 
@@ -35,6 +38,33 @@ class DunnSearch(BaseEstimator, ClusterMixin, TransformerMixin):
     min_clusters: int, default: 1
         The minimal number of clusters to form and score.
 
+    method: {'full', 'sampled', 'auto'}, default: 'full'
+        Whether to run full computations or approximate.
+        - full - always computes full Dunn's index, without sampling
+        - sampled - samples the clusters to reduce computational overhead
+        - auto - switches the above methods to provide best performance-quality
+        trade-off.
+
+    inter : {'centroid', 'closest'}, default: 'centroid'
+        How the distance between clusters is computed. For more details see
+        `dunn`.
+
+    intra : {'avg', 'furthest'}, default: 'avg'
+        How the cluster internal distance is computed. For more details see
+        `dunn`.
+
+    sample_size : int, default: 1000
+        Size of the sample used to compute Dunn index in `auto` or `sampled`
+        scenario.
+
+    n_trials : int, default: 10
+        Number of trials to use when computing Dunn index in `auto` or
+        `sampled` scenario.
+
+    seed : int, default: 42
+        Random seed for the reproducibility of subset draws in Dunn `auto`
+        or `sampled` scenario.
+
     n_jobs: int, default: 1
         The number of jobs to use for the computation. This works by computing
         each of the clustering & scoring runs in parallel.
@@ -56,8 +86,8 @@ class DunnSearch(BaseEstimator, ClusterMixin, TransformerMixin):
     estimators_: List[KMeans]
         KMeans instances for n_clusters in range [min_clusters, max_clusters].
 
-    scores_: array, [max_clusters - min_clusters + 1, ?]
-        Array with scores for each estimator in each row.
+    scores_: array, [max_clusters - min_clusters + 1,]
+        Array with scores for each estimator.
 
     n_clusters_: int
         Estimated optimal number of clusters.
@@ -69,25 +99,68 @@ class DunnSearch(BaseEstimator, ClusterMixin, TransformerMixin):
         The optimal estimator.
 
     """
-    def __init__(self, kmeans: KMeans,
-                 max_clusters: int, min_clusters: int = 2,
-                 n_jobs: int = 1, drop_unfit: bool = False,
+    def __init__(self,
+                 kmeans: KMeans,
+                 max_clusters: int,
+                 min_clusters: int = 2,
+                 method='full',
+                 inter='centroid',
+                 intra='avg',
+                 sample_size=1000,
+                 n_trials=10,
+                 seed=42,
+                 n_jobs: int = 1,
+                 drop_unfit: bool = False,
                  verbose: bool = False):
         super().__init__()
         assert min_clusters <= max_clusters
         self.kmeans = kmeans
         self.min_clusters = min_clusters
         self.max_clusters = max_clusters
+        self.method = method
+        self.inter = inter
+        self.intra = intra
+        self.sample_size = sample_size
+        self.n_trials = n_trials
+        self.seed = seed
         self.n_jobs = n_jobs
         self.drop_unfit = drop_unfit
         self.verbose = verbose
 
+    def _n_ops(self, data):
+        if self.inter == 'closest' or self.intra == 'furthest':
+            n_ops_full = data.shape[0] ** 2
+            n_ops_sampled = self.n_trials * self.sample_size ** 2
+        else:
+            n_ops_full = data.shape[0]
+            n_ops_sampled = self.n_trials * self.sample_size
+        return n_ops_full, n_ops_sampled
+
+    def _sampled_dunn(self, kmeans, data, inter, intra):
+        return sampled_dunn(kmeans, data, inter=inter, intra=intra,
+                            sample_size=self.sample_size, n_jobs=self.n_jobs,
+                            seed=self.seed, n_trials=self.n_trials)
+
+    def _dunn(self, kmeans, data):
+        n_ops_full, n_ops_sampled = self._n_ops(data)
+        if self.method == 'full':
+            dunn_ = dunn
+        elif self.method == 'sampled':
+            dunn_ = self._sampled_dunn
+        elif self.method == 'auto' and n_ops_full <= n_ops_sampled:
+            dunn_ = dunn
+        elif self.method == 'auto' and n_ops_full > n_ops_sampled:
+            dunn_ = self._sampled_dunn
+        else:
+            raise ValueError(f"Unknown Dunn method {self.method}")
+        return dunn_(kmeans, data, inter=self.inter, intra=self.intra)
+
     def _fit_kmeans(self, n_clusters, data_ref):
         data = _DATA[data_ref].value
         kmeans = clone(self.kmeans)
         kmeans.n_clusters = n_clusters
         kmeans.fit(data)
-        d = dunn(kmeans, data)
+        d = self._dunn(kmeans, data)
         return kmeans, d
 
     def fit(self, X, y=None):

diff --git a/divik/score/__init__.py b/divik/score/__init__.py
@@ -1,3 +1,3 @@
-from ._dunn import dunn
+from ._dunn import dunn, sampled_dunn
 from ._gap import gap
 from ._sampled_gap import sampled_gap
diff --git a/divik/score/_dunn.py b/divik/score/_dunn.py
@@ -1,14 +1,67 @@
+from functools import partial
+from typing import Union
+
 import numpy as np
 import pandas as pd
 from scipy.spatial import distance as dist
 
-from divik.core import Data
+from divik.core import configurable, Data, maybe_pool
+from divik.sampler import BaseSampler, StratifiedSampler
+
+
+KMeans = 'divik.cluster.KMeans'
+_BIG_PRIME = 49277
+
+
+def _inter_centroid(kmeans: KMeans, data: Data, labels=None):
+    d = dist.pdist(kmeans.cluster_centers_, kmeans.distance)
+    return np.min(d[d != 0])
+
+
+def _inter_closest(kmeans: KMeans, data: Data, labels=None):
+    if labels is None:
+        labels = kmeans.labels_
+    d = np.inf
+    for label in np.arange(kmeans.n_clusters - 1):
+        grp = label == labels
+        non_grp = label < labels
+        dst = dist.cdist(data[grp], data[non_grp], metric=kmeans.distance)
+        d = np.minimum(d, dst.min())
+    return d
+
+
+def _intra_avg(kmeans: KMeans, data: Data, labels=None):
+    if labels is None:
+        labels = kmeans.labels_
+    clusters = pd.DataFrame(data).groupby(labels).apply(np.asarray)
+    return np.max([
+        np.mean(dist.cdist(cluster, centroid.reshape(1, -1), kmeans.distance))
+        for cluster, centroid in zip(clusters, kmeans.cluster_centers_)
+    ])
 
 
-KMeans = 'divik.KMeans'
+def _intra_furthest(kmeans: KMeans, data: Data, labels=None):
+    def max_distance(group):
+        group = np.asarray(group)
+        d = dist.pdist(group, metric=kmeans.distance)
+        return np.max(d)
+    if labels is None:
+        labels = kmeans.labels_
+    return pd.DataFrame(data).groupby(labels).apply(max_distance).max()
 
 
-def dunn(kmeans: KMeans, data: Data) -> float:
+_INTER = {
+    'centroid': _inter_centroid,
+    'closest': _inter_closest,
+}
+_INTRA = {
+    'avg': _intra_avg,
+    'furthest': _intra_furthest,
+}
+
+
+@configurable
+def dunn(kmeans: KMeans, data: Data, inter='centroid', intra='avg') -> float:
     """Compute Dunn's index for the clustering
 
     Parameters
@@ -19,19 +72,59 @@ def dunn(kmeans: KMeans, data: Data) -> float:
     data : array, shape (n_samples, n_features)
         Clustered data
 
+    inter : {'centroid', 'closest'}
+        Method of computing intercluster distance
+        - centroid - uses distances between centroids
+        - closest - uses distance between closest members of separate clusters
+
+    intra : {'avg', 'furthest}
+        Method of computing intracluster distance
+        - avg - uses average distance to the centroid
+        - furthest - uses distance between the furthest cluster members
+
     Returns
     -------
     dunn_index : float
         Value of Dunn's index for the clustering of data
     """
     if kmeans.cluster_centers_.shape[0] == 1:
         return -np.inf
-    clusters = pd.DataFrame(data).groupby(kmeans.labels_).apply(np.asarray)
-    intercluster = dist.pdist(kmeans.cluster_centers_, kmeans.distance)
-    intercluster = np.min(intercluster[intercluster != 0])
-    intracluster = np.max([
-        np.mean(dist.cdist(cluster, centroid.reshape(1, -1), kmeans.distance))
-        for cluster, centroid in zip(clusters, kmeans.cluster_centers_)
-    ])
+    if inter not in _INTER:
+        raise ValueError(f'Unsupported intercluster distance {inter}. '
+                         f'Supported: {list(_INTER.keys())}')
+    if intra not in _INTRA:
+        raise ValueError(f'Unsupported intracluster distance {intra}. '
+                         f'Supported: {list(_INTRA.keys())}')
+    intercluster = _INTER[inter](kmeans, data)
+    intracluster = _INTRA[intra](kmeans, data)
     score = intercluster / intracluster
     return score
+
+
+def _sample_distances(seed: int, sampler: BaseSampler, kmeans: KMeans,
+                      inter='centroid', intra='avg'):
+    data = sampler.get_sample(seed)
+    labels = kmeans.predict(data)
+    inter_ = _INTER[inter](kmeans, data, labels)
+    intra_ = _INTRA[intra](kmeans, data, labels)
+    return inter_, intra_
+
+
+@configurable
+def sampled_dunn(kmeans: KMeans, data: Data,
+                 sample_size: Union[int, float] = 1000,
+                 n_jobs: int = None,
+                 seed: int = 0,
+                 n_trials: int = 10,
+                 inter='closest', intra='furthest') -> float:
+    data_ = StratifiedSampler(n_rows=sample_size, n_samples=n_trials
+                              ).fit(data, kmeans.labels_)
+    seeds = list(seed + np.arange(n_trials) * _BIG_PRIME)
+    with data_.parallel() as d, maybe_pool(n_jobs, initializer=d.initializer,
+                                           initargs=d.initargs) as pool:
+        distances = partial(_sample_distances, sampler=d, kmeans=kmeans,
+                            inter=inter, intra=intra)
+        inter_, intra_ = np.array(pool.map(distances, seeds)).T
+    v_inter = inter_.var()
+    v_intra = intra_.var()
+    return (inter_.min() - v_inter) / (intra_.max() + v_intra)
diff --git a/divik/score/_sampled_gap.py b/divik/score/_sampled_gap.py
@@ -9,7 +9,7 @@
 from divik.score._gap import _sampled_dispersion as _dispersion
 
 
-KMeans = 'divik.KMeans'
+KMeans = 'divik.cluster.KMeans'
 _BIG_PRIME = 40013
 
 

diff --git a/docs/instructions/installation.rst b/docs/instructions/installation.rst
@@ -14,14 +14,14 @@ To install latest stable version use::
 
 To install specific version, you can specify it in the command, e.g.::
 
-    docker pull gmrukwa/divik:2.4.8
+    docker pull gmrukwa/divik:2.5.0
 
 Python package
 --------------
 
 Prerequisites for installation of base package:
 
-- Python 3.6 / 3.7
+- Python 3.6 / 3.7 / 3.8
 - compiler capable of compiling the native C code
 
 Having prerequisites installed, one can install latest base version of the
@@ -31,7 +31,7 @@ package::
 
 or any stable tagged version, e.g.::
 
-    pip install divik==2.4.8
+    pip install divik==2.5.0
 
 If you want to have compatibility with
 `gin-config <https://github.com/google/gin-config>`_, you can install

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 import sys
 import numpy
 
-__version__ = '2.4.8'
+__version__ = '2.5.0'
 
 LINUX_OPTS = {
     'extra_link_args': [