Merge pull request #49 from gmrukwa/develop

Release 2.4.4 - `DiviK` class was not configurable. Now it's fixed. - Unfit estimators may be fit from auto-tuning KMeans implementations
gmrukwa · Feb 21, 2020 · 037ae46 · 037ae46
2 parents 3e8bc55 + 4894a30
commit 037ae46
Show file tree

Hide file tree

Showing 11 changed files with 66 additions and 12 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -12,7 +12,7 @@ on:
 env:
   MAJOR: ${{ 2 }}
   MINOR: ${{ 4 }}
-  FIXUP: ${{ 3 }}
+  FIXUP: ${{ 4 }}
   PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
   PACKAGE_INIT_FILE_VERSION_LINE: ${{ 1 }}
   PACKAGE_SETUP_FILE: ${{ 'setup.py' }}

diff --git a/README.md b/README.md
@@ -40,7 +40,7 @@ docker pull gmrukwa/divik
 To install specific version, you can specify it in the command, e.g.:
 
 ```bash
-docker pull gmrukwa/divik:2.4.3
+docker pull gmrukwa/divik:2.4.4
 ```
 
 ## Python package
@@ -79,7 +79,7 @@ pip install divik
 or any stable tagged version, e.g.:
 
 ```bash
-pip install divik==2.4.3
+pip install divik==2.4.4
 ```
 
 If you want to have compatibility with

diff --git a/divik/__init__.py b/divik/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '2.4.3'
+__version__ = '2.4.4'
 
 from ._summary import plot, reject_split
 

diff --git a/divik/cluster/_divik/_sklearn.py b/divik/cluster/_divik/_sklearn.py
@@ -10,17 +10,30 @@
 from sklearn.utils.validation import check_is_fitted
 
 from divik import _summary as summary, feature_selection as fs
-from divik.core import context_if, DivikResult, normalize_rows, maybe_pool
+from divik.core import (
+    configurable,
+    context_if,
+    DivikResult,
+    normalize_rows,
+    maybe_pool,
+)
 from ._backend import divik
 from ._report import DivikReporter
 
 
-# TODO: Describe the kmeans and fast_kmeans parameters
+@configurable
 class DiviK(BaseEstimator, ClusterMixin, TransformerMixin):
     """DiviK clustering
 
     Parameters
     ----------
+    kmeans: AutoKMeans
+        A self-tuning KMeans estimator for the purpose of clustering
+
+    fast_kmeans: GAPSearch, optional, default: None
+        A self-tuning KMeans estimator for the purpose of stop condition
+        check. If None, the `kmeans` parameter is assumed to be the
+        `GAPSearch` instance.
 
     distance: str, optional, default: 'correlation'
         The distance metric between points, centroids and for GAP index

diff --git a/divik/cluster/_kmeans/_core.py b/divik/cluster/_kmeans/_core.py
@@ -174,7 +174,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
     distance : str, optional, default: 'euclidean'
         Distance measure. One of the distances supported by scipy package.
 
-    init : {'percentile' or 'extreme'}
+    init : {'percentile', 'extreme' or 'kdtree'}
         Method for initialization, defaults to 'percentile':
 
         'percentile' : selects initial cluster centers for k-mean
@@ -185,6 +185,9 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
         clustering starting from the furthest points to already specified
         clusters
 
+        'kdtree': selects initial cluster centers for k-mean clustering
+        starting from centroids of KD-Tree boxes
+
     percentile : float, default: 95.0
         Specifies the starting percentile for 'percentile' initialization.
         Must be within range [0.0, 100.0]. At 100.0 it is equivalent to

diff --git a/divik/cluster/_kmeans/_dunn.py b/divik/cluster/_kmeans/_dunn.py
@@ -39,6 +39,9 @@ class DunnSearch(BaseEstimator, ClusterMixin, TransformerMixin):
         The number of jobs to use for the computation. This works by computing
         each of the clustering & scoring runs in parallel.
 
+    drop_unfit: bool, default: False
+        If True, drops the estimators that did not fit the data.
+
     verbose: bool, default: False
         If True, shows progress with tqdm.
 
@@ -68,13 +71,15 @@ class DunnSearch(BaseEstimator, ClusterMixin, TransformerMixin):
     """
     def __init__(self, kmeans: KMeans,
                  max_clusters: int, min_clusters: int = 2,
-                 n_jobs: int = 1, verbose: bool = False):
+                 n_jobs: int = 1, drop_unfit: bool = False,
+                 verbose: bool = False):
         super().__init__()
         assert min_clusters <= max_clusters
         self.kmeans = kmeans
         self.min_clusters = min_clusters
         self.max_clusters = max_clusters
         self.n_jobs = n_jobs
+        self.drop_unfit = drop_unfit
         self.verbose = verbose
 
     def _fit_kmeans(self, n_clusters, data_ref):
@@ -121,6 +126,8 @@ def fit(self, X, y=None):
         self.best_ = self.estimators_[best]
         self.labels_ = self.best_.labels_
         self.cluster_centers_ = self.best_.cluster_centers_
+        if self.drop_unfit:
+            self.estimators_ = None
 
         return self
 

diff --git a/divik/cluster/_kmeans/_gap.py b/divik/cluster/_kmeans/_gap.py
@@ -43,6 +43,9 @@ class GAPSearch(BaseEstimator, ClusterMixin, TransformerMixin):
         Size of the sample used for GAP statistic computation. Used only if
         introduces speedup.
 
+    drop_unfit: bool, default: False
+        If True, drops the estimators that did not fit the data.
+
     verbose: bool, default: False
         If True, shows progress with tqdm.
 
@@ -73,7 +76,8 @@ class GAPSearch(BaseEstimator, ClusterMixin, TransformerMixin):
     def __init__(self, kmeans: KMeans,
                  max_clusters: int, min_clusters: int = 1,
                  n_jobs: int = 1, seed: int = 0, n_trials: int = 10,
-                 sample_size: int = 1000, verbose: bool = False):
+                 sample_size: int = 1000, drop_unfit: bool = False,
+                 verbose: bool = False):
         super().__init__()
         assert min_clusters <= max_clusters
         self.kmeans = kmeans
@@ -83,6 +87,7 @@ def __init__(self, kmeans: KMeans,
         self.seed = seed
         self.n_trials = n_trials
         self.sample_size = sample_size
+        self.drop_unfit = drop_unfit
         self.verbose = verbose
 
     def _should_sample(self, data):
@@ -148,6 +153,8 @@ def fit(self, X, y=None):
             self.n_clusters_ = self.best_.n_clusters
             self.labels_ = self.best_.labels_
             self.cluster_centers_ = self.best_.cluster_centers_
+            if self.drop_unfit:
+                self.estimators_ = None
         else:
             self.best_ = None
             self.best_score_ = None

diff --git a/docs/instructions/installation.rst b/docs/instructions/installation.rst
@@ -14,7 +14,7 @@ To install latest stable version use::
 
 To install specific version, you can specify it in the command, e.g.::
 
-    docker pull gmrukwa/divik:2.4.3
+    docker pull gmrukwa/divik:2.4.4
 
 Python package
 --------------
@@ -31,7 +31,7 @@ package::
 
 or any stable tagged version, e.g.::
 
-    pip install divik==2.4.3
+    pip install divik==2.4.4
 
 If you want to have compatibility with
 `gin-config <https://github.com/google/gin-config>`_, you can install

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 import sys
 import numpy
 
-__version__ = '2.4.3'
+__version__ = '2.4.4'
 
 LINUX_OPTS = {
     'extra_link_args': [

diff --git a/test/cluster/kmeans/test_dunn.py b/test/cluster/kmeans/test_dunn.py
@@ -25,6 +25,17 @@ def test_works_with_dunn(self, _, n_clusters):
         rand = adjusted_rand_score(y, kmeans.labels_)
         self.assertEqual(kmeans.n_clusters_, n_clusters)
         self.assertGreater(rand, 0.75)
+
+    def test_works_with_unfit_removal(self):
+        n_clusters = 3
+        X, y = data(n_clusters)
+        single_kmeans = KMeans(n_clusters=2)
+        kmeans = DunnSearch(
+            single_kmeans, max_clusters=10, drop_unfit=True).fit(X)
+        rand = adjusted_rand_score(y, kmeans.labels_)
+        self.assertEqual(kmeans.n_clusters_, n_clusters)
+        self.assertGreater(rand, 0.75)
+        self.assertIsNone(kmeans.estimators_)
 
 
 if __name__ == '__main__':

diff --git a/test/cluster/kmeans/test_gap.py b/test/cluster/kmeans/test_gap.py
@@ -41,6 +41,19 @@ def test_works_with_sampled_gap(self, _, n_clusters):
         self.assertLessEqual(kmeans.n_clusters_ - 1, n_clusters)
         self.assertGreater(rand, 0.75)
 
+    def test_works_with_unfit_removal(self):
+        n_clusters = 3
+        X, y = data(n_clusters)
+        single_kmeans = KMeans(n_clusters=2)
+        kmeans = GAPSearch(
+            single_kmeans, max_clusters=10, drop_unfit=True).fit(X)
+        rand = adjusted_rand_score(y, kmeans.labels_)
+        # allow for misidentification of 1 cluster
+        self.assertGreaterEqual(kmeans.n_clusters_ + 1, n_clusters)
+        self.assertLessEqual(kmeans.n_clusters_ - 1, n_clusters)
+        self.assertGreater(rand, 0.75)
+        self.assertIsNone(kmeans.estimators_)
+
 
 if __name__ == '__main__':
     unittest.main()