Skip to content

Commit

Permalink
Merge pull request #49 from gmrukwa/develop
Browse files Browse the repository at this point in the history
Release 2.4.4

- `DiviK` class was not configurable. Now it's fixed.
- Unfit estimators may be fit from auto-tuning KMeans implementations
  • Loading branch information
gmrukwa committed Feb 21, 2020
2 parents 3e8bc55 + 4894a30 commit 037ae46
Show file tree
Hide file tree
Showing 11 changed files with 66 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
env:
MAJOR: ${{ 2 }}
MINOR: ${{ 4 }}
FIXUP: ${{ 3 }}
FIXUP: ${{ 4 }}
PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
PACKAGE_INIT_FILE_VERSION_LINE: ${{ 1 }}
PACKAGE_SETUP_FILE: ${{ 'setup.py' }}
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ docker pull gmrukwa/divik
To install specific version, you can specify it in the command, e.g.:

```bash
docker pull gmrukwa/divik:2.4.3
docker pull gmrukwa/divik:2.4.4
```

## Python package
Expand Down Expand Up @@ -79,7 +79,7 @@ pip install divik
or any stable tagged version, e.g.:

```bash
pip install divik==2.4.3
pip install divik==2.4.4
```

If you want to have compatibility with
Expand Down
2 changes: 1 addition & 1 deletion divik/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2.4.3'
__version__ = '2.4.4'

from ._summary import plot, reject_split

Expand Down
17 changes: 15 additions & 2 deletions divik/cluster/_divik/_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,30 @@
from sklearn.utils.validation import check_is_fitted

from divik import _summary as summary, feature_selection as fs
from divik.core import context_if, DivikResult, normalize_rows, maybe_pool
from divik.core import (
configurable,
context_if,
DivikResult,
normalize_rows,
maybe_pool,
)
from ._backend import divik
from ._report import DivikReporter


# TODO: Describe the kmeans and fast_kmeans parameters
@configurable
class DiviK(BaseEstimator, ClusterMixin, TransformerMixin):
"""DiviK clustering
Parameters
----------
kmeans: AutoKMeans
A self-tuning KMeans estimator for the purpose of clustering
fast_kmeans: GAPSearch, optional, default: None
A self-tuning KMeans estimator for the purpose of stop condition
check. If None, the `kmeans` parameter is assumed to be the
`GAPSearch` instance.
distance: str, optional, default: 'correlation'
The distance metric between points, centroids and for GAP index
Expand Down
5 changes: 4 additions & 1 deletion divik/cluster/_kmeans/_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
distance : str, optional, default: 'euclidean'
Distance measure. One of the distances supported by scipy package.
init : {'percentile' or 'extreme'}
init : {'percentile', 'extreme' or 'kdtree'}
Method for initialization, defaults to 'percentile':
'percentile' : selects initial cluster centers for k-mean
Expand All @@ -185,6 +185,9 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
clustering starting from the furthest points to already specified
clusters
'kdtree': selects initial cluster centers for k-mean clustering
starting from centroids of KD-Tree boxes
percentile : float, default: 95.0
Specifies the starting percentile for 'percentile' initialization.
Must be within range [0.0, 100.0]. At 100.0 it is equivalent to
Expand Down
9 changes: 8 additions & 1 deletion divik/cluster/_kmeans/_dunn.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ class DunnSearch(BaseEstimator, ClusterMixin, TransformerMixin):
The number of jobs to use for the computation. This works by computing
each of the clustering & scoring runs in parallel.
drop_unfit: bool, default: False
If True, drops the estimators that did not fit the data.
verbose: bool, default: False
If True, shows progress with tqdm.
Expand Down Expand Up @@ -68,13 +71,15 @@ class DunnSearch(BaseEstimator, ClusterMixin, TransformerMixin):
"""
def __init__(self, kmeans: KMeans,
max_clusters: int, min_clusters: int = 2,
n_jobs: int = 1, verbose: bool = False):
n_jobs: int = 1, drop_unfit: bool = False,
verbose: bool = False):
super().__init__()
assert min_clusters <= max_clusters
self.kmeans = kmeans
self.min_clusters = min_clusters
self.max_clusters = max_clusters
self.n_jobs = n_jobs
self.drop_unfit = drop_unfit
self.verbose = verbose

def _fit_kmeans(self, n_clusters, data_ref):
Expand Down Expand Up @@ -121,6 +126,8 @@ def fit(self, X, y=None):
self.best_ = self.estimators_[best]
self.labels_ = self.best_.labels_
self.cluster_centers_ = self.best_.cluster_centers_
if self.drop_unfit:
self.estimators_ = None

return self

Expand Down
9 changes: 8 additions & 1 deletion divik/cluster/_kmeans/_gap.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ class GAPSearch(BaseEstimator, ClusterMixin, TransformerMixin):
Size of the sample used for GAP statistic computation. Used only if
introduces speedup.
drop_unfit: bool, default: False
If True, drops the estimators that did not fit the data.
verbose: bool, default: False
If True, shows progress with tqdm.
Expand Down Expand Up @@ -73,7 +76,8 @@ class GAPSearch(BaseEstimator, ClusterMixin, TransformerMixin):
def __init__(self, kmeans: KMeans,
max_clusters: int, min_clusters: int = 1,
n_jobs: int = 1, seed: int = 0, n_trials: int = 10,
sample_size: int = 1000, verbose: bool = False):
sample_size: int = 1000, drop_unfit: bool = False,
verbose: bool = False):
super().__init__()
assert min_clusters <= max_clusters
self.kmeans = kmeans
Expand All @@ -83,6 +87,7 @@ def __init__(self, kmeans: KMeans,
self.seed = seed
self.n_trials = n_trials
self.sample_size = sample_size
self.drop_unfit = drop_unfit
self.verbose = verbose

def _should_sample(self, data):
Expand Down Expand Up @@ -148,6 +153,8 @@ def fit(self, X, y=None):
self.n_clusters_ = self.best_.n_clusters
self.labels_ = self.best_.labels_
self.cluster_centers_ = self.best_.cluster_centers_
if self.drop_unfit:
self.estimators_ = None
else:
self.best_ = None
self.best_score_ = None
Expand Down
4 changes: 2 additions & 2 deletions docs/instructions/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ To install latest stable version use::

To install specific version, you can specify it in the command, e.g.::

docker pull gmrukwa/divik:2.4.3
docker pull gmrukwa/divik:2.4.4

Python package
--------------
Expand All @@ -31,7 +31,7 @@ package::

or any stable tagged version, e.g.::

pip install divik==2.4.3
pip install divik==2.4.4

If you want to have compatibility with
`gin-config <https://github.com/google/gin-config>`_, you can install
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
import numpy

__version__ = '2.4.3'
__version__ = '2.4.4'

LINUX_OPTS = {
'extra_link_args': [
Expand Down
11 changes: 11 additions & 0 deletions test/cluster/kmeans/test_dunn.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ def test_works_with_dunn(self, _, n_clusters):
rand = adjusted_rand_score(y, kmeans.labels_)
self.assertEqual(kmeans.n_clusters_, n_clusters)
self.assertGreater(rand, 0.75)

def test_works_with_unfit_removal(self):
n_clusters = 3
X, y = data(n_clusters)
single_kmeans = KMeans(n_clusters=2)
kmeans = DunnSearch(
single_kmeans, max_clusters=10, drop_unfit=True).fit(X)
rand = adjusted_rand_score(y, kmeans.labels_)
self.assertEqual(kmeans.n_clusters_, n_clusters)
self.assertGreater(rand, 0.75)
self.assertIsNone(kmeans.estimators_)


if __name__ == '__main__':
Expand Down
13 changes: 13 additions & 0 deletions test/cluster/kmeans/test_gap.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,19 @@ def test_works_with_sampled_gap(self, _, n_clusters):
self.assertLessEqual(kmeans.n_clusters_ - 1, n_clusters)
self.assertGreater(rand, 0.75)

def test_works_with_unfit_removal(self):
n_clusters = 3
X, y = data(n_clusters)
single_kmeans = KMeans(n_clusters=2)
kmeans = GAPSearch(
single_kmeans, max_clusters=10, drop_unfit=True).fit(X)
rand = adjusted_rand_score(y, kmeans.labels_)
# allow for misidentification of 1 cluster
self.assertGreaterEqual(kmeans.n_clusters_ + 1, n_clusters)
self.assertLessEqual(kmeans.n_clusters_ - 1, n_clusters)
self.assertGreater(rand, 0.75)
self.assertIsNone(kmeans.estimators_)


if __name__ == '__main__':
unittest.main()

0 comments on commit 037ae46

Please sign in to comment.