Skip to content

Commit

Permalink
Merge pull request #56 from gmrukwa/develop
Browse files Browse the repository at this point in the history
Release 2.5.0
  • Loading branch information
gmrukwa committed Mar 16, 2020
2 parents 106dad1 + b79ead6 commit 0fc7dc5
Show file tree
Hide file tree
Showing 11 changed files with 287 additions and 45 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ on:

env:
MAJOR: ${{ 2 }}
MINOR: ${{ 4 }}
FIXUP: ${{ 8 }}
MINOR: ${{ 5 }}
FIXUP: ${{ 0 }}
PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
PACKAGE_INIT_FILE_VERSION_LINE: ${{ 1 }}
PACKAGE_SETUP_FILE: ${{ 'setup.py' }}
Expand Down
14 changes: 3 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,14 @@ docker pull gmrukwa/divik
To install specific version, you can specify it in the command, e.g.:

```bash
docker pull gmrukwa/divik:2.4.8
docker pull gmrukwa/divik:2.5.0
```

## Python package

Prerequisites for installation of base package:

- Python 3.6 / 3.7
- Python 3.6 / 3.7 / 3.8
- compiler capable of compiling the native C code and OpenMP support

#### Installation of OpenMP for Ubuntu / Debian
Expand Down Expand Up @@ -79,7 +79,7 @@ pip install divik
or any stable tagged version, e.g.:

```bash
pip install divik==2.4.8
pip install divik==2.5.0
```

If you want to have compatibility with
Expand All @@ -92,14 +92,6 @@ pip install divik[gin]

**Note:** Remember about `\` before `[` and `]` in `zsh` shell.

## Known Issues

### Mac OS & Numba

Certain code compilation is not supported for Mac OS, as it started to
freeze. Therefore it is disabled and slower. You can try yourself by
installation of `numba` package, but I had no luck up to this point.

# References

This software is part of contribution made by [Data Mining Group of Silesian
Expand Down
2 changes: 1 addition & 1 deletion divik/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2.4.8'
__version__ = '2.5.0'

from ._summary import plot, reject_split

Expand Down
87 changes: 80 additions & 7 deletions divik/cluster/_kmeans/_dunn.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

from divik.core import configurable
from ._core import KMeans
from divik.score import dunn
from divik.score import (
dunn,
sampled_dunn,
)
from divik.core import maybe_pool, share


Expand All @@ -35,6 +38,33 @@ class DunnSearch(BaseEstimator, ClusterMixin, TransformerMixin):
min_clusters: int, default: 1
The minimal number of clusters to form and score.
method: {'full', 'sampled', 'auto'}, default: 'full'
Whether to run full computations or approximate.
- full - always computes full Dunn's index, without sampling
- sampled - samples the clusters to reduce computational overhead
- auto - switches the above methods to provide best performance-quality
trade-off.
inter : {'centroid', 'closest'}, default: 'centroid'
How the distance between clusters is computed. For more details see
`dunn`.
intra : {'avg', 'furthest'}, default: 'avg'
How the cluster internal distance is computed. For more details see
`dunn`.
sample_size : int, default: 1000
Size of the sample used to compute Dunn index in `auto` or `sampled`
scenario.
n_trials : int, default: 10
Number of trials to use when computing Dunn index in `auto` or
`sampled` scenario.
seed : int, default: 42
Random seed for the reproducibility of subset draws in Dunn `auto`
or `sampled` scenario.
n_jobs: int, default: 1
The number of jobs to use for the computation. This works by computing
each of the clustering & scoring runs in parallel.
Expand All @@ -56,8 +86,8 @@ class DunnSearch(BaseEstimator, ClusterMixin, TransformerMixin):
estimators_: List[KMeans]
KMeans instances for n_clusters in range [min_clusters, max_clusters].
scores_: array, [max_clusters - min_clusters + 1, ?]
Array with scores for each estimator in each row.
scores_: array, [max_clusters - min_clusters + 1,]
Array with scores for each estimator.
n_clusters_: int
Estimated optimal number of clusters.
Expand All @@ -69,25 +99,68 @@ class DunnSearch(BaseEstimator, ClusterMixin, TransformerMixin):
The optimal estimator.
"""
def __init__(self, kmeans: KMeans,
max_clusters: int, min_clusters: int = 2,
n_jobs: int = 1, drop_unfit: bool = False,
def __init__(self,
kmeans: KMeans,
max_clusters: int,
min_clusters: int = 2,
method='full',
inter='centroid',
intra='avg',
sample_size=1000,
n_trials=10,
seed=42,
n_jobs: int = 1,
drop_unfit: bool = False,
verbose: bool = False):
super().__init__()
assert min_clusters <= max_clusters
self.kmeans = kmeans
self.min_clusters = min_clusters
self.max_clusters = max_clusters
self.method = method
self.inter = inter
self.intra = intra
self.sample_size = sample_size
self.n_trials = n_trials
self.seed = seed
self.n_jobs = n_jobs
self.drop_unfit = drop_unfit
self.verbose = verbose

def _n_ops(self, data):
if self.inter == 'closest' or self.intra == 'furthest':
n_ops_full = data.shape[0] ** 2
n_ops_sampled = self.n_trials * self.sample_size ** 2
else:
n_ops_full = data.shape[0]
n_ops_sampled = self.n_trials * self.sample_size
return n_ops_full, n_ops_sampled

def _sampled_dunn(self, kmeans, data, inter, intra):
return sampled_dunn(kmeans, data, inter=inter, intra=intra,
sample_size=self.sample_size, n_jobs=self.n_jobs,
seed=self.seed, n_trials=self.n_trials)

def _dunn(self, kmeans, data):
n_ops_full, n_ops_sampled = self._n_ops(data)
if self.method == 'full':
dunn_ = dunn
elif self.method == 'sampled':
dunn_ = self._sampled_dunn
elif self.method == 'auto' and n_ops_full <= n_ops_sampled:
dunn_ = dunn
elif self.method == 'auto' and n_ops_full > n_ops_sampled:
dunn_ = self._sampled_dunn
else:
raise ValueError(f"Unknown Dunn method {self.method}")
return dunn_(kmeans, data, inter=self.inter, intra=self.intra)

def _fit_kmeans(self, n_clusters, data_ref):
data = _DATA[data_ref].value
kmeans = clone(self.kmeans)
kmeans.n_clusters = n_clusters
kmeans.fit(data)
d = dunn(kmeans, data)
d = self._dunn(kmeans, data)
return kmeans, d

def fit(self, X, y=None):
Expand Down
2 changes: 1 addition & 1 deletion divik/score/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from ._dunn import dunn
from ._dunn import dunn, sampled_dunn
from ._gap import gap
from ._sampled_gap import sampled_gap
113 changes: 103 additions & 10 deletions divik/score/_dunn.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,67 @@
from functools import partial
from typing import Union

import numpy as np
import pandas as pd
from scipy.spatial import distance as dist

from divik.core import Data
from divik.core import configurable, Data, maybe_pool
from divik.sampler import BaseSampler, StratifiedSampler


KMeans = 'divik.cluster.KMeans'
_BIG_PRIME = 49277


def _inter_centroid(kmeans: KMeans, data: Data, labels=None):
d = dist.pdist(kmeans.cluster_centers_, kmeans.distance)
return np.min(d[d != 0])


def _inter_closest(kmeans: KMeans, data: Data, labels=None):
if labels is None:
labels = kmeans.labels_
d = np.inf
for label in np.arange(kmeans.n_clusters - 1):
grp = label == labels
non_grp = label < labels
dst = dist.cdist(data[grp], data[non_grp], metric=kmeans.distance)
d = np.minimum(d, dst.min())
return d


def _intra_avg(kmeans: KMeans, data: Data, labels=None):
if labels is None:
labels = kmeans.labels_
clusters = pd.DataFrame(data).groupby(labels).apply(np.asarray)
return np.max([
np.mean(dist.cdist(cluster, centroid.reshape(1, -1), kmeans.distance))
for cluster, centroid in zip(clusters, kmeans.cluster_centers_)
])


KMeans = 'divik.KMeans'
def _intra_furthest(kmeans: KMeans, data: Data, labels=None):
def max_distance(group):
group = np.asarray(group)
d = dist.pdist(group, metric=kmeans.distance)
return np.max(d)
if labels is None:
labels = kmeans.labels_
return pd.DataFrame(data).groupby(labels).apply(max_distance).max()


def dunn(kmeans: KMeans, data: Data) -> float:
_INTER = {
'centroid': _inter_centroid,
'closest': _inter_closest,
}
_INTRA = {
'avg': _intra_avg,
'furthest': _intra_furthest,
}


@configurable
def dunn(kmeans: KMeans, data: Data, inter='centroid', intra='avg') -> float:
"""Compute Dunn's index for the clustering
Parameters
Expand All @@ -19,19 +72,59 @@ def dunn(kmeans: KMeans, data: Data) -> float:
data : array, shape (n_samples, n_features)
Clustered data
inter : {'centroid', 'closest'}
Method of computing intercluster distance
- centroid - uses distances between centroids
- closest - uses distance between closest members of separate clusters
intra : {'avg', 'furthest}
Method of computing intracluster distance
- avg - uses average distance to the centroid
- furthest - uses distance between the furthest cluster members
Returns
-------
dunn_index : float
Value of Dunn's index for the clustering of data
"""
if kmeans.cluster_centers_.shape[0] == 1:
return -np.inf
clusters = pd.DataFrame(data).groupby(kmeans.labels_).apply(np.asarray)
intercluster = dist.pdist(kmeans.cluster_centers_, kmeans.distance)
intercluster = np.min(intercluster[intercluster != 0])
intracluster = np.max([
np.mean(dist.cdist(cluster, centroid.reshape(1, -1), kmeans.distance))
for cluster, centroid in zip(clusters, kmeans.cluster_centers_)
])
if inter not in _INTER:
raise ValueError(f'Unsupported intercluster distance {inter}. '
f'Supported: {list(_INTER.keys())}')
if intra not in _INTRA:
raise ValueError(f'Unsupported intracluster distance {intra}. '
f'Supported: {list(_INTRA.keys())}')
intercluster = _INTER[inter](kmeans, data)
intracluster = _INTRA[intra](kmeans, data)
score = intercluster / intracluster
return score


def _sample_distances(seed: int, sampler: BaseSampler, kmeans: KMeans,
inter='centroid', intra='avg'):
data = sampler.get_sample(seed)
labels = kmeans.predict(data)
inter_ = _INTER[inter](kmeans, data, labels)
intra_ = _INTRA[intra](kmeans, data, labels)
return inter_, intra_


@configurable
def sampled_dunn(kmeans: KMeans, data: Data,
sample_size: Union[int, float] = 1000,
n_jobs: int = None,
seed: int = 0,
n_trials: int = 10,
inter='closest', intra='furthest') -> float:
data_ = StratifiedSampler(n_rows=sample_size, n_samples=n_trials
).fit(data, kmeans.labels_)
seeds = list(seed + np.arange(n_trials) * _BIG_PRIME)
with data_.parallel() as d, maybe_pool(n_jobs, initializer=d.initializer,
initargs=d.initargs) as pool:
distances = partial(_sample_distances, sampler=d, kmeans=kmeans,
inter=inter, intra=intra)
inter_, intra_ = np.array(pool.map(distances, seeds)).T
v_inter = inter_.var()
v_intra = intra_.var()
return (inter_.min() - v_inter) / (intra_.max() + v_intra)
2 changes: 1 addition & 1 deletion divik/score/_sampled_gap.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from divik.score._gap import _sampled_dispersion as _dispersion


KMeans = 'divik.KMeans'
KMeans = 'divik.cluster.KMeans'
_BIG_PRIME = 40013


Expand Down
6 changes: 3 additions & 3 deletions docs/instructions/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ To install latest stable version use::

To install specific version, you can specify it in the command, e.g.::

docker pull gmrukwa/divik:2.4.8
docker pull gmrukwa/divik:2.5.0

Python package
--------------

Prerequisites for installation of base package:

- Python 3.6 / 3.7
- Python 3.6 / 3.7 / 3.8
- compiler capable of compiling the native C code

Having prerequisites installed, one can install latest base version of the
Expand All @@ -31,7 +31,7 @@ package::

or any stable tagged version, e.g.::

pip install divik==2.4.8
pip install divik==2.5.0

If you want to have compatibility with
`gin-config <https://github.com/google/gin-config>`_, you can install
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
import numpy

__version__ = '2.4.8'
__version__ = '2.5.0'

LINUX_OPTS = {
'extra_link_args': [
Expand Down

0 comments on commit 0fc7dc5

Please sign in to comment.