Skip to content

Commit

Permalink
Merge pull request #53 from gmrukwa/kdtree-percentile-init
Browse files Browse the repository at this point in the history
Kdtree percentile init
  • Loading branch information
gmrukwa committed Feb 21, 2020
2 parents b7fb4f4 + ac6f09d commit e62447f
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
env:
MAJOR: ${{ 2 }}
MINOR: ${{ 4 }}
FIXUP: ${{ 6 }}
FIXUP: ${{ 7 }}
PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
PACKAGE_INIT_FILE_VERSION_LINE: ${{ 1 }}
PACKAGE_SETUP_FILE: ${{ 'setup.py' }}
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ docker pull gmrukwa/divik
To install specific version, you can specify it in the command, e.g.:

```bash
docker pull gmrukwa/divik:2.4.6
docker pull gmrukwa/divik:2.4.7
```

## Python package
Expand Down Expand Up @@ -79,7 +79,7 @@ pip install divik
or any stable tagged version, e.g.:

```bash
pip install divik==2.4.6
pip install divik==2.4.7
```

If you want to have compatibility with
Expand Down
2 changes: 1 addition & 1 deletion divik/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2.4.6'
__version__ = '2.4.7'

from ._summary import plot, reject_split

Expand Down
9 changes: 8 additions & 1 deletion divik/cluster/_kmeans/_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
ExtremeInitialization,
PercentileInitialization,
KDTreeInitialization,
KDTreePercentileInitialization,
)
from divik.core import (
normalize_rows,
Expand Down Expand Up @@ -157,6 +158,8 @@ def _parse_initialization(name: str, distance: str,
return ExtremeInitialization(distance)
if name == 'kdtree':
return KDTreeInitialization(distance, leaf_size)
if name == 'kdtree_percentile':
return KDTreePercentileInitialization(distance, leaf_size, percentile)
raise ValueError('Unknown initialization: {0}'.format(name))


Expand All @@ -174,7 +177,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
distance : str, optional, default: 'euclidean'
Distance measure. One of the distances supported by scipy package.
init : {'percentile', 'extreme' or 'kdtree'}
init : {'percentile', 'extreme', 'kdtree', 'kdtree_percentile'}
Method for initialization, defaults to 'percentile':
'percentile' : selects initial cluster centers for k-mean
Expand All @@ -188,6 +191,10 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
'kdtree': selects initial cluster centers for k-mean clustering
starting from centroids of KD-Tree boxes
'kdtree_percentile': selects initial cluster centers for k-means
clustering starting from centroids of KD-Tree boxes containing
specified percentile. This should be more robust against outliers.
percentile : float, default: 95.0
Specifies the starting percentile for 'percentile' initialization.
Must be within range [0.0, 100.0]. At 100.0 it is equivalent to
Expand Down
46 changes: 46 additions & 0 deletions divik/cluster/_kmeans/_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,49 @@ def __call__(self, data: Data, number_of_centroids: int) -> Centroids:
centroids[i] = box_centroids[np.argmax(distances)]

return centroids


class KDTreePercentileInitialization(Initialization):
"""Initializes k-means by picking extreme KDTree box"""
def __init__(self, distance: str, leaf_size: Union[int, float] = 0.01,
percentile: float=99.):
assert 0 <= percentile <= 100, percentile
self.distance = distance
self.leaf_size = leaf_size
self.percentile = percentile

def _get_percentile_idx(self, distances, weights) -> int:
idx = np.argsort(distances)
over_percentile = np.cumsum(weights[idx]) >= self.percentile
first_over = np.flatnonzero(over_percentile)[0]
return idx[first_over]

def __call__(self, data: Data, number_of_centroids: int) -> Centroids:
"""Generate initial centroids for k-means algorithm"""
_validate(data, number_of_centroids)
leaf_size = self.leaf_size
if isinstance(leaf_size, float):
if 0 <= leaf_size <= 1:
leaf_size = max(int(leaf_size * data.shape[0]), 1)
else:
raise ValueError('leaf_size must be between 0 and 1 when float')
tree = make_tree(data, leaf_size=leaf_size)
leaves = get_leaves(tree)
box_centroids = np.vstack([l.centroid for l in leaves])
box_weights = np.array([l.count for l in leaves])
normalized_weights = box_weights / np.sum(box_weights)

residuals = _find_residuals(box_centroids, box_weights)
centroids = np.nan * np.zeros((number_of_centroids, data.shape[1]))
idx = self._get_percentile_idx(residuals, normalized_weights)
centroids[0] = box_centroids[idx]

distances = np.inf * np.ones((box_centroids.shape[0], ))
for i in range(1, number_of_centroids):
current_distance = dist.cdist(
box_centroids, centroids[np.newaxis, i - 1], self.distance)
distances[:] = np.minimum(current_distance.ravel(), distances)
idx = self._get_percentile_idx(distances, normalized_weights)
centroids[i] = box_centroids[idx]

return centroids
4 changes: 2 additions & 2 deletions docs/instructions/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ To install latest stable version use::

To install specific version, you can specify it in the command, e.g.::

docker pull gmrukwa/divik:2.4.6
docker pull gmrukwa/divik:2.4.7

Python package
--------------
Expand All @@ -31,7 +31,7 @@ package::

or any stable tagged version, e.g.::

pip install divik==2.4.6
pip install divik==2.4.7

If you want to have compatibility with
`gin-config <https://github.com/google/gin-config>`_, you can install
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
import numpy

__version__ = '2.4.6'
__version__ = '2.4.7'

LINUX_OPTS = {
'extra_link_args': [
Expand Down

0 comments on commit e62447f

Please sign in to comment.