Skip to content

Commit

Permalink
Release v2.5.9
Browse files Browse the repository at this point in the history
Merge pull request #65 from gmrukwa/fixup/memory-issues
Fixup memory issues and improve logging
  • Loading branch information
gmrukwa committed Aug 9, 2020
2 parents 6d96295 + 25c1410 commit 735f9fd
Show file tree
Hide file tree
Showing 12 changed files with 80 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
env:
MAJOR: ${{ 2 }}
MINOR: ${{ 5 }}
FIXUP: ${{ 8 }}
FIXUP: ${{ 9 }}
PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
PACKAGE_INIT_FILE_VERSION_LINE: ${{ 1 }}
PACKAGE_SETUP_FILE: ${{ 'setup.py' }}
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ docker pull gmrukwa/divik
To install specific version, you can specify it in the command, e.g.:

```bash
docker pull gmrukwa/divik:2.5.8
docker pull gmrukwa/divik:2.5.9
```

## Python package
Expand Down Expand Up @@ -79,7 +79,7 @@ pip install divik
or any stable tagged version, e.g.:

```bash
pip install divik==2.5.8
pip install divik==2.5.9
```

If you want to have compatibility with
Expand Down
2 changes: 1 addition & 1 deletion divik/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2.5.8'
__version__ = '2.5.9'

from ._summary import plot, reject_split

Expand Down
30 changes: 25 additions & 5 deletions divik/cluster/_kmeans/_core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import logging
from typing import Tuple, Union

import dask_distance as ddst
import dask.array as da
import dask.dataframe as dd
import numpy as np
import scipy.spatial.distance as dst
from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
Expand Down Expand Up @@ -43,8 +46,16 @@ def __call__(self, data: Data, centroids: Centroids) -> IntLabels:
f"Was {data.shape[1]} and {centroids.shape[1]}"
logging.error(msg)
raise ValueError(msg)
distances = dst.cdist(data, centroids, self.distance_metric)
return np.argmin(distances, axis=1)

if data.shape[0] > 10000 or data.shape[1] > 1000:
X1 = da.from_array(data)
X2 = da.from_array(centroids)
distances = ddst.cdist(X1, X2, self.distance_metric)
labels = da.argmin(distances, axis=1).compute()
else:
distances = dst.cdist(data, centroids, self.distance_metric)
labels = np.argmin(distances, axis=1)
return labels


def redefine_centroids(data: Data, labeling: IntLabels,
Expand All @@ -62,9 +73,14 @@ def redefine_centroids(data: Data, labeling: IntLabels,
f"number of observations: {data.shape[0]}."
logging.error(msg)
raise ValueError(msg)
centroids = np.nan * np.zeros((len(label_set), data.shape[1]))
for label in label_set:
centroids[label] = np.mean(data[labeling == label], axis=0)
if data.shape[0] > 10000 or data.shape[1] > 1000:
X = dd.from_array(data)
y = dd.from_array(labeling)
centroids = X.groupby(y).mean().compute().values
else:
centroids = np.nan * np.zeros((len(label_set), data.shape[1]))
for label in label_set:
centroids[label] = np.mean(data[labeling == label], axis=0)
return centroids


Expand Down Expand Up @@ -139,14 +155,18 @@ def __call__(self, data: Data, number_of_clusters: int) \
_validate_normalizable(data)
data = normalize_rows(data)
label_set = np.arange(number_of_clusters)
logging.debug('Initializing KMeans centroids.')
centroids = self.initialize(data, number_of_clusters)
logging.debug('First centroids found.')
old_labels = np.nan * np.zeros((data.shape[0],))
labels = self.labeling(data, centroids)
logging.debug('Labels assigned.')
for _ in range(self.number_of_iterations):
if np.unique(labels).size != number_of_clusters:
centroids, labels = self._fix_labels(
data, centroids, labels, number_of_clusters)
if np.all(labels == old_labels):
logging.debug('Stability achieved.')
break
old_labels = labels
centroids = redefine_centroids(data, old_labels, label_set)
Expand Down
5 changes: 5 additions & 0 deletions divik/cluster/_kmeans/_gap.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,10 @@ def _should_sample(self, data):

def _gap(self, data, kmeans):
if self._should_sample(data):
logging.debug("Selecting sampled GAP.")
score = partial(sampled_gap, sample_size=self.sample_size)
else:
logging.debug("Selecting full GAP.")
score = gap
return score(data, kmeans, n_jobs=self.n_jobs,
seed=self.seed + _BIG_PRIME * kmeans.n_clusters,
Expand All @@ -108,8 +110,11 @@ def _gap(self, data, kmeans):
def _fit_kmeans(self, n_clusters, data):
kmeans = clone(self.kmeans)
kmeans.n_clusters = n_clusters
logging.debug(f"Fitting kmeans for {n_clusters} clusters.")
kmeans.fit(data)
logging.debug(f"Fitted kmeans for {n_clusters} clusters.")
idx, std = self._gap(data, kmeans)
logging.debug(f"GAP index: {idx}; std: {std}.")
return kmeans, idx, std

def fit(self, X, y=None):
Expand Down
25 changes: 15 additions & 10 deletions divik/cluster/_kmeans/_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ class Node(NamedTuple):
right: KDTree = None


def make_tree(X, leaf_size: int, _feature_idx: int = 0) -> KDTree:
def make_tree(X, leaf_size: int, _feature_idx: int = 0, selector=None) \
-> KDTree:
"""Make KDTree out of the data
Construct a KDTree out of data using mean as a pivoting element.
Expand All @@ -151,19 +152,23 @@ def make_tree(X, leaf_size: int, _feature_idx: int = 0) -> KDTree:
tree : KDTree
Lightweight KD-Tree over the data
"""
if X.shape[0] < 2 * leaf_size:
centroid = X.mean(axis=0, keepdims=True)
if selector is None:
selector = np.ones((X.shape[0],), dtype=bool)
if selector.sum() < 2 * leaf_size:
centroid = X[selector, :].mean(axis=0, keepdims=True)
return Leaf(centroid, X.shape[0])
feature = X[:, _feature_idx]
feature = X[selector, _feature_idx]
thr = np.mean(feature)
left_idx = feature < thr
right_idx = np.logical_not(left_idx)
left = np.compress(left_idx, X, axis=0)
right = np.compress(right_idx, X, axis=0)
left_idx = selector.copy()
left_idx[selector] = feature < thr
right_idx = selector.copy()
right_idx[selector] = np.logical_not(left_idx[selector])
next_feature = (_feature_idx + 1) % X.shape[1]
return Node(
left=make_tree(left, leaf_size=leaf_size, _feature_idx=next_feature),
right=make_tree(right, leaf_size=leaf_size, _feature_idx=next_feature),
left=make_tree(X, leaf_size=leaf_size,
_feature_idx=next_feature, selector=left_idx),
right=make_tree(X, leaf_size=leaf_size,
_feature_idx=next_feature, selector=right_idx),
)


Expand Down
16 changes: 13 additions & 3 deletions divik/score/_gap.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from functools import partial
import logging

import numpy as np
import pandas as pd
Expand All @@ -25,12 +26,21 @@ def _dispersion(data: Data, kmeans: KMeans) -> float:
]))


def _sampled_dispersion(seed: int, sampler: BaseSampler, kmeans: KMeans) \
-> float:
def _sampled_dispersion(seed: int, sampler: BaseSampler, kmeans: KMeans,
fit: bool=True) -> float:
logging.debug(f"Sampling with seed {seed}.")
X = sampler.get_sample(seed)
logging.debug(f"Sample shape {X.shape}")
if kmeans.normalize_rows:
logging.debug("Normalizing rows.")
X = normalize_rows(X)
y = kmeans.fit_predict(X)
if fit:
logging.debug("Fitting kmeans for sample.")
y = kmeans.fit_predict(X)
else:
logging.debug("Predicting labels for sample.")
y = kmeans.predict(X)
logging.debug("Computing dispersion for clustered sample.")
clusters = pd.DataFrame(X).groupby(y)
return float(np.mean([
np.mean(dist.pdist(cluster_members.values, kmeans.distance))
Expand Down
10 changes: 9 additions & 1 deletion divik/score/_sampled_gap.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from functools import partial
import logging
from typing import Union

import numpy as np
Expand All @@ -25,20 +26,27 @@ def sampled_gap(data: Data, kmeans: KMeans,
n_trials: int = 100,
return_deviation: bool = False) -> float:
# TODO: Docs
logging.debug("Creating samplers.")
data_ = StratifiedSampler(n_rows=sample_size, n_samples=n_trials
).fit(data, kmeans.labels_)
reference_ = UniformSampler(n_rows=sample_size, n_samples=n_trials
).fit(data)
kmeans_ = clone(kmeans)
seeds = list(seed + np.arange(n_trials) * _BIG_PRIME)
logging.debug(f"Generated seeds: {seeds}.")
logging.debug(f"Entering parallel context with n_jobs={n_jobs}.")
with data_.parallel() as d, reference_.parallel() as r:
initializer = partial(_pool_initialize, [d, r])
with maybe_pool(n_jobs, initializer=initializer,
initargs=(d.initargs, r.initargs)) as pool:
logging.debug("Computing reference dispersion.")
compute_disp = partial(_dispersion, sampler=r, kmeans=kmeans_)
ref_disp = pool.map(compute_disp, seeds)
compute_disp = partial(_dispersion, sampler=d, kmeans=kmeans_)
logging.debug("Computing data dispersion.")
compute_disp = partial(_dispersion, sampler=d, kmeans=kmeans,
fit=False)
data_disp = pool.map(compute_disp, seeds)
logging.debug("Left parallel context.")
ref_disp = np.log(ref_disp)
data_disp = np.log(data_disp)
gap = np.mean(ref_disp) - np.mean(data_disp)
Expand Down
4 changes: 2 additions & 2 deletions docs/instructions/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ To install latest stable version use::

To install specific version, you can specify it in the command, e.g.::

docker pull gmrukwa/divik:2.5.8
docker pull gmrukwa/divik:2.5.9

Python package
--------------
Expand All @@ -31,7 +31,7 @@ package::

or any stable tagged version, e.g.::

pip install divik==2.5.8
pip install divik==2.5.9

If you want to have compatibility with
`gin-config <https://github.com/google/gin-config>`_, you can install
Expand Down
2 changes: 2 additions & 0 deletions requirements-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ dash-html-components==0.13.4
dash-core-components==0.42.0
dash-renderer==0.17.0
dash-table==3.1.11
dask-distance
dask[dataframe]
gin-config
h5py
kneed
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ dash-core-components==0.42.0
dash-html-components==0.13.4
dash-renderer==0.17.0
dash-table==3.1.11
dask-distance==0.2.0
dask[dataframe]==2.14.0
decorator==4.4.1
Flask==1.1.1
Flask-Compress==1.4.0
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
import numpy

__version__ = '2.5.8'
__version__ = '2.5.9'

LINUX_OPTS = {
'extra_link_args': [
Expand Down Expand Up @@ -93,6 +93,8 @@
'dash-html-components==0.13.4',
'dash-core-components==0.42.0',
'dash-table==3.1.11',
'dask[dataframe]>=2.14.0',
'dask-distance>=0.2.0',
'h5py>=2.8.0',
'kneed>=0.5.1',
'numpy>=0.12.1',
Expand Down

0 comments on commit 735f9fd

Please sign in to comment.