Skip to content

Commit

Permalink
Merge pull request #31 from gmrukwa/develop
Browse files Browse the repository at this point in the history
Release v2.3.11
  • Loading branch information
gmrukwa committed Jan 9, 2020
2 parents da7740b + d4258b3 commit 92b8427
Show file tree
Hide file tree
Showing 36 changed files with 1,060 additions and 134 deletions.
30 changes: 30 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"name": "divik:dev",
"context": "..",
"dockerFile": "../docker/development.dockerfile",

"runArgs": [
"-v", "${env:HOME}${env:USERPROFILE}/.ssh:/root/.ssh-localhost:ro",
],

"postCreateCommand": "cp -r ~/.ssh-localhost ~/.ssh && chmod 700 ~/.ssh && chmod 600 ~/.ssh/* && rm -f ~/.ssh/config",

"settings": {
"terminal.integrated.shell.linux": "/bin/bash",
"python.pythonPath": "/usr/local/bin/python",
"python.linting.pylintEnabled": true,
"python.linting.pylintPath": "/usr/local/bin/pylint",
"python.linting.enabled": true
},

"extensions": [
"eamodio.gitlens",
"mutantdino.resourcemonitor",
"ms-python.python",
"angelo-breuer.clock",
"donjayamanne.githistory",
"tabnine.tabnine-vscode",
"redhat.vscode-yaml",
"formulahendry.code-runner"
]
}
2 changes: 1 addition & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
env:
MAJOR: ${{ 2 }}
MINOR: ${{ 3 }}
FIXUP: ${{ 10 }}
FIXUP: ${{ 11 }}
PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
DOCKER_REPO: ${{ 'gmrukwa/divik' }}
IS_ALPHA: ${{ github.event_name == 'pull_request' }}
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ jobs:
uses: actions/checkout@v1
- name: Run unit tests
run: docker build . --file docker/unittest.dockerfile --tag unittest
env:
ENABLE_SLOW_TESTS: True
12 changes: 12 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"python.pythonPath": "/usr/local/bin/python",
"python.linting.pylintEnabled": true,
"python.linting.enabled": true,
"python.linting.pylintPath": "/usr/local/bin/pylint",
"python.testing.unittestArgs": [
"discover"
],
"python.testing.unittestEnabled": true,
"python.testing.nosetestsEnabled": false,
"python.testing.pytestEnabled": false,
}
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ docker pull gmrukwa/divik
To install specific version, you can specify it in the command, e.g.:

```bash
docker pull gmrukwa/divik:2.3.10
docker pull gmrukwa/divik:2.3.11
```

## Python package
Expand All @@ -59,7 +59,7 @@ pip install divik
or any stable tagged version, e.g.:

```bash
pip install divik==2.3.10
pip install divik==2.3.11
```

# References
Expand Down
4 changes: 3 additions & 1 deletion divik/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
__version__ = '2.3.10'
__version__ = '2.3.11'

from ._seeding import seeded
from ._utils import DivikResult
from divik import feature_selection
from divik import feature_extraction
from divik import cluster
from divik import sampler
from ._summary import plot, reject_split

__all__ = [
"__version__",
"cluster",
"feature_selection",
"feature_extraction",
"sampler",
"seeded",
'DivikResult',
"plot", "reject_split",
Expand Down
21 changes: 9 additions & 12 deletions divik/_score/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
from ._dunn import DunnPicker
from ._gap import gap, GapPicker
from ._sampled_gap import sampled_gap, SamplingGapPicker
from ._picker import Picker


def make_picker(method, n_jobs: int = 1, gap=None):
if gap is None:
gap = {}
if method == 'dunn':
picker = DunnPicker(n_jobs=n_jobs)
elif method == 'gap':
if gap is None:
gap = {}
max_iter = gap.get('max_iter', 10)
seed = gap.get('seed', 0)
trials = gap.get('trials', 10)
correction = gap.get('correction', True)
picker = GapPicker(max_iter, seed, trials, correction, n_jobs=n_jobs)
else:
raise ValueError('Unknown quality measure {0}'.format(method))
return picker
return DunnPicker(n_jobs=n_jobs)
if method == 'gap':
return GapPicker(n_jobs=n_jobs, **gap)
if method == 'sampled_gap':
return SamplingGapPicker(n_jobs=n_jobs, **gap)
raise ValueError('Unknown quality measure {0}'.format(method))
5 changes: 2 additions & 3 deletions divik/_score/_dunn.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from functools import partial
from multiprocessing.pool import Pool
from typing import List, Optional
import uuid

Expand All @@ -8,7 +7,7 @@
from scipy.spatial import distance as dist

from divik._score._picker import Picker
from divik._utils import Data, get_n_jobs
from divik._utils import Data, maybe_pool


KMeans = 'divik.KMeans'
Expand Down Expand Up @@ -40,7 +39,7 @@ def score(self, data: Data, estimators: List[KMeans]) -> np.ndarray:
global _DATA
_DATA[ref] = data
score = partial(_dunn, data=ref)
with Pool(get_n_jobs(self.n_jobs)) as pool:
with maybe_pool(self.n_jobs) as pool:
scores = pool.map(score, estimators)
del _DATA[ref]
else:
Expand Down
95 changes: 39 additions & 56 deletions divik/_score/_gap.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
from __future__ import division
from functools import partial
import gc
from multiprocessing import Pool
from typing import List, Optional, Tuple
from typing import List, Optional

import numpy as np
import pandas as pd
import scipy.spatial.distance as dist
from sklearn.base import clone

from divik._score._picker import Picker
from divik._utils import Data, context_if, get_n_jobs, normalize_rows
from divik._utils import Data, normalize_rows, maybe_pool
from divik._seeding import seeded
from divik.sampler import BaseSampler, UniformSampler


KMeans = 'divik.KMeans'
Expand All @@ -29,59 +27,43 @@ def _dispersion(data: Data, kmeans: KMeans) -> float:
]))


def _dispersion_of_random_sample(seed: int,
shape: Tuple[int, int],
minima: np.ndarray,
ranges: np.ndarray,
kmeans: KMeans) -> float:
np.random.seed(seed)
sample = np.random.random_sample(shape) * ranges + minima
dispersion = _dispersion(sample, kmeans.fit(sample))
del sample
gc.collect()
return dispersion
def _sampled_dispersion(seed: int, sampler: BaseSampler, kmeans: KMeans) \
-> float:
X = sampler.get_sample(seed)
if kmeans.normalize_rows:
X = normalize_rows(X)
y = kmeans.fit_predict(X)
clusters = pd.DataFrame(X).groupby(y)
return float(np.mean([
np.mean(dist.pdist(cluster_members.values, kmeans.distance))
for _, cluster_members in clusters
]))


@seeded(wrapped_requires_seed=True)
def gap(data: Data, kmeans: KMeans, seed: int = 0, n_trials: int = 100,
pool: Pool = None, return_deviation: bool = False,
def gap(data: Data, kmeans: KMeans,
n_jobs: int = None,
seed: int = 0,
n_trials: int = 100,
return_deviation: bool = False,
max_iter: int = 10) -> float:
minima = np.min(data, axis=0)
ranges = np.max(data, axis=0) - minima
fast_kmeans = clone(kmeans)
fast_kmeans.max_iter = max_iter
compute_dispersion = partial(_dispersion_of_random_sample,
shape=data.shape,
minima=minima,
ranges=ranges,
kmeans=fast_kmeans)
if pool is None:
dispersions = [compute_dispersion(i)
for i in range(seed, seed + n_trials)]
else:
dispersions = pool.map(compute_dispersion, range(seed, seed + n_trials))
reference = _dispersion(data, kmeans)
log_dispersions = np.log(dispersions)
gap_value = np.mean(log_dispersions) - np.log(reference)
result = (gap_value, )
reference_ = UniformSampler(n_rows=None, n_samples=n_trials
).fit(data)
kmeans_ = clone(kmeans)
kmeans_.max_iter = max_iter
with reference_.parallel() as r, maybe_pool(n_jobs) as pool:
compute_disp = partial(_sampled_dispersion, sampler=r, kmeans=kmeans_)
ref_disp = pool.map(compute_disp, range(seed, seed + n_trials))
ref_disp = np.log(ref_disp)
data_disp = np.log(_dispersion(data, kmeans))
gap = np.mean(ref_disp) - data_disp
result = (gap,)
if return_deviation:
standard_deviation = np.sqrt(1 + 1 / n_trials) \
* np.std(log_dispersions)
result += (standard_deviation,)
std = np.sqrt(1 + 1 / n_trials) * np.std(ref_disp)
result += (std,)
return result


class pipe:
def __init__(self, *functions):
self.functions = functions

def __call__(self, *args, **kwargs):
result = self.functions[0](*args, **kwargs)
for func in self.functions[1:]:
result = func(result)
return result


class GapPicker(Picker):
def __init__(self, max_iter: int = 10, seed: int = 0, n_trials: int = 10,
correction: bool = True, n_jobs: int = 1):
Expand All @@ -92,12 +74,13 @@ def __init__(self, max_iter: int = 10, seed: int = 0, n_trials: int = 10,
self.correction = correction

def score(self, data: Data, estimators: List[KMeans]) -> np.ndarray:
n_jobs = get_n_jobs(self.n_jobs)
with context_if(self.n_jobs != 1, Pool, n_jobs) as pool:
gap_ = partial(gap, data, seed=self.seed, n_trials=self.n_trials,
return_deviation=True, pool=pool,
max_iter=self.max_iter)
scores = [gap_(kmeans=estimator) for estimator in estimators]
gap_ = partial(gap, data,
n_jobs=self.n_jobs,
seed=self.seed,
n_trials=self.n_trials,
return_deviation=True,
max_iter=self.max_iter)
scores = [gap_(kmeans=estimator) for estimator in estimators]
return np.array(scores)

def select(self, scores: np.ndarray) -> Optional[int]:
Expand Down
66 changes: 66 additions & 0 deletions divik/_score/_sampled_gap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from functools import partial
from typing import List, Union

import numpy as np
from sklearn.base import clone

from divik._utils import Data, maybe_pool
from divik._seeding import seeded
from divik.sampler import UniformSampler, StratifiedSampler
from divik._score._gap import _sampled_dispersion as _dispersion, GapPicker


KMeans = 'divik.KMeans'


@seeded(wrapped_requires_seed=True)
def sampled_gap(data: Data, kmeans: KMeans,
sample_size: Union[int, float] = 1000,
n_jobs: int = None,
seed: int = 0,
n_trials: int = 100,
return_deviation: bool = False,
max_iter: int = 10) -> float:
# TODO: Docs
# TODO: Tests
data_ = StratifiedSampler(n_rows=sample_size, n_samples=n_trials
).fit(data, kmeans.labels_)
reference_ = UniformSampler(n_rows=sample_size, n_samples=n_trials
).fit(data)
kmeans_ = clone(kmeans)
kmeans_.max_iter = max_iter
with data_.parallel() as d, reference_.parallel() as r, \
maybe_pool(n_jobs) as pool:
compute_disp = partial(_dispersion, sampler=r, kmeans=kmeans_)
ref_disp = pool.map(compute_disp, range(seed, seed + n_trials))
compute_disp = partial(_dispersion, sampler=d, kmeans=kmeans_)
data_disp = pool.map(compute_disp, range(seed, seed + n_trials))
ref_disp = np.log(ref_disp)
data_disp = np.log(data_disp)
gap = np.mean(ref_disp) - np.mean(data_disp)
result = (gap,)
if return_deviation:
std = np.sqrt(np.var(ref_disp) + np.var(data_disp)) / n_trials
result += (std,)
return result


class SamplingGapPicker(GapPicker):
def __init__(self, sample_size: int = 1000, max_iter: int = 10,
seed: int = 0, n_trials: int = 10,
correction: bool = True, n_jobs: int = 1):
super().__init__(
max_iter=max_iter, seed=seed, n_trials=n_trials,
correction=correction, n_jobs=n_jobs)
self.sample_size = sample_size

def score(self, data: Data, estimators: List[KMeans]) -> np.ndarray:
gap_ = partial(sampled_gap, data,
sample_size=self.sample_size,
n_jobs=self.n_jobs,
seed=self.seed,
n_trials=self.n_trials,
return_deviation=True,
max_iter=self.max_iter)
scores = [gap_(kmeans=estimator) for estimator in estimators]
return np.array(scores)
27 changes: 27 additions & 0 deletions divik/_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from contextlib import contextmanager
from multiprocessing import Pool
from typing import Callable, Tuple, NamedTuple, List, Optional

import numpy as np
Expand Down Expand Up @@ -54,3 +55,29 @@ def context_if(condition, context, *args, **kwargs):
yield c
else:
yield None


class DummyPool:
def __init__(self, *args, **kwargs):
pass

def apply(self, func, args, kwds):
return func(*args, **kwds)

# noinspection PyUnusedLocal
def map(self, func, iterable, chunksize=None):
return [func(v) for v in iterable]

# noinspection PyUnusedLocal
def starmap(self, func, iterable, chunksize=None):
return [func(*v) for v in iterable]


@contextmanager
def maybe_pool(processes: int=None, *args, **kwargs):
n_jobs = get_n_jobs(processes)
if n_jobs == 1 or n_jobs == 0:
yield DummyPool(n_jobs, *args, **kwargs)
else:
with Pool(n_jobs, *args, **kwargs) as pool:
yield pool

0 comments on commit 92b8427

Please sign in to comment.