Skip to content

Commit

Permalink
Merge pull request #62 from gmrukwa/develop
Browse files Browse the repository at this point in the history
Release v2.5.5 introduces:
- pipeline saving
- fixes for SpectralClustering
- EXIMS feature selector
  • Loading branch information
gmrukwa committed Apr 26, 2020
2 parents f951d93 + a4631e8 commit a7e60cd
Show file tree
Hide file tree
Showing 17 changed files with 439 additions and 60 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
env:
MAJOR: ${{ 2 }}
MINOR: ${{ 5 }}
FIXUP: ${{ 4 }}
FIXUP: ${{ 5 }}
PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
PACKAGE_INIT_FILE_VERSION_LINE: ${{ 1 }}
PACKAGE_SETUP_FILE: ${{ 'setup.py' }}
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ docker pull gmrukwa/divik
To install specific version, you can specify it in the command, e.g.:

```bash
docker pull gmrukwa/divik:2.5.4
docker pull gmrukwa/divik:2.5.5
```

## Python package
Expand Down Expand Up @@ -79,7 +79,7 @@ pip install divik
or any stable tagged version, e.g.:

```bash
pip install divik==2.5.4
pip install divik==2.5.5
```

If you want to have compatibility with
Expand Down
2 changes: 1 addition & 1 deletion divik/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2.5.4'
__version__ = '2.5.5'

from ._summary import plot, reject_split

Expand Down
38 changes: 38 additions & 0 deletions divik/_cli/_model_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

from divik.core import configurable, DivikResult

Expand Down Expand Up @@ -104,3 +105,40 @@ def save_cluster_paths(model, destination, **kwargs):
'path': rev,
'cluster_number': list(model.reverse_paths_.values())
}).to_csv(os.path.join(destination, 'paths.csv'))

@saver
def save_pipeline(model, destination, **kwargs):
if not isinstance(model, Pipeline):
return
feature_selector = model[:-1]
clustering = model[-1]
if isinstance(clustering, Pipeline):
logging.info('Saving pre-extractor pickle.')
with open(os.path.join(destination, 'feature_pre_extractor.pkl'), 'wb') as pkl:
pickle.dump(feature_selector, pkl)
return save(clustering, destination, **kwargs)
logging.info('Saving model pickle.')
with open(os.path.join(destination, 'feature_selector.pkl'), 'wb') as pkl:
pickle.dump(feature_selector, pkl)
save(clustering, destination, **kwargs)
if not os.path.exists(os.path.join(destination, 'summary.json')):
logging.info("Saving JSON summary.")
with open(os.path.join(destination, 'summary.json'), 'w') as smr:
json.dump({
"depth": 1,
"number_of_clusters": int(clustering.n_clusters_),
"mean_cluster_size": \
clustering.labels_.size / float(clustering.n_clusters_)
}, smr)
if not os.path.exists(os.path.join(destination, 'final_partition.npy')):
logging.info("Saving final partition.")
np.save(os.path.join(destination, 'final_partition.npy'), clustering.labels_)
np.savetxt(os.path.join(destination, 'final_partition.csv'), clustering.labels_,
delimiter=', ', fmt='%i')
if not os.path.exists(os.path.join(destination, 'partition-0.png')):
from .divik import save_merged
save_merged(
destination,
clustering.labels_.reshape(-1, 1),
xy=kwargs.get('xy', None)
)
6 changes: 5 additions & 1 deletion divik/_cli/fit_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def load_xy(path=None):
@gin.configurable
def experiment(
model=gin.REQUIRED,
steps_that_require_xy = None,
destination: str = 'result',
omit_datetime: bool = False,
verbose: bool = False,
Expand All @@ -39,7 +40,10 @@ def experiment(
xy = load_xy()
# repeated dump just because the dataset locations are not tracked
dump_gin_args(destination)
model.fit(data)
if steps_that_require_xy is None:
steps_that_require_xy = []
kwargs = {f'{step}__xy': xy for step in steps_that_require_xy}
model.fit(data, **kwargs)
save(model, destination, xy=xy)


Expand Down
71 changes: 20 additions & 51 deletions divik/feature_extraction/_spectral.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,47 +9,6 @@
from divik.core import configurable, Data


def locally_adjusted_affinity(X: Data, d: str, neighbors: int = 7) -> Data:
"""Calculate affinity with local density correction
Calculate affinity matrix based on input coordinates matrix and the number
of nearest neighbors.
Apply local scaling based on the k nearest neighbor
Parameters
----------
X : array-like or sparse matrix, shape=(n_samples, n_features)
Training instances to cluster.
d : str
Measure of distance between points.
neighbors : int
The number of neighbors considered a local neighborhood.
Returns
-------
affinity : array, shape [n_samples, n_samples]
Adjusted affinity matrix.
References:
----------
https://towardsdatascience.com/spectral-graph-clustering-and-optimal-number-of-clusters-estimation-32704189afbe
https://papers.nips.cc/paper/2619-self-tuning-spectral-clustering.pdf
"""
distances = dist.pdist(X, metric=d)
knn_distances = np.sort(distances, axis=0)[neighbors].reshape(-1, 1)
local_scale = knn_distances.dot(knn_distances.T)
affinity = - distances ** 2 / local_scale
affinity[np.isnan(affinity)] = 0
affinity = np.exp(affinity)
np.fill_diagonal(affinity, 0)
return affinity


@configurable
class LocallyAdjustedRbfSpectralEmbedding(BaseEstimator):
"""Spectral embedding for non-linear dimensionality reduction.
Expand Down Expand Up @@ -143,18 +102,22 @@ def fit(self, X, y=None):
Returns the instance itself.
"""
logging.debug('Computing locally adjusted affinities.')
affinity_matrix_ = locally_adjusted_affinity(
X, self.distance, self.n_neighbors)
d = dist.squareform(dist.pdist(X, metric=self.distance))

if 0 <= self.n_components <= 1:
n_components = max(int(self.n_components * X.shape[1]), 1)
else:
n_components = self.n_components

logging.debug('Computing embedding of affinities.')
embedder = SpectralEmbedding(n_components=self.n_components,
affinity='precomputed',
gamma=None,
random_state=self.random_state,
eigen_solver=self.eigen_solver,
n_neighbors=self.n_neighbors,
n_jobs=self.n_jobs)
self.embedding_ = embedder.fit_transform(affinity_matrix_)
embedder = SpectralEmbedding(n_components=n_components,
affinity='precomputed_nearest_neighbors',
gamma=None,
random_state=self.random_state,
eigen_solver=self.eigen_solver,
n_neighbors=self.n_neighbors,
n_jobs=self.n_jobs)
self.embedding_ = embedder.fit_transform(d)
return self

def fit_transform(self, X, y=None):
Expand All @@ -173,6 +136,12 @@ def fit_transform(self, X, y=None):
X_new : array-like, shape (n_samples, n_components)
"""
return self.fit(X).embedding_

def transform(self, X, y=None):
if not hasattr(self, 'embedding_') \
or self.embedding_.shape[0] != X.shape[0]:
self.fit(X, y)
return self.embedding_

def save(self, destination: str):
"""Save embedding to a directory
Expand Down
9 changes: 8 additions & 1 deletion divik/feature_selection/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
"""Unsupervised feature selection methods"""
from ._stat_selector_mixin import StatSelectorMixin, NoSelector
from ._stat_selector_mixin import (
SelectorMixin,
StatSelectorMixin,
NoSelector,
)
from ._exims import EximsSelector
from ._gmm_selector import GMMSelector
from ._outlier import (
huberta_outliers,
Expand All @@ -14,8 +19,10 @@


__all__ = [
'SelectorMixin',
'StatSelectorMixin',
'NoSelector',
'EximsSelector',
'GMMSelector',
'huberta_outliers',
'OutlierSelector',
Expand Down
1 change: 1 addition & 0 deletions divik/feature_selection/_exims/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ._sklearn import EximsSelector
99 changes: 99 additions & 0 deletions divik/feature_selection/_exims/_exims.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from functools import partial
from multiprocessing import Pool
from typing import Callable, Tuple

import numpy as np
from tqdm import tqdm

from divik.feature_selection._exims._structness import structness


class pipe:
def __init__(self, *functions):
self.functions = functions

def __call__(self, *args, **kwargs):
result = self.functions[0](*args, **kwargs)
for func in self.functions[1:]:
result = func(result)
return result


def progress_bar(description: str=None):
return partial(tqdm, desc=description)


def pmap(func, collection, **kwargs):
with Pool() as pool:
return pool.map(func, collection, **kwargs)


def apply(func, collection):
return [func(element) for element in collection]


def for_each(func, lazy: bool=True, parallel: bool=False, **kwargs):
if parallel:
return partial(pmap, func, **kwargs)
if lazy:
return partial(map, func)
else:
return partial(apply, func)


def as_image(data: np.ndarray, x: np.ndarray, y: np.ndarray, default=-1) -> \
np.ndarray:
x, y = x.astype(int), y.astype(int)
translated_x, translated_y = x - np.min(x), y - np.min(y)
rows, columns = int(np.max(translated_y) + 1), int(np.max(translated_x) + 1)
if len(data.shape) < 2:
data = data.reshape((data.shape[0], 1))
cube = default * np.ones((rows, columns, data.shape[1]))
cube[translated_y, translated_x] = data
return cube


_IGNORED = -1
_Feature = np.ndarray
_Structness = float
_FeatureProcessor = Callable[[_Feature], Tuple[_Structness, _Structness]]
_remove_channel_dimension = partial(np.squeeze, axis=2)


def _feature_processor(x: np.ndarray, y: np.ndarray) -> _FeatureProcessor:
# noinspection PyTypeChecker
return pipe(
partial(as_image, x=x, y=y, default=_IGNORED),
_remove_channel_dimension,
partial(structness, ignored=[_IGNORED])
)


def _normalize_columns(matrix) -> np.ndarray:
matrix = np.array(matrix, dtype=float)
matrix += np.finfo(float).eps
assert len(matrix.shape) == 2
return matrix / np.max(matrix, axis=0)


_as_features = np.transpose
_normalize_structness_by_kind = _normalize_columns
_sumarize_structness_by_feature = pipe(partial(np.sum, axis=1), np.ravel)
FeaturesStructness = np.ndarray
_StructnessEstimator = Callable[[np.ndarray], FeaturesStructness]


def _estimator(structness_: _FeatureProcessor) -> _StructnessEstimator:
# noinspection PyTypeChecker
return pipe(
_as_features,
progress_bar('feature structness'),
for_each(structness_, parallel=True),
_normalize_structness_by_kind,
_sumarize_structness_by_feature
)


def exims(data: np.ndarray, x: np.ndarray, y: np.ndarray) -> FeaturesStructness:
structness_estimator = _estimator(_feature_processor(x, y))
return structness_estimator(data)
58 changes: 58 additions & 0 deletions divik/feature_selection/_exims/_matlab_alike.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import numpy as np


def quantile(values, quantiles):
"""Compute MATLAB-alike quantiles
Arguments:
values - (np.ndarray) Input array or object that can be converted to an
array.
quantiles - (float, np.ndarray) float in range of [0,1] (or sequence
of floats). Quantile to compute, which must be between 0 and 1
inclusive.
location of first element as a quantile -> 0.5 / n
location of last element as a quantile -> (n - 0.5) / n
^
| (n-0.5) / n 1.0
1.0 ----------------|---|
| /
| /
| /
| / <- this is how quantiles look in MATLAB
| /
| /
| /
| /
0.0 ----|---|------------->
| 0.0 0.5 / n
y = ax + b
0 = a * 0.5 / n + b <- 0th element is treated as 0.5 / n quantile
1 = a * (n - 0.5) / n + b <- last element is treated as (n - 0.5) / n quantile
a = n / (n - 1)
b = - 0.5 / (n - 1)
"""
values = np.array(values)
n = float(values.size)
a = n / (n - 1)
b = - 0.5 / (n - 1)
quantiles = np.array(quantiles)
matlab_alike_quantiles = np.clip(a * quantiles + b, a_min=0.0, a_max=1.0)
return np.percentile(values, q=100. * matlab_alike_quantiles)


def n_quantiles(values, N, unbiased=True, backend=quantile):
return backend(values, np.arange(1, N+1, dtype=float) / (N + int(unbiased)))


def iqr(values, rng=(25, 75)):
q1, q3 = quantile(values, .01 * np.array(rng))
return q3 - q1

0 comments on commit a7e60cd

Please sign in to comment.