Merge pull request #62 from gmrukwa/develop

Release v2.5.5 introduces: - pipeline saving - fixes for SpectralClustering - EXIMS feature selector
gmrukwa · Apr 26, 2020 · a7e60cd · a7e60cd
2 parents f951d93 + a4631e8
commit a7e60cd
Show file tree

Hide file tree

Showing 17 changed files with 439 additions and 60 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -12,7 +12,7 @@ on:
 env:
   MAJOR: ${{ 2 }}
   MINOR: ${{ 5 }}
-  FIXUP: ${{ 4 }}
+  FIXUP: ${{ 5 }}
   PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
   PACKAGE_INIT_FILE_VERSION_LINE: ${{ 1 }}
   PACKAGE_SETUP_FILE: ${{ 'setup.py' }}

diff --git a/README.md b/README.md
@@ -40,7 +40,7 @@ docker pull gmrukwa/divik
 To install specific version, you can specify it in the command, e.g.:
 
 ```bash
-docker pull gmrukwa/divik:2.5.4
+docker pull gmrukwa/divik:2.5.5
 ```
 
 ## Python package
@@ -79,7 +79,7 @@ pip install divik
 or any stable tagged version, e.g.:
 
 ```bash
-pip install divik==2.5.4
+pip install divik==2.5.5
 ```
 
 If you want to have compatibility with

diff --git a/divik/__init__.py b/divik/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '2.5.4'
+__version__ = '2.5.5'
 
 from ._summary import plot, reject_split
 

diff --git a/divik/_cli/_model_io.py b/divik/_cli/_model_io.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pandas as pd
+from sklearn.pipeline import Pipeline
 
 from divik.core import configurable, DivikResult
 
@@ -104,3 +105,40 @@ def save_cluster_paths(model, destination, **kwargs):
         'path': rev,
         'cluster_number': list(model.reverse_paths_.values())
     }).to_csv(os.path.join(destination, 'paths.csv'))
+
+@saver
+def save_pipeline(model, destination, **kwargs):
+    if not isinstance(model, Pipeline):
+        return
+    feature_selector = model[:-1]
+    clustering = model[-1]
+    if isinstance(clustering, Pipeline):
+        logging.info('Saving pre-extractor pickle.')
+        with open(os.path.join(destination, 'feature_pre_extractor.pkl'), 'wb') as pkl:
+            pickle.dump(feature_selector, pkl)
+        return save(clustering, destination, **kwargs)
+    logging.info('Saving model pickle.')
+    with open(os.path.join(destination, 'feature_selector.pkl'), 'wb') as pkl:
+        pickle.dump(feature_selector, pkl)
+    save(clustering, destination, **kwargs)
+    if not os.path.exists(os.path.join(destination, 'summary.json')):
+        logging.info("Saving JSON summary.")
+        with open(os.path.join(destination, 'summary.json'), 'w') as smr:
+            json.dump({
+                "depth": 1,
+                "number_of_clusters": int(clustering.n_clusters_),
+                "mean_cluster_size": \
+                    clustering.labels_.size / float(clustering.n_clusters_)
+            }, smr)
+    if not os.path.exists(os.path.join(destination, 'final_partition.npy')):
+        logging.info("Saving final partition.")
+        np.save(os.path.join(destination, 'final_partition.npy'), clustering.labels_)
+        np.savetxt(os.path.join(destination, 'final_partition.csv'), clustering.labels_,
+                delimiter=', ', fmt='%i')
+    if not os.path.exists(os.path.join(destination, 'partition-0.png')):
+        from .divik import save_merged
+        save_merged(
+            destination,
+            clustering.labels_.reshape(-1, 1),
+            xy=kwargs.get('xy', None)
+        )
diff --git a/divik/_cli/fit_clusters.py b/divik/_cli/fit_clusters.py
@@ -25,6 +25,7 @@ def load_xy(path=None):
 @gin.configurable
 def experiment(
     model=gin.REQUIRED,
+    steps_that_require_xy = None,
     destination: str = 'result',
     omit_datetime: bool = False,
     verbose: bool = False,
@@ -39,7 +40,10 @@ def experiment(
     xy = load_xy()
     # repeated dump just because the dataset locations are not tracked
     dump_gin_args(destination)
-    model.fit(data)
+    if steps_that_require_xy is None:
+        steps_that_require_xy = []
+    kwargs = {f'{step}__xy': xy for step in steps_that_require_xy}
+    model.fit(data, **kwargs)
     save(model, destination, xy=xy)
 
 

diff --git a/divik/feature_extraction/_spectral.py b/divik/feature_extraction/_spectral.py
@@ -9,47 +9,6 @@
 from divik.core import configurable, Data
 
 
-def locally_adjusted_affinity(X: Data, d: str, neighbors: int = 7) -> Data:
-    """Calculate affinity with local density correction
-
-    Calculate affinity matrix based on input coordinates matrix and the number
-    of nearest neighbors.
-
-    Apply local scaling based on the k nearest neighbor
-
-    Parameters
-    ----------
-    X : array-like or sparse matrix, shape=(n_samples, n_features)
-        Training instances to cluster.
-
-    d : str
-        Measure of distance between points.
-
-    neighbors : int
-        The number of neighbors considered a local neighborhood.
-
-    Returns
-    -------
-
-    affinity : array, shape [n_samples, n_samples]
-        Adjusted affinity matrix.
-
-    References:
-    ----------
-    https://towardsdatascience.com/spectral-graph-clustering-and-optimal-number-of-clusters-estimation-32704189afbe
-    https://papers.nips.cc/paper/2619-self-tuning-spectral-clustering.pdf
-
-    """
-    distances = dist.pdist(X, metric=d)
-    knn_distances = np.sort(distances, axis=0)[neighbors].reshape(-1, 1)
-    local_scale = knn_distances.dot(knn_distances.T)
-    affinity = - distances ** 2 / local_scale
-    affinity[np.isnan(affinity)] = 0
-    affinity = np.exp(affinity)
-    np.fill_diagonal(affinity, 0)
-    return affinity
-
-
 @configurable
 class LocallyAdjustedRbfSpectralEmbedding(BaseEstimator):
     """Spectral embedding for non-linear dimensionality reduction.
@@ -143,18 +102,22 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
         logging.debug('Computing locally adjusted affinities.')
-        affinity_matrix_ = locally_adjusted_affinity(
-            X, self.distance, self.n_neighbors)
+        d = dist.squareform(dist.pdist(X, metric=self.distance))
+
+        if 0 <= self.n_components <= 1:
+            n_components = max(int(self.n_components * X.shape[1]), 1)
+        else:
+            n_components = self.n_components
 
         logging.debug('Computing embedding of affinities.')
-        embedder = SpectralEmbedding(n_components=self.n_components,
-                                     affinity='precomputed',
-                                     gamma=None,
-                                     random_state=self.random_state,
-                                     eigen_solver=self.eigen_solver,
-                                     n_neighbors=self.n_neighbors,
-                                     n_jobs=self.n_jobs)
-        self.embedding_ = embedder.fit_transform(affinity_matrix_)
+        embedder = SpectralEmbedding(n_components=n_components,
+                                    affinity='precomputed_nearest_neighbors',
+                                    gamma=None,
+                                    random_state=self.random_state,
+                                    eigen_solver=self.eigen_solver,
+                                    n_neighbors=self.n_neighbors,
+                                    n_jobs=self.n_jobs)
+        self.embedding_ = embedder.fit_transform(d)
         return self
 
     def fit_transform(self, X, y=None):
@@ -173,6 +136,12 @@ def fit_transform(self, X, y=None):
         X_new : array-like, shape (n_samples, n_components)
         """
         return self.fit(X).embedding_
+
+    def transform(self, X, y=None):
+        if not hasattr(self, 'embedding_') \
+                or self.embedding_.shape[0] != X.shape[0]:
+            self.fit(X, y)
+        return self.embedding_
 
     def save(self, destination: str):
         """Save embedding to a directory

diff --git a/divik/feature_selection/__init__.py b/divik/feature_selection/__init__.py
@@ -1,5 +1,10 @@
 """Unsupervised feature selection methods"""
-from ._stat_selector_mixin import StatSelectorMixin, NoSelector
+from ._stat_selector_mixin import (
+    SelectorMixin,
+    StatSelectorMixin,
+    NoSelector,
+)
+from ._exims import EximsSelector
 from ._gmm_selector import GMMSelector
 from ._outlier import (
     huberta_outliers,
@@ -14,8 +19,10 @@
 
 
 __all__ = [
+    'SelectorMixin',
     'StatSelectorMixin',
     'NoSelector',
+    'EximsSelector',
     'GMMSelector',
     'huberta_outliers',
     'OutlierSelector',

diff --git a/divik/feature_selection/_exims/__init__.py b/divik/feature_selection/_exims/__init__.py
@@ -0,0 +1 @@
+from ._sklearn import EximsSelector
diff --git a/divik/feature_selection/_exims/_exims.py b/divik/feature_selection/_exims/_exims.py
@@ -0,0 +1,99 @@
+from functools import partial
+from multiprocessing import Pool
+from typing import Callable, Tuple
+
+import numpy as np
+from tqdm import tqdm
+
+from divik.feature_selection._exims._structness import structness
+
+
+class pipe:
+    def __init__(self, *functions):
+        self.functions = functions
+
+    def __call__(self, *args, **kwargs):
+        result = self.functions[0](*args, **kwargs)
+        for func in self.functions[1:]:
+            result = func(result)
+        return result
+
+
+def progress_bar(description: str=None):
+    return partial(tqdm, desc=description)
+
+
+def pmap(func, collection, **kwargs):
+    with Pool() as pool:
+        return pool.map(func, collection, **kwargs)
+
+
+def apply(func, collection):
+    return [func(element) for element in collection]
+
+
+def for_each(func, lazy: bool=True, parallel: bool=False, **kwargs):
+    if parallel:
+        return partial(pmap, func, **kwargs)
+    if lazy:
+        return partial(map, func)
+    else:
+        return partial(apply, func)
+
+
+def as_image(data: np.ndarray, x: np.ndarray, y: np.ndarray, default=-1) -> \
+        np.ndarray:
+    x, y = x.astype(int), y.astype(int)
+    translated_x, translated_y = x - np.min(x), y - np.min(y)
+    rows, columns = int(np.max(translated_y) + 1), int(np.max(translated_x) + 1)
+    if len(data.shape) < 2:
+        data = data.reshape((data.shape[0], 1))
+    cube = default * np.ones((rows, columns, data.shape[1]))
+    cube[translated_y, translated_x] = data
+    return cube
+
+
+_IGNORED = -1
+_Feature = np.ndarray
+_Structness = float
+_FeatureProcessor = Callable[[_Feature], Tuple[_Structness, _Structness]]
+_remove_channel_dimension = partial(np.squeeze, axis=2)
+
+
+def _feature_processor(x: np.ndarray, y: np.ndarray) -> _FeatureProcessor:
+    # noinspection PyTypeChecker
+    return pipe(
+        partial(as_image, x=x, y=y, default=_IGNORED),
+        _remove_channel_dimension,
+        partial(structness, ignored=[_IGNORED])
+    )
+
+
+def _normalize_columns(matrix) -> np.ndarray:
+    matrix = np.array(matrix, dtype=float)
+    matrix += np.finfo(float).eps
+    assert len(matrix.shape) == 2
+    return matrix / np.max(matrix, axis=0)
+
+
+_as_features = np.transpose
+_normalize_structness_by_kind = _normalize_columns
+_sumarize_structness_by_feature = pipe(partial(np.sum, axis=1), np.ravel)
+FeaturesStructness = np.ndarray
+_StructnessEstimator = Callable[[np.ndarray], FeaturesStructness]
+
+
+def _estimator(structness_: _FeatureProcessor) -> _StructnessEstimator:
+    # noinspection PyTypeChecker
+    return pipe(
+        _as_features,
+        progress_bar('feature structness'),
+        for_each(structness_, parallel=True),
+        _normalize_structness_by_kind,
+        _sumarize_structness_by_feature
+    )
+
+
+def exims(data: np.ndarray, x: np.ndarray, y: np.ndarray) -> FeaturesStructness:
+    structness_estimator = _estimator(_feature_processor(x, y))
+    return structness_estimator(data)
diff --git a/divik/feature_selection/_exims/_matlab_alike.py b/divik/feature_selection/_exims/_matlab_alike.py
@@ -0,0 +1,58 @@
+import numpy as np
+
+
+def quantile(values, quantiles):
+    """Compute MATLAB-alike quantiles
+
+    Arguments:
+        values - (np.ndarray) Input array or object that can be converted to an
+        array.
+        quantiles - (float, np.ndarray) float in range of [0,1] (or sequence
+        of floats). Quantile to compute, which must be between 0 and 1
+        inclusive.
+
+
+    location of first element as a quantile -> 0.5 / n
+    location of last element as a quantile -> (n - 0.5) / n
+
+         ^
+         |     (n-0.5) / n    1.0
+     1.0 ----------------|---|
+         |              /
+         |             /
+         |            /
+         |           /                  <- this is how quantiles look in MATLAB
+         |          /
+         |         /
+         |        /
+         |       /
+     0.0 ----|---|------------->
+         | 0.0   0.5 / n
+
+
+    y = ax + b
+
+
+    0 = a * 0.5 / n + b         <- 0th element is treated as 0.5 / n quantile
+    1 = a * (n - 0.5) / n + b   <- last element is treated as (n - 0.5) / n quantile
+
+    a = n / (n - 1)
+    b = - 0.5 / (n - 1)
+
+    """
+    values = np.array(values)
+    n = float(values.size)
+    a = n / (n - 1)
+    b = - 0.5 / (n - 1)
+    quantiles = np.array(quantiles)
+    matlab_alike_quantiles = np.clip(a * quantiles + b, a_min=0.0, a_max=1.0)
+    return np.percentile(values, q=100. * matlab_alike_quantiles)
+
+
+def n_quantiles(values, N, unbiased=True, backend=quantile):
+    return backend(values, np.arange(1, N+1, dtype=float) / (N + int(unbiased)))
+
+
+def iqr(values, rng=(25, 75)):
+    q1, q3 = quantile(values, .01 * np.array(rng))
+    return q3 - q1