Merge pull request #34 from gmrukwa/develop

Release v2.3.13
gmrukwa · Jan 11, 2020 · ce2ceab · ce2ceab
2 parents 7664288 + 0ed3cf5
commit ce2ceab
Show file tree

Hide file tree

Showing 38 changed files with 1,392 additions and 961 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -12,7 +12,7 @@ on:
 env:
   MAJOR: ${{ 2 }}
   MINOR: ${{ 3 }}
-  FIXUP: ${{ 12 }}
+  FIXUP: ${{ 13 }}
   PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
   DOCKER_REPO: ${{ 'gmrukwa/divik' }}
   IS_ALPHA: ${{ github.event_name == 'pull_request' }}

diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ docker pull gmrukwa/divik
 To install specific version, you can specify it in the command, e.g.:
 
 ```bash
-docker pull gmrukwa/divik:2.3.12
+docker pull gmrukwa/divik:2.3.13
 ```
 
 ## Python package
@@ -59,7 +59,7 @@ pip install divik
 or any stable tagged version, e.g.:
 
 ```bash
-pip install divik==2.3.12
+pip install divik==2.3.13
 ```
 
 # References

diff --git a/divik/__init__.py b/divik/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '2.3.12'
+__version__ = '2.3.13'
 
 from ._seeding import seeded
 from ._utils import DivikResult

diff --git a/divik/_cli/auto_kmeans.py b/divik/_cli/auto_kmeans.py
@@ -6,32 +6,25 @@
 from typing import List, Tuple
 
 import numpy as np
-import pandas as pd
 import skimage.io as sio
 
-from divik.cluster import AutoKMeans
-import divik._score
+from divik.cluster import GAPSearch, KMeans
 import divik._cli._utils as scr
 import divik._utils as u
 
 
 Segmentations = List[Tuple[u.IntLabels, u.Centroids]]
 
 
-def get_segmentations(kmeans: AutoKMeans) -> Segmentations:
+def get_segmentations(kmeans: GAPSearch) -> Segmentations:
     return [(est.labels_, est.cluster_centers_) for est in kmeans.estimators_]
 
 
-def make_segmentations_matrix(kmeans: AutoKMeans) -> np.ndarray:
+def make_segmentations_matrix(kmeans: GAPSearch) -> np.ndarray:
     return np.hstack([e.labels_.reshape(-1, 1) for e in kmeans.estimators_])
 
 
-def make_scores_report(kmeans: AutoKMeans, n_jobs: int = 1) -> pd.DataFrame:
-    picker = divik._score.make_picker(kmeans.method, n_jobs, kmeans.gap)
-    return picker.report(kmeans.estimators_, kmeans.scores_)
-
-
-def save(kmeans: AutoKMeans, destination: str, xy: np.ndarray=None):
+def save(kmeans: GAPSearch, destination: str, xy: np.ndarray=None):
     logging.info("Saving result.")
 
     logging.info("Saving model.")
@@ -61,15 +54,12 @@ def save(kmeans: AutoKMeans, destination: str, xy: np.ndarray=None):
         sio.imsave(fname('partitions.{0}.png').format(kmeans.n_clusters_),
                    visualization)
 
-    logging.info("Saving scores.")
-    report = make_scores_report(kmeans, n_jobs=-1)
-    report.to_csv(fname('scores.csv'))
-
 
 def main():
     data, config, destination, xy = scr.initialize()
     try:
-        kmeans = AutoKMeans(**config)
+        single_kmeans = KMeans(**config['kmeans'])
+        kmeans = GAPSearch(single_kmeans, **config['gap'])
         kmeans.fit(data)
     except Exception as ex:
         logging.error("Failed with exception.")

diff --git a/divik/_cli/divik.json b/divik/_cli/divik.json
@@ -5,11 +5,14 @@
   "distance": "correlation",
   "minimal_size": 16,
   "rejection_size": 2,
+  "rejection_percentage": null,
   "minimal_features_percentage": 0.01,
-  "fast_kmeans_iter": 10,
-  "k_max": 10,
+  "features_percentage": 0.05,
+  "k_max": 50,
+  "sample_size": 1000,
   "normalize_rows": true,
   "use_logfilters": true,
+  "filter_type": "gmm",
   "n_jobs": -1,
   "random_seed": 0,
   "verbose": true

diff --git a/divik/_cli/divik.md b/divik/_cli/divik.md
@@ -83,9 +83,11 @@ Configuration file should be a JSON file as follows:
   "distance": "correlation",
   "minimal_size": 16,
   "rejection_size": 2,
+  "rejection_percentage": null,
   "minimal_features_percentage": 0.01,
-  "fast_kmeans_iter": 10,
+  "features_percentage": 0.05,
   "k_max": 10,
+  "sample_size": 10000,
   "normalize_rows": true,
   "use_logfilters": true,
   "filter_type": "gmm",
@@ -104,8 +106,11 @@ for computation of GAP index. Default `10`.
 
 #### `distance_percentile`
 
-Distance percentile, at which algorithm should look for initial cluster centers.
-Should be between `0.0` and `100.0`.
+The percentile of the distance between points and their closest
+centroid. 100.0 would simply select the furthest point from all the
+centroids found already. Lower value provides better robustness against
+outliers. Too low value reduces the capability to detect centroid
+candidates during initialization. Should be between `0.0` and `100.0`.
 
 #### `max_iter`
 
@@ -116,7 +121,9 @@ not be less.
 
 Distance measure, defaults to `euclidean`. For Mass Spectrometry Imaging
 purposes `correlation` metric may be more useful. These are the distances
-supported by `scipy` package. All supported values:
+supported by
+[`scipy` package](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html).
+All supported values:
 
 - `braycurtis`
 - `canberra`
@@ -149,31 +156,44 @@ smaller.
 
 #### `rejection_size`
 
+Size under which split will be rejected - if a cluster appears in the
+split that is below rejection_size, the split is considered improper
+and discarded. This may be useful for some domains (like there is no
+justification for a 3-cells cluster in biological data). By default,
+no segmentation is discarded, as careful post-processing provides the
+same advantage.
+
 If cluster of size less or equal to this number appears, such segmentation will
 be rejected. Default `2`. To disable this mechanism, just set it to `0`.
 
+#### `rejection_percentage`
+
+An alternative to ``rejection_size``, with the same behavior, but this
+parameter is related to the training data size percentage. By default,
+no segmentation is discarded.
+
 #### `minimal_features_percentage`
 
-Percent of features that are enforced to be preserved after each filtering.
-Default `0.01` (corresponding to `1%`).
+The minimal percentage of features that must be preserved after
+GMM-based feature selection. By default at least 1% of features is
+preserved in the filtration process. Default `0.01` (corresponding to `1%`).
 
 #### `features_percentage`
 
 The target percentage of features that are used by fallback percentage
 filter for `'outlier'` filter.
 
-#### `fast_kmeans_iter`
-
-Number of k-means iterations performed during GAP trial. Default `10`. In most
-cases this is sufficient.
-
 #### `k_max`
 
-Maximal number of clusters. Default `10`, since Dunn's index for selection of
+Maximal number of clusters. Default `50`, since Dunn's index for selection of
 optimal number of clusters favorizes low number of clusters. If there is a
 suspicion that it is not enough, may be increased, but will slow down
 computations. 
 
+#### `sample_size`
+
+Size of the sample used for GAP statistic computation.
+
 #### `normalize_rows`
 
 Specifies, if rows should be centered and their norm should be set to `1.0`.
@@ -202,11 +222,6 @@ filtering is applied.
 the dimensionality. When more than 250 features are present, 'gmm' is chosen.
 - `'none'` - feature selection is disabled
 
-#### `keep_outlier`
-
-When `filter_type` is `'outlier'`, this will switch feature selection
-to outliers-preserving mode (inlier features are removed).
-
 #### `n_jobs`
 
 The number of jobs to use for the computation. This works by computing each of

diff --git a/divik/_score/__init__.py b/divik/_score/__init__.py
diff --git a/divik/_score/_dunn.py b/divik/_score/_dunn.py