Release v2.5.11

Merge pull request #70 from gmrukwa/develop
gmrukwa · Oct 20, 2020 · 0aa7958 · 0aa7958
2 parents 9f8d695 + 9cb6f05
commit 0aa7958
Show file tree

Hide file tree

Showing 9 changed files with 89 additions and 35 deletions.
diff --git a/.bettercodehub.yml b/.bettercodehub.yml
@@ -4,3 +4,4 @@ languages:
 exclude:
 - /divik/core/gin_sklearn_configurables\.py
 - /divik/core/_gin_bugfix\.py
+- /gamred_native
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -12,7 +12,7 @@ on:
 env:
   MAJOR: ${{ 2 }}
   MINOR: ${{ 5 }}
-  FIXUP: ${{ 10 }}
+  FIXUP: ${{ 11 }}
   PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
   PACKAGE_INIT_FILE_VERSION_LINE: ${{ 1 }}
   PACKAGE_SETUP_FILE: ${{ 'setup.py' }}

diff --git a/README.md b/README.md
@@ -40,7 +40,7 @@ docker pull gmrukwa/divik
 To install specific version, you can specify it in the command, e.g.:
 
 ```bash
-docker pull gmrukwa/divik:2.5.10
+docker pull gmrukwa/divik:2.5.11
 ```
 
 ## Python package
@@ -79,7 +79,7 @@ pip install divik
 or any stable tagged version, e.g.:
 
 ```bash
-pip install divik==2.5.10
+pip install divik==2.5.11
 ```
 
 If you want to have compatibility with
@@ -92,27 +92,26 @@ pip install divik[gin]
 
 **Note:** Remember about `\` before `[` and `]` in `zsh` shell.
 
+# High-Volume Data Considerations
+
+If you are using DiviK to run the analysis that could fail to fit RAM of your
+computer, consider disabling the default parallelism and switch to
+[dask](https://dask.org/). It's easy to achieve through configuration:
+
+- set all parameters named `n_jobs` to `1`;
+- set all parameters named `allow_dask` to `True`.
+
+Never set `n_jobs>1` and `allow_dask=True` at the same time, the computations
+will freeze due to how `multiprocessing` and `dask` handle parallelism.
+
 # References
 
 This software is part of contribution made by [Data Mining Group of Silesian
 University of Technology](http://www.zaed.polsl.pl/), rest of which is
 published [here](https://github.com/ZAEDPolSl).
 
-+ [P. Widlak, G. Mrukwa, M. Kalinowska, M. Pietrowska, M. Chekan, J. Wierzgon, M.
-Gawin, G. Drazek and J. Polanska, "Detection of molecular signatures of oral
-squamous cell carcinoma and normal epithelium - application of a novel
-methodology for unsupervised segmentation of imaging mass spectrometry data,"
-Proteomics, vol. 16, no. 11-12, pp. 1613-21, 2016][1]
-+ [M. Pietrowska, H. C. Diehl, G. Mrukwa, M. Kalinowska-Herok, M. Gawin, M.
-Chekan, J. Elm, G. Drazek, A. Krawczyk, D. Lange, H. E. Meyer, J. Polanska, C.
-Henkel, P. Widlak, "Molecular profiles of thyroid cancer subtypes:
-Classification based on features of tissue revealed by mass spectrometry
-imaging," Biochimica et Biophysica Acta (BBA)-Proteins and Proteomics, 2016][2]
-+ [G. Mrukwa, G. Drazek, M. Pietrowska, P. Widlak and J. Polanska, "A Novel
-Divisive iK-Means Algorithm with Region-Driven Feature Selection as a Tool for
-Automated Detection of Tumour Heterogeneity in MALDI IMS Experiments," in
-International Conference on Bioinformatics and Biomedical Engineering, 2016][3]
-
-[1]: http://onlinelibrary.wiley.com/doi/10.1002/pmic.201500458/pdf
-[2]: http://www.sciencedirect.com/science/article/pii/S1570963916302175
-[3]: http://link.springer.com/chapter/10.1007/978-3-319-31744-1_11
++ [Mrukwa, G. and Polanska, J., 2020. DiviK: Divisive intelligent K-means for
+hands-free unsupervised clustering in biological big data. *arXiv preprint
+arXiv:2009.10706.*][1]
+
+[1]: https://arxiv.org/abs/2009.10706
diff --git a/divik/__init__.py b/divik/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '2.5.10'
+__version__ = '2.5.11'
 
 from ._summary import plot, reject_split
 

diff --git a/divik/cluster/_kmeans/_core.py b/divik/cluster/_kmeans/_core.py
@@ -28,11 +28,13 @@
 
 class Labeling(object):
     """Labels observations by closest centroids"""
-    def __init__(self, distance_metric: str):
+    def __init__(self, distance_metric: str, allow_dask: bool=False):
         """
         @param distance_metric: distance metric for estimation of closest
+        @param allow_dask: should be False if `multiprocessing.Pool` is spawned
         """
         self.distance_metric = distance_metric
+        self.allow_dask = allow_dask
 
     def __call__(self, data: Data, centroids: Centroids) -> IntLabels:
         """Find closest centroids
@@ -47,7 +49,7 @@ def __call__(self, data: Data, centroids: Centroids) -> IntLabels:
             logging.error(msg)
             raise ValueError(msg)
 
-        if data.shape[0] > 10000 or data.shape[1] > 1000:
+        if self.allow_dask and (data.shape[0] > 10000 or data.shape[1] > 1000):
             X1 = da.from_array(data)
             X2 = da.from_array(centroids)
             distances = ddst.cdist(X1, X2, self.distance_metric)
@@ -59,12 +61,13 @@ def __call__(self, data: Data, centroids: Centroids) -> IntLabels:
 
 
 def redefine_centroids(data: Data, labeling: IntLabels,
-                       label_set: IntLabels) -> Centroids:
+                       label_set: IntLabels, allow_dask: bool=False) -> Centroids:
     """Recompute centroids in data for given labeling
 
     @param data: observations
     @param labeling: partition of dataset into groups
     @param label_set: set of labels used for partitioning
+    @param allow_dask: should be False if `multiprocessing.Pool` is spawned
     @return: centroids
     """
     if data.shape[0] != labeling.size:
@@ -73,7 +76,7 @@ def redefine_centroids(data: Data, labeling: IntLabels,
             f"number of observations: {data.shape[0]}."
         logging.error(msg)
         raise ValueError(msg)
-    if data.shape[0] > 10000 or data.shape[1] > 1000:
+    if allow_dask and (data.shape[0] > 10000 or data.shape[1] > 1000):
         X = dd.from_array(data)
         y = dd.from_array(labeling)
         centroids = X.groupby(y).mean().compute().values
@@ -106,17 +109,20 @@ def _validate_normalizable(data):
 class _KMeans(SegmentationMethod):
     """K-means clustering"""
     def __init__(self, labeling: Labeling, initialize: Initialization,
-                 number_of_iterations: int=100, normalize_rows: bool=False):
+                 number_of_iterations: int=100, normalize_rows: bool=False,
+                 allow_dask: bool = False):
         """
         @param labeling: labeling method
         @param initialize: initialization method
         @param number_of_iterations: number of iterations
         @param normalize_rows: sets mean of row to 0 and norm to 1
+        @param allow_dask: should be False if `multiprocessing.Pool` is spawned
         """
         self.labeling = labeling
         self.initialize = initialize
         self.number_of_iterations = number_of_iterations
         self.normalize_rows = normalize_rows
+        self.allow_dask = allow_dask
 
     def _fix_labels(self, data, centroids, labels, n_clusters, retries=10):
         logging.debug('A label vanished - fixing')
@@ -169,7 +175,8 @@ def __call__(self, data: Data, number_of_clusters: int) \
                 logging.debug('Stability achieved.')
                 break
             old_labels = labels
-            centroids = redefine_centroids(data, old_labels, label_set)
+            centroids = redefine_centroids(
+                data, old_labels, label_set, self.allow_dask)
             labels = self.labeling(data, centroids)
         return labels, centroids
 
@@ -238,6 +245,11 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
 
     normalize_rows : bool, default: False
         If True, rows are translated to mean of 0.0 and scaled to norm of 1.0.
+    
+    allow_dask : bool, default: False
+        If True, automatically selects dask as computations backend whenever
+        reasonable. Default `False` since it cannot be used together with
+        `multiprocessing.Pool` and everywhere `n_jobs` must be set to `1`.
 
     Attributes
     ----------
@@ -253,7 +265,8 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
     def __init__(self, n_clusters: int, distance: str = 'euclidean',
                  init: str = 'percentile', percentile: float = 95.,
                  leaf_size : Union[int, float] = 0.01,
-                 max_iter: int = 100, normalize_rows: bool = False):
+                 max_iter: int = 100, normalize_rows: bool = False,
+                 allow_dask: bool = False):
         super().__init__()
         self.n_clusters = n_clusters
         self.distance = distance
@@ -262,6 +275,7 @@ def __init__(self, n_clusters: int, distance: str = 'euclidean',
         self.leaf_size = leaf_size
         self.max_iter = max_iter
         self.normalize_rows = normalize_rows
+        self.allow_dask = allow_dask
 
     def fit(self, X, y=None):
         """Compute k-means clustering.
@@ -280,10 +294,11 @@ def fit(self, X, y=None):
         initialize = _parse_initialization(
             self.init, self.distance, self.percentile, self.leaf_size)
         kmeans = _KMeans(
-            labeling=Labeling(self.distance),
+            labeling=Labeling(self.distance, allow_dask=self.allow_dask),
             initialize=initialize,
             number_of_iterations=self.max_iter,
-            normalize_rows=self.normalize_rows
+            normalize_rows=self.normalize_rows,
+            allow_dask=self.allow_dask,
         )
         X = np.asanyarray(X)
         self.labels_, self.cluster_centers_ = kmeans(

diff --git a/docs/instructions/installation.rst b/docs/instructions/installation.rst
@@ -14,7 +14,7 @@ To install latest stable version use::
 
 To install specific version, you can specify it in the command, e.g.::
 
-    docker pull gmrukwa/divik:2.5.10
+    docker pull gmrukwa/divik:2.5.11
 
 Python package
 --------------
@@ -31,7 +31,7 @@ package::
 
 or any stable tagged version, e.g.::
 
-    pip install divik==2.5.10
+    pip install divik==2.5.11
 
 If you want to have compatibility with
 `gin-config <https://github.com/google/gin-config>`_, you can install

diff --git a/requirements-base.txt b/requirements-base.txt
@@ -14,6 +14,7 @@ networkx
 numpy
 pandas
 parameterized
+polyaxon==1.1.9
 pylint
 scikit-image
 scikit-learn

diff --git a/requirements.txt b/requirements.txt
@@ -1,20 +1,27 @@
 absl-py==0.9.0
 astroid==2.3.3
 attrs==19.3.0
+cachetools==4.1.1
+certifi==2020.6.20
+chardet==3.0.4
 Click==7.0
+click-completion==0.5.2
 cycler==0.10.0
 dash==0.34.0
 dash-core-components==0.42.0
 dash-html-components==0.13.4
 dash-renderer==0.17.0
 dash-table==3.1.11
+dask==2.14.0
 dask-distance==0.2.0
-dask[dataframe]==2.14.0
 decorator==4.4.1
 Flask==1.1.1
 Flask-Compress==1.4.0
+fsspec==0.8.4
 gin-config==0.3.0
+google-auth==1.22.1
 h5py==2.8.0
+idna==2.10
 imageio==2.6.1
 importlib-metadata==1.3.0
 isort==4.3.21
@@ -23,35 +30,58 @@ Jinja2==2.10.3
 joblib==0.14.1
 kiwisolver==1.1.0
 kneed==0.5.1
+kubernetes==12.0.0
 lazy-object-proxy==1.4.3
+locket==0.2.0
 MarkupSafe==1.1.1
+marshmallow==3.7.1
 matplotlib==3.1.2
 mccabe==0.6.1
 more-itertools==8.0.2
 networkx==2.4
 numpy==1.18.0
+nvidia-ml-py3==7.352.0
+oauthlib==3.1.0
 packaging==19.2
 pandas==0.25.3
 parameterized==0.7.1
+partd==1.1.0
 Pillow==6.2.1
 plotly==4.4.1
 pluggy==0.13.1
+polyaxon==1.1.9
+polyaxon-sdk==1.1.9
+psutil==5.7.2
 py==1.8.0
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
 pybind11==2.4.3
 pylint==2.4.4
 pyparsing==2.4.5
 pytest==5.3.2
 python-dateutil==2.8.1
 pytz==2019.3
 PyWavelets==1.1.1
+PyYAML==5.3.1
+requests==2.24.0
+requests-oauthlib==1.3.0
+requests-toolbelt==0.9.1
 retrying==1.3.3
+rsa==4.6
 scikit-image==0.16.2
 scikit-learn==0.22
 scipy==1.4.1
+sentry-sdk==0.19.1
+shellingham==1.3.2
 six==1.13.0
+tabulate==0.8.7
+toolz==0.11.1
 tqdm==4.41.0
 typed-ast==1.4.0
+ujson==4.0.1
+urllib3==1.25.11
 wcwidth==0.1.7
+websocket-client==0.57.0
 Werkzeug==0.16.0
 wrapt==1.11.2
 zipp==0.6.0
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 import sys
 import numpy
 
-__version__ = '2.5.10'
+__version__ = '2.5.11'
 
 LINUX_OPTS = {
     'extra_link_args': [
@@ -108,10 +108,18 @@
         'numpy>=0.12.1',
     ],
     extras_require={
+        'all': [
+            'absl-py',
+            'gin-config',
+            'polyaxon',
+        ],
         'gin': [
             "absl-py",
             "gin-config",
         ],
+        'polyaxon': [
+            "polyaxon",
+        ],
     },
     python_requires='>=3.6',
     package_data={