Merge pull request #15 from gmrukwa/develop

Release 2.2.0
gmrukwa · Dec 8, 2019 · d271321 · d271321
2 parents aefc29f + 44c4626
commit d271321
Show file tree

Hide file tree

Showing 22 changed files with 707 additions and 398 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -11,8 +11,8 @@ on:
 
 env:
   MAJOR: ${{ 2 }}
-  MINOR: ${{ 1 }}
-  FIXUP: ${{ 9 }}
+  MINOR: ${{ 2 }}
+  FIXUP: ${{ 0 }}
   PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
   DOCKER_REPO: ${{ 'gmrukwa/divik' }}
   IS_ALPHA: ${{ github.event_name == 'pull_request' }}

diff --git a/divik/__init__.py b/divik/__init__.py
@@ -1,15 +1,28 @@
-__version__ = '2.1.8'
+__version__ = '2.2.0'
 
 from ._seeding import seeded
 from ._sklearn import DiviK
 from ._kmeans import AutoKMeans, KMeans
-from ._feature_selection import GMMSelector, HighAbundanceAndVarianceSelector
+from ._feature_selection import (
+    StatSelectorMixin,
+    NoSelector,
+    GMMSelector,
+    huberta_outliers,
+    OutlierSelector,
+    HighAbundanceAndVarianceSelector,
+    OutlierAbundanceAndVarianceSelector,
+)
 from ._summary import depth, plot, reject_split
 
 __all__ = [
+    "__version__",
     "seeded",
     "DiviK",
     "AutoKMeans", "KMeans",
+    "NoSelector",
+    "StatSelectorMixin",
     "GMMSelector", "HighAbundanceAndVarianceSelector",
+    'huberta_outliers', 'OutlierSelector',
+    'OutlierAbundanceAndVarianceSelector',
     "depth", "plot", "reject_split",
 ]
diff --git a/divik/_cli/_utils.py b/divik/_cli/_utils.py
@@ -48,7 +48,13 @@ def prepare_destination(destination: str, omit_datetime: bool = False) -> str:
     return destination
 
 
-def setup_logger(destination: str, verbose: bool=False):
+def setup_logger(destination: str, verbose: bool = False):
+    try:
+        import divik._matlab_legacy
+        logger = logging.getLogger(divik._matlab_legacy.__name__)
+        logger.setLevel(logging.CRITICAL)
+    except ImportError:
+        pass  # In environments without MATLAB this should work as well
     log_destination = os.path.join(destination, 'logs.txt')
     if verbose:
         log_format = '%(asctime)s [%(levelname)s] %(filename)40s:%(lineno)3s' \

diff --git a/divik/_cli/divik.md b/divik/_cli/divik.md
@@ -88,6 +88,8 @@ Configuration file should be a JSON file as follows:
   "k_max": 10,
   "normalize_rows": true,
   "use_logfilters": true,
+  "filter_type": "gmm",
+  "keep_outliers": false,
   "n_jobs": -1,
   "random_seed": 0,
   "verbose": true
@@ -182,6 +184,23 @@ variance) have to be positive for that - filtering will fail otherwise. This is
 useful for specific cases in biology where the distribution of data may actually
 require this option for any efficient filtering.
 
+#### `filter_type`
+
+Filtering procedure type, defaults to `'gmm'`.
+
+- `'gmm'` - usual Gaussian Mixture Model-based filtering, useful for high
+dimensional cases
+- `'outlier'` - robust outlier detection-based filtering, useful for low
+dimensional cases
+- `'auto'` - automatically selects between 'gmm' and 'outlier' based on
+the dimensionality. When more than 250 features are present, 'gmm' is chosen.
+- `'none'` - feature selection is disabled
+
+#### `keep_outlier`
+
+When `filter_type` is `'outlier'`, this will switch feature selection
+to outliers-preserving mode (inlier features are removed).
+
 #### `n_jobs`
 
 The number of jobs to use for the computation. This works by computing each of

diff --git a/divik/_divik.py b/divik/_divik.py
@@ -44,11 +44,12 @@ def _constant_rows(matrix: np.ndarray) -> List[int]:
     return np.where(is_constant)[0]
 
 
-class _Reporter:
-    def __init__(self, progress_reporter: tqdm.tqdm = None):
+class DivikReporter:
+    def __init__(self, progress_reporter: tqdm.tqdm = None,
+                 warn_const: bool = True):
         self.progress_reporter = progress_reporter
         self.paths_open = 1
-        self.warn_const = True
+        self.warn_const = warn_const
 
     def filter(self, subset):
         lg.info('Feature filtering.')
@@ -61,7 +62,7 @@ def filter(self, subset):
     def filtered(self, data):
         lg.debug('Shape after filtering: {0}'.format(data.shape))
         constant = _constant_rows(data)
-        if any(constant) and self.warn_const:
+        if self.warn_const and any(constant):
             msg = 'After feature filtering some rows are constant: {0}. ' \
                   'This may not work with specific configurations.'
             lg.warning(msg.format(constant))
@@ -98,11 +99,11 @@ def assemble(self):
 
 
 # @gmrukwa: I could not find more readable solution than recursion for now.
-def _divik_backend(data: Data, selection: np.ndarray,
-                   fast_kmeans: km.AutoKMeans, full_kmeans: km.AutoKMeans,
-                   feature_selector: fs.HighAbundanceAndVarianceSelector,
-                   minimal_size: int, rejection_size: int, report: _Reporter,
-                   pool: Pool = None) -> Optional[DivikResult]:
+def divik(data: Data, selection: np.ndarray,
+          fast_kmeans: km.AutoKMeans, full_kmeans: km.AutoKMeans,
+          feature_selector: fs.StatSelectorMixin,
+          minimal_size: int, rejection_size: int, report: DivikReporter,
+          pool: Pool = None) -> Optional[DivikResult]:
     subset = data[selection]
 
     if subset.shape[0] <= max(full_kmeans.max_clusters, minimal_size):
@@ -131,7 +132,7 @@ def _divik_backend(data: Data, selection: np.ndarray,
 
     report.recurring(len(counts))
     recurse = partial(
-        _divik_backend, data=data, fast_kmeans=fast_kmeans,
+        divik, data=data, fast_kmeans=fast_kmeans,
         full_kmeans=full_kmeans, feature_selector=feature_selector,
         minimal_size=minimal_size, rejection_size=rejection_size,
         report=report, pool=pool)
@@ -146,18 +147,3 @@ def _divik_backend(data: Data, selection: np.ndarray,
     report.assemble()
     return DivikResult(clustering=clusterer, feature_selector=feature_selector,
                        merged=partition, subregions=subregions)
-
-
-def divik(data: Data, fast_kmeans: km.AutoKMeans, full_kmeans: km.AutoKMeans,
-          feature_selector: fs.HighAbundanceAndVarianceSelector,
-          progress_reporter: tqdm.tqdm = None, minimal_size: int = 2,
-          rejection_size: int = 0, pool: Pool = None) -> Optional[DivikResult]:
-    if np.isnan(data).any():
-        raise ValueError("NaN values are not supported.")
-    report = _Reporter(progress_reporter)
-    select_all = np.ones(shape=(data.shape[0],), dtype=bool)
-    return _divik_backend(
-        data, selection=select_all, fast_kmeans=fast_kmeans,
-        full_kmeans=full_kmeans, feature_selector=feature_selector,
-        minimal_size=minimal_size, rejection_size=rejection_size,
-        report=report, pool=pool)