Skip to content

Commit

Permalink
Merge pull request #15 from gmrukwa/develop
Browse files Browse the repository at this point in the history
Release 2.2.0
  • Loading branch information
gmrukwa committed Dec 8, 2019
2 parents aefc29f + 44c4626 commit d271321
Show file tree
Hide file tree
Showing 22 changed files with 707 additions and 398 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ on:

env:
MAJOR: ${{ 2 }}
MINOR: ${{ 1 }}
FIXUP: ${{ 9 }}
MINOR: ${{ 2 }}
FIXUP: ${{ 0 }}
PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
DOCKER_REPO: ${{ 'gmrukwa/divik' }}
IS_ALPHA: ${{ github.event_name == 'pull_request' }}
Expand Down
17 changes: 15 additions & 2 deletions divik/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,28 @@
__version__ = '2.1.8'
__version__ = '2.2.0'

from ._seeding import seeded
from ._sklearn import DiviK
from ._kmeans import AutoKMeans, KMeans
from ._feature_selection import GMMSelector, HighAbundanceAndVarianceSelector
from ._feature_selection import (
StatSelectorMixin,
NoSelector,
GMMSelector,
huberta_outliers,
OutlierSelector,
HighAbundanceAndVarianceSelector,
OutlierAbundanceAndVarianceSelector,
)
from ._summary import depth, plot, reject_split

__all__ = [
"__version__",
"seeded",
"DiviK",
"AutoKMeans", "KMeans",
"NoSelector",
"StatSelectorMixin",
"GMMSelector", "HighAbundanceAndVarianceSelector",
'huberta_outliers', 'OutlierSelector',
'OutlierAbundanceAndVarianceSelector',
"depth", "plot", "reject_split",
]
8 changes: 7 additions & 1 deletion divik/_cli/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,13 @@ def prepare_destination(destination: str, omit_datetime: bool = False) -> str:
return destination


def setup_logger(destination: str, verbose: bool=False):
def setup_logger(destination: str, verbose: bool = False):
try:
import divik._matlab_legacy
logger = logging.getLogger(divik._matlab_legacy.__name__)
logger.setLevel(logging.CRITICAL)
except ImportError:
pass # In environments without MATLAB this should work as well
log_destination = os.path.join(destination, 'logs.txt')
if verbose:
log_format = '%(asctime)s [%(levelname)s] %(filename)40s:%(lineno)3s' \
Expand Down
19 changes: 19 additions & 0 deletions divik/_cli/divik.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ Configuration file should be a JSON file as follows:
"k_max": 10,
"normalize_rows": true,
"use_logfilters": true,
"filter_type": "gmm",
"keep_outliers": false,
"n_jobs": -1,
"random_seed": 0,
"verbose": true
Expand Down Expand Up @@ -182,6 +184,23 @@ variance) have to be positive for that - filtering will fail otherwise. This is
useful for specific cases in biology where the distribution of data may actually
require this option for any efficient filtering.

#### `filter_type`

Filtering procedure type, defaults to `'gmm'`.

- `'gmm'` - usual Gaussian Mixture Model-based filtering, useful for high
dimensional cases
- `'outlier'` - robust outlier detection-based filtering, useful for low
dimensional cases
- `'auto'` - automatically selects between 'gmm' and 'outlier' based on
the dimensionality. When more than 250 features are present, 'gmm' is chosen.
- `'none'` - feature selection is disabled

#### `keep_outlier`

When `filter_type` is `'outlier'`, this will switch feature selection
to outliers-preserving mode (inlier features are removed).

#### `n_jobs`

The number of jobs to use for the computation. This works by computing each of
Expand Down
36 changes: 11 additions & 25 deletions divik/_divik.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,12 @@ def _constant_rows(matrix: np.ndarray) -> List[int]:
return np.where(is_constant)[0]


class _Reporter:
def __init__(self, progress_reporter: tqdm.tqdm = None):
class DivikReporter:
def __init__(self, progress_reporter: tqdm.tqdm = None,
warn_const: bool = True):
self.progress_reporter = progress_reporter
self.paths_open = 1
self.warn_const = True
self.warn_const = warn_const

def filter(self, subset):
lg.info('Feature filtering.')
Expand All @@ -61,7 +62,7 @@ def filter(self, subset):
def filtered(self, data):
lg.debug('Shape after filtering: {0}'.format(data.shape))
constant = _constant_rows(data)
if any(constant) and self.warn_const:
if self.warn_const and any(constant):
msg = 'After feature filtering some rows are constant: {0}. ' \
'This may not work with specific configurations.'
lg.warning(msg.format(constant))
Expand Down Expand Up @@ -98,11 +99,11 @@ def assemble(self):


# @gmrukwa: I could not find more readable solution than recursion for now.
def _divik_backend(data: Data, selection: np.ndarray,
fast_kmeans: km.AutoKMeans, full_kmeans: km.AutoKMeans,
feature_selector: fs.HighAbundanceAndVarianceSelector,
minimal_size: int, rejection_size: int, report: _Reporter,
pool: Pool = None) -> Optional[DivikResult]:
def divik(data: Data, selection: np.ndarray,
fast_kmeans: km.AutoKMeans, full_kmeans: km.AutoKMeans,
feature_selector: fs.StatSelectorMixin,
minimal_size: int, rejection_size: int, report: DivikReporter,
pool: Pool = None) -> Optional[DivikResult]:
subset = data[selection]

if subset.shape[0] <= max(full_kmeans.max_clusters, minimal_size):
Expand Down Expand Up @@ -131,7 +132,7 @@ def _divik_backend(data: Data, selection: np.ndarray,

report.recurring(len(counts))
recurse = partial(
_divik_backend, data=data, fast_kmeans=fast_kmeans,
divik, data=data, fast_kmeans=fast_kmeans,
full_kmeans=full_kmeans, feature_selector=feature_selector,
minimal_size=minimal_size, rejection_size=rejection_size,
report=report, pool=pool)
Expand All @@ -146,18 +147,3 @@ def _divik_backend(data: Data, selection: np.ndarray,
report.assemble()
return DivikResult(clustering=clusterer, feature_selector=feature_selector,
merged=partition, subregions=subregions)


def divik(data: Data, fast_kmeans: km.AutoKMeans, full_kmeans: km.AutoKMeans,
feature_selector: fs.HighAbundanceAndVarianceSelector,
progress_reporter: tqdm.tqdm = None, minimal_size: int = 2,
rejection_size: int = 0, pool: Pool = None) -> Optional[DivikResult]:
if np.isnan(data).any():
raise ValueError("NaN values are not supported.")
report = _Reporter(progress_reporter)
select_all = np.ones(shape=(data.shape[0],), dtype=bool)
return _divik_backend(
data, selection=select_all, fast_kmeans=fast_kmeans,
full_kmeans=full_kmeans, feature_selector=feature_selector,
minimal_size=minimal_size, rejection_size=rejection_size,
report=report, pool=pool)

0 comments on commit d271321

Please sign in to comment.