Skip to content

Commit

Permalink
Merge pull request #19 from gmrukwa/fixup/outlier-filter
Browse files Browse the repository at this point in the history
Fixup: outlier filter
  • Loading branch information
gmrukwa committed Dec 11, 2019
2 parents 4039548 + 3b21542 commit 4944dc2
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 117 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
env:
MAJOR: ${{ 2 }}
MINOR: ${{ 3 }}
FIXUP: ${{ 3 }}
FIXUP: ${{ 4 }}
PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }}
DOCKER_REPO: ${{ 'gmrukwa/divik' }}
IS_ALPHA: ${{ github.event_name == 'pull_request' }}
Expand Down
5 changes: 2 additions & 3 deletions divik/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2.3.3'
__version__ = '2.3.4'

from ._seeding import seeded
from ._sklearn import DiviK
Expand All @@ -9,7 +9,6 @@
GMMSelector,
huberta_outliers,
OutlierSelector,
OutlierOrTopSelector,
PercentageSelector,
HighAbundanceAndVarianceSelector,
OutlierAbundanceAndVarianceSelector,
Expand All @@ -24,7 +23,7 @@
"NoSelector",
"StatSelectorMixin",
"GMMSelector", "HighAbundanceAndVarianceSelector",
'huberta_outliers', 'OutlierSelector', 'OutlierOrTopSelector',
'huberta_outliers', 'OutlierSelector',
'PercentageSelector',
'OutlierAbundanceAndVarianceSelector',
"depth", "plot", "reject_split",
Expand Down
1 change: 0 additions & 1 deletion divik/_feature_selection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from ._outlier import (
huberta_outliers,
OutlierSelector,
OutlierOrTopSelector,
)
from ._percentage_selector import PercentageSelector
from ._specialized import (
Expand Down
84 changes: 0 additions & 84 deletions divik/_feature_selection/_outlier.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,87 +97,3 @@ def fit(self, X, y=None):
else:
self.selected_ = outliers == False
return self


# noinspection PyAttributeOutsideInit
class OutlierOrTopSelector(BaseEstimator, StatSelectorMixin):
"""Feature selector that removes outlier features or top ones
This feature selection algorithm looks only at the features (X), not the
desired outputs (y), and can thus be used for unsupervised learning.
Parameters
----------
stat: {'mean', 'var'}
Kind of statistic to be computed out of the feature.
use_log: bool, optional, default: False
Whether to use the logarithm of feature characteristic instead of the
characteristic itself. This may improve feature filtering performance,
depending on the distribution of features, however all the
characteristics (mean, variance) have to be positive for that -
filtering will fail otherwise. This is useful for specific cases in
biology where the distribution of data may actually require this option
for any efficient filtering.
keep_outliers: bool, optional, default: False
When True, keeps outliers instead of inlier features or top features.
p: float, optional, default: 0.2
Rate of features to keep.
min_features_rate: float, optional, default: 0.01
Minimal rate of features to keep.
Attributes
----------
vals_: array, shape (n_features,)
Computed characteristic of each feature.
selected_: array, shape (n_features,)
Vector of binary selections of the informative features.
outlier_selector_: OutlierSelector
Outlier-based feature selector.
percentage_selector_: PercentageSelector
Percentage-based feature selector.
"""
def __init__(self, stat: str, use_log: bool = False,
keep_outliers: bool = False, p: float = 0.2,
min_features_rate: float = 0.01):
self.stat = stat
self.use_log = use_log
self.keep_outliers = keep_outliers
self.p = p
self.min_features_rate = min_features_rate

def fit(self, X, y=None):
"""Learn data-driven feature thresholds from X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Sample vectors from which to compute feature characteristic.
y : any
Ignored. This parameter exists only for compatibility with
sklearn.pipeline.Pipeline.
Returns
-------
self
"""
self.vals_ = self._to_characteristics(X)
self.outlier_selector_ = OutlierSelector(
stat=self.stat, use_log=self.use_log,
keep_outliers=self.keep_outliers).fit(X)
if self.outlier_selector_.selected_.mean() >= self.min_features_rate:
self.selected_ = self.outlier_selector_.selected_
self.percentage_selector_ = None
else:
self.percentage_selector_ = PercentageSelector(
stat=self.stat, use_log=self.use_log,
keep_top=self.keep_outliers, p=self.p).fit(X)
self.selected_ = self.percentage_selector_.selected_
return self
66 changes: 44 additions & 22 deletions divik/_feature_selection/_specialized.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from sklearn.base import BaseEstimator
from sklearn.feature_selection.base import SelectorMixin
from ._gmm_selector import GMMSelector
from ._outlier import OutlierSelector, OutlierOrTopSelector
from ._outlier import OutlierSelector
from ._percentage_selector import PercentageSelector


Expand Down Expand Up @@ -122,12 +122,14 @@ def _get_support_mask(self):
return self.selected_


EPS = 10e-6


# noinspection PyAttributeOutsideInit
class OutlierAbundanceAndVarianceSelector(BaseEstimator, SelectorMixin):
def __init__(self, use_log: bool = False, keep_outliers: bool = False,
min_features_rate: float = 0.01, p: float = 0.2):
def __init__(self, use_log: bool = False, min_features_rate: float = 0.01,
p: float = 0.2):
self.use_log = use_log
self.keep_outliers = keep_outliers
self.min_features_rate = min_features_rate
self.p = p

Expand All @@ -147,26 +149,46 @@ def fit(self, X, y=None):
-------
self
"""
self.abundance_selector_ = OutlierSelector(
stat='mean', use_log=self.use_log,
keep_outliers=False).fit(X)
if self.abundance_selector_.selected_.mean() < self.min_features_rate:
self.abundance_selector_ = PercentageSelector(
stat='mean', use_log=self.use_log, keep_top=True,
p=1.0 - self.p).fit(X)
filtered = self.abundance_selector_.transform(X)
self.selected_ = self.abundance_selector_.selected_.copy()

corrected = self.min_features_rate / self.selected_.mean()
self.variance_selector_ = OutlierOrTopSelector(
stat='var', use_log=self.use_log,
keep_outliers=self.keep_outliers,
min_features_rate=corrected,
p=self.p).fit(filtered)
self.selected_[self.selected_] = self.variance_selector_.selected_

self.abundance_selector_, a_selected = self._fit_abundance(X)
filtered = X[:, a_selected]
self.variance_selector_, v_selected = self._fit_variance(
filtered, a_selected)
self.selected_ = a_selected
self.selected_[a_selected] = v_selected
return self

def _fit_abundance(self, X):
selector = OutlierSelector(stat='mean', use_log=self.use_log,
keep_outliers=False).fit(X)
selected = selector.selected_
inlier = selector.vals_[selected][0]
over_inlier = selector.vals_ > inlier
selected[over_inlier] = True
p = selected.mean()
if p < self.min_features_rate or p >= 1 - EPS:
selector = PercentageSelector(stat='mean', use_log=self.use_log,
keep_top=True, p=1.0 - self.p).fit(X)
selected = selector.selected_
return selector, selected

def _fit_variance(self, X, old_selected):
corrected_min = self.min_features_rate / old_selected.mean()
corrected_p = self.p / old_selected.mean()

selector = OutlierSelector(stat='var', use_log=self.use_log,
keep_outliers=True).fit(X)
selected = selector.selected_
inlier = selector.vals_[selected == 0][0]
under_inlier = selector.vals_ < inlier
selected[under_inlier] = False
p = selected.mean()

if p < corrected_min or p >= 1 - EPS:
selector = PercentageSelector(stat='var', use_log=self.use_log,
keep_top=True, p=corrected_p).fit(X)
selected = selector.selected_
return selector, selected

def _get_support_mask(self):
"""
Get the boolean mask indicating which features are selected
Expand Down
2 changes: 1 addition & 1 deletion divik/_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def _gmm_filter(self):

def _outlier_filter(self):
return fs.OutlierAbundanceAndVarianceSelector(
self.use_logfilters, self.keep_outliers,
use_log=self.use_logfilters,
min_features_rate=self.minimal_features_percentage,
p=self.features_percentage)

Expand Down
13 changes: 8 additions & 5 deletions test/feature_selection/test_specialized.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,11 @@ def test_discards_outlier_variance(self):
TNR = (selector.selected_[self.high_var] == False).mean()
self.assertGreaterEqual(TNR, 0.95)

def test_preserves_inlier_features(self):
selector = fs.OutlierAbundanceAndVarianceSelector().fit(self.data)
inlier = (self.outlier_mean + self.high_var) == 0
TPR = selector.selected_[inlier].mean()
self.assertGreaterEqual(TPR, 0.95)
def test_selects_percentage_of_features(self):
N = self.labels.size
p = 5. / N
selector = fs.OutlierAbundanceAndVarianceSelector(p=p).fit(self.data)
self.assertAlmostEqual(selector.selected_.mean(), p, places=2)
p = 20. / N
selector = fs.OutlierAbundanceAndVarianceSelector(p=p).fit(self.data)
self.assertAlmostEqual(selector.selected_.mean(), p, places=2)

0 comments on commit 4944dc2

Please sign in to comment.