Skip to content

Commit

Permalink
Remove statmodels from requirements
Browse files Browse the repository at this point in the history
  • Loading branch information
gmrukwa committed Dec 29, 2019
1 parent 052384a commit 15fbbad
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 5 deletions.
98 changes: 96 additions & 2 deletions divik/feature_selection/_outlier.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,103 @@
import numpy as np
from sklearn.base import BaseEstimator
from statsmodels.stats.stattools import medcouple

from ._stat_selector_mixin import StatSelectorMixin
from ._percentage_selector import PercentageSelector


def _medcouple_1d(y):
"""
Calculates the medcouple robust measure of skew.
Parameters
----------
y : array_like, 1-d
Data to compute use in the estimator.
Returns
-------
mc : float
The medcouple statistic
Notes
-----
The current algorithm requires a O(N**2) memory allocations, and so may
not work for very large arrays (N>10000).
.. [*] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed
distributions" Computational Statistics & Data Analysis, vol. 52, pp.
5186-5201, August 2008.
"""

# Parameter changes the algorithm to the slower for large n

y = np.squeeze(np.asarray(y))
if y.ndim != 1:
raise ValueError("y must be squeezable to a 1-d array")

y = np.sort(y)

n = y.shape[0]
if n % 2 == 0:
mf = (y[n // 2 - 1] + y[n // 2]) / 2
else:
mf = y[(n - 1) // 2]

z = y - mf
lower = z[z <= 0.0]
upper = z[z >= 0.0]
upper = upper[:, None]
standardization = upper - lower
is_zero = np.logical_and(lower == 0.0, upper == 0.0)
standardization[is_zero] = np.inf
spread = upper + lower
h = spread / standardization
# GH5395
num_ties = np.sum(lower == 0.0)
if num_ties:
# Replacements has -1 above the anti-diagonal, 0 on the anti-diagonal,
# and 1 below the anti-diagonal
replacements = np.ones((num_ties, num_ties)) - np.eye(num_ties)
replacements -= 2 * np.triu(replacements)
# Convert diagonal to anti-diagonal
replacements = np.fliplr(replacements)
# Always replace upper right block
h[:num_ties, -num_ties:] = replacements

return np.median(h)


def medcouple(y, axis=0):
"""
Calculate the medcouple robust measure of skew.
Parameters
----------
y : array_like
Data to compute use in the estimator.
axis : {int, None}
Axis along which the medcouple statistic is computed. If `None`, the
entire array is used.
Returns
-------
mc : ndarray
The medcouple statistic with the same shape as `y`, with the specified
axis removed.
Notes
-----
The current algorithm requires a O(N**2) memory allocations, and so may
not work for very large arrays (N>10000).
.. [*] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed
distributions" Computational Statistics & Data Analysis, vol. 52, pp.
5186-5201, August 2008.
"""
y = np.asarray(y, dtype=np.double) # GH 4243
if axis is None:
return _medcouple_1d(y.ravel())

return np.apply_along_axis(_medcouple_1d, axis, y)


def huberta_outliers(v):
Expand Down
1 change: 0 additions & 1 deletion requirements-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,4 @@ parameterized
scikit-image
scikit-learn
scipy
statsmodels
tqdm
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ numpy==1.18.0
packaging==19.2
pandas==0.25.3
parameterized==0.7.1
patsy==0.5.1
Pillow==6.2.1
plotly==4.4.1
pluggy==0.13.1
Expand All @@ -40,7 +39,6 @@ scikit-image==0.16.2
scikit-learn==0.22
scipy==1.4.1
six==1.13.0
statsmodels==0.10.2
tqdm==4.41.0
wcwidth==0.1.7
Werkzeug==0.16.0
Expand Down

0 comments on commit 15fbbad

Please sign in to comment.