Remove statmodels from requirements

gmrukwa · Dec 29, 2019 · 15fbbad · 15fbbad
1 parent 052384a
commit 15fbbad
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 5 deletions.
diff --git a/divik/feature_selection/_outlier.py b/divik/feature_selection/_outlier.py
@@ -1,9 +1,103 @@
 import numpy as np
 from sklearn.base import BaseEstimator
-from statsmodels.stats.stattools import medcouple
 
 from ._stat_selector_mixin import StatSelectorMixin
-from ._percentage_selector import PercentageSelector
+
+
+def _medcouple_1d(y):
+    """
+    Calculates the medcouple robust measure of skew.
+
+    Parameters
+    ----------
+    y : array_like, 1-d
+        Data to compute use in the estimator.
+
+    Returns
+    -------
+    mc : float
+        The medcouple statistic
+
+    Notes
+    -----
+    The current algorithm requires a O(N**2) memory allocations, and so may
+    not work for very large arrays (N>10000).
+
+    .. [*] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed
+       distributions" Computational Statistics & Data Analysis, vol. 52, pp.
+       5186-5201, August 2008.
+    """
+
+    # Parameter changes the algorithm to the slower for large n
+
+    y = np.squeeze(np.asarray(y))
+    if y.ndim != 1:
+        raise ValueError("y must be squeezable to a 1-d array")
+
+    y = np.sort(y)
+
+    n = y.shape[0]
+    if n % 2 == 0:
+        mf = (y[n // 2 - 1] + y[n // 2]) / 2
+    else:
+        mf = y[(n - 1) // 2]
+
+    z = y - mf
+    lower = z[z <= 0.0]
+    upper = z[z >= 0.0]
+    upper = upper[:, None]
+    standardization = upper - lower
+    is_zero = np.logical_and(lower == 0.0, upper == 0.0)
+    standardization[is_zero] = np.inf
+    spread = upper + lower
+    h = spread / standardization
+    # GH5395
+    num_ties = np.sum(lower == 0.0)
+    if num_ties:
+        # Replacements has -1 above the anti-diagonal, 0 on the anti-diagonal,
+        # and 1 below the anti-diagonal
+        replacements = np.ones((num_ties, num_ties)) - np.eye(num_ties)
+        replacements -= 2 * np.triu(replacements)
+        # Convert diagonal to anti-diagonal
+        replacements = np.fliplr(replacements)
+        # Always replace upper right block
+        h[:num_ties, -num_ties:] = replacements
+
+    return np.median(h)
+
+
+def medcouple(y, axis=0):
+    """
+    Calculate the medcouple robust measure of skew.
+
+    Parameters
+    ----------
+    y : array_like
+        Data to compute use in the estimator.
+    axis : {int, None}
+        Axis along which the medcouple statistic is computed.  If `None`, the
+        entire array is used.
+
+    Returns
+    -------
+    mc : ndarray
+        The medcouple statistic with the same shape as `y`, with the specified
+        axis removed.
+
+    Notes
+    -----
+    The current algorithm requires a O(N**2) memory allocations, and so may
+    not work for very large arrays (N>10000).
+
+    .. [*] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed
+       distributions" Computational Statistics & Data Analysis, vol. 52, pp.
+       5186-5201, August 2008.
+    """
+    y = np.asarray(y, dtype=np.double)  # GH 4243
+    if axis is None:
+        return _medcouple_1d(y.ravel())
+
+    return np.apply_along_axis(_medcouple_1d, axis, y)
 
 
 def huberta_outliers(v):

diff --git a/requirements-base.txt b/requirements-base.txt
@@ -12,5 +12,4 @@ parameterized
 scikit-image
 scikit-learn
 scipy
-statsmodels
 tqdm
diff --git a/requirements.txt b/requirements.txt
@@ -24,7 +24,6 @@ numpy==1.18.0
 packaging==19.2
 pandas==0.25.3
 parameterized==0.7.1
-patsy==0.5.1
 Pillow==6.2.1
 plotly==4.4.1
 pluggy==0.13.1
@@ -40,7 +39,6 @@ scikit-image==0.16.2
 scikit-learn==0.22
 scipy==1.4.1
 six==1.13.0
-statsmodels==0.10.2
 tqdm==4.41.0
 wcwidth==0.1.7
 Werkzeug==0.16.0