Skip to content

Commit

Permalink
Merge pull request #73 from ealcobaca/new-summaries
Browse files Browse the repository at this point in the history
New nan-resilient summary functions
  • Loading branch information
ealcobaca committed Mar 6, 2020
2 parents 44344a5 + 4e36190 commit 084aced
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 13 deletions.
6 changes: 1 addition & 5 deletions pymfe/_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -925,11 +925,7 @@ def process_summary(
summary_func),
RuntimeWarning)
else:
try:
summary_mtd_args = _extract_mtd_args(summary_mtd_callable)

except ValueError:
summary_mtd_args = tuple()
summary_mtd_args = _extract_mtd_args(summary_mtd_callable)

summary_mtd_pack = (
summary_func,
Expand Down
104 changes: 96 additions & 8 deletions pymfe/_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,28 @@ def sum_quantiles(values: TypeValList,
"(got {}).".format(valid_packges, package))

if package == "numpy":
return np.percentile(
values, (0, 25, 50, 75, 100), interpolation=numpy_interpolation)
return np.quantile(
values, (0.00, 0.25, 0.50, 0.75, 1.00),
interpolation=numpy_interpolation)

return scipy.stats.mstats.mquantiles(
values, (0.00, 0.25, 0.50, 0.75, 1.00),
alphap=scipy_alphap,
betap=scipy_betap)


def sum_nanquantiles(values: TypeValList,
numpy_interpolation: str = "linear") -> TypeValList:
"""Calculate the ``values`` quantiles, ignoring `nan` values.
The quantiles calculated corresponds to the minimum, maximum,
median value, and third and fourth quartiles.
"""
return np.nanquantile(
values, (0.00, 0.25, 0.50, 0.75, 1.00),
interpolation=numpy_interpolation)


def sum_skewness(values: TypeValList, method: int = 3,
bias: bool = True) -> float:
"""Calculate the skewness from ``values`` using ``method`` strategy.
Expand Down Expand Up @@ -163,7 +176,7 @@ def sum_skewness(values: TypeValList, method: int = 3,


def sum_kurtosis(values: TypeValList, method: int = 3,
bias: bool = True) -> TypeValList:
bias: bool = True) -> float:
"""Calculate the kurtosis of ``values`` using ``method`` strategy.
Args:
Expand Down Expand Up @@ -224,34 +237,109 @@ def sum_kurtosis(values: TypeValList, method: int = 3,
return kurt_val


def sum_std(values: TypeValList, ddof: int = 1) -> TypeValList:
def sum_nanstd(values: TypeValList, ddof: int = 1) -> float:
"""Standard deviation summary function ignoring `nan` values."""
if len(values) <= ddof:
return np.nan

return np.nanstd(values, ddof=ddof)


def sum_std(values: TypeValList, ddof: int = 1) -> float:
"""Standard deviation summary function."""
if len(values) <= ddof:
return np.nan

return np.std(values, ddof=ddof)


def sum_var(values: TypeValList, ddof: int = 1) -> TypeValList:
"""Standard deviation summary function."""
def sum_nanvar(values: TypeValList, ddof: int = 1) -> float:
"""Variance summary function ignoring `nan` values."""
if len(values) <= ddof:
return np.nan

return np.nanvar(values, ddof=ddof)


def sum_var(values: TypeValList, ddof: int = 1) -> float:
"""Variance summary function."""
if len(values) <= ddof:
return np.nan

return np.var(values, ddof=ddof)


def sum_nancount(values: TypeValList) -> int:
"""Count how many non-nan element in ``values``."""
return len(values) - np.count_nonzero(np.isnan(values))


def sum_naniq_range(values: TypeValList) -> float:
"""Inter-quartile range (IQR) ignoring `nan` values."""
return scipy.stats.iqr(values, nan_policy="omit")


def sum_nanptp(values: TypeValList) -> float:
"""Calculate (max - min) ignoring `nan` values."""
return np.nanmax(values) - np.nanmin(values)


def sum_nanhistogram(values: TypeValList,
bins: int = 10,
normalize: bool = True) -> TypeValList:
"""Create a histogram ignoring `nan` values."""
if not isinstance(values, np.ndarray):
values = np.asarray(values, dtype=float)

return sum_histogram(
values=values[~np.isnan(values)], bins=bins, normalize=normalize)


def sum_nankurtosis(values: TypeValList, method: int = 3,
bias: bool = True) -> float:
"""Estimate data kurtosis ignoring `nan` values."""
if not isinstance(values, np.ndarray):
values = np.asarray(values, dtype=float)

return sum_kurtosis(
values=values[~np.isnan(values)], method=method, bias=bias)


def sum_nanskewness(values: TypeValList, method: int = 3,
bias: bool = True) -> float:
"""Estimate data skewness ignoring `nan` values."""
if not isinstance(values, np.ndarray):
values = np.asarray(values, dtype=float)

return sum_skewness(
values=values[~np.isnan(values)], method=method, bias=bias)


SUMMARY_METHODS = collections.OrderedDict((
("mean", np.mean),
("nanmean", np.nanmean),
("sd", sum_std),
("nansd", sum_nanstd),
("var", sum_var),
("nanvar", sum_nanvar),
("count", len),
("nancount", sum_nancount),
("histogram", sum_histogram),
("nanhistogram", sum_nanhistogram),
("iq_range", scipy.stats.iqr),
("naniq_range", sum_naniq_range),
("kurtosis", sum_kurtosis),
("max", max),
("nankurtosis", sum_nankurtosis),
("max", np.max),
("nanmax", np.nanmax),
("median", np.median),
("min", min),
("nanmedian", np.nanmedian),
("min", np.min),
("nanmin", np.nanmin),
("quantiles", sum_quantiles),
("nanquantiles", sum_nanquantiles),
("range", np.ptp),
("nanrange", sum_nanptp),
("skewness", sum_skewness),
("nanskewness", sum_nanskewness),
))
7 changes: 7 additions & 0 deletions pymfe/mfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,13 @@ def __init__(self,
12. ``skewness``: Describes the shape of the measure values
distribution in terms of symmetry.
You can concatenate `nan` with the desired summary function name
to use an alternative version of the same summary which ignores
`nan` values. For instance, `nanmean` is the `mean` summary
function which ignores all `nan` values, while 'naniq_range`
is the interquartile range calculated only with valid (non-`nan`)
values.
If more than one summary function is selected, then all multivalued
extracted metafeatures are summarized with each summary function.
Expand Down
56 changes: 56 additions & 0 deletions tests/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,59 @@ def test_ddof():
assert np.isnan(pymfe._summary.sum_var(sing_val, ddof=1))
assert np.isnan(pymfe._summary.sum_std(sing_val, ddof=2))
assert np.isnan(pymfe._summary.sum_var(sing_val, ddof=2))
assert np.isnan(pymfe._summary.sum_nanstd(sing_val, ddof=1))
assert np.isnan(pymfe._summary.sum_nanvar(sing_val, ddof=1))
assert np.isnan(pymfe._summary.sum_nanstd(sing_val, ddof=2))
assert np.isnan(pymfe._summary.sum_nanvar(sing_val, ddof=2))


@pytest.mark.parametrize("summary_func", [
"nanmean",
"nansd",
"nanvar",
"nanhistogram",
"naniq_range",
"nankurtosis",
"nanmax",
"nanmedian",
"nanmin",
"nanquantiles",
"nanrange",
"nanskewness",
])
def test_nansummary(summary_func):
values = np.array([
1, np.nan, np.nan, 2, -4, np.nan, 9, -11, 1, 5, 6.4, 2.3, 4.5, np.nan,
0
])
clean_values = values[~np.isnan(values)]
summary_nan = pymfe._summary.SUMMARY_METHODS[summary_func]
summary_reg = pymfe._summary.SUMMARY_METHODS[summary_func[3:]]

assert np.allclose(
summary_nan(list(values)), summary_reg(list(clean_values)))


def test_nancount():
values = np.array([
1,
np.nan,
np.nan,
2,
-4,
np.nan,
9,
-11,
1,
5,
6.4,
2.3,
4.5,
np.nan,
0,
])
summary_nan = pymfe._summary.SUMMARY_METHODS["nancount"]
summary_reg = pymfe._summary.SUMMARY_METHODS["count"]
assert np.allclose(
summary_nan(list(values)),
summary_reg(list(values)) - np.count_nonzero(np.isnan(values)))

0 comments on commit 084aced

Please sign in to comment.