Skip to content

Commit

Permalink
Issue 350 (#398)
Browse files Browse the repository at this point in the history
* init issue interface

* issue-350

* lint fixes

* lint fixes

* lint fixes

* fixes feedback

* black formation

* black formation

* fixes signature

* blck

* description

* rm useless

* Update etna/analysis/outliers/rolling_statistics.py

Co-authored-by: Maxim Zherelo <60392282+brsnw250@users.noreply.github.com>

* Update etna/analysis/outliers/rolling_statistics.py

Co-authored-by: Maxim Zherelo <60392282+brsnw250@users.noreply.github.com>

* rm useless

* minor

* minor

* minor tests

---------

Co-authored-by: Maxim Zherelo <60392282+brsnw250@users.noreply.github.com>
  • Loading branch information
Polzovat123 and brsnw250 authored Jun 26, 2024
1 parent b419f9a commit 0b2cd55
Show file tree
Hide file tree
Showing 6 changed files with 318 additions and 1 deletion.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add `get_anomalies_isolation_forest` method for anomaly detection ([#375](https://github.com/etna-team/etna/pull/375))
- Add `IForestOutlierTransform` ([#381](https://github.com/etna-team/etna/pull/381))
- Add `IQROutlierTransform` ([#387](https://github.com/etna-team/etna/pull/387))
- Add `num_workers` parameter to `TS2VecEmbeddingModel` ([#396](https://github.com/etna-team/etna/pull/396))
- Add `num_workers` parameter to `TS2VecEmbeddingModel` ([#396](https://github.com/etna-team/etna/pull/396))
- Add `get_anomalies_mad` function for anomaly detection ([#398](https://github.com/etna-team/etna/pull/398))
- Add `TSDataset.features` property to get list of all features in a dataset ([#405](https://github.com/etna-team/etna/pull/405))
-

Expand Down
1 change: 1 addition & 0 deletions docs/source/api_reference/analysis.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,4 @@ Outliers analysis utilities:
get_anomalies_prediction_interval
get_anomalies_iqr
get_anomalies_isolation_forest
get_anomalies_mad
1 change: 1 addition & 0 deletions etna/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@
from etna.analysis.outliers.median_outliers import get_anomalies_median
from etna.analysis.outliers.prediction_interval_outliers import get_anomalies_prediction_interval
from etna.analysis.outliers.rolling_statistics import get_anomalies_iqr
from etna.analysis.outliers.rolling_statistics import get_anomalies_mad
1 change: 1 addition & 0 deletions etna/analysis/outliers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
from etna.analysis.outliers.plots import plot_anomalies_interactive
from etna.analysis.outliers.prediction_interval_outliers import get_anomalies_prediction_interval
from etna.analysis.outliers.rolling_statistics import get_anomalies_iqr
from etna.analysis.outliers.rolling_statistics import get_anomalies_mad
114 changes: 114 additions & 0 deletions etna/analysis/outliers/rolling_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view
from scipy.stats import median_abs_deviation
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.tsatools import freq_to_period

Expand Down Expand Up @@ -275,3 +276,116 @@ def get_anomalies_iqr(
)

return _outliers_per_segment(df=df, func=detection_func, index_only=index_only)


@sliding_window_decorator
def mad_method(
series: pd.Series,
indices: np.ndarray,
mad_scale: float = 1.5,
period: Optional[int] = None,
trend: bool = False,
seasonality: bool = False,
stl_params: Optional[Dict[str, Any]] = None,
) -> np.ndarray:
"""
Estimate anomalies using MAD statistics.
Parameters
----------
series:
original series for the estimation.
indices:
which observations use for the estimation.
mad_scale:
scaling parameter of the estimated interval.
period:
periodicity of the sequence for STL.
trend:
whether to remove trend from the series.
seasonality:
whether to remove seasonality from the series
stl_params:
other parameters for STL. See :py:class:`statsmodels.tsa.seasonal.STL`
Returns
-------
:
binary mask for each observation, indicating if it was estimated as an anomaly.
"""
if mad_scale <= 0:
raise ValueError("Scaling parameter must be positive!")

window = series[indices]

if trend or seasonality:
if stl_params is None:
stl_params = {}

window = _stl_decompose(series=window, period=period, trend=trend, seasonality=seasonality, **stl_params)

mad = median_abs_deviation(window)
median = np.median(window)

anom_mask = np.abs(window - median) > mad * mad_scale

return anom_mask


def get_anomalies_mad(
ts: "TSDataset",
in_column: str = "target",
window_size: int = 10,
stride: int = 1,
mad_scale: float = 3,
trend: bool = False,
seasonality: bool = False,
period: Optional[int] = None,
stl_params: Optional[Dict[str, Any]] = None,
index_only: bool = True,
) -> Dict[str, Union[List[pd.Timestamp], List[int], pd.Series]]:
"""
Get point outliers in time series using median absolute deviation.
Detects outliers in a row that fall out of range: [median - mad_scale * mad; median + mad_scale * mad]
Parameters
----------
ts:
TSDataset with timeseries data
in_column:
name of the column in which the anomaly is searching
window_size:
number of points in the window
stride:
offset between neighboring windows.
mad_scale:
scaling parameter of the estimated interval.
trend:
whether to remove trend from the series.
seasonality:
whether to remove seasonality from the series
period:
periodicity of the sequence for STL.
stl_params:
other parameters for STL. See :py:class:`statsmodels.tsa.seasonal.STL`
index_only:
whether to return only outliers indices. If `False` will return outliers series
Returns
-------
:
dict of outliers in format {segment: [outliers_timestamps]}
"""
df = ts[..., in_column]
df = df.droplevel(level="feature", axis=1)

detection_func = functools.partial(
mad_method,
window_size=window_size,
stride=stride,
mad_scale=mad_scale,
trend=trend,
seasonality=seasonality,
period=period,
stl_params=stl_params,
)

return _outliers_per_segment(df=df, func=detection_func, index_only=index_only)
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import numpy as np
import pandas as pd
import pytest

from etna.analysis.outliers import get_anomalies_mad


def test_mad_outliers_missing_values(outliers_tsds):
with pytest.raises(ValueError, match=".* contains missing values!"):
_ = get_anomalies_mad(ts=outliers_tsds)


def test_mad_outliers_invalid_scale(outliers_df_with_two_columns):
with pytest.raises(ValueError, match="Scaling parameter must be positive!"):
_ = get_anomalies_mad(ts=outliers_df_with_two_columns, mad_scale=-1.0)


@pytest.mark.parametrize(
"ts_name, error",
(
(
"outliers_df_with_two_columns_int_timestamp",
"Series must have inferable frequency to autodetect period for STL!",
),
("outliers_df_with_two_columns_minute_freq", "freq T not understood. Please report"),
),
)
def test_mad_outliers_stl_period_error(ts_name, error, request):
ts = request.getfixturevalue(ts_name)
with pytest.raises(ValueError, match=error):
_ = get_anomalies_mad(ts=ts, seasonality=True)


@pytest.mark.parametrize(
"ts_name, answer",
(
(
"outliers_df_with_two_columns",
{
"1": [np.datetime64("2021-01-11")],
"2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-16"), np.datetime64("2021-01-27")],
},
),
("outliers_df_with_two_columns_int_timestamp", {"1": [10], "2": [8, 15, 26]}),
(
"outliers_df_with_two_columns_minute_freq",
{
"1": [np.datetime64("2021-01-01T00:10:00")],
"2": [
np.datetime64("2021-01-01T00:08:00"),
np.datetime64("2021-01-01T00:15:00"),
np.datetime64("2021-01-01T00:26:00"),
],
},
),
),
)
def test_mad_outliers_various_index(ts_name, answer, request):
ts = request.getfixturevalue(ts_name)
res = get_anomalies_mad(ts=ts, window_size=30, stride=1, mad_scale=3)
assert res == answer


@pytest.mark.parametrize(
"window_size, mad_scale, stride, right_anomal",
(
(
20,
3,
1,
{
"1": [np.datetime64("2021-01-11")],
"2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-16"), np.datetime64("2021-01-27")],
},
),
(
20,
7,
1,
{"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]},
),
(
10,
6,
1,
{"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]},
),
(
12,
4,
3,
{"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]},
),
),
)
def test_mad_outliers(window_size, mad_scale, stride, right_anomal, outliers_df_with_two_columns):
res = get_anomalies_mad(
ts=outliers_df_with_two_columns, window_size=window_size, stride=stride, mad_scale=mad_scale
)
assert res == right_anomal


@pytest.mark.parametrize(
"window_size, mad_scale, stride, right_anomal",
(
(
15,
3.2,
1,
{"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]},
),
),
)
@pytest.mark.parametrize("period", (None, 2, 5))
def test_mad_outliers_with_seasonality(
window_size, mad_scale, stride, period, right_anomal, outliers_df_with_two_columns
):
res = get_anomalies_mad(
ts=outliers_df_with_two_columns,
window_size=window_size,
stride=stride,
mad_scale=mad_scale,
period=period,
seasonality=True,
)
assert len(res) == len(right_anomal)


@pytest.mark.parametrize(
"window_size, mad_scale, stride, right_anomal",
(
(
20,
7,
1,
{"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]},
),
),
)
@pytest.mark.parametrize("period", (None, 5))
def test_mad_outliers_with_trend(window_size, mad_scale, stride, period, right_anomal, outliers_df_with_two_columns):
res = get_anomalies_mad(
ts=outliers_df_with_two_columns,
window_size=window_size,
stride=stride,
mad_scale=mad_scale,
period=period,
trend=True,
)
assert res == right_anomal


@pytest.mark.parametrize("true_params", (["1", "2"],))
@pytest.mark.parametrize("index_only", (True, False))
def test_interface_correct_args(true_params, index_only, outliers_df_with_two_columns):
res = get_anomalies_mad(ts=outliers_df_with_two_columns, index_only=index_only)

assert isinstance(res, dict)
assert sorted(res.keys()) == sorted(true_params)

for key in res:
if index_only:
assert isinstance(res[key], list)
for value in res[key]:
assert isinstance(value, np.datetime64)
else:
assert isinstance(res[key], pd.Series)


def test_in_column(outliers_df_with_two_columns):
outliers = get_anomalies_mad(ts=outliers_df_with_two_columns, in_column="feature")
expected = {"1": [np.datetime64("2021-01-08")], "2": [np.datetime64("2021-01-26")]}
for key in expected:
assert key in outliers
np.testing.assert_array_equal(outliers[key], expected[key])


@pytest.mark.parametrize(
"window_size, mad_scale, stride, right_anomal",
(
(
30,
5,
1,
{"1": [np.datetime64("2021-01-11")], "2": [np.datetime64("2021-01-09"), np.datetime64("2021-01-27")]},
),
),
)
@pytest.mark.parametrize("period", (4,))
def test_mad_outliers_full_stl(window_size, mad_scale, stride, period, right_anomal, outliers_df_with_two_columns):
res = get_anomalies_mad(
ts=outliers_df_with_two_columns,
window_size=window_size,
stride=stride,
mad_scale=mad_scale,
period=period,
seasonality=True,
)
assert res == right_anomal

0 comments on commit 0b2cd55

Please sign in to comment.