Skip to content

Commit

Permalink
Add get_anomalies_iqr function (#374)
Browse files Browse the repository at this point in the history
* added iqr method

* added fixtures

* added tests

* updated tests

* updated typing

* added test

* review fixes
  • Loading branch information
brsnw250 committed Jun 6, 2024
1 parent 19f2ab2 commit dd5edd1
Show file tree
Hide file tree
Showing 8 changed files with 579 additions and 1 deletion.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased
### Added
- Add `get_anomalies_iqr` function for anomaly detection ([#374](https://github.com/etna-team/etna/pull/374))
- Add `get_anomalies_isolation_forest` method for anomaly detection ([#375](https://github.com/etna-team/etna/pull/375))
-
-
-
-
-
-

### Changed
-
Expand Down Expand Up @@ -76,6 +76,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add `TSDataset.create_from_misaligned` constructor ([#269](https://github.com/etna-team/etna/pull/269))
- Add tutorial about working with misaligned data ([#288](https://github.com/etna-team/etna/pull/288))
- Add in `OutliersTransform` possibilities use `ignore_flag_column` to skip values use ignore ([#291](https://github.com/etna-team/etna/pull/291))
- Add `get_anomalies_iqr` function for anomaly detection ([#374](https://github.com/etna-team/etna/pull/374))

### Changed
- Update glossary with terms related to working with misaligned data ([#288](https://github.com/etna-team/etna/pull/288))
Expand Down
1 change: 1 addition & 0 deletions docs/source/api_reference/analysis.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,5 @@ Outliers analysis utilities:
get_anomalies_hist
get_anomalies_median
get_anomalies_prediction_interval
get_anomalies_iqr
get_anomalies_isolation_forest
1 change: 1 addition & 0 deletions etna/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@
from etna.analysis.outliers.isolation_forest_outliers import get_anomalies_isolation_forest
from etna.analysis.outliers.median_outliers import get_anomalies_median
from etna.analysis.outliers.prediction_interval_outliers import get_anomalies_prediction_interval
from etna.analysis.outliers.rolling_statistics import get_anomalies_iqr
1 change: 1 addition & 0 deletions etna/analysis/outliers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
from etna.analysis.outliers.plots import plot_anomalies
from etna.analysis.outliers.plots import plot_anomalies_interactive
from etna.analysis.outliers.prediction_interval_outliers import get_anomalies_prediction_interval
from etna.analysis.outliers.rolling_statistics import get_anomalies_iqr
277 changes: 277 additions & 0 deletions etna/analysis/outliers/rolling_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
import functools
from typing import TYPE_CHECKING
from typing import Any
from typing import Callable
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.tsatools import freq_to_period

if TYPE_CHECKING:
from etna.datasets import TSDataset


def _sliding_window(x: np.ndarray, window_size: int, stride: int = 1) -> np.ndarray:
"""Prepare windows of 1-d data, strided by given parameters."""
if window_size <= 0:
raise ValueError("Window size must be positive integer!")

if stride < 1:
raise ValueError("Stride must be integer greater or equal to 1!")

# get all sliding windows views
all_windows = sliding_window_view(x[::-1], window_size)

# select only windows, that match given stride
strided_windows = all_windows[::stride, ::-1]

# reverse back to match the original order
return strided_windows[::-1]


def sliding_window_decorator(func: Callable) -> Callable:
"""Decorate function to run on windows of 1-d data."""

@functools.wraps(func)
def wrapper(
series: pd.Series, *, window_size: int, stride: int = 1, return_indices: bool = True, **kwargs
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
"""
Make computation on sliding window with given stride.
Parameters
----------
series:
original series.
window_size:
size of the window to make computations on.
stride:
offset between neighboring windows.
return_indices:
whether to return original indices along with the results.
**kwargs:
additional arguments for the function.
Returns
-------
:
arrays with the results of applying function on the sliding windows.
"""
indices = np.arange(series.size)
indices_matrix = _sliding_window(x=indices, window_size=window_size, stride=stride)

apply_func = functools.partial(func, series, **kwargs)

results = np.apply_along_axis(apply_func, 1, indices_matrix)

if return_indices:
return results, indices_matrix

else:
return results

return wrapper


def _stl_decompose(
series: pd.Series, period: Optional[int] = None, trend: bool = False, seasonality: bool = False, **kwargs
) -> pd.Series:
"""
Estimate seasonal and trend components and remove them from the series.
Parameters
----------
series:
series for detrending and seasonality removal.
period:
periodicity of the sequence.
trend:
whether to remove trend from the series.
seasonality:
whether to remove seasonality from the series
**kwargs:
other parameters for decompositions. See :py:class:`statsmodels.tsa.seasonal.STL`
Returns
-------
:
series with removed seasonality/trend.
"""
if not (trend or seasonality):
raise ValueError("At least one component must be set!")

if period is None:
freq = getattr(series.index, "inferred_freq", None)
if freq is None:
raise ValueError("Series must have inferable frequency to autodetect period for STL!")

period = freq_to_period(freq)

stl_res = STL(endog=series, period=period, **kwargs).fit()

if trend:
series = series - stl_res.trend

if seasonality:
series = series - stl_res.seasonal

return series


def _outliers_per_segment(
df: pd.DataFrame, func: Callable, index_only: bool = True
) -> Dict[str, Union[List[pd.Timestamp], List[int], pd.Series]]:
"""Run estimation function for each segment."""
outliers_per_segment = {}
for segment in df.columns:
series = df[segment]

series = series.loc[series.first_valid_index() :]

if np.any(series.isna()):
raise ValueError(f"Segment `{segment}` contains missing values!")

mask, indices = func(series)

# set index as outlier if it was marked as such at least in one window
outlier_indices = np.unique(indices[mask].reshape(-1))

if len(outlier_indices) > 0:
if index_only:
outliers = list(series.index[outlier_indices].values)
else:
outliers = series.iloc[outlier_indices]

outliers_per_segment[segment] = outliers

return outliers_per_segment


@sliding_window_decorator
def iqr_method(
series: pd.Series,
indices: np.ndarray,
iqr_scale: float = 1.5,
period: Optional[int] = None,
trend: bool = False,
seasonality: bool = False,
stl_params: Optional[Dict[str, Any]] = None,
) -> np.ndarray:
"""
Estimate anomalies using IQR statistics.
Parameters
----------
series:
original series for the estimation.
indices:
which observations use for the estimation.
iqr_scale:
scaling parameter of the estimated interval.
period:
periodicity of the sequence for STL.
trend:
whether to remove trend from the series.
seasonality:
whether to remove seasonality from the series
stl_params:
other parameters for STL. See :py:class:`statsmodels.tsa.seasonal.STL`
Returns
-------
:
binary mask for each observation, indicating if it was estimated as an anomaly.
"""
if iqr_scale <= 0:
raise ValueError("Scaling parameter must be positive!")

window = series[indices]

if trend or seasonality:
if stl_params is None:
stl_params = {}

window = _stl_decompose(series=window, period=period, trend=trend, seasonality=seasonality, **stl_params)

window = window.values

first = np.quantile(window, q=0.25)
third = np.quantile(window, q=0.75)

iqr = iqr_scale * (third - first)

maxval = third + iqr
minval = first - iqr

anom_mask = (window > maxval) | (window < minval)

return anom_mask


def get_anomalies_iqr(
ts: "TSDataset",
in_column: str = "target",
window_size: int = 10,
stride: int = 1,
iqr_scale: float = 1.5,
trend: bool = False,
seasonality: bool = False,
period: Optional[int] = None,
stl_params: Optional[Dict[str, Any]] = None,
index_only: bool = True,
) -> Dict[str, Union[List[pd.Timestamp], List[int], pd.Series]]:
"""
Get point outliers in time series using IQR statistics, estimated on a rolling window.
Outliers are all points that fall outside the estimated interval.
Parameters
----------
ts:
TSDataset with timeseries data
in_column:
name of the column in which the anomaly is searching
window_size:
number of points in the window
stride:
offset between neighboring windows.
iqr_scale:
scaling parameter of the estimated interval.
trend:
whether to remove trend from the series.
seasonality:
whether to remove seasonality from the series
period:
periodicity of the sequence for STL.
stl_params:
other parameters for STL. See :py:class:`statsmodels.tsa.seasonal.STL`
index_only:
whether to return only outliers indices. If `False` will return outliers series
Returns
-------
:
mapping from segment names to corresponding outliers.
"""
df = ts[..., in_column]
df = df.droplevel(level="feature", axis=1)

detection_func = functools.partial(
iqr_method,
window_size=window_size,
stride=stride,
iqr_scale=iqr_scale,
trend=trend,
seasonality=seasonality,
period=period,
stl_params=stl_params,
)

return _outliers_per_segment(df=df, func=detection_func, index_only=index_only)
12 changes: 12 additions & 0 deletions tests/test_analysis/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from etna.datasets import TSDataset
from etna.datasets import duplicate_data
from tests.utils import convert_ts_index_to_freq
from tests.utils import convert_ts_to_int_timestamp


@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -41,3 +43,13 @@ def exog_and_target_dfs():
df["no_cast"] = df["no_cast"].astype("category")
df_exog = duplicate_data(df, segments=["a", "b"])
return ts, df_exog


@pytest.fixture
def outliers_df_with_two_columns_int_timestamp(outliers_df_with_two_columns):
return convert_ts_to_int_timestamp(ts=outliers_df_with_two_columns)


@pytest.fixture
def outliers_df_with_two_columns_minute_freq(outliers_df_with_two_columns):
return convert_ts_index_to_freq(ts=outliers_df_with_two_columns, freq="T")
Loading

0 comments on commit dd5edd1

Please sign in to comment.