Skip to content

Commit

Permalink
DensityOutliersTransform optimization (#231)
Browse files Browse the repository at this point in the history
* `DensityOutliersTransform` optimization

* fixed tests

* removed `_save_original_values`

* reworked indexing

* updated docs

* support scalar functions

* updated transform class

* updated tests

* added comments

* updated tests

* minor changes

* updated changelog
  • Loading branch information
brsnw250 committed Feb 21, 2024
1 parent 71b80d2 commit a8efed8
Show file tree
Hide file tree
Showing 13 changed files with 303 additions and 95 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Create page about internal datasets in documentation ([#175](https://github.com/etna-team/etna/pull/175))
- Add usage example of internal datasets in `101-get_started.ipynb` and `305-classification.ipynb` tutorials ([#202](https://github.com/etna-team/etna/pull/202))
- Add size method to `TSDataset` class ([#238](https://github.com/etna-team/etna/pull/238))
- Add the `index_only` parameter to outlier analysis functions for return type control ([#231](https://github.com/etna-team/etna/pull/231))

### Changed
- Add `relevance_aggregation_mode` and `redundancy_aggregation_mode` into `MRMRFeatureSelectionTransform.params_to_tune` ([#212](https://github.com/etna-team/etna/pull/212))
- Optimized `DensityOutliersTransform` and removed `_save_original_values` from outlier transforms ([#231](https://github.com/etna-team/etna/pull/231))

### Fixed
- Fix `traffic_2008` ([128](https://github.com/etna-team/etna/pull/128))
Expand Down
137 changes: 95 additions & 42 deletions etna/analysis/outliers/density_outliers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from enum import Enum
from itertools import islice
from typing import TYPE_CHECKING
from typing import Callable
from typing import Dict
from typing import List
from typing import Literal
from typing import Union

import numpy as np
import pandas as pd
Expand All @@ -10,7 +14,7 @@
from etna.datasets import TSDataset


def absolute_difference_distance(x: float, y: float) -> float:
def absolute_difference_distance(x: np.ndarray, y: np.ndarray) -> np.ndarray:
"""Calculate distance for :py:func:`get_anomalies_density` function by taking absolute value of difference.
Parameters
Expand All @@ -22,18 +26,36 @@ def absolute_difference_distance(x: float, y: float) -> float:
Returns
-------
result: float
result: np.ndarray
absolute difference between values
"""
return abs(x - y)
return np.abs(x - y)


class DistanceFunction(str, Enum):
"""Enum for points distance measure functions."""

absolute_difference = "absolute_difference"

@classmethod
def _missing_(cls, value):
raise NotImplementedError(
f"{value} is not a valid {cls.__name__}. Only {', '.join([repr(m.value) for m in cls])} seasonality allowed"
)

def get_callable(self) -> Callable[[np.ndarray, np.ndarray], np.ndarray]:
if self.value == DistanceFunction.absolute_difference:
return absolute_difference_distance
else:
raise ValueError("Invalid distance function!")


def get_segment_density_outliers_indices(
series: np.ndarray,
window_size: int = 7,
distance_threshold: float = 10,
n_neighbors: int = 3,
distance_func: Callable[[float, float], float] = absolute_difference_distance,
distance_func: Union[Literal["absolute_difference"], Callable[[float, float], float]] = "absolute_difference",
) -> List[int]:
"""Get indices of outliers for one series.
Expand All @@ -48,40 +70,60 @@ def get_segment_density_outliers_indices(
n_neighbors:
min number of close items that item should have not to be outlier
distance_func:
distance function
distance function. If a string is specified, a corresponding vectorized implementation will be used. Custom callable
will be used as a scalar function, which will result in worse performance.
Returns
-------
:
list of outliers' indices
"""
idxs = np.arange(len(series))
start_idxs = np.maximum(0, idxs - window_size)
end_idxs = np.maximum(0, np.minimum(idxs, len(series) - window_size)) + 1

max_shifts: np.ndarray = end_idxs - start_idxs

if isinstance(distance_func, str):
dist_func = DistanceFunction(distance_func).get_callable()

def _closeness_func(x, start, stop, y):
return (dist_func(x[start:stop], y) < distance_threshold).astype(int)

else:

def is_close(item1: float, item2: float) -> int:
"""Return 1 if item1 is closer to item2 than distance_threshold according to distance_func, 0 otherwise."""
return int(distance_func(item1, item2) < distance_threshold)
def _closeness_func(x, start, stop, y):
return [int(distance_func(elem, y) < distance_threshold) for elem in islice(x, start, stop)]

outliers_indices = []
for idx, item in enumerate(series):
is_outlier = True
left_start = max(0, idx - window_size)
left_stop = max(0, min(idx, len(series) - window_size))
closeness = None
n = 0
for i in range(left_start, left_stop + 1):
if closeness is None:
closeness = [is_close(item, series[j]) for j in range(i, min(i + window_size, len(series)))]
n = sum(closeness) - 1
else:
n -= closeness.pop(0)
new_element_is_close = is_close(item, series[i + window_size - 1])
closeness.append(new_element_is_close)
n += new_element_is_close
if n >= n_neighbors:
is_outlier = False
for idx, item, start_idx, max_shift in zip(idxs, series, start_idxs, max_shifts):
# compute which neighbours are close to the element in the given windows
closeness = _closeness_func(
x=series,
start=start_idx,
stop=start_idx + window_size + max_shift - 1,
y=item,
)

# compute number of close neighbours before index
num_close = np.cumsum(closeness)

outlier = True
for shift in range(max_shift):
# number of neighbours in particular window
num_in_window = num_close[-max_shift + shift] - num_close[shift]
if (start_idx + shift) != idx:
# subtract current element if it is not on the window border
num_in_window += closeness[shift] - 1

if num_in_window >= n_neighbors:
outlier = False
break
if is_outlier:

if outlier:
outliers_indices.append(idx)
return list(outliers_indices)

return outliers_indices


def get_anomalies_density(
Expand All @@ -90,8 +132,9 @@ def get_anomalies_density(
window_size: int = 15,
distance_coef: float = 3,
n_neighbors: int = 3,
distance_func: Callable[[float, float], float] = absolute_difference_distance,
) -> Dict[str, List[pd.Timestamp]]:
distance_func: Union[Literal["absolute_difference"], Callable[[float, float], float]] = "absolute_difference",
index_only: bool = True,
) -> Dict[str, Union[List[pd.Timestamp], pd.Series]]:
"""Compute outliers according to density rule.
For each element in the series build all the windows of size ``window_size`` containing this point.
Expand All @@ -111,7 +154,10 @@ def get_anomalies_density(
n_neighbors:
min number of close neighbors of point not to be outlier
distance_func:
distance function
distance function. If a string is specified, a corresponding vectorized implementation will be used. Custom callable
will be used as a scalar function, which will result in worse performance.
index_only:
whether to return only outliers indices. If `False` will return outliers series
Returns
-------
Expand All @@ -122,27 +168,34 @@ def get_anomalies_density(
-----
It is a variation of distance-based (index) outlier detection method adopted for timeseries.
"""
segments = ts.segments
outliers_per_segment = {}
for seg in segments:

segments_df = ts[..., in_column].droplevel("feature", axis=1)
stds = np.nanstd(segments_df.values, axis=0)

for series_std, (segment, series_df) in zip(stds, segments_df.items()):
# TODO: dropna() now is responsible for removing nan-s at the end of the sequence and in the middle of it
# May be error or warning should be raised in this case
segment_df = ts[:, seg, :][seg].dropna().reset_index()
series = segment_df[in_column].values
timestamps = segment_df["timestamp"].values
series_std = np.std(series)
if series_std:
series = series_df.dropna()

if series_std > 0:
outliers_idxs = get_segment_density_outliers_indices(
series=series,
series=series.values,
window_size=window_size,
distance_threshold=distance_coef * series_std,
n_neighbors=n_neighbors,
distance_func=distance_func,
)
outliers = [timestamps[i] for i in outliers_idxs]
outliers_per_segment[seg] = outliers
else:
outliers_per_segment[seg] = []

if len(outliers_idxs):
if index_only:
store_values = list(series.index.values[outliers_idxs])

else:
store_values = series.iloc[outliers_idxs]

outliers_per_segment[segment] = store_values

return outliers_per_segment


Expand Down
19 changes: 15 additions & 4 deletions etna/analysis/outliers/hist_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from copy import deepcopy
from typing import TYPE_CHECKING
from typing import List
from typing import Union

import numba
import numpy as np
Expand Down Expand Up @@ -299,8 +300,8 @@ def hist(series: np.ndarray, bins_number: int) -> np.ndarray:


def get_anomalies_hist(
ts: "TSDataset", in_column: str = "target", bins_number: int = 10
) -> typing.Dict[str, List[pd.Timestamp]]:
ts: "TSDataset", in_column: str = "target", bins_number: int = 10, index_only: bool = True
) -> typing.Dict[str, Union[List[pd.Timestamp], pd.Series]]:
"""
Get point outliers in time series using histogram model.
Expand All @@ -315,6 +316,8 @@ def get_anomalies_hist(
name of the column in which the anomaly is searching
bins_number:
number of bins
index_only:
whether to return only outliers indices. If `False` will return outliers series
Returns
-------
Expand All @@ -326,9 +329,17 @@ def get_anomalies_hist(
for seg in segments:
segment_df = ts.df[seg].reset_index()
values = segment_df[in_column].values
timestamp = segment_df["timestamp"].values

anomalies = hist(values, bins_number)

outliers_per_segment[seg] = [timestamp[i] for i in anomalies]
if len(anomalies):
store_values = segment_df.iloc[anomalies]

if index_only:
store_values = list(store_values["timestamp"].values)
else:
store_values = pd.Series(store_values[in_column].values, index=store_values["timestamp"])

outliers_per_segment[seg] = store_values

return outliers_per_segment
24 changes: 19 additions & 5 deletions etna/analysis/outliers/median_outliers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import math
import typing
from typing import Dict
from typing import List
from typing import Union

import numpy as np
import pandas as pd
Expand All @@ -9,8 +12,8 @@


def get_anomalies_median(
ts: "TSDataset", in_column: str = "target", window_size: int = 10, alpha: float = 3
) -> typing.Dict[str, typing.List[pd.Timestamp]]:
ts: "TSDataset", in_column: str = "target", window_size: int = 10, alpha: float = 3, index_only: bool = True
) -> Dict[str, Union[List[pd.Timestamp], pd.Series]]:
"""
Get point outliers in time series using median model (estimation model-based method).
Expand All @@ -27,6 +30,8 @@ def get_anomalies_median(
number of points in the window
alpha:
coefficient for determining the threshold
index_only:
whether to return only outliers indices. If `False` will return outliers series
Returns
-------
Expand All @@ -36,11 +41,10 @@ def get_anomalies_median(
outliers_per_segment = {}
segments = ts.segments
for seg in segments:
anomalies: typing.List[int] = []
anomalies: List[int] = []

segment_df = ts.df[seg].reset_index()
values = segment_df[in_column].values
timestamp = segment_df["timestamp"].values

n_iter = math.ceil(len(values) / window_size)
for i in range(n_iter):
Expand All @@ -50,5 +54,15 @@ def get_anomalies_median(
std = np.std(values[left_border:right_border])
diff = np.abs(values[left_border:right_border] - med)
anomalies.extend(np.where(diff > std * alpha)[0] + left_border)
outliers_per_segment[seg] = [timestamp[i] for i in anomalies]

if len(anomalies):
store_values = segment_df.iloc[anomalies]

if index_only:
store_values = list(store_values["timestamp"].values)
else:
store_values = pd.Series(store_values[in_column].values, index=store_values["timestamp"])

outliers_per_segment[seg] = store_values

return outliers_per_segment
16 changes: 14 additions & 2 deletions etna/analysis/outliers/prediction_interval_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ def get_anomalies_prediction_interval(
model: Union[Type["ProphetModel"], Type["SARIMAXModel"]],
interval_width: float = 0.95,
in_column: str = "target",
index_only: bool = True,
**model_params,
) -> Dict[str, List[pd.Timestamp]]:
) -> Dict[str, Union[List[pd.Timestamp], pd.Series]]:
"""
Get point outliers in time series using prediction intervals (estimation model-based method).
Expand All @@ -87,6 +88,8 @@ def get_anomalies_prediction_interval(
* If it is set to "target", then all data will be used for prediction.
* Otherwise, only column data will be used.
index_only:
whether to return only outliers indices. If `False` will return outliers series
Returns
-------
Expand Down Expand Up @@ -115,5 +118,14 @@ def get_anomalies_prediction_interval(
anomalies_mask = (actual_segment_slice["target"] > predicted_segment_slice[f"target_{upper_p:.4g}"]) | (
actual_segment_slice["target"] < predicted_segment_slice[f"target_{lower_p:.4g}"]
)
outliers_per_segment[segment] = list(predicted_segment_slice[anomalies_mask].index.values)

if anomalies_mask.sum() >= 1:
store_values = actual_segment_slice[anomalies_mask]
if index_only:
store_values = list(store_values.index.values)
else:
store_values = pd.Series(store_values["target"], index=store_values.index)

outliers_per_segment[segment] = store_values

return outliers_per_segment
Loading

0 comments on commit a8efed8

Please sign in to comment.