DensityOutliersTransform optimization (#231)

* `DensityOutliersTransform` optimization * fixed tests * removed `_save_original_values` * reworked indexing * updated docs * support scalar functions * updated transform class * updated tests * added comments * updated tests * minor changes * updated changelog
etna-team · Feb 21, 2024 · a8efed8 · a8efed8
1 parent 71b80d2
commit a8efed8
Show file tree

Hide file tree

Showing 13 changed files with 303 additions and 95 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,9 +23,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Create page about internal datasets in documentation ([#175](https://github.com/etna-team/etna/pull/175))
 - Add usage example of internal datasets in `101-get_started.ipynb` and `305-classification.ipynb` tutorials ([#202](https://github.com/etna-team/etna/pull/202))
 - Add size method to `TSDataset` class ([#238](https://github.com/etna-team/etna/pull/238))
+- Add the `index_only` parameter to outlier analysis functions for return type control ([#231](https://github.com/etna-team/etna/pull/231))
 
 ### Changed
 - Add `relevance_aggregation_mode` and `redundancy_aggregation_mode` into `MRMRFeatureSelectionTransform.params_to_tune` ([#212](https://github.com/etna-team/etna/pull/212))
+- Optimized `DensityOutliersTransform` and removed `_save_original_values` from outlier transforms ([#231](https://github.com/etna-team/etna/pull/231))
 
 ### Fixed
 - Fix `traffic_2008` ([128](https://github.com/etna-team/etna/pull/128))

diff --git a/etna/analysis/outliers/density_outliers.py b/etna/analysis/outliers/density_outliers.py
@@ -1,7 +1,11 @@
+from enum import Enum
+from itertools import islice
 from typing import TYPE_CHECKING
 from typing import Callable
 from typing import Dict
 from typing import List
+from typing import Literal
+from typing import Union
 
 import numpy as np
 import pandas as pd
@@ -10,7 +14,7 @@
     from etna.datasets import TSDataset
 
 
-def absolute_difference_distance(x: float, y: float) -> float:
+def absolute_difference_distance(x: np.ndarray, y: np.ndarray) -> np.ndarray:
     """Calculate distance for :py:func:`get_anomalies_density` function by taking absolute value of difference.
 
     Parameters
@@ -22,18 +26,36 @@ def absolute_difference_distance(x: float, y: float) -> float:
 
     Returns
     -------
-    result: float
+    result: np.ndarray
         absolute difference between values
     """
-    return abs(x - y)
+    return np.abs(x - y)
+
+
+class DistanceFunction(str, Enum):
+    """Enum for points distance measure functions."""
+
+    absolute_difference = "absolute_difference"
+
+    @classmethod
+    def _missing_(cls, value):
+        raise NotImplementedError(
+            f"{value} is not a valid {cls.__name__}. Only {', '.join([repr(m.value) for m in cls])} seasonality allowed"
+        )
+
+    def get_callable(self) -> Callable[[np.ndarray, np.ndarray], np.ndarray]:
+        if self.value == DistanceFunction.absolute_difference:
+            return absolute_difference_distance
+        else:
+            raise ValueError("Invalid distance function!")
 
 
 def get_segment_density_outliers_indices(
     series: np.ndarray,
     window_size: int = 7,
     distance_threshold: float = 10,
     n_neighbors: int = 3,
-    distance_func: Callable[[float, float], float] = absolute_difference_distance,
+    distance_func: Union[Literal["absolute_difference"], Callable[[float, float], float]] = "absolute_difference",
 ) -> List[int]:
     """Get indices of outliers for one series.
 
@@ -48,40 +70,60 @@ def get_segment_density_outliers_indices(
     n_neighbors:
         min number of close items that item should have not to be outlier
     distance_func:
-        distance function
+        distance function. If a string is specified, a corresponding vectorized implementation will be used. Custom callable
+        will be used as a scalar function, which will result in worse performance.
 
     Returns
     -------
     :
         list of outliers' indices
     """
+    idxs = np.arange(len(series))
+    start_idxs = np.maximum(0, idxs - window_size)
+    end_idxs = np.maximum(0, np.minimum(idxs, len(series) - window_size)) + 1
+
+    max_shifts: np.ndarray = end_idxs - start_idxs
+
+    if isinstance(distance_func, str):
+        dist_func = DistanceFunction(distance_func).get_callable()
+
+        def _closeness_func(x, start, stop, y):
+            return (dist_func(x[start:stop], y) < distance_threshold).astype(int)
+
+    else:
 
-    def is_close(item1: float, item2: float) -> int:
-        """Return 1 if item1 is closer to item2 than distance_threshold according to distance_func, 0 otherwise."""
-        return int(distance_func(item1, item2) < distance_threshold)
+        def _closeness_func(x, start, stop, y):
+            return [int(distance_func(elem, y) < distance_threshold) for elem in islice(x, start, stop)]
 
     outliers_indices = []
-    for idx, item in enumerate(series):
-        is_outlier = True
-        left_start = max(0, idx - window_size)
-        left_stop = max(0, min(idx, len(series) - window_size))
-        closeness = None
-        n = 0
-        for i in range(left_start, left_stop + 1):
-            if closeness is None:
-                closeness = [is_close(item, series[j]) for j in range(i, min(i + window_size, len(series)))]
-                n = sum(closeness) - 1
-            else:
-                n -= closeness.pop(0)
-                new_element_is_close = is_close(item, series[i + window_size - 1])
-                closeness.append(new_element_is_close)
-                n += new_element_is_close
-            if n >= n_neighbors:
-                is_outlier = False
+    for idx, item, start_idx, max_shift in zip(idxs, series, start_idxs, max_shifts):
+        # compute which neighbours are close to the element in the given windows
+        closeness = _closeness_func(
+            x=series,
+            start=start_idx,
+            stop=start_idx + window_size + max_shift - 1,
+            y=item,
+        )
+
+        # compute number of close neighbours before index
+        num_close = np.cumsum(closeness)
+
+        outlier = True
+        for shift in range(max_shift):
+            # number of neighbours in particular window
+            num_in_window = num_close[-max_shift + shift] - num_close[shift]
+            if (start_idx + shift) != idx:
+                # subtract current element if it is not on the window border
+                num_in_window += closeness[shift] - 1
+
+            if num_in_window >= n_neighbors:
+                outlier = False
                 break
-        if is_outlier:
+
+        if outlier:
             outliers_indices.append(idx)
-    return list(outliers_indices)
+
+    return outliers_indices
 
 
 def get_anomalies_density(
@@ -90,8 +132,9 @@ def get_anomalies_density(
     window_size: int = 15,
     distance_coef: float = 3,
     n_neighbors: int = 3,
-    distance_func: Callable[[float, float], float] = absolute_difference_distance,
-) -> Dict[str, List[pd.Timestamp]]:
+    distance_func: Union[Literal["absolute_difference"], Callable[[float, float], float]] = "absolute_difference",
+    index_only: bool = True,
+) -> Dict[str, Union[List[pd.Timestamp], pd.Series]]:
     """Compute outliers according to density rule.
 
     For each element in the series build all the windows of size ``window_size`` containing this point.
@@ -111,7 +154,10 @@ def get_anomalies_density(
     n_neighbors:
         min number of close neighbors of point not to be outlier
     distance_func:
-        distance function
+        distance function. If a string is specified, a corresponding vectorized implementation will be used. Custom callable
+        will be used as a scalar function, which will result in worse performance.
+    index_only:
+        whether to return only outliers indices. If `False` will return outliers series
 
     Returns
     -------
@@ -122,27 +168,34 @@ def get_anomalies_density(
     -----
     It is a variation of distance-based (index) outlier detection method adopted for timeseries.
     """
-    segments = ts.segments
     outliers_per_segment = {}
-    for seg in segments:
+
+    segments_df = ts[..., in_column].droplevel("feature", axis=1)
+    stds = np.nanstd(segments_df.values, axis=0)
+
+    for series_std, (segment, series_df) in zip(stds, segments_df.items()):
         # TODO: dropna() now is responsible for removing nan-s at the end of the sequence and in the middle of it
         #   May be error or warning should be raised in this case
-        segment_df = ts[:, seg, :][seg].dropna().reset_index()
-        series = segment_df[in_column].values
-        timestamps = segment_df["timestamp"].values
-        series_std = np.std(series)
-        if series_std:
+        series = series_df.dropna()
+
+        if series_std > 0:
             outliers_idxs = get_segment_density_outliers_indices(
-                series=series,
+                series=series.values,
                 window_size=window_size,
                 distance_threshold=distance_coef * series_std,
                 n_neighbors=n_neighbors,
                 distance_func=distance_func,
             )
-            outliers = [timestamps[i] for i in outliers_idxs]
-            outliers_per_segment[seg] = outliers
-        else:
-            outliers_per_segment[seg] = []
+
+            if len(outliers_idxs):
+                if index_only:
+                    store_values = list(series.index.values[outliers_idxs])
+
+                else:
+                    store_values = series.iloc[outliers_idxs]
+
+                outliers_per_segment[segment] = store_values
+
     return outliers_per_segment
 
 

diff --git a/etna/analysis/outliers/hist_outliers.py b/etna/analysis/outliers/hist_outliers.py
@@ -2,6 +2,7 @@
 from copy import deepcopy
 from typing import TYPE_CHECKING
 from typing import List
+from typing import Union
 
 import numba
 import numpy as np
@@ -299,8 +300,8 @@ def hist(series: np.ndarray, bins_number: int) -> np.ndarray:
 
 
 def get_anomalies_hist(
-    ts: "TSDataset", in_column: str = "target", bins_number: int = 10
-) -> typing.Dict[str, List[pd.Timestamp]]:
+    ts: "TSDataset", in_column: str = "target", bins_number: int = 10, index_only: bool = True
+) -> typing.Dict[str, Union[List[pd.Timestamp], pd.Series]]:
     """
     Get point outliers in time series using histogram model.
 
@@ -315,6 +316,8 @@ def get_anomalies_hist(
         name of the column in which the anomaly is searching
     bins_number:
         number of bins
+    index_only:
+        whether to return only outliers indices. If `False` will return outliers series
 
     Returns
     -------
@@ -326,9 +329,17 @@ def get_anomalies_hist(
     for seg in segments:
         segment_df = ts.df[seg].reset_index()
         values = segment_df[in_column].values
-        timestamp = segment_df["timestamp"].values
 
         anomalies = hist(values, bins_number)
 
-        outliers_per_segment[seg] = [timestamp[i] for i in anomalies]
+        if len(anomalies):
+            store_values = segment_df.iloc[anomalies]
+
+            if index_only:
+                store_values = list(store_values["timestamp"].values)
+            else:
+                store_values = pd.Series(store_values[in_column].values, index=store_values["timestamp"])
+
+            outliers_per_segment[seg] = store_values
+
     return outliers_per_segment
diff --git a/etna/analysis/outliers/median_outliers.py b/etna/analysis/outliers/median_outliers.py
@@ -1,5 +1,8 @@
 import math
 import typing
+from typing import Dict
+from typing import List
+from typing import Union
 
 import numpy as np
 import pandas as pd
@@ -9,8 +12,8 @@
 
 
 def get_anomalies_median(
-    ts: "TSDataset", in_column: str = "target", window_size: int = 10, alpha: float = 3
-) -> typing.Dict[str, typing.List[pd.Timestamp]]:
+    ts: "TSDataset", in_column: str = "target", window_size: int = 10, alpha: float = 3, index_only: bool = True
+) -> Dict[str, Union[List[pd.Timestamp], pd.Series]]:
     """
     Get point outliers in time series using median model (estimation model-based method).
 
@@ -27,6 +30,8 @@ def get_anomalies_median(
         number of points in the window
     alpha:
         coefficient for determining the threshold
+    index_only:
+        whether to return only outliers indices. If `False` will return outliers series
 
     Returns
     -------
@@ -36,11 +41,10 @@ def get_anomalies_median(
     outliers_per_segment = {}
     segments = ts.segments
     for seg in segments:
-        anomalies: typing.List[int] = []
+        anomalies: List[int] = []
 
         segment_df = ts.df[seg].reset_index()
         values = segment_df[in_column].values
-        timestamp = segment_df["timestamp"].values
 
         n_iter = math.ceil(len(values) / window_size)
         for i in range(n_iter):
@@ -50,5 +54,15 @@ def get_anomalies_median(
             std = np.std(values[left_border:right_border])
             diff = np.abs(values[left_border:right_border] - med)
             anomalies.extend(np.where(diff > std * alpha)[0] + left_border)
-        outliers_per_segment[seg] = [timestamp[i] for i in anomalies]
+
+        if len(anomalies):
+            store_values = segment_df.iloc[anomalies]
+
+            if index_only:
+                store_values = list(store_values["timestamp"].values)
+            else:
+                store_values = pd.Series(store_values[in_column].values, index=store_values["timestamp"])
+
+            outliers_per_segment[seg] = store_values
+
     return outliers_per_segment
diff --git a/etna/analysis/outliers/prediction_interval_outliers.py b/etna/analysis/outliers/prediction_interval_outliers.py
@@ -66,8 +66,9 @@ def get_anomalies_prediction_interval(
     model: Union[Type["ProphetModel"], Type["SARIMAXModel"]],
     interval_width: float = 0.95,
     in_column: str = "target",
+    index_only: bool = True,
     **model_params,
-) -> Dict[str, List[pd.Timestamp]]:
+) -> Dict[str, Union[List[pd.Timestamp], pd.Series]]:
     """
     Get point outliers in time series using prediction intervals (estimation model-based method).
 
@@ -87,6 +88,8 @@ def get_anomalies_prediction_interval(
         * If it is set to "target", then all data will be used for prediction.
 
         * Otherwise, only column data will be used.
+    index_only:
+        whether to return only outliers indices. If `False` will return outliers series
 
     Returns
     -------
@@ -115,5 +118,14 @@ def get_anomalies_prediction_interval(
         anomalies_mask = (actual_segment_slice["target"] > predicted_segment_slice[f"target_{upper_p:.4g}"]) | (
             actual_segment_slice["target"] < predicted_segment_slice[f"target_{lower_p:.4g}"]
         )
-        outliers_per_segment[segment] = list(predicted_segment_slice[anomalies_mask].index.values)
+
+        if anomalies_mask.sum() >= 1:
+            store_values = actual_segment_slice[anomalies_mask]
+            if index_only:
+                store_values = list(store_values.index.values)
+            else:
+                store_values = pd.Series(store_values["target"], index=store_values.index)
+
+            outliers_per_segment[segment] = store_values
+
     return outliers_per_segment