Skip to content

Commit

Permalink
Add IForestOutlierTransform (#381)
Browse files Browse the repository at this point in the history
  • Loading branch information
alex-hse-repository committed Jun 6, 2024
1 parent dd5edd1 commit aacd89d
Show file tree
Hide file tree
Showing 11 changed files with 215 additions and 6 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Add `get_anomalies_iqr` function for anomaly detection ([#374](https://github.com/etna-team/etna/pull/374))
- Add `get_anomalies_isolation_forest` method for anomaly detection ([#375](https://github.com/etna-team/etna/pull/375))
-
- Add `IForestOutlierTransform` ([#381](https://github.com/etna-team/etna/pull/381))
-
-
-
Expand Down
1 change: 1 addition & 0 deletions docs/source/api_reference/transforms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ Transforms to detect outliers:
DensityOutliersTransform
MedianOutliersTransform
PredictionIntervalOutliersTransform
IForestOutlierTransform

Transforms to work with time-related features:

Expand Down
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
from etna.transforms.missing_values import ResampleWithDistributionTransform
from etna.transforms.missing_values import TimeSeriesImputerTransform
from etna.transforms.outliers import DensityOutliersTransform
from etna.transforms.outliers import IForestOutlierTransform
from etna.transforms.outliers import MedianOutliersTransform
from etna.transforms.outliers import PredictionIntervalOutliersTransform
from etna.transforms.timestamp import DateFlagsTransform
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/outliers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from etna.transforms.outliers.base import OutliersTransform
from etna.transforms.outliers.point_outliers import DensityOutliersTransform
from etna.transforms.outliers.point_outliers import IForestOutlierTransform
from etna.transforms.outliers.point_outliers import MedianOutliersTransform
from etna.transforms.outliers.point_outliers import PredictionIntervalOutliersTransform
149 changes: 145 additions & 4 deletions etna/transforms/outliers/point_outliers.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
from typing import Callable
from typing import Dict
from typing import List
from typing import Optional
from typing import Sequence
from typing import Type
from typing import Union

import pandas as pd
from numpy.random import RandomState
from typing_extensions import Literal

from etna import SETTINGS
from etna.analysis import get_anomalies_density
from etna.analysis import get_anomalies_median
from etna.analysis import get_anomalies_prediction_interval
from etna.analysis.outliers import get_anomalies_isolation_forest
from etna.datasets import TSDataset
from etna.distributions import BaseDistribution
from etna.distributions import CategoricalDistribution
Expand Down Expand Up @@ -57,7 +59,7 @@ def __init__(
self.alpha = alpha
super().__init__(in_column=in_column, ignore_flag_column=ignore_flag_column)

def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
def detect_outliers(self, ts: TSDataset) -> Dict[str, pd.Series]:
"""Call :py:func:`~etna.analysis.outliers.median_outliers.get_anomalies_median` function with self parameters.
Parameters
Expand Down Expand Up @@ -132,7 +134,7 @@ def __init__(
self.distance_func = distance_func
super().__init__(in_column=in_column, ignore_flag_column=ignore_flag_column)

def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
def detect_outliers(self, ts: TSDataset) -> Dict[str, pd.Series]:
"""Call :py:func:`~etna.analysis.outliers.density_outliers.get_anomalies_density` function with self parameters.
Parameters
Expand Down Expand Up @@ -217,7 +219,7 @@ def _get_model_type(
return SARIMAXModel
return model

def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]:
def detect_outliers(self, ts: TSDataset) -> Dict[str, pd.Series]:
"""Call :py:func:`~etna.analysis.outliers.prediction_interval_outliers.get_anomalies_prediction_interval` function with self parameters.
Parameters
Expand Down Expand Up @@ -255,8 +257,147 @@ def params_to_tune(self) -> Dict[str, BaseDistribution]:
}


class IForestOutlierTransform(OutliersTransform):
"""Transform that uses :py:func:`~etna.analysis.outliers.isolation_forest_outliers.get_anomalies_isolation_forest` to find anomalies in data."""

def __init__(
self,
in_column: str,
ignore_flag_column: Optional[str] = None,
features_to_use: Optional[Sequence[str]] = None,
features_to_ignore: Optional[Sequence[str]] = None,
ignore_missing: bool = False,
n_estimators: int = 100,
max_samples: Union[int, float, Literal["auto"]] = "auto",
contamination: Union[float, Literal["auto"]] = "auto",
max_features: Union[int, float] = 1.0,
bootstrap: bool = False,
n_jobs: Optional[int] = None,
random_state: Optional[Union[int, RandomState]] = None,
verbose: int = 0,
):
"""Create instance of PredictionIntervalOutliersTransform.
Parameters
----------
in_column:
Name of the column in which the anomaly is searching
ignore_flag_column:
Column name for skipping values from outlier check
features_to_use:
List of feature column names to use for anomaly detection
features_to_ignore:
List of feature column names to exclude from anomaly detection
ignore_missing:
Whether to ignore missing values inside a series
n_estimators:
The number of base estimators in the ensemble
max_samples:
The number of samples to draw from X to train each base estimator
* If int, then draw max_samples samples.
* If float, then draw max_samples * X.shape[0] samples.
* If “auto”, then max_samples=min(256, n_samples).
If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling).
contamination:
The amount of contamination of the data set, i.e. the proportion of outliers in the data set.
Used when fitting to define the threshold on the scores of the samples.
* If ‘auto’, the threshold is determined as in the original paper.
* If float, the contamination should be in the range (0, 0.5].
max_features:
The number of features to draw from X to train each base estimator.
* If int, then draw max_features features.
* If float, then draw `max(1, int(max_features * n_features_in_))` features.
Note: using a float number less than 1.0 or integer less than number of features
will enable feature subsampling and leads to a longer runtime.
bootstrap:
* If True, individual trees are fit on random subsets of the training data sampled with replacement.
* If False, sampling without replacement is performed.
n_jobs:
The number of jobs to run in parallel for both fit and predict.
* None means 1 unless in a joblib.parallel_backend context.
* -1 means using all processors
random_state:
Controls the pseudo-randomness of the selection of the feature and split values for
each branching step and each tree in the forest.
verbose:
Controls the verbosity of the tree building process.
Notes
-----
To get more insights on parameters see documentation of Isolation Forest algorithm:
`Documentation for Isolation Forest <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html>`_.
"""
self.features_to_use = features_to_use
self.features_to_ignore = features_to_ignore
self.ignore_missing = ignore_missing
self.n_estimators = n_estimators
self.max_samples = max_samples
self.contamination = contamination
self.max_features = max_features
self.bootstrap = bootstrap
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose
super().__init__(in_column=in_column, ignore_flag_column=ignore_flag_column)

def detect_outliers(self, ts: TSDataset) -> Dict[str, pd.Series]:
"""Call :py:func:`~etna.analysis.outliers.isolation_forest_outliers.get_anomalies_isolation_forest` function with self parameters.
Parameters
----------
ts:
Dataset to process
Returns
-------
:
Dict of outliers in format {segment: [outliers_timestamps]}
"""
return get_anomalies_isolation_forest(
ts=ts,
in_column=self.in_column,
features_to_use=self.features_to_use,
features_to_ignore=self.features_to_ignore,
ignore_missing=self.ignore_missing,
n_estimators=self.n_estimators,
max_samples=self.max_samples,
contamination=self.contamination,
max_features=self.max_features,
bootstrap=self.bootstrap,
n_jobs=self.n_jobs,
random_state=self.random_state,
verbose=self.verbose,
index_only=False,
)

def params_to_tune(self) -> Dict[str, BaseDistribution]:
"""Get default grid for tuning hyperparameters.
This grid tunes parameters: ``interval_width``, ``model``. Other parameters are expected to be set by the user.
Returns
-------
:
Grid to tune.
"""
return {
"n_estimators": IntDistribution(low=10, high=1000),
"max_samples": FloatDistribution(low=0.1, high=1.0),
"contamination": FloatDistribution(low=0.1, high=0.5),
"max_features": FloatDistribution(low=0.1, high=1.0),
"bootstrap": CategoricalDistribution([True, False]),
}


__all__ = [
"MedianOutliersTransform",
"DensityOutliersTransform",
"PredictionIntervalOutliersTransform",
"IForestOutlierTransform",
]
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,8 @@ filterwarnings = [
"ignore: Implicitly cleaning up <TemporaryDirectory",
"ignore: Call to deprecated class DeepARModel.",
"ignore: dropout option adds dropout after all but last recurrent layer",
"ignore: Call to deprecated class TFTModel."
"ignore: Call to deprecated class TFTModel.",
"ignore: X does not have valid feature names, but IsolationForest was fitted with feature names" # fixed here https://github.com/scikit-learn/scikit-learn/issues/25844
]
markers = [
"smoke"
Expand Down
Empty file.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from etna.transforms import FourierTransform
from etna.transforms import GaleShapleyFeatureSelectionTransform
from etna.transforms import HolidayTransform
from etna.transforms import IForestOutlierTransform
from etna.transforms import LabelEncoderTransform
from etna.transforms import LagTransform
from etna.transforms import LambdaTransform
Expand Down Expand Up @@ -463,6 +464,7 @@ def _test_inverse_transform_train(self, ts, transform, expected_changes):
"ts_with_outliers",
{"change": {"target"}},
),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers", {"change": {"target"}}),
# timestamp
(
DateFlagsTransform(out_column="res"),
Expand Down Expand Up @@ -909,6 +911,7 @@ def test_inverse_transform_train_fail_resample(self, transform, dataset_name, ex
# outliers
(DensityOutliersTransform(in_column="target"), "ts_with_outliers", {"change": {"target"}}),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers", {"change": {"target"}}),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers", {"change": {"target"}}),
# timestamp
(
DateFlagsTransform(out_column="res", in_column="external_timestamp"),
Expand Down Expand Up @@ -1236,6 +1239,7 @@ def _test_inverse_transform_train_subset_segments(self, ts, transform, segments)
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers"),
# timestamp
(DateFlagsTransform(), "regular_ts"),
(
Expand Down Expand Up @@ -1541,6 +1545,7 @@ def _test_inverse_transform_future_subset_segments(self, ts, transform, segments
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers"),
# timestamp
(DateFlagsTransform(), "regular_ts"),
(
Expand Down Expand Up @@ -1991,6 +1996,7 @@ def test_inverse_transform_train_new_segments(self, transform, dataset_name, exp
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers"),
# timestamp
(SpecialDaysTransform(), "regular_ts"),
(
Expand Down Expand Up @@ -2434,6 +2440,7 @@ def test_inverse_transform_future_new_segments(self, transform, dataset_name, ex
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers"),
# timestamp
(SpecialDaysTransform(), "regular_ts"),
(
Expand Down Expand Up @@ -2877,6 +2884,7 @@ def _test_inverse_transform_future_with_target(
(DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers", {}),
# timestamp
(
DateFlagsTransform(out_column="res"),
Expand Down Expand Up @@ -3403,6 +3411,7 @@ def _test_inverse_transform_future_without_target(
(DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers", {}),
# timestamp
(
DateFlagsTransform(out_column="res"),
Expand Down
9 changes: 9 additions & 0 deletions tests/test_transforms/test_inference/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from etna.transforms import FourierTransform
from etna.transforms import GaleShapleyFeatureSelectionTransform
from etna.transforms import HolidayTransform
from etna.transforms import IForestOutlierTransform
from etna.transforms import LabelEncoderTransform
from etna.transforms import LagTransform
from etna.transforms import LambdaTransform
Expand Down Expand Up @@ -426,6 +427,7 @@ def _test_transform_train(self, ts, transform, expected_changes):
"ts_with_outliers",
{"change": {"target"}},
),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers", {"change": {"target"}}),
# timestamp
(
DateFlagsTransform(out_column="res"),
Expand Down Expand Up @@ -865,6 +867,7 @@ def test_transform_train_datetime_timestamp(self, transform, dataset_name, expec
# outliers
(DensityOutliersTransform(in_column="target"), "ts_with_outliers", {"change": {"target"}}),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers", {"change": {"target"}}),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers", {"change": {"target"}}),
# timestamp
(
DateFlagsTransform(out_column="res", in_column="external_timestamp"),
Expand Down Expand Up @@ -1187,6 +1190,7 @@ def _test_transform_train_subset_segments(self, ts, transform, segments):
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers"),
# timestamp
(DateFlagsTransform(), "regular_ts"),
(
Expand Down Expand Up @@ -1479,6 +1483,7 @@ def _test_transform_future_subset_segments(self, ts, transform, segments, horizo
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers"),
# timestamp
(DateFlagsTransform(), "regular_ts"),
(
Expand Down Expand Up @@ -1894,6 +1899,7 @@ def test_transform_train_new_segments(self, transform, dataset_name, expected_ch
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers"),
# timestamp
(SpecialDaysTransform(), "regular_ts"),
(SpecialDaysTransform(in_column="external_timestamp"), "ts_with_external_timestamp"),
Expand Down Expand Up @@ -2327,6 +2333,7 @@ def test_transform_future_new_segments(self, transform, dataset_name, expected_c
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers"),
# timestamp
(SpecialDaysTransform(), "regular_ts"),
(
Expand Down Expand Up @@ -2686,6 +2693,7 @@ def _test_transform_future_with_target(self, ts, transform, expected_changes, ga
(DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers", {}),
# timestamp
(
DateFlagsTransform(out_column="res"),
Expand Down Expand Up @@ -3182,6 +3190,7 @@ def _test_transform_future_without_target(self, ts, transform, expected_changes,
(DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}),
(IForestOutlierTransform(in_column="target"), "ts_with_outliers", {}),
# timestamp
(
DateFlagsTransform(out_column="res"),
Expand Down
Loading

0 comments on commit aacd89d

Please sign in to comment.