diff --git a/CHANGELOG.md b/CHANGELOG.md index eb37d4f06..28e65af2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased ### Added -- Add `get_anomalies_isolation_forest` method for anomaly detection ([#375](https://github.com/etna-team/etna/pull/375)) +- +- +- +- - - - @@ -28,7 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ### Fixed -- +- Fix errors when importing modules without `torch` extras ([#382](https://github.com/etna-team/etna/pull/382)) - - - diff --git a/docs/source/api_reference/analysis.rst b/docs/source/api_reference/analysis.rst index 3b562d724..c2cbc41c2 100644 --- a/docs/source/api_reference/analysis.rst +++ b/docs/source/api_reference/analysis.rst @@ -101,4 +101,3 @@ Outliers analysis utilities: get_anomalies_hist get_anomalies_median get_anomalies_prediction_interval - get_anomalies_isolation_forest diff --git a/etna/analysis/__init__.py b/etna/analysis/__init__.py index 682d4e090..ab2c4179a 100644 --- a/etna/analysis/__init__.py +++ b/etna/analysis/__init__.py @@ -40,6 +40,5 @@ from etna.analysis.outliers.density_outliers import absolute_difference_distance from etna.analysis.outliers.density_outliers import get_anomalies_density from etna.analysis.outliers.hist_outliers import get_anomalies_hist -from etna.analysis.outliers.isolation_forest_outliers import get_anomalies_isolation_forest from etna.analysis.outliers.median_outliers import get_anomalies_median from etna.analysis.outliers.prediction_interval_outliers import get_anomalies_prediction_interval diff --git a/etna/analysis/outliers/__init__.py b/etna/analysis/outliers/__init__.py index 27c1841ab..44d56ae44 100644 --- a/etna/analysis/outliers/__init__.py +++ b/etna/analysis/outliers/__init__.py @@ -1,7 +1,6 @@ from etna.analysis.outliers.density_outliers import absolute_difference_distance from etna.analysis.outliers.density_outliers import get_anomalies_density from etna.analysis.outliers.hist_outliers import get_anomalies_hist -from etna.analysis.outliers.isolation_forest_outliers import get_anomalies_isolation_forest from etna.analysis.outliers.median_outliers import get_anomalies_median from etna.analysis.outliers.plots import plot_anomalies from etna.analysis.outliers.plots import plot_anomalies_interactive diff --git a/etna/analysis/outliers/isolation_forest_outliers.py b/etna/analysis/outliers/isolation_forest_outliers.py deleted file mode 100644 index a74a33de2..000000000 --- a/etna/analysis/outliers/isolation_forest_outliers.py +++ /dev/null @@ -1,172 +0,0 @@ -from typing import Dict -from typing import List -from typing import Literal -from typing import Optional -from typing import Sequence -from typing import Union - -import pandas as pd -from numpy.random import RandomState -from sklearn.ensemble import IsolationForest - -from etna.datasets import TSDataset - - -def _select_features( - ts: TSDataset, in_column: str, features_to_use: Optional[Sequence[str]], features_to_ignore: Optional[Sequence[str]] -) -> pd.DataFrame: - features = ts.columns.get_level_values("feature") - if in_column not in features: - raise ValueError(f"Feature {in_column} is not present in the dataset.") - - if features_to_use is None and features_to_ignore is None: - return ts.to_pandas() - - df = ts.to_pandas() - if features_to_use is not None and features_to_ignore is None: - if not set(features_to_use).issubset(features): - raise ValueError(f"Features {set(features_to_use) - set(features)} are not present in the dataset.") - features_to_ignore = list(set(features) - set(features_to_use)) - elif features_to_ignore is not None and features_to_use is None: - if not set(features_to_ignore).issubset(features): - raise ValueError(f"Features {set(features_to_ignore) - set(features)} are not present in the dataset.") - else: - raise ValueError( - "Changing the defaults there should be exactly one option set: features_to_use or features_to_ignore" - ) - features_to_ignore = list(set(features_to_ignore) - {in_column}) - df = df.drop(columns=features_to_ignore, level="feature") - return df - - -def _prepare_segment_df(df: pd.DataFrame, segment: str, ignore_missing: bool) -> pd.DataFrame: - df_segment = df[segment] - if ignore_missing: - return df_segment.dropna() - - first_valid_index = df_segment.isna().any(axis=1).idxmin() - df_segment = df_segment.loc[first_valid_index:] - if df_segment.isna().any().any(): - raise ValueError( - f"Series {segment} contains NaNs! Set `ignore_missing=True` to drop them or impute them appropriately!" - ) - return df_segment - - -def _get_anomalies_isolation_forest_segment( - df_segment: pd.DataFrame, model: IsolationForest, in_column: str, use_in_column: bool, index_only: bool -) -> Union[List[pd.Timestamp], List[int], pd.Series]: - model.fit(X=df_segment if use_in_column else df_segment.drop(columns=[in_column])) - anomalies_flags = model.predict(X=df_segment if use_in_column else df_segment.drop(columns=[in_column])) == -1 - anomalies_series = df_segment.loc[anomalies_flags, in_column] - if index_only: - return list(anomalies_series.index.values) - return anomalies_series - - -def get_anomalies_isolation_forest( - ts: TSDataset, - in_column: str = "target", - features_to_use: Optional[Sequence[str]] = None, - features_to_ignore: Optional[Sequence[str]] = None, - ignore_missing: bool = False, - n_estimators: int = 100, - max_samples: Union[int, float, Literal["auto"]] = "auto", - contamination: Union[float, Literal["auto"]] = "auto", - max_features: Union[int, float] = 1.0, - bootstrap: bool = False, - n_jobs: Optional[int] = None, - random_state: Optional[Union[int, RandomState]] = None, - verbose: int = 0, - index_only: bool = True, -) -> Dict[str, Union[List[pd.Timestamp], List[int], pd.Series]]: - """ - Get point outliers in time series using Isolation Forest algorithm. - - `Documentation for Isolation Forest `_. - - Parameters - ---------- - ts: - TSDataset with timeseries data - in_column: - Name of the column in which the anomaly is searching - features_to_use: - List of feature column names to use for anomaly detection - features_to_ignore: - List of feature column names to exclude from anomaly detection - ignore_missing: - Whether to ignore missing values inside a series - n_estimators: - The number of base estimators in the ensemble - max_samples: - The number of samples to draw from X to train each base estimator - * If int, then draw max_samples samples. - - * If float, then draw max_samples * X.shape[0] samples. - - * If “auto”, then max_samples=min(256, n_samples). - - If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling). - contamination: - The amount of contamination of the data set, i.e. the proportion of outliers in the data set. - Used when fitting to define the threshold on the scores of the samples. - * If ‘auto’, the threshold is determined as in the original paper. - - * If float, the contamination should be in the range (0, 0.5]. - max_features: - The number of features to draw from X to train each base estimator. - * If int, then draw max_features features. - - * If float, then draw `max(1, int(max_features * n_features_in_))` features. - Note: using a float number less than 1.0 or integer less than number of features - will enable feature subsampling and leads to a longer runtime. - bootstrap: - * If True, individual trees are fit on random subsets of the training data sampled with replacement. - * If False, sampling without replacement is performed. - n_jobs: - The number of jobs to run in parallel for both fit and predict. - * None means 1 unless in a joblib.parallel_backend context. - * -1 means using all processors - random_state: - Controls the pseudo-randomness of the selection of the feature and split values for - each branching step and each tree in the forest. - verbose: - Controls the verbosity of the tree building process. - index_only: - whether to return only outliers indices. If `False` will return outliers series - - Returns - ------- - : - dict of outliers in format {segment: [outliers_timestamps]} - """ - df = _select_features( - ts=ts, in_column=in_column, features_to_use=features_to_use, features_to_ignore=features_to_ignore - ) - model = IsolationForest( - n_estimators=n_estimators, - max_samples=max_samples, - contamination=contamination, - max_features=max_features, - bootstrap=bootstrap, - n_jobs=n_jobs, - random_state=random_state, - verbose=verbose, - warm_start=False, - ) - - use_in_column = True - if features_to_ignore is not None and in_column in features_to_ignore: - use_in_column = False - if features_to_use is not None and in_column not in features_to_use: - use_in_column = False - - outliers_per_segment = {} - for segment in ts.segments: - df_segment = _prepare_segment_df(df=df, segment=segment, ignore_missing=ignore_missing) - outliers_per_segment[segment] = _get_anomalies_isolation_forest_segment( - df_segment=df_segment, model=model, in_column=in_column, use_in_column=use_in_column, index_only=index_only - ) - - return outliers_per_segment diff --git a/etna/models/nn/deepstate/__init__.py b/etna/models/nn/deepstate/__init__.py index c1e1581ab..95184c6e4 100644 --- a/etna/models/nn/deepstate/__init__.py +++ b/etna/models/nn/deepstate/__init__.py @@ -1,8 +1,11 @@ -from etna.models.nn.deepstate.linear_dynamic_system import LDS -from etna.models.nn.deepstate.state_space_model import CompositeSSM -from etna.models.nn.deepstate.state_space_model import DaylySeasonalitySSM -from etna.models.nn.deepstate.state_space_model import LevelSSM -from etna.models.nn.deepstate.state_space_model import LevelTrendSSM -from etna.models.nn.deepstate.state_space_model import SeasonalitySSM -from etna.models.nn.deepstate.state_space_model import WeeklySeasonalitySSM -from etna.models.nn.deepstate.state_space_model import YearlySeasonalitySSM +from etna import SETTINGS + +if SETTINGS.torch_required: + from etna.models.nn.deepstate.linear_dynamic_system import LDS + from etna.models.nn.deepstate.state_space_model import CompositeSSM + from etna.models.nn.deepstate.state_space_model import DaylySeasonalitySSM + from etna.models.nn.deepstate.state_space_model import LevelSSM + from etna.models.nn.deepstate.state_space_model import LevelTrendSSM + from etna.models.nn.deepstate.state_space_model import SeasonalitySSM + from etna.models.nn.deepstate.state_space_model import WeeklySeasonalitySSM + from etna.models.nn.deepstate.state_space_model import YearlySeasonalitySSM diff --git a/etna/models/nn/deepstate/deepstate.py b/etna/models/nn/deepstate/deepstate.py index baeea95c3..d2f72d02a 100644 --- a/etna/models/nn/deepstate/deepstate.py +++ b/etna/models/nn/deepstate/deepstate.py @@ -13,13 +13,13 @@ from etna.distributions import IntDistribution from etna.models.base import DeepBaseModel from etna.models.base import DeepBaseNet -from etna.models.nn.deepstate import LDS -from etna.models.nn.deepstate import CompositeSSM if SETTINGS.torch_required: import torch import torch.nn as nn + from etna.models.nn.deepstate import LDS + from etna.models.nn.deepstate import CompositeSSM from etna.models.nn.utils import MultiEmbedding diff --git a/etna/models/nn/deepstate/linear_dynamic_system.py b/etna/models/nn/deepstate/linear_dynamic_system.py index 867eab797..112c35290 100644 --- a/etna/models/nn/deepstate/linear_dynamic_system.py +++ b/etna/models/nn/deepstate/linear_dynamic_system.py @@ -1,9 +1,12 @@ from typing import Tuple -import torch -from torch import Tensor -from torch.distributions.multivariate_normal import MultivariateNormal -from torch.distributions.normal import Normal +from etna import SETTINGS + +if SETTINGS.torch_required: + import torch + from torch import Tensor + from torch.distributions.multivariate_normal import MultivariateNormal + from torch.distributions.normal import Normal from etna.core import BaseMixin diff --git a/etna/models/nn/deepstate/state_space_model.py b/etna/models/nn/deepstate/state_space_model.py index 1bfd2934b..dffeabea9 100644 --- a/etna/models/nn/deepstate/state_space_model.py +++ b/etna/models/nn/deepstate/state_space_model.py @@ -7,9 +7,13 @@ import numpy as np import pandas as pd -import torch -from torch import Tensor -from torch.nn.functional import one_hot + +from etna import SETTINGS + +if SETTINGS.torch_required: + import torch + from torch import Tensor + from torch.nn.functional import one_hot from etna.core import BaseMixin diff --git a/etna/transforms/embeddings/models/__init__.py b/etna/transforms/embeddings/models/__init__.py index f7e380218..0f52034cf 100644 --- a/etna/transforms/embeddings/models/__init__.py +++ b/etna/transforms/embeddings/models/__init__.py @@ -1,3 +1,6 @@ +from etna import SETTINGS from etna.transforms.embeddings.models.base import BaseEmbeddingModel -from etna.transforms.embeddings.models.ts2vec import TS2VecEmbeddingModel -from etna.transforms.embeddings.models.tstcc import TSTCCEmbeddingModel + +if SETTINGS.torch_required: + from etna.transforms.embeddings.models.ts2vec import TS2VecEmbeddingModel + from etna.transforms.embeddings.models.tstcc import TSTCCEmbeddingModel diff --git a/etna/transforms/embeddings/models/ts2vec.py b/etna/transforms/embeddings/models/ts2vec.py index 948ba3803..0dca64191 100644 --- a/etna/transforms/embeddings/models/ts2vec.py +++ b/etna/transforms/embeddings/models/ts2vec.py @@ -6,9 +6,12 @@ import numpy as np -from etna.libs.ts2vec import TS2Vec +from etna import SETTINGS from etna.transforms.embeddings.models import BaseEmbeddingModel +if SETTINGS.torch_required: + from etna.libs.ts2vec import TS2Vec + class TS2VecEmbeddingModel(BaseEmbeddingModel): """TS2Vec embedding model. diff --git a/etna/transforms/embeddings/models/tstcc.py b/etna/transforms/embeddings/models/tstcc.py index 863418893..b4b792db8 100644 --- a/etna/transforms/embeddings/models/tstcc.py +++ b/etna/transforms/embeddings/models/tstcc.py @@ -5,9 +5,12 @@ import numpy as np -from etna.libs.tstcc import TSTCC +from etna import SETTINGS from etna.transforms.embeddings.models import BaseEmbeddingModel +if SETTINGS.torch_required: + from etna.libs.tstcc import TSTCC + class TSTCCEmbeddingModel(BaseEmbeddingModel): """TSTCC embedding model. diff --git a/tests/test_analysis/test_outliers/test_isolation_forest_outliers.py b/tests/test_analysis/test_outliers/test_isolation_forest_outliers.py deleted file mode 100644 index 2db6a2a2d..000000000 --- a/tests/test_analysis/test_outliers/test_isolation_forest_outliers.py +++ /dev/null @@ -1,216 +0,0 @@ -import numpy as np -import pandas as pd -import pytest -from sklearn.ensemble import IsolationForest - -from etna.analysis.outliers.isolation_forest_outliers import _get_anomalies_isolation_forest_segment -from etna.analysis.outliers.isolation_forest_outliers import _prepare_segment_df -from etna.analysis.outliers.isolation_forest_outliers import _select_features -from etna.analysis.outliers.isolation_forest_outliers import get_anomalies_isolation_forest -from etna.datasets import TSDataset -from etna.datasets import generate_ar_df - - -@pytest.fixture -def ts_with_features(): - df = generate_ar_df(n_segments=2, periods=5, start_time="2000-01-01") - df["target"] = [np.NAN, np.NAN, 1, 20, 3] + [np.NAN, 10, np.NAN, 300, 40] - df["exog_1"] = [1.0] * 5 + [2.0] * 5 - ts = TSDataset(df=df.drop(columns=["exog_1"]), freq="D", df_exog=df.drop(columns=["target"])) - return ts - - -@pytest.fixture -def df_segment_0(): - df = pd.DataFrame( - { - "timestamp": [pd.Timestamp("2000-01-03"), pd.Timestamp("2000-01-04"), pd.Timestamp("2000-01-05")], - "segment": "segment_0", - "target": [1.0, 20.0, 3.0], - "exog_1": [1.0, 1.0, 1.0], - } - ) - df = TSDataset(df=df, freq="D").df["segment_0"].dropna() - return df - - -@pytest.fixture -def df_segment_1(): - df = pd.DataFrame( - { - "timestamp": [pd.Timestamp("2000-01-02"), pd.Timestamp("2000-01-04"), pd.Timestamp("2000-01-05")], - "segment": "segment_1", - "target": [10.0, 300.0, 40.0], - "exog_1": [2.0, 2.0, 2.0], - } - ) - df = TSDataset(df=df, freq="D").df["segment_1"].dropna() - return df - - -@pytest.mark.parametrize( - "in_column,features_to_use,features_to_ignore,expected_features", - [ - ("target", None, None, ["target", "exog_1"]), - ("exog_1", None, None, ["target", "exog_1"]), - ("target", ["exog_1"], None, ["target", "exog_1"]), - ("exog_1", ["exog_1"], None, ["exog_1"]), - ("target", None, ["exog_1"], ["target"]), - ("exog_1", None, ["exog_1"], ["target", "exog_1"]), - ], -) -def test_select_features(ts_with_features, in_column, features_to_use, features_to_ignore, expected_features): - df = _select_features( - ts=ts_with_features, in_column=in_column, features_to_use=features_to_use, features_to_ignore=features_to_ignore - ) - features = set(df.columns.get_level_values("feature")) - assert sorted(features) == sorted(expected_features) - - -@pytest.mark.parametrize( - "in_column, features_to_use,features_to_ignore,expected_error", - [ - ("exog_3", None, None, "Feature exog_3 is not present in the dataset."), - ( - "target", - ["exog_1"], - ["exog_1"], - "Changing the defaults there should be exactly one option set: features_to_use or features_to_ignore", - ), - ("target", ["exog_2"], None, "Features {'exog_2'} are not present in the dataset."), - ("target", None, ["exog_2"], "Features {'exog_2'} are not present in the dataset."), - ], -) -def test_select_features_fails(ts_with_features, in_column, features_to_use, features_to_ignore, expected_error): - with pytest.raises(ValueError, match=expected_error): - _ = _select_features( - ts=ts_with_features, - in_column=in_column, - features_to_use=features_to_use, - features_to_ignore=features_to_ignore, - ) - - -@pytest.mark.parametrize( - "segment,ignore_missing, expected_df", - [ - ("segment_0", True, "df_segment_0"), - ("segment_1", True, "df_segment_1"), - ("segment_0", False, "df_segment_0"), - ], -) -def test_prepare_segment_df(ts_with_features, segment, ignore_missing, expected_df, request): - expected_df = request.getfixturevalue(expected_df) - df = _prepare_segment_df(df=ts_with_features.to_pandas(), segment=segment, ignore_missing=ignore_missing) - pd.testing.assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1), check_names=False) - - -def test_prepare_segment_df_fails(ts_with_features): - with pytest.raises( - ValueError, - match="Series segment_1 contains NaNs! Set `ignore_missing=True` to drop them or impute them appropriately!", - ): - _ = _prepare_segment_df(df=ts_with_features.to_pandas(), segment="segment_1", ignore_missing=False) - - -@pytest.mark.parametrize( - "in_column, use_in_column, expected_anomalies", - [ - ("target", True, [np.datetime64("2000-01-04")]), - ("target", False, []), - ("exog_1", True, [np.datetime64("2000-01-04")]), - ("exog_1", False, [np.datetime64("2000-01-04")]), - ], -) -def test_get_anomalies_isolation_forest_segment_index_only(df_segment_0, in_column, use_in_column, expected_anomalies): - model = IsolationForest(n_estimators=3) - anomalies = _get_anomalies_isolation_forest_segment( - df_segment=df_segment_0, model=model, in_column=in_column, use_in_column=use_in_column, index_only=True - ) - assert anomalies == expected_anomalies - - -@pytest.mark.parametrize( - "in_column,use_in_column,expected_anomalies", - [ - ( - "target", - True, - pd.Series( - data=[20.0], - index=pd.DatetimeIndex([np.datetime64("2000-01-04")], freq="D"), - ), - ), - ( - "target", - False, - pd.Series(data=[], index=pd.DatetimeIndex([], freq="D"), dtype=float), - ), - ( - "exog_1", - True, - pd.Series( - data=[1.0], - index=pd.DatetimeIndex([np.datetime64("2000-01-04")], freq="D"), - ), - ), - ( - "exog_1", - False, - pd.Series( - data=[1.0], - index=pd.DatetimeIndex([np.datetime64("2000-01-04")], freq="D"), - ), - ), - ], -) -def test_get_anomalies_isolation_forest_segment_series(df_segment_0, in_column, use_in_column, expected_anomalies): - model = IsolationForest(n_estimators=3) - anomalies = _get_anomalies_isolation_forest_segment( - df_segment=df_segment_0, model=model, in_column=in_column, use_in_column=use_in_column, index_only=False - ) - pd.testing.assert_series_equal(anomalies, expected_anomalies, check_names=False) - - -def test_get_anomalies_isolation_forest_interface(ts_with_features): - anomalies = get_anomalies_isolation_forest( - ts=ts_with_features, features_to_use=["target", "exog_1"], ignore_missing=True, n_estimators=3 - ) - assert sorted(anomalies.keys()) == sorted(ts_with_features.segments) - - -def test_get_anomalies_isolation_forest_dummy_case(outliers_df_with_two_columns): - anomalies = get_anomalies_isolation_forest( - ts=outliers_df_with_two_columns, in_column="feature", ignore_missing=True - ) - expected = { - "1": [np.datetime64("2021-01-08"), np.datetime64("2021-01-11")], - "2": [ - np.datetime64("2021-01-09"), - np.datetime64("2021-01-16"), - np.datetime64("2021-01-26"), - np.datetime64("2021-01-27"), - ], - } - for key in expected: - assert key in anomalies - np.testing.assert_array_equal(anomalies[key], expected[key]) - - -def test_get_anomalies_isolation_forest_not_use_in_column(ts_with_features): - expected_anomalies = { - "segment_0": pd.Series( - data=[1.0], - index=pd.DatetimeIndex([np.datetime64("2000-01-04")], freq="D"), - ), - "segment_1": pd.Series( - data=[2.0], - index=[np.datetime64("2000-01-04")], # Does not have freq due to missing values - ), - } - anomalies = get_anomalies_isolation_forest( - ts=ts_with_features, in_column="exog_1", features_to_use=["target"], ignore_missing=True, index_only=False - ) - assert sorted(expected_anomalies.keys()) == sorted(anomalies.keys()) - for segment in expected_anomalies.keys(): - pd.testing.assert_series_equal(anomalies[segment], expected_anomalies[segment], check_names=False)