Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create EventTransform #78

Merged
merged 12 commits into from
Sep 15, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add error page into documentation ([#57](https://github.com/etna-team/etna/pull/57))
- Add `LimitTransform` ([#63](https://github.com/etna-team/etna/pull/63))
- Add config for Codecov to control CI ([#80](https://github.com/etna-team/etna/pull/80))
- Add `EventTransform` ([#78](https://github.com/etna-team/etna/pull/78))

### Changed
-
Expand Down
1 change: 1 addition & 0 deletions docs/source/api_reference/transforms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ Transforms to work with time-related features:
SpecialDaysTransform
HolidayTransform
FourierTransform
EventTransform

Shift transforms:

Expand Down
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from etna.transforms.outliers import MedianOutliersTransform
from etna.transforms.outliers import PredictionIntervalOutliersTransform
from etna.transforms.timestamp import DateFlagsTransform
from etna.transforms.timestamp import EventTransform
from etna.transforms.timestamp import FourierTransform
from etna.transforms.timestamp import HolidayTransform
from etna.transforms.timestamp import SpecialDaysTransform
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/timestamp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from etna.transforms.timestamp.date_flags import DateFlagsTransform
from etna.transforms.timestamp.event import EventTransform
from etna.transforms.timestamp.fourier import FourierTransform
from etna.transforms.timestamp.holiday import HolidayTransform
from etna.transforms.timestamp.special_days import SpecialDaysTransform
Expand Down
188 changes: 188 additions & 0 deletions etna/transforms/timestamp/event.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
from enum import Enum
from typing import Dict
from typing import List
from typing import Optional

import numpy as np
import pandas as pd

from etna.datasets import TSDataset
from etna.distributions import BaseDistribution
from etna.distributions import CategoricalDistribution
from etna.distributions import IntDistribution
from etna.transforms.base import IrreversibleTransform


class ImputerMode(str, Enum):
"""Enum for different imputation strategy."""

binary = "binary"
distance = "distance"

@classmethod
def _missing_(cls, value):
raise NotImplementedError(
f"{value} is not a valid {cls.__name__}. Supported modes: {', '.join([repr(m.value) for m in cls])}"
)


class EventTransform(IrreversibleTransform):
"""EventTransform marks days before and after event depending on ``mode``.
It creates two columns for future and past.
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved

* In `'binary'` mode shows whether there will be or were events regarding current date.

* In `'distance'` mode shows distance to the previous and future events regarding current date. Computed as :math:`1 / x`, where x is a distance to the nearest event.

Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from etna.datasets import TSDataset
>>> from etna.transforms import EventTransform
>>> periods = 5
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved
>>> df = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved
>>> df["segment"] = ["segment_1"] * periods
>>> df["target"] = np.arange(periods)
>>> df["holiday"] = np.array([0, 1, 1, 0, 0])
>>> df = TSDataset.to_dataset(df)
>>> tsds = TSDataset(df, freq="D")
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved
>>> transform = EventTransform(in_column='holiday', out_column='holiday')
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved
>>> transform.fit_transform(tsds)
segment segment_1
feature holiday holiday_post holiday_prev target
timestamp
2020-01-01 0 0.0 1.0 0
2020-01-02 1 0.0 0.0 1
2020-01-03 1 0.0 0.0 2
2020-01-04 0 1.0 0.0 3
2020-01-05 0 0.0 0.0 4
"""

def __init__(
self, in_column: str, out_column: str, n_pre: int = 1, n_post: int = 1, mode: str = ImputerMode.binary
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved
):
"""
Init EventTransform.

Parameters
----------
in_column:
binary column with event indicator.
out_column:
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved
base for creating out columns names for future and past - '{out_column}_pre' and '{out_column}_post'
n_pre:
number of days before the event to react.
n_post:
number of days after the event to react.
mode:
mode of marking events:

- `'binary'`: whether there will be or were events regarding current date in binary type;
- `'distance'`: distance to the previous and future events regarding current date;

Raises
------
ValueError:
Some ``in_column`` features are not binary.
ValueError:
``n_pre`` or ``n_post`` values are less than one.
NotImplementedError:
Given ``mode`` value is not supported.
"""
super().__init__(required_features=[in_column])
self.in_column = in_column
self.out_column = out_column
self.n_pre = n_pre
self.n_post = n_post
self.mode = mode
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved
self.in_column_regressor: Optional[bool] = None
self._mode = ImputerMode(mode)

def fit(self, ts: TSDataset) -> "EventTransform":
"""Fit the transform."""
self.in_column_regressor = self.in_column in ts.regressors
super().fit(ts)
return self

def _fit(self, df: pd.DataFrame):
"""Fit method does nothing and is kept for compatibility.

Parameters
----------
df:
dataframe with data.
"""
pass

def _compute_event_column(self, df: pd.DataFrame, column: str, max_distance: int) -> pd.DataFrame:
"""Compute event column."""
indexes = df.copy()
indexes[:] = np.repeat((np.arange(len(indexes)) + 1).reshape(-1, 1), len(indexes.columns), axis=1)

col = indexes.copy()
col.mask(df != 1, None, inplace=True)
if column == "prev":
col = col.bfill().fillna(indexes)
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved
col = col - indexes
else:
col = col.ffill().fillna(indexes)
col = indexes - col
distance = 1 if self.mode == "binary" else 1 / col
col.mask(col > max_distance, 0, inplace=True)
col.mask((col >= 1) & (col <= max_distance), distance, inplace=True)

col.rename(columns={self.in_column: f"{self.out_column}_{column}"}, inplace=True, level="feature")
return col

def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add marked days before and after event to dataset.

Parameters
----------
df:
dataframe with data to transform.

Returns
-------
:
transformed dataframe

"""
if set(df.values.reshape(-1)) != {0, 1}:
raise ValueError("Input columns must be binary")
if self.n_pre < 1 or self.n_post < 1:
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(f"`n_pre` and `n_post` must be greater than zero, given {self.n_pre} and {self.n_post}")

prev = self._compute_event_column(df, column="prev", max_distance=self.n_pre)
post = self._compute_event_column(df, column="post", max_distance=self.n_post)

df = pd.concat([df, prev, post], axis=1)

return df

def get_regressors_info(self) -> List[str]:
"""Return the list with regressors created by the transform."""
if self.in_column_regressor is None:
raise ValueError("Fit the transform to get the correct regressors info!")

Check warning on line 167 in etna/transforms/timestamp/event.py

View check run for this annotation

Codecov / codecov/patch

etna/transforms/timestamp/event.py#L167

Added line #L167 was not covered by tests
return [self.out_column + "_pre", self.out_column + "_post"] if self.in_column_regressor else []

def params_to_tune(self) -> Dict[str, BaseDistribution]:
"""Get default grid for tuning hyperparameters.

This grid tunes parameters: ``n_pre``, ``n_post``.
Other parameters are expected to be set by the user.

Returns
-------
:
Grid to tune.
"""
return {
"n_pre": IntDistribution(low=1, high=self.n_pre),
"n_post": IntDistribution(low=1, high=self.n_post),
"mode": CategoricalDistribution(["binary", "distance"]),
}


__all__ = ["EventTransform"]
33 changes: 33 additions & 0 deletions tests/test_transforms/test_inference/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,39 @@
from etna.datasets import duplicate_data


@pytest.fixture
def ts_with_binary_exoc(random_seed) -> TSDataset:
egoriyaa marked this conversation as resolved.
Show resolved Hide resolved
periods_df = 100
periods_df_exoc = periods_df + 10
df_1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods_df)})
df_1["segment"] = ["segment_1"] * periods_df
df_1["target"] = np.arange(periods_df)
df_2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods_df)})
df_2["segment"] = ["segment_2"] * periods_df
df_2["target"] = np.arange(periods_df)
df_3 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods_df)})
df_3["segment"] = ["segment_3"] * periods_df
df_3["target"] = np.arange(periods_df)

df_exoc_1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods_df_exoc)})
df_exoc_1["segment"] = ["segment_1"] * periods_df_exoc
df_exoc_1["exoc"] = np.random.choice([0, 1], size=periods_df_exoc)
df_exoc_2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods_df_exoc)})
df_exoc_2["segment"] = ["segment_2"] * periods_df_exoc
df_exoc_2["exoc"] = np.random.choice([0, 1], size=periods_df_exoc)
df_exoc_3 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods_df_exoc)})
df_exoc_3["segment"] = ["segment_3"] * periods_df_exoc
df_exoc_3["exoc"] = np.random.choice([0, 1], size=periods_df_exoc)

df = pd.concat([df_1, df_2, df_3])
df_exoc = pd.concat([df_exoc_1, df_exoc_2, df_exoc_3])

df = TSDataset.to_dataset(df)
df_exoc = TSDataset.to_dataset(df_exoc)
tsds = TSDataset(df, freq="D", df_exog=df_exoc, known_future=["exoc"])
return tsds


@pytest.fixture
def regular_ts(random_seed) -> TSDataset:
periods = 100
Expand Down
13 changes: 13 additions & 0 deletions tests/test_transforms/test_inference/test_inverse_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from etna.transforms import DensityOutliersTransform
from etna.transforms import DeseasonalityTransform
from etna.transforms import DifferencingTransform
from etna.transforms import EventTransform
from etna.transforms import FilterFeaturesTransform
from etna.transforms import FourierTransform
from etna.transforms import GaleShapleyFeatureSelectionTransform
Expand Down Expand Up @@ -225,6 +226,8 @@ def _test_inverse_transform_train_subset_segments(self, ts, transform, segments)
(HolidayTransform(mode="category"), "regular_ts"),
(SpecialDaysTransform(), "regular_ts"),
(TimeFlagsTransform(), "regular_ts"),
(EventTransform(in_column="exoc", out_column="exoc"), "ts_with_binary_exoc"),
(EventTransform(in_column="exoc", out_column="exoc", mode="distance"), "ts_with_binary_exoc"),
],
)
def test_inverse_transform_train_subset_segments(self, transform, dataset_name, request):
Expand Down Expand Up @@ -436,6 +439,8 @@ def _test_inverse_transform_future_subset_segments(self, ts, transform, segments
(HolidayTransform(mode="category"), "regular_ts"),
(SpecialDaysTransform(), "regular_ts"),
(TimeFlagsTransform(), "regular_ts"),
(EventTransform(in_column="exoc", out_column="exoc"), "ts_with_binary_exoc"),
(EventTransform(in_column="exoc", out_column="exoc", mode="distance"), "ts_with_binary_exoc"),
],
)
def test_inverse_transform_future_subset_segments(self, transform, dataset_name, request):
Expand Down Expand Up @@ -669,6 +674,8 @@ def _test_inverse_transform_train_new_segments(self, ts, transform, train_segmen
"regular_ts",
{},
),
(EventTransform(in_column="exoc", out_column="exoc"), "ts_with_binary_exoc", {}),
(EventTransform(in_column="exoc", out_column="exoc", mode="distance"), "ts_with_binary_exoc", {}),
],
)
def test_inverse_transform_train_new_segments(self, transform, dataset_name, expected_changes, request):
Expand Down Expand Up @@ -1005,6 +1012,8 @@ def _test_inverse_transform_future_new_segments(self, ts, transform, train_segme
"regular_ts",
{},
),
(EventTransform(in_column="exoc", out_column="exoc"), "ts_with_binary_exoc", {}),
(EventTransform(in_column="exoc", out_column="exoc", mode="distance"), "ts_with_binary_exoc", {}),
],
)
def test_inverse_transform_future_new_segments(self, transform, dataset_name, expected_changes, request):
Expand Down Expand Up @@ -1493,6 +1502,8 @@ def _test_inverse_transform_future_with_target(
{},
),
(SpecialDaysTransform(), "regular_ts", {}),
(EventTransform(in_column="exoc", out_column="exoc"), "ts_with_binary_exoc", {}),
(EventTransform(in_column="exoc", out_column="exoc", mode="distance"), "ts_with_binary_exoc", {}),
],
)
def test_inverse_transform_future_with_target(self, transform, dataset_name, expected_changes, request):
Expand Down Expand Up @@ -1920,6 +1931,8 @@ def _test_inverse_transform_future_without_target(
{},
),
(SpecialDaysTransform(), "regular_ts", {}),
(EventTransform(in_column="exoc", out_column="exoc"), "ts_with_binary_exoc", {}),
(EventTransform(in_column="exoc", out_column="exoc", mode="distance"), "ts_with_binary_exoc", {}),
],
)
def test_inverse_transform_future_without_target(self, transform, dataset_name, expected_changes, request):
Expand Down