Skip to content

Commit

Permalink
Fix HolidayTransform to handle integer timestamp in days_count mo…
Browse files Browse the repository at this point in the history
…de (#285)
  • Loading branch information
d-a-bunin committed Apr 2, 2024
1 parent 0b2112f commit 495f846
Show file tree
Hide file tree
Showing 6 changed files with 317 additions and 84 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Expand Up @@ -107,7 +107,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fix indexing in `stl_plot`, `plot_periodogram`, `plot_holidays`, `plot_backtest`, `plot_backtest_interactive`, `ResampleWithDistributionTransform` ([#244](https://github.com/etna-team/etna/pull/244))
- Fix `DifferencingTransform` to handle integer timestamp on test ([#244](https://github.com/etna-team/etna/pull/244))
-
-
- Fix `HolidayTransform` to handle integer timestamp in `days_count` mode ([#285](https://github.com/etna-team/etna/pull/285))
-
-
-
Expand Down
45 changes: 40 additions & 5 deletions etna/transforms/timestamp/fourier.py
Expand Up @@ -29,8 +29,8 @@ class FourierTransform(IrreversibleTransform):
The features will be the same for each segment.
- As external column. In this case for each segment its ``in_column`` will be used to compute features.
It is expected that for each segment we have the same type of timestamp data (datetime or numeric)
and for datetime type only one frequency is used.
It is expected that for each segment we have the same type of timestamp data (datetime or numeric),
and for datetime type only one frequency is used for all the segments.
If we are working with external column, there is a difference in handling numeric and datetime data:
Expand Down Expand Up @@ -161,7 +161,25 @@ def get_regressors_info(self) -> List[str]:
return output_columns

def fit(self, ts: TSDataset) -> "FourierTransform":
"""Fit the transform."""
"""Fit the transform.
Parameters
----------
ts:
Dataset to fit the transform on.
Returns
-------
:
The fitted transform instance.
Raises
------
ValueError
if external timestamp doesn't have frequency
ValueError
if external timestamp doesn't have the same frequency for all segments
"""
if self.in_column is None:
self._freq = ts.freq
self.in_column_regressor = True
Expand Down Expand Up @@ -233,7 +251,15 @@ def _fit(self, df: pd.DataFrame) -> "FourierTransform":
Returns
-------
result:
:
The fitted transform instance.
Raises
------
ValueError
if external timestamp doesn't have frequency
ValueError
if external timestamp doesn't have the same frequency for all segments
"""
if self.in_column is None:
self._reference_timestamp = df.index[0]
Expand Down Expand Up @@ -293,8 +319,17 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
Returns
-------
result:
:
transformed dataframe
Raises
------
ValueError:
if transform isn't fitted
ValueError
if external timestamp doesn't have frequency
ValueError
if external timestamp doesn't have the same frequency for all segments
"""
if self._freq is _DEFAULT_FREQ:
raise ValueError("The transform isn't fitted!")
Expand Down
123 changes: 102 additions & 21 deletions etna/transforms/timestamp/holiday.py
@@ -1,6 +1,7 @@
from enum import Enum
from typing import List
from typing import Optional
from typing import cast

import holidays
import pandas as pd
Expand All @@ -14,22 +15,13 @@
from typing_extensions import assert_never

from etna.datasets import TSDataset
from etna.datasets.utils import determine_freq
from etna.transforms.base import IrreversibleTransform

_DEFAULT_FREQ = object()


# TODO: it shouldn't be called on freq=None, we should discuss this
def bigger_than_day(freq: Optional[str]):
"""Compare frequency with day."""
dt = "2000-01-01"
dates_day = pd.date_range(start=dt, periods=2, freq="D")
dates_freq = pd.date_range(start=dt, periods=2, freq=freq)
return dates_freq[-1] > dates_day[-1]


# TODO: it shouldn't be called on freq=None, we should discuss this
def define_period(offset: pd.tseries.offsets.BaseOffset, dt: pd.Timestamp, freq: Optional[str]):
def define_period(offset: pd.tseries.offsets.BaseOffset, dt: pd.Timestamp, freq: str):
"""Define start_date and end_date of period using dataset frequency."""
if isinstance(offset, Week) and offset.weekday == 6:
start_date = dt - pd.tseries.frequencies.to_offset("W") + pd.Timedelta(days=1)
Expand Down Expand Up @@ -70,18 +62,29 @@ def _missing_(cls, value):
)


# TODO: discuss conceptual problems with
class HolidayTransform(IrreversibleTransform):
"""
HolidayTransform generates series that indicates holidays in given dataset.
* In ``binary`` mode shows the presence of holiday in that day.
* In ``category`` mode shows the name of the holiday with value "NO_HOLIDAY" reserved for days without holidays.
* In ``binary`` mode shows the presence of holiday in a given timestamp.
* In ``category`` mode shows the name of the holiday in a given timestamp, the value "NO_HOLIDAY" is reserved for days without holidays.
* In ``days_count`` mode shows the frequency of holidays in a given period.
* If the frequency is weekly, then we count the proportion of holidays in a week (Monday-Sunday) that contains this day.
* If the frequency is monthly, then we count the proportion of holidays in a month that contains this day.
* If the frequency is yearly, then we count the proportion of holidays in a year that contains this day.
Transform can accept timestamp data in two forms:
- As index. In this case the dataset index is used to compute features.
The features will be the same for each segment.
- As external column. In this case for each segment its ``in_column`` will be used to compute features.
In ``days_count`` mode it is expected that for all segments only one frequency is used.
Notes
-----
During fitting int ``days_count`` mode the transform saves frequency. It is assumed to be the same during ``transform``.
"""

_no_holiday_name: str = "NO_HOLIDAY"
Expand Down Expand Up @@ -117,7 +120,7 @@ def __init__(
self.iso_code = iso_code
self.mode = mode
self._mode = HolidayTransformMode(mode)
self._freq: Optional[str] = _DEFAULT_FREQ # type: ignore
self._freq: str = _DEFAULT_FREQ # type: ignore
self.holidays = holidays.country_holidays(iso_code)
self.out_column = out_column
self.in_column = in_column
Expand Down Expand Up @@ -145,15 +148,73 @@ def fit(self, ts: TSDataset) -> "HolidayTransform":
-------
:
The fitted transform instance.
Raises
------
ValueError:
if index timestamp is integer and ``in_column`` isn't set
ValueError:
if external timestamp isn't datetime
ValueError
if in ``days_count`` mode external timestamp doesn't have frequency
ValueError
if in ``days_count`` mode external timestamp doesn't have the same frequency for all segments
"""
if self.in_column is None:
if self._mode is HolidayTransformMode.days_count:
if ts.freq is None:
raise ValueError("Transform can't work with integer index, parameter in_column should be set!")
self._freq = ts.freq
else:
# set some value that doesn't really matter
self._freq = object() # type: ignore
self.in_column_regressor = True
else:
self.in_column_regressor = self.in_column in ts.regressors
self._freq = ts.freq
super().fit(ts)
return self

def _validate_external_timestamps(self, df: pd.DataFrame):
df = df.droplevel("feature", axis=1)

# here we are assuming that every segment has the same timestamp dtype
timestamp_dtype = df.dtypes.iloc[0]
if not pd.api.types.is_datetime64_dtype(timestamp_dtype):
raise ValueError("Transform can work only with datetime external timestamp!")

if self._mode is HolidayTransformMode.binary or self._mode is HolidayTransformMode.category:
return

segments = df.columns.unique()
freq_values = set()
for segment in segments:
timestamps = df[segment]
timestamps = timestamps.loc[timestamps.first_valid_index() :]
if len(timestamps) >= 3:
cur_freq = pd.infer_freq(timestamps)
if cur_freq is None:
raise ValueError(
f"Invalid in_column values! Datetime values should be regular timestamps with some frequency. "
f"This doesn't hold for segment {segment}"
)
freq_values.add(cur_freq)

if len(freq_values) > 1:
raise ValueError(
f"Invalid in_column values! Datetime values should have the same frequency for every segment. "
f"Discovered frequencies: {freq_values}"
)

def _infer_external_freq(self, df: pd.DataFrame) -> str:
df = df.droplevel("feature", axis=1)
# here we are assuming that every segment has the same timestamp freq
sample_segment = df.columns[0]
sample_timestamps = df[sample_segment]
sample_timestamps = sample_timestamps.loc[sample_timestamps.first_valid_index() :]
result = determine_freq(sample_timestamps)
result = cast(str, result) # we can't get None here, because we checked dtype
return result

def _fit(self, df: pd.DataFrame) -> "HolidayTransform":
"""Fit the transform.
Expand All @@ -166,13 +227,26 @@ def _fit(self, df: pd.DataFrame) -> "HolidayTransform":
-------
:
The fitted transform instance.
Raises
------
ValueError:
if external timestamp isn't datetime
ValueError
if in ``days_count`` mode external timestamp doesn't have frequency
ValueError
if in ``days_count`` mode external timestamp doesn't have the same frequency for all segments
"""
if self.in_column is not None:
self._validate_external_timestamps(df)
if self._mode is HolidayTransformMode.days_count:
self._freq = self._infer_external_freq(df)
else:
# set some value that doesn't really matter
self._freq = object() # type: ignore
return self

def _compute_feature(self, timestamps: pd.Series) -> pd.Series:
if bigger_than_day(self._freq) and self._mode is not HolidayTransformMode.days_count:
raise ValueError("For binary and category modes frequency of data should be no more than daily.")

if self._mode is HolidayTransformMode.days_count:
date_offset = pd.tseries.frequencies.to_offset(self._freq)
values = []
Expand Down Expand Up @@ -222,9 +296,15 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
ValueError:
if transform isn't fitted
ValueError:
if the frequency is greater than daily and this is a ``binary`` or ``categorical`` mode
if the frequency is not weekly, monthly, quarterly or yearly in ``days_count`` mode
ValueError:
if index timestamp is integer and ``in_column`` isn't set
ValueError:
if the frequency is not weekly, monthly, quarterly or yearly and this is ``days_count`` mode
if external timestamp isn't datetime
ValueError
if in ``days_count`` mode external timestamp doesn't have frequency
ValueError
if in ``days_count`` mode external timestamp doesn't have the same frequency for all segments
"""
if self._freq is _DEFAULT_FREQ:
raise ValueError("Transform is not fitted")
Expand All @@ -243,6 +323,7 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
index=df.index,
)
else:
self._validate_external_timestamps(df=df)
features = TSDataset.to_flatten(df=df, features=[self.in_column])
features[out_column] = self._compute_feature(timestamps=features[self.in_column])
features.drop(columns=[self.in_column], inplace=True)
Expand Down
11 changes: 5 additions & 6 deletions tests/test_transforms/test_inference/test_inverse_transform.py
Expand Up @@ -864,12 +864,11 @@ def test_inverse_transform_train_fail_resample(self, transform, dataset_name, ex
"ts_with_external_timestamp",
{},
),
# TODO: fix after discussing conceptual problems
# (
# HolidayTransform(out_column="res", mode="days_count", in_column="external_timestamp"),
# "ts_with_external_timestamp_one_month",
# {},
# ),
(
HolidayTransform(out_column="res", mode="days_count", in_column="external_timestamp"),
"ts_with_external_timestamp_one_month",
{},
),
(
SpecialDaysTransform(in_column="external_timestamp"),
"ts_with_external_timestamp",
Expand Down
11 changes: 5 additions & 6 deletions tests/test_transforms/test_inference/test_transform.py
Expand Up @@ -810,12 +810,11 @@ def test_transform_train_datetime_timestamp(self, transform, dataset_name, expec
"ts_with_external_timestamp",
{"create": {"res"}},
),
# TODO: fix after discussing conceptual problems
# (
# HolidayTransform(out_column="res", mode="days_count", in_column="external_timestamp"),
# "ts_with_external_timestamp_one_month",
# {"create": {"res"}},
# ),
(
HolidayTransform(out_column="res", mode="days_count", in_column="external_timestamp"),
"ts_with_external_timestamp_one_month",
{"create": {"res"}},
),
(
SpecialDaysTransform(in_column="external_timestamp"),
"ts_with_external_timestamp",
Expand Down

0 comments on commit 495f846

Please sign in to comment.