diff --git a/CHANGELOG.md b/CHANGELOG.md index 50a94a658..db69f9bc5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -107,7 +107,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix indexing in `stl_plot`, `plot_periodogram`, `plot_holidays`, `plot_backtest`, `plot_backtest_interactive`, `ResampleWithDistributionTransform` ([#244](https://github.com/etna-team/etna/pull/244)) - Fix `DifferencingTransform` to handle integer timestamp on test ([#244](https://github.com/etna-team/etna/pull/244)) - -- +- Fix `HolidayTransform` to handle integer timestamp in `days_count` mode ([#285](https://github.com/etna-team/etna/pull/285)) - - - diff --git a/etna/transforms/timestamp/fourier.py b/etna/transforms/timestamp/fourier.py index 981e2095c..8e0c552ee 100644 --- a/etna/transforms/timestamp/fourier.py +++ b/etna/transforms/timestamp/fourier.py @@ -29,8 +29,8 @@ class FourierTransform(IrreversibleTransform): The features will be the same for each segment. - As external column. In this case for each segment its ``in_column`` will be used to compute features. - It is expected that for each segment we have the same type of timestamp data (datetime or numeric) - and for datetime type only one frequency is used. + It is expected that for each segment we have the same type of timestamp data (datetime or numeric), + and for datetime type only one frequency is used for all the segments. If we are working with external column, there is a difference in handling numeric and datetime data: @@ -161,7 +161,25 @@ def get_regressors_info(self) -> List[str]: return output_columns def fit(self, ts: TSDataset) -> "FourierTransform": - """Fit the transform.""" + """Fit the transform. + + Parameters + ---------- + ts: + Dataset to fit the transform on. + + Returns + ------- + : + The fitted transform instance. + + Raises + ------ + ValueError + if external timestamp doesn't have frequency + ValueError + if external timestamp doesn't have the same frequency for all segments + """ if self.in_column is None: self._freq = ts.freq self.in_column_regressor = True @@ -233,7 +251,15 @@ def _fit(self, df: pd.DataFrame) -> "FourierTransform": Returns ------- - result: + : + The fitted transform instance. + + Raises + ------ + ValueError + if external timestamp doesn't have frequency + ValueError + if external timestamp doesn't have the same frequency for all segments """ if self.in_column is None: self._reference_timestamp = df.index[0] @@ -293,8 +319,17 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: Returns ------- - result: + : transformed dataframe + + Raises + ------ + ValueError: + if transform isn't fitted + ValueError + if external timestamp doesn't have frequency + ValueError + if external timestamp doesn't have the same frequency for all segments """ if self._freq is _DEFAULT_FREQ: raise ValueError("The transform isn't fitted!") diff --git a/etna/transforms/timestamp/holiday.py b/etna/transforms/timestamp/holiday.py index 91b99a278..b543c0329 100644 --- a/etna/transforms/timestamp/holiday.py +++ b/etna/transforms/timestamp/holiday.py @@ -1,6 +1,7 @@ from enum import Enum from typing import List from typing import Optional +from typing import cast import holidays import pandas as pd @@ -14,22 +15,13 @@ from typing_extensions import assert_never from etna.datasets import TSDataset +from etna.datasets.utils import determine_freq from etna.transforms.base import IrreversibleTransform _DEFAULT_FREQ = object() -# TODO: it shouldn't be called on freq=None, we should discuss this -def bigger_than_day(freq: Optional[str]): - """Compare frequency with day.""" - dt = "2000-01-01" - dates_day = pd.date_range(start=dt, periods=2, freq="D") - dates_freq = pd.date_range(start=dt, periods=2, freq=freq) - return dates_freq[-1] > dates_day[-1] - - -# TODO: it shouldn't be called on freq=None, we should discuss this -def define_period(offset: pd.tseries.offsets.BaseOffset, dt: pd.Timestamp, freq: Optional[str]): +def define_period(offset: pd.tseries.offsets.BaseOffset, dt: pd.Timestamp, freq: str): """Define start_date and end_date of period using dataset frequency.""" if isinstance(offset, Week) and offset.weekday == 6: start_date = dt - pd.tseries.frequencies.to_offset("W") + pd.Timedelta(days=1) @@ -70,18 +62,29 @@ def _missing_(cls, value): ) -# TODO: discuss conceptual problems with class HolidayTransform(IrreversibleTransform): """ HolidayTransform generates series that indicates holidays in given dataset. - * In ``binary`` mode shows the presence of holiday in that day. - * In ``category`` mode shows the name of the holiday with value "NO_HOLIDAY" reserved for days without holidays. + * In ``binary`` mode shows the presence of holiday in a given timestamp. + * In ``category`` mode shows the name of the holiday in a given timestamp, the value "NO_HOLIDAY" is reserved for days without holidays. * In ``days_count`` mode shows the frequency of holidays in a given period. * If the frequency is weekly, then we count the proportion of holidays in a week (Monday-Sunday) that contains this day. * If the frequency is monthly, then we count the proportion of holidays in a month that contains this day. * If the frequency is yearly, then we count the proportion of holidays in a year that contains this day. + + Transform can accept timestamp data in two forms: + + - As index. In this case the dataset index is used to compute features. + The features will be the same for each segment. + + - As external column. In this case for each segment its ``in_column`` will be used to compute features. + In ``days_count`` mode it is expected that for all segments only one frequency is used. + + Notes + ----- + During fitting int ``days_count`` mode the transform saves frequency. It is assumed to be the same during ``transform``. """ _no_holiday_name: str = "NO_HOLIDAY" @@ -117,7 +120,7 @@ def __init__( self.iso_code = iso_code self.mode = mode self._mode = HolidayTransformMode(mode) - self._freq: Optional[str] = _DEFAULT_FREQ # type: ignore + self._freq: str = _DEFAULT_FREQ # type: ignore self.holidays = holidays.country_holidays(iso_code) self.out_column = out_column self.in_column = in_column @@ -145,15 +148,73 @@ def fit(self, ts: TSDataset) -> "HolidayTransform": ------- : The fitted transform instance. + + Raises + ------ + ValueError: + if index timestamp is integer and ``in_column`` isn't set + ValueError: + if external timestamp isn't datetime + ValueError + if in ``days_count`` mode external timestamp doesn't have frequency + ValueError + if in ``days_count`` mode external timestamp doesn't have the same frequency for all segments """ if self.in_column is None: + if self._mode is HolidayTransformMode.days_count: + if ts.freq is None: + raise ValueError("Transform can't work with integer index, parameter in_column should be set!") + self._freq = ts.freq + else: + # set some value that doesn't really matter + self._freq = object() # type: ignore self.in_column_regressor = True else: self.in_column_regressor = self.in_column in ts.regressors - self._freq = ts.freq super().fit(ts) return self + def _validate_external_timestamps(self, df: pd.DataFrame): + df = df.droplevel("feature", axis=1) + + # here we are assuming that every segment has the same timestamp dtype + timestamp_dtype = df.dtypes.iloc[0] + if not pd.api.types.is_datetime64_dtype(timestamp_dtype): + raise ValueError("Transform can work only with datetime external timestamp!") + + if self._mode is HolidayTransformMode.binary or self._mode is HolidayTransformMode.category: + return + + segments = df.columns.unique() + freq_values = set() + for segment in segments: + timestamps = df[segment] + timestamps = timestamps.loc[timestamps.first_valid_index() :] + if len(timestamps) >= 3: + cur_freq = pd.infer_freq(timestamps) + if cur_freq is None: + raise ValueError( + f"Invalid in_column values! Datetime values should be regular timestamps with some frequency. " + f"This doesn't hold for segment {segment}" + ) + freq_values.add(cur_freq) + + if len(freq_values) > 1: + raise ValueError( + f"Invalid in_column values! Datetime values should have the same frequency for every segment. " + f"Discovered frequencies: {freq_values}" + ) + + def _infer_external_freq(self, df: pd.DataFrame) -> str: + df = df.droplevel("feature", axis=1) + # here we are assuming that every segment has the same timestamp freq + sample_segment = df.columns[0] + sample_timestamps = df[sample_segment] + sample_timestamps = sample_timestamps.loc[sample_timestamps.first_valid_index() :] + result = determine_freq(sample_timestamps) + result = cast(str, result) # we can't get None here, because we checked dtype + return result + def _fit(self, df: pd.DataFrame) -> "HolidayTransform": """Fit the transform. @@ -166,13 +227,26 @@ def _fit(self, df: pd.DataFrame) -> "HolidayTransform": ------- : The fitted transform instance. + + Raises + ------ + ValueError: + if external timestamp isn't datetime + ValueError + if in ``days_count`` mode external timestamp doesn't have frequency + ValueError + if in ``days_count`` mode external timestamp doesn't have the same frequency for all segments """ + if self.in_column is not None: + self._validate_external_timestamps(df) + if self._mode is HolidayTransformMode.days_count: + self._freq = self._infer_external_freq(df) + else: + # set some value that doesn't really matter + self._freq = object() # type: ignore return self def _compute_feature(self, timestamps: pd.Series) -> pd.Series: - if bigger_than_day(self._freq) and self._mode is not HolidayTransformMode.days_count: - raise ValueError("For binary and category modes frequency of data should be no more than daily.") - if self._mode is HolidayTransformMode.days_count: date_offset = pd.tseries.frequencies.to_offset(self._freq) values = [] @@ -222,9 +296,15 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: ValueError: if transform isn't fitted ValueError: - if the frequency is greater than daily and this is a ``binary`` or ``categorical`` mode + if the frequency is not weekly, monthly, quarterly or yearly in ``days_count`` mode + ValueError: + if index timestamp is integer and ``in_column`` isn't set ValueError: - if the frequency is not weekly, monthly, quarterly or yearly and this is ``days_count`` mode + if external timestamp isn't datetime + ValueError + if in ``days_count`` mode external timestamp doesn't have frequency + ValueError + if in ``days_count`` mode external timestamp doesn't have the same frequency for all segments """ if self._freq is _DEFAULT_FREQ: raise ValueError("Transform is not fitted") @@ -243,6 +323,7 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: index=df.index, ) else: + self._validate_external_timestamps(df=df) features = TSDataset.to_flatten(df=df, features=[self.in_column]) features[out_column] = self._compute_feature(timestamps=features[self.in_column]) features.drop(columns=[self.in_column], inplace=True) diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py index 9f47b5e2d..10f23bb51 100644 --- a/tests/test_transforms/test_inference/test_inverse_transform.py +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -864,12 +864,11 @@ def test_inverse_transform_train_fail_resample(self, transform, dataset_name, ex "ts_with_external_timestamp", {}, ), - # TODO: fix after discussing conceptual problems - # ( - # HolidayTransform(out_column="res", mode="days_count", in_column="external_timestamp"), - # "ts_with_external_timestamp_one_month", - # {}, - # ), + ( + HolidayTransform(out_column="res", mode="days_count", in_column="external_timestamp"), + "ts_with_external_timestamp_one_month", + {}, + ), ( SpecialDaysTransform(in_column="external_timestamp"), "ts_with_external_timestamp", diff --git a/tests/test_transforms/test_inference/test_transform.py b/tests/test_transforms/test_inference/test_transform.py index 96dd9bbf9..322bfc674 100644 --- a/tests/test_transforms/test_inference/test_transform.py +++ b/tests/test_transforms/test_inference/test_transform.py @@ -810,12 +810,11 @@ def test_transform_train_datetime_timestamp(self, transform, dataset_name, expec "ts_with_external_timestamp", {"create": {"res"}}, ), - # TODO: fix after discussing conceptual problems - # ( - # HolidayTransform(out_column="res", mode="days_count", in_column="external_timestamp"), - # "ts_with_external_timestamp_one_month", - # {"create": {"res"}}, - # ), + ( + HolidayTransform(out_column="res", mode="days_count", in_column="external_timestamp"), + "ts_with_external_timestamp_one_month", + {"create": {"res"}}, + ), ( SpecialDaysTransform(in_column="external_timestamp"), "ts_with_external_timestamp", diff --git a/tests/test_transforms/test_timestamp/test_holiday_transform.py b/tests/test_transforms/test_timestamp/test_holiday_transform.py index 9b2d30be5..e813aaa03 100644 --- a/tests/test_transforms/test_timestamp/test_holiday_transform.py +++ b/tests/test_transforms/test_timestamp/test_holiday_transform.py @@ -145,6 +145,40 @@ def two_segments_w_mon_int_timestamp(two_segments_w_mon: TSDataset): return ts +@pytest.fixture() +def two_segments_w_mon_external_int_timestamp(two_segments_w_mon_int_timestamp: TSDataset): + ts = two_segments_w_mon_int_timestamp + df = ts.raw_df + df_exog = ts.df_exog + external_int_timestamp = np.arange(len(df_exog)) + df_exog.loc[:, pd.IndexSlice["segment_1", "external_timestamp"]] = external_int_timestamp + df_exog.loc[:, pd.IndexSlice["segment_2", "external_timestamp"]] = external_int_timestamp + ts = TSDataset(df=df, df_exog=df_exog, freq=ts.freq) + return ts + + +@pytest.fixture() +def two_segments_w_mon_external_irregular_timestamp(two_segments_w_mon: TSDataset): + ts = two_segments_w_mon + df = ts.raw_df + df_exog = ts.df_exog + df_exog.loc[df_exog.index[3], pd.IndexSlice["segment_1", "external_timestamp"]] += pd.Timedelta("3H") + ts = TSDataset(df=df, df_exog=df_exog, freq=ts.freq) + return ts + + +@pytest.fixture() +def two_segments_w_mon_external_irregular_timestamp_different_freq(two_segments_w_mon: TSDataset): + ts = two_segments_w_mon + df = ts.raw_df + df_exog = ts.df_exog + df_exog.loc[:, pd.IndexSlice["segment_1", "external_timestamp"]] = pd.date_range( + start="2020-01-01", periods=len(df_exog), freq="W-SUN" + ) + ts = TSDataset(df=df, df_exog=df_exog, freq=ts.freq) + return ts + + @pytest.fixture() def two_segments_w_mon_with_nans(two_segments_w_mon: TSDataset): ts = two_segments_w_mon @@ -207,18 +241,6 @@ def two_segments_simple_ts_minute(simple_constant_df_minute): return ts -@pytest.fixture() -def uk_holiday_names_daily(): - values = ["New Year's Day"] + ["New Year Holiday [Scotland]"] + ["NO_HOLIDAY"] * 13 - return np.array(values) - - -@pytest.fixture() -def us_holiday_names_daily(): - values = ["New Year's Day"] + ["NO_HOLIDAY"] * 14 - return np.array(values) - - @pytest.mark.parametrize( "freq, timestamp, expected_result", ( @@ -241,11 +263,36 @@ def test_define_period_end(freq, timestamp, expected_result): assert (define_period(pd.tseries.frequencies.to_offset(freq), timestamp, freq))[1] == expected_result[1] -def test_holiday_with_regressors(simple_ts_with_regressors: TSDataset): - holiday = HolidayTransform(out_column="holiday") - new = holiday.fit_transform(simple_ts_with_regressors) - len_holiday = len([cols for cols in new.columns if cols[1] == "holiday"]) - assert len_holiday == len(np.unique(new.columns.get_level_values("segment"))) +def test_fit_days_count_fail_int_index(two_segments_w_mon_int_timestamp): + ts = two_segments_w_mon_int_timestamp + transform = HolidayTransform(out_column="holiday", mode="days_count") + with pytest.raises(ValueError, match="Transform can't work with integer index, parameter in_column should be set"): + transform.fit(ts=ts) + + +def test_fit_days_count_fail_external_timestamp_int(two_segments_w_mon_external_int_timestamp): + ts = two_segments_w_mon_external_int_timestamp + transform = HolidayTransform(in_column="external_timestamp", out_column="holiday", mode="days_count") + with pytest.raises(ValueError, match="Transform can work only with datetime external timestamp"): + transform.fit(ts=ts) + + +def test_fit_days_count_fail_irregular_timestamp(two_segments_w_mon_external_irregular_timestamp): + ts = two_segments_w_mon_external_irregular_timestamp + transform = HolidayTransform(in_column="external_timestamp", out_column="holiday", mode="days_count") + with pytest.raises( + ValueError, match="Invalid in_column values! Datetime values should be regular timestamps with some frequency" + ): + transform.fit(ts=ts) + + +def test_fit_days_count_fail_different_freq(two_segments_w_mon_external_irregular_timestamp_different_freq): + ts = two_segments_w_mon_external_irregular_timestamp_different_freq + transform = HolidayTransform(in_column="external_timestamp", out_column="holiday", mode="days_count") + with pytest.raises( + ValueError, match="Invalid in_column values! Datetime values should have the same frequency for every segment" + ): + transform.fit(ts=ts) @pytest.mark.parametrize( @@ -263,7 +310,7 @@ def test_holiday_with_regressors(simple_ts_with_regressors: TSDataset): ("US", np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), ), ) -def test_holidays_binary_day(in_column: Optional[str], ts_name, iso_code: str, answer: np.array, request): +def test_transform_binary_day(in_column: Optional[str], ts_name, iso_code: str, answer: np.array, request): ts = request.getfixturevalue(ts_name) holidays_finder = HolidayTransform(iso_code=iso_code, mode="binary", out_column="holiday", in_column=in_column) ts = holidays_finder.fit_transform(ts) @@ -280,7 +327,7 @@ def test_holidays_binary_day(in_column: Optional[str], ts_name, iso_code: str, a ("US", np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), ), ) -def test_holidays_binary_hour(iso_code: str, answer: np.array, two_segments_simple_ts_hour: TSDataset): +def test_transform_binary_hour(iso_code: str, answer: np.array, two_segments_simple_ts_hour: TSDataset): holidays_finder = HolidayTransform(iso_code=iso_code, mode="binary", out_column="holiday") df = holidays_finder.fit_transform(two_segments_simple_ts_hour).to_pandas() for segment in df.columns.get_level_values("segment").unique(): @@ -295,7 +342,7 @@ def test_holidays_binary_hour(iso_code: str, answer: np.array, two_segments_simp ("US", np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])), ), ) -def test_holidays_binary_minute(iso_code: str, answer: np.array, two_segments_simple_ts_minute): +def test_transform_binary_minute(iso_code: str, answer: np.array, two_segments_simple_ts_minute): holidays_finder = HolidayTransform(iso_code=iso_code, mode="binary", out_column="holiday") df = holidays_finder.fit_transform(two_segments_simple_ts_minute).to_pandas() for segment in df.columns.get_level_values("segment").unique(): @@ -303,7 +350,22 @@ def test_holidays_binary_minute(iso_code: str, answer: np.array, two_segments_si assert df[segment]["holiday"].dtype == "category" -def test_holidays_binary_day_with_nans(two_segments_simple_ts_daily_with_nans): +@pytest.mark.parametrize( + "iso_code,answer", + ( + ("RUS", np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), + ("US", np.array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), + ), +) +def test_transform_binary_w_mon(iso_code: str, answer: np.array, two_segments_w_mon): + holidays_finder = HolidayTransform(iso_code=iso_code, mode="binary", out_column="holiday") + df = holidays_finder.fit_transform(two_segments_w_mon).to_pandas() + for segment in df.columns.get_level_values("segment").unique(): + assert np.array_equal(df[segment]["holiday"].values, answer) + assert df[segment]["holiday"].dtype == "category" + + +def test_transform_binary_day_with_nans(two_segments_simple_ts_daily_with_nans): ts = two_segments_simple_ts_daily_with_nans holidays_finder = HolidayTransform( iso_code="RUS", mode="binary", out_column="holiday", in_column="external_timestamp" @@ -324,15 +386,14 @@ def test_holidays_binary_day_with_nans(two_segments_simple_ts_daily_with_nans): ], ) @pytest.mark.parametrize( - "iso_code, answer_name", + "iso_code,answer", [ - ("UK", "uk_holiday_names_daily"), - ("US", "us_holiday_names_daily"), + ("UK", np.array(["New Year's Day"] + ["New Year Holiday [Scotland]"] + ["NO_HOLIDAY"] * 13)), + ("US", np.array(["New Year's Day"] + ["NO_HOLIDAY"] * 14)), ], ) -def test_holidays_category_day(in_column, ts_name, iso_code, answer_name, request): +def test_transform_category_day(in_column, ts_name, iso_code, answer, request): ts = request.getfixturevalue(ts_name) - answer = request.getfixturevalue(answer_name) holidays_finder = HolidayTransform(iso_code=iso_code, mode="category", out_column="holiday", in_column=in_column) df = holidays_finder.fit_transform(ts).to_pandas() for segment in df.columns.get_level_values("segment").unique(): @@ -340,7 +401,30 @@ def test_holidays_category_day(in_column, ts_name, iso_code, answer_name, reques assert df[segment]["holiday"].dtype == "category" -def test_holidays_category_day_with_nans(two_segments_simple_ts_daily_with_nans): +@pytest.mark.parametrize( + "iso_code,answer", + ( + ("RUS", np.array(["NO_HOLIDAY"] * 18)), + ( + "US", + np.array( + ["NO_HOLIDAY", "Martin Luther King Jr. Day"] + + ["NO_HOLIDAY"] * 3 + + ["Washington's Birthday"] + + ["NO_HOLIDAY"] * 12 + ), + ), + ), +) +def test_transform_category_w_mon(iso_code: str, answer: np.array, two_segments_w_mon): + holidays_finder = HolidayTransform(iso_code=iso_code, mode="category", out_column="holiday") + df = holidays_finder.fit_transform(two_segments_w_mon).to_pandas() + for segment in df.columns.get_level_values("segment").unique(): + assert np.array_equal(df[segment]["holiday"].values, answer) + assert df[segment]["holiday"].dtype == "category" + + +def test_transform_category_day_with_nans(two_segments_simple_ts_daily_with_nans): ts = two_segments_simple_ts_daily_with_nans holidays_finder = HolidayTransform( iso_code="RUS", mode="category", out_column="holiday", in_column="external_timestamp" @@ -352,7 +436,6 @@ def test_holidays_category_day_with_nans(two_segments_simple_ts_daily_with_nans) assert df[segment]["holiday"].dtype == "category" -# TODO: fix after discussing conceptual problems @pytest.mark.xfail() @pytest.mark.parametrize( "in_column, ts_name", @@ -369,7 +452,7 @@ def test_holidays_category_day_with_nans(two_segments_simple_ts_daily_with_nans) ("US", np.array([0, 1 / 7, 0, 0, 0, 1 / 7] + 12 * [0])), ), ) -def test_holidays_days_count_w_mon(in_column, ts_name, iso_code, answer, request): +def test_transform_days_count_w_mon(in_column, ts_name, iso_code, answer, request): ts = request.getfixturevalue(ts_name) holidays_finder = HolidayTransform(iso_code=iso_code, mode="days_count", out_column="holiday", in_column=in_column) ts = holidays_finder.fit_transform(ts) @@ -378,7 +461,7 @@ def test_holidays_days_count_w_mon(in_column, ts_name, iso_code, answer, request assert np.array_equal(df[segment]["holiday"].values, answer) -def test_holidays_days_count_w_mon_with_nans(two_segments_w_mon_with_nans): +def test_transform_days_count_w_mon_with_nans(two_segments_w_mon_with_nans): ts = two_segments_w_mon_with_nans holidays_finder = HolidayTransform( iso_code="RUS", mode="days_count", out_column="holiday", in_column="external_timestamp" @@ -389,19 +472,62 @@ def test_holidays_days_count_w_mon_with_nans(two_segments_w_mon_with_nans): assert df[segment]["holiday"].isna().sum() == 3 -@pytest.mark.parametrize("ts_name", ("two_segments_w_mon", "two_segments_simple_ts_day_15min")) -@pytest.mark.parametrize("mode", ("binary", "category")) -def test_holidays_binary_category_failed_wrong_freq(ts_name, mode, request): - ts = request.getfixturevalue(ts_name) - holidays_finder = HolidayTransform(out_column="holiday", mode=mode) +@pytest.mark.parametrize( + "ts_name_fit, ts_name_transform, mode", + [ + ("two_segments_simple_ts_daily_int_timestamp", "two_segments_simple_ts_daily_int_timestamp", "binary"), + ("two_segments_simple_ts_daily_int_timestamp", "two_segments_simple_ts_daily_int_timestamp", "category"), + ("two_segments_w_mon", "two_segments_w_mon_int_timestamp", "days_count"), + ], +) +def test_transform_fail_int_index(ts_name_fit, ts_name_transform, mode, request): + ts_fit = request.getfixturevalue(ts_name_fit) + ts_transform = request.getfixturevalue(ts_name_transform) + transform = HolidayTransform(out_column="holiday", in_column=None) + transform.fit(ts_fit) + with pytest.raises(ValueError, match="Transform can't work with integer index, parameter in_column should be set"): + _ = transform.transform(ts_transform) + + +def test_transform_days_count_fail_external_timestamp_int( + two_segments_w_mon, two_segments_w_mon_external_int_timestamp +): + ts_fit = two_segments_w_mon + ts_transform = two_segments_w_mon_external_int_timestamp + transform = HolidayTransform(in_column="external_timestamp", out_column="holiday", mode="days_count") + transform.fit(ts_fit) + with pytest.raises(ValueError, match="Transform can work only with datetime external timestamp"): + transform.transform(ts=ts_transform) + + +def test_transform_days_count_fail_irregular_timestamp( + two_segments_w_mon, two_segments_w_mon_external_irregular_timestamp +): + ts_fit = two_segments_w_mon + ts_transform = two_segments_w_mon_external_irregular_timestamp + transform = HolidayTransform(in_column="external_timestamp", out_column="holiday", mode="days_count") + transform.fit(ts_fit) with pytest.raises( - ValueError, match="For binary and category modes frequency of data should be no more than daily." + ValueError, match="Invalid in_column values! Datetime values should be regular timestamps with some frequency" ): - _ = holidays_finder.fit_transform(ts) + transform.transform(ts=ts_transform) + + +def test_transform_days_count_fail_different_freq( + two_segments_w_mon, two_segments_w_mon_external_irregular_timestamp_different_freq +): + ts_fit = two_segments_w_mon + ts_transform = two_segments_w_mon_external_irregular_timestamp_different_freq + transform = HolidayTransform(in_column="external_timestamp", out_column="holiday", mode="days_count") + transform.fit(ts_fit) + with pytest.raises( + ValueError, match="Invalid in_column values! Datetime values should have the same frequency for every segment" + ): + transform.transform(ts=ts_transform) @pytest.mark.parametrize("ts_name", ("two_segments_simple_ts_daily", "two_segments_simple_ts_minute")) -def test_holidays_days_count_mode_failed(ts_name, request): +def test_transform_days_count_mode_fail_wrong_freq(ts_name, request): ts = request.getfixturevalue(ts_name) holidays_finder = HolidayTransform(out_column="holiday", mode="days_count") with pytest.raises( @@ -411,13 +537,6 @@ def test_holidays_days_count_mode_failed(ts_name, request): _ = holidays_finder.fit_transform(ts) -def test_transform_index_fail_int_timestamp(two_segments_simple_ts_daily_int_timestamp): - transform = HolidayTransform(out_column="holiday", in_column=None) - transform.fit(two_segments_simple_ts_daily_int_timestamp) - with pytest.raises(ValueError, match="Transform can't work with integer index, parameter in_column should be set"): - _ = transform.transform(two_segments_simple_ts_daily_int_timestamp) - - @pytest.mark.parametrize("mode", ["binary", "category", "days_count"]) def test_get_regressors_info_index(mode): transform = HolidayTransform(mode=mode, out_column="holiday")