From 7dc5665460c66e6c3e7c43a0c6be1225e99c0e38 Mon Sep 17 00:00:00 2001 From: Yakov Malyshev <38911542+ostreech1997@users.noreply.github.com> Date: Mon, 8 Apr 2024 09:25:39 +0300 Subject: [PATCH] Update internal datasets to work with unaligned data (#292) --- CHANGELOG.md | 2 +- docs/source/internal_datasets.rst | 56 +++--- etna/datasets/internal_datasets.py | 160 ++++++++++-------- tests/test_datasets/test_internal_datasets.py | 141 +++++++-------- 4 files changed, 192 insertions(+), 167 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef58f9ec9..a3f9228e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -80,7 +80,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Update CLI to handle integer timestamp ([#246](https://github.com/etna-team/etna/pull/246)) - Update `ExogShiftTransform` to handle integer timestamp ([#254](https://github.com/etna-team/etna/pull/254)) - Extend base `TSDataset` constructor to handle long format dataframes, update documentation and tutorials with this change ([#266](https://github.com/etna-team/etna/pull/266)) -- +- Update internal datasets to work with unaligned data ([#292](https://github.com/etna-team/etna/pull/292)) - - - diff --git a/docs/source/internal_datasets.rst b/docs/source/internal_datasets.rst index dd31ba0d0..b345692c5 100644 --- a/docs/source/internal_datasets.rst +++ b/docs/source/internal_datasets.rst @@ -33,145 +33,145 @@ List of internal datasets * - :ref:`electricity_15T ` - 15 minutes - 140256 observations, 370 segments - - ("2011-01-01 00:15:00", "2015-01-01 00:00:00"), original + - ("2011-01-01 00:15:00", "2015-01-01 00:00:00") - No exog data - train, test, full * - :ref:`m3_monthly ` - monthly - 144 observations, 1428 segments - - ("2010-01-31 00:00:00", "2021-12-31 00:00:00"), synthetic + - int timestamp - Original timestamp column - train, test, full * - :ref:`m3_quarterly ` - quarterly - 72 observations, 756 segments - - ("2004-03-31 00:00:00", "2021-12-31 00:00:00"), synthetic + - int timestamp - Original timestamp column - train, test, full * - :ref:`m3_other ` - unknown, expected quarterly - 104 observations, 174 segments - - ("1996-03-31 00:00:00", "2021-12-31 00:00:00"), synthetic + - int timestamp - Original timestamp column - train, test, full * - :ref:`m3_yearly ` - yearly - 47 observations, 645 segments - - ("1975-12-31 00:00:00", "2021-12-31 00:00:00"), synthetic + - int timestamp - Original timestamp column - train, test, full * - :ref:`m4_hourly ` - hourly - 1008 observations, 414 segments - - ("2021-11-20 01:00:00", "2022-01-01 00:00:00"), synthetic + - int timestamp - No exog data - train, test, full * - :ref:`m4_daily ` - daily - 9933 observations, 4227 segments - - ("1994-10-23 00:00:00", "2022-01-01 00:00:00"), synthetic + - int timestamp - No exog data - train, test, full * - :ref:`m4_weekly ` - weekly - 2610 observations, 359 segments - - ("1971-12-27 00:00:00", "2021-12-27 00:00:00"), synthetic + - int timestamp - No exog data - train, test, full * - :ref:`m4_monthly ` - monthly - 2812 observations, 48000 segments - - ("1787-09-30 00:00:00", "2021-12-31 00:00:00"), synthetic + - int timestamp - No exog data - train, test, full * - :ref:`m4_quarterly ` - quarterly - 874 observations, 24000 segments - - ("1803-10-01 00:00:00", "2022-01-01 00:00:00"), synthetic + - int timestamp - No exog data - train, test, full * - :ref:`m4_yearly ` - daily - 47 observations, 23000 segments - - ("2019-09-14 00:00:00", "2022-01-01 00:00:00"), synthetic + - int timestamp - No exog data - train, test, full * - :ref:`traffic_2008_10T ` - 10 minutes - 65520 observations, 963 segments - - ("2008-01-01 00:00:00", "2009-03-30 23:50:00"), original + - ("2008-01-01 00:00:00", "2009-03-30 23:50:00") - No exog data - train, test, full * - :ref:`traffic_2008_hourly ` - hourly - 10920 observations, 963 segments - - ("2008-01-01 00:00:00", "2009-03-30 23:00:00"), original + - ("2008-01-01 00:00:00", "2009-03-30 23:00:00") - No exog data - train, test, full * - :ref:`traffic_2015_hourly ` - hourly - 17544 observations, 862 segments - - ("2015-01-01 00:00:00", "2016-12-31 23:00:00"), original + - ("2015-01-01 00:00:00", "2016-12-31 23:00:00") - No exog data - train, test, full * - :ref:`tourism_monthly ` - monthly - 333 observations, 366 segments - - ("1994-05-01 00:00:00", "2022-01-01 00:00:00"), synthetic + - int timestamp - Original timestamp column - train, test, full * - :ref:`tourism_quarterly ` - quarterly - 130 observations, 427 segments - - ("1989-09-30 00:00:00", "2021-12-31 00:00:00"), synthetic + - int timestamp - Original timestamp column - train, test, full * - :ref:`tourism_yearly ` - yearly - 47 observations, 518 segments - - ("1975-12-31 00:00:00", "2021-12-31 00:00:00"), synthetic + - int timestamp - Original timestamp column - train, test, full * - :ref:`weather_10T ` - 10 minutes - 52704 observations, 21 segments - - ("2020-01-01 00:10:00", "2021-01-01 00:00:00"), original + - ("2020-01-01 00:10:00", "2021-01-01 00:00:00") - No exog data - train, test, full * - :ref:`ETTm1 ` - 15 minutes - 69680 observations, 7 segments - - ("2016-07-01 00:00:00", "2018-06-26 19:45:00"), original + - ("2016-07-01 00:00:00", "2018-06-26 19:45:00") - No exog data - train, test, full * - :ref:`ETTm2 ` - 15 minutes - 69680 observations, 7 segments - - ("2016-07-01 00:00:00", "2018-06-26 19:45:00"), original + - ("2016-07-01 00:00:00", "2018-06-26 19:45:00") - No exog data - train, test, full * - :ref:`ETTh1 ` - hourly - 17420 observations, 7 segments - - ("2016-07-01 00:00:00", "2018-06-26 19:00:00"), original + - ("2016-07-01 00:00:00", "2018-06-26 19:00:00") - No exog data - train, test, full * - :ref:`ETTh2 ` - hourly - 17420 observations, 7 segments - - ("2016-07-01 00:00:00", "2018-06-26 19:00:00"), original + - ("2016-07-01 00:00:00", "2018-06-26 19:00:00") - No exog data - train, test, full * - :ref:`IHEPC_T ` - minute - 2075259 observations, 7 segments - - ("2006-12-16 17:24:00", "2010-11-26 21:02:00"), original + - ("2006-12-16 17:24:00", "2010-11-26 21:02:00") - No exog data - full * - :ref:`australian_wine_sales_monthly ` - monthly - 176 observations, 1 segments - - ("1980-01-01 00:00:00", "1994-08-01 00:00:00"), original + - ("1980-01-01 00:00:00", "1994-08-01 00:00:00") - No exog data - full @@ -199,8 +199,8 @@ Competition. The M3 dataset consists of time series of yearly, quarterly, monthl data originally does not have any particular frequency, but we assume it as a quarterly data. Each frequency mode has its own specific prediction horizon: 6 for yearly, 8 for quarterly, 18 for monthly, and 8 for other. -M3 dataset has series ending on different dates. As to the specificity of ``TSDataset`` we should add custom dates -to make series end on one date. Original dates are added as an exogenous data. For example, ``df_exog`` of train +M3 dataset has series ending on different dates. As to the specificity of ``TSDataset`` we use integer index to make +series end on one timestamp.. Original dates are added as an exogenous data. For example, ``df_exog`` of train dataset has dates for train and test and ``df_exog`` of test dataset has dates only for test. Loading names: @@ -280,8 +280,8 @@ tourism bodies (such as Tourism Australia, the Hong Kong Tourism Board and Touri academics, who had used them in previous tourism forecasting studies. Each frequency mode has its own specific prediction horizon: 4 for yearly, 8 for quarterly, 24 for monthly. -Tourism dataset has series ending on different dates. As to the specificity of ``TSDataset`` we should add custom dates -to make series end on one date. Original dates are added as an exogenous data. For example, ``df_exog`` of train +Tourism dataset has series ending on different dates. As to the specificity of ``TSDataset`` we use integer index to +make series end on one timestamp. Original dates are added as an exogenous data. For example, ``df_exog`` of train dataset has dates for train and test and ``df_exog`` of test dataset has dates only for test. Loading names: diff --git a/etna/datasets/internal_datasets.py b/etna/datasets/internal_datasets.py index 8cb65c575..67cb74469 100644 --- a/etna/datasets/internal_datasets.py +++ b/etna/datasets/internal_datasets.py @@ -176,6 +176,11 @@ def load_dataset( index_col=[0], parse_dates=[0], ) + # For some datasets there are real dates that we cannot use directly, so we save them in exog data. When we + # load dataset, we convert this dates into datetime so that the user can apply transforms to them. + if "exog_datetime_columns" in dataset_params: + dt_columns = [col for col in df_exog.columns if col[1] in dataset_params["exog_datetime_columns"]] + df_exog[dt_columns] = df_exog[dt_columns].astype("datetime64[ns]") ts = TSDataset(data, df_exog=df_exog, freq=freq) else: ts = TSDataset(data, freq=freq) @@ -247,7 +252,6 @@ def get_m4_dataset(dataset_dir: Path, dataset_freq: str) -> None: url_data = ( "https://raw.githubusercontent.com/Mcompetitions/M4-methods/6c1067e5a57161249b17289a565178dc7a3fb3ca/Dataset/" ) - end_date = "2022-01-01" freq = get_freq[dataset_freq] dataset_dir.mkdir(exist_ok=True, parents=True) @@ -257,9 +261,13 @@ def get_m4_dataset(dataset_dir: Path, dataset_freq: str) -> None: segments = data_test.index test_target = data_test.values + test_len = test_target.shape[1] + train_target = [x[~np.isnan(x)] for x in data_train.values] + + max_len = test_len + max([len(target) for target in train_target]) df_list = [] - test_timestamps = pd.date_range(end=end_date, freq=freq, periods=test_target.shape[1]) + test_timestamps = np.arange(start=max_len - test_len, stop=max_len) for segment, target in zip(segments, test_target): df_segment = pd.DataFrame({"target": target}) df_segment["segment"] = segment @@ -267,12 +275,11 @@ def get_m4_dataset(dataset_dir: Path, dataset_freq: str) -> None: df_list.append(df_segment) df_test = pd.concat(df_list, axis=0) - train_target = [x[~np.isnan(x)] for x in data_train.values] df_list = [] for segment, target in zip(segments, train_target): df_segment = pd.DataFrame({"target": target}) df_segment["segment"] = segment - df_segment["timestamp"] = pd.date_range(end=test_timestamps[0], freq=freq, periods=len(target) + 1)[:-1] + df_segment["timestamp"] = np.arange(start=max_len - test_target.shape[1] - len(target), stop=max_len - test_len) df_list.append(df_segment) df_train = pd.concat(df_list, axis=0) @@ -445,9 +452,9 @@ def get_m3_dataset(dataset_dir: Path, dataset_freq: str) -> None: data originally does not have any particular frequency, but we assume it as a quarterly data. Each frequency mode has its own specific prediction horizon: 6 for yearly, 8 for quarterly, 18 for monthly, and 8 for other. - M3 dataset has series ending on different dates. As to the specificity of TSDataset we should add custom dates - to make series end on one date. Original dates are added as an exogenous data. For example, ``df_exog`` of train - dataset has dates for train and test and ``df_exog`` of test dataset has dates only for test. + M3 dataset has series ending on different dates. As to the specificity of TSDataset we use integer index use integer + index to make series end on one timestamp. Original dates are added as an exogenous data. For example, ``df_exog`` + of train dataset has dates for train and test and ``df_exog`` of test dataset has dates only for test. Parameters ---------- @@ -461,16 +468,15 @@ def get_m3_dataset(dataset_dir: Path, dataset_freq: str) -> None: .. [1] https://forvis.github.io/datasets/m3-data/ .. [2] https://forecasters.org/resources/time-series-data/m3-competition/ """ - get_freq = {"monthly": "M", "quarterly": "Q-DEC", "yearly": "A-DEC", "other": "Q-DEC"} get_horizon = {"monthly": 18, "quarterly": 8, "yearly": 6, "other": 8} url_data = "https://forvis.github.io/data" - end_date = "2022-01-01" - freq = get_freq[dataset_freq] + horizon = get_horizon[dataset_freq] exog_dir = dataset_dir / EXOG_SUBDIRECTORY exog_dir.mkdir(exist_ok=True, parents=True) data = pd.read_csv(f"{url_data}/M3_{dataset_freq}_TSTS.csv") + max_len = data.groupby("series_id")["timestamp"].count().max() df_full = pd.DataFrame() df_train = pd.DataFrame() @@ -478,9 +484,8 @@ def get_m3_dataset(dataset_dir: Path, dataset_freq: str) -> None: df_full_exog = pd.DataFrame() df_test_exog = pd.DataFrame() - horizon = get_horizon[dataset_freq] for _, group in data.groupby("series_id"): - timestamps = pd.date_range(end=end_date, freq=freq, periods=group.shape[0]) + timestamps = np.arange(start=max_len - group.shape[0], stop=max_len) group.rename(columns={"timestamp": "origin_timestamp", "series_id": "segment", "value": "target"}, inplace=True) group["segment"] = group["segment"] + "_" + group["category"] group.drop(columns=["category"], inplace=True) @@ -500,6 +505,13 @@ def get_m3_dataset(dataset_dir: Path, dataset_freq: str) -> None: df_full_exog = pd.concat([df_full_exog, df_full_part_exog]) df_test_exog = pd.concat([df_test_exog, df_test_part_exog]) + if dataset_freq == "yearly": + df_full_exog["origin_timestamp"] = pd.to_datetime(df_full_exog["origin_timestamp"], format="%Y") + df_test_exog["origin_timestamp"] = pd.to_datetime(df_test_exog["origin_timestamp"], format="%Y") + elif dataset_freq != "other": + df_full_exog["origin_timestamp"] = pd.to_datetime(df_full_exog["origin_timestamp"]) + df_test_exog["origin_timestamp"] = pd.to_datetime(df_test_exog["origin_timestamp"]) + TSDataset.to_dataset(df_full).to_csv( dataset_dir / f"m3_{dataset_freq.lower()}_full.csv.gz", index=True, compression="gzip", float_format="%.8f" ) @@ -509,6 +521,7 @@ def get_m3_dataset(dataset_dir: Path, dataset_freq: str) -> None: TSDataset.to_dataset(df_test).to_csv( dataset_dir / f"m3_{dataset_freq.lower()}_test.csv.gz", index=True, compression="gzip", float_format="%.8f" ) + TSDataset.to_dataset(df_full_exog).to_csv( dataset_dir / EXOG_SUBDIRECTORY / f"m3_{dataset_freq.lower()}_full_exog.csv.gz", index=True, @@ -538,8 +551,8 @@ def get_tourism_dataset(dataset_dir: Path, dataset_freq: str) -> None: academics, who had used them in previous tourism forecasting studies. Each frequency mode has its own specific prediction horizon: 4 for yearly, 8 for quarterly, 24 for monthly. - Tourism dataset has series ending on different dates. As to the specificity of TSDataset we should add custom dates - to make series end on one date. Original dates are added as an exogenous data. For example, ``df_exog`` of train + Tourism dataset has series ending on different dates. As to the specificity of TSDataset we use integer index to + make series end on one timestamp. Original dates are added as an exogenous data. For example, ``df_exog`` of train dataset has dates for train and test and ``df_exog`` of test dataset has dates only for test. References @@ -548,7 +561,6 @@ def get_tourism_dataset(dataset_dir: Path, dataset_freq: str) -> None: """ get_freq = {"monthly": "MS", "quarterly": "Q-DEC", "yearly": "A-DEC"} start_index_target_rows = {"monthly": 3, "quarterly": 3, "yearly": 2} - end_date = "2022-01-01" freq = get_freq[dataset_freq] target_index = start_index_target_rows[dataset_freq] exog_dir = dataset_dir / EXOG_SUBDIRECTORY @@ -560,7 +572,7 @@ def get_tourism_dataset(dataset_dir: Path, dataset_freq: str) -> None: file_names=(f"{dataset_freq}_in.csv", f"{dataset_freq}_oos.csv"), read_functions=(partial(pd.read_csv, sep=","), partial(pd.read_csv, sep=",")), ) - + max_len = int(data_train.iloc[0].max() + data_test.iloc[0].max()) segments = data_train.columns df_full = pd.DataFrame() @@ -583,7 +595,7 @@ def get_tourism_dataset(dataset_dir: Path, dataset_freq: str) -> None: target_test = data_test_[target_index : target_index + test_size] target_full = np.concatenate([target_train, target_test]) - new_timestamps = pd.date_range(end=end_date, freq=freq, periods=len(target_full)) + new_timestamps = np.arange(start=max_len - len(target_full), stop=max_len) initial_timestamps = pd.date_range(start=initial_date, periods=len(target_full), freq=freq) df_full_ = pd.DataFrame( @@ -752,7 +764,7 @@ def get_ihepc_dataset(dataset_dir: Path) -> None: df_full.to_csv(dataset_dir / f"IHEPC_T_full.csv.gz", index=True, compression="gzip", float_format="%.8f") -def get_australian_wine_sales_daataset(dataset_dir: Path) -> None: +def get_australian_wine_sales_dataset(dataset_dir: Path) -> None: """ Download and save Australian total wine sales by wine makers in bottles. @@ -799,102 +811,105 @@ def list_datasets() -> List[str]: }, "m3_monthly": { "get_dataset_function": partial(get_m3_dataset, dataset_freq="monthly"), - "freq": "M", + "freq": None, + "exog_datetime_columns": ("origin_timestamp",), "parts": ("train", "test", "full"), "hash": { - "train": "cfa58e9c2caf28849f5397ba159887b2", - "test": "9d8f9871e418239f0efc23550dbe2e91", - "full": "d1a8bad4aba489d04063dd48cedb96a5", + "train": "36535626a98157ccbfe3d1f5b2d964ac", + "test": "09af36fa503b41ea5283db6ec6063ae1", + "full": "4babb773e580501b4918557555157f34", }, }, "m3_quarterly": { "get_dataset_function": partial(get_m3_dataset, dataset_freq="quarterly"), - "freq": "Q-DEC", + "freq": None, + "exog_datetime_columns": ("origin_timestamp",), "parts": ("train", "test", "full"), "hash": { - "train": "f944dd06aa47a495f18b40f0a1dab6a5", - "test": "d29138ea613c8a4945cbd421754254e0", - "full": "fdfdd5400dce06530d576f4136d13421", + "train": "fb4286f519a6aa9385937c47dde6ddf4", + "test": "a27614afc474472f842a152a6ceb95e6", + "full": "dba2451b2aac7fc397c1cff5ad32a3dd", }, }, "m3_yearly": { "get_dataset_function": partial(get_m3_dataset, dataset_freq="yearly"), - "freq": "A-DEC", + "freq": None, + "exog_datetime_columns": ("origin_timestamp",), "parts": ("train", "test", "full"), "hash": { - "train": "6eb14930144e2012d0132f0b809cf2d8", - "test": "15ad9304aa9d0a3acf6496e7e5e03176", - "full": "d41fadf624a61645c545847e2154c4a9", + "train": "1d14eb24b2dd7bc9796a5758c6b215f1", + "test": "ad83bafa0533557a65e124aed9b1c381", + "full": "62fc772fe16c1e0eb53401f088f82b6a", }, }, "m3_other": { "get_dataset_function": partial(get_m3_dataset, dataset_freq="other"), - "freq": "Q-DEC", + "freq": None, "parts": ("train", "test", "full"), "hash": { - "train": "9132a834a7edb7f7c10215f753c0d68c", - "test": "d489b43229c7498c937f38fa465e8734", - "full": "9b55fd0bc336120e3756e022f5beade3", + "train": "37316d0cc7eb45c653719aea0be53880", + "test": "a63258ce320d3f2e68c019c9f23767b1", + "full": "81b024a7ef1b6be31e748c47edb057be", }, }, "m4_hourly": { "get_dataset_function": partial(get_m4_dataset, dataset_freq="Hourly"), - "freq": "H", + "freq": None, "parts": ("train", "test", "full"), "hash": { - "train": "61dcfc17181fdeb67821fc3a9ff4b509", - "test": "53768f5aa63d5c99eb6841fbd14fa42f", - "full": "1bf6e9a9f5ae7e19261bb01a9a24da6f", + "train": "239f11e69086ee0ef9c39fcb0bb89286", + "test": "36cc4ae564342a361695c402e6812074", + "full": "fd299eaaa9ef3deadabb0197c37ba8b2", }, }, "m4_daily": { "get_dataset_function": partial(get_m4_dataset, dataset_freq="Daily"), - "freq": "D", + "freq": None, "parts": ("train", "test", "full"), "hash": { - "train": "dbf8a576d00f1e523f01f8a72af6c0da", - "test": "294ad20e7c6f0a1dddb4f749b7473dc0", - "full": "11e60a29e9ea7c4f9672e77bd107e4d8", + "train": "7878f1485a779da34848f900c58ca991", + "test": "e26d4a1bc0b45428a52f1ba8be3bf510", + "full": "7a1ce18e378fb8c69f02757547ccab4c", }, }, "m4_weekly": { "get_dataset_function": partial(get_m4_dataset, dataset_freq="Weekly"), - "freq": "W-MON", + "freq": None, "parts": ("train", "test", "full"), "hash": { - "train": "26821e9fd21cac965bbedc35a137f18a", - "test": "6798cae75181c5f0c1a608eb0e59e23f", - "full": "5bdbaff1a011ef8723f09a38e0266fcf", + "train": "6dedd34a04fefb7f6da626b37fcf0ad2", + "test": "69f807a621b864d7e2d51f6daca147d8", + "full": "9954d2341af9615472f58afcc9dae2fd", }, }, "m4_monthly": { "get_dataset_function": partial(get_m4_dataset, dataset_freq="Monthly"), - "freq": "M", + "freq": None, "parts": ("train", "test", "full"), "hash": { - "train": "f625bc066e42299132aaad2a79e54537", - "test": "9e2dc5262ca01b5d2c0a6d2993039735", - "full": "78a96c47ee4335bd59e33a1e7b26c3b3", + "train": "6c1f5212132429c24279c583d8350ec3", + "test": "8495595ea49766f94855e2275adf41e8", + "full": "69e4479c83174eddf22b9c125de086b8", }, }, "m4_quarterly": { "get_dataset_function": partial(get_m4_dataset, dataset_freq="Quarterly"), - "freq": "QS-JAN", + "freq": None, "parts": ("train", "test", "full"), "hash": { - "train": "540c397f52a13dd17f5158ab799a93f9", - "test": "8a145e44f9ce19ffe004d867ac7899d4", - "full": "745c6e679a600dcd96211c7717605d72", + "train": "a82abe6bb3d471ae23dc8de0c28d62c2", + "test": "2469cf58fea2468c30ffc4ad5891b67c", + "full": "bc076efa89d65cb5ce35d867b9bfcb3b", }, }, "m4_yearly": { "get_dataset_function": partial(get_m4_dataset, dataset_freq="Yearly"), - "freq": "D", + "freq": None, "parts": ("train", "test", "full"), "hash": { - "train": "67d73db6245af5c5551f38d315e290f9", - "test": "806d1f2257162fe95c98718db2f04ab7", - "full": "011bef4ab44721a99288d502ccb2bc98", + "train": "b44199b886507abd9118e0f756527af9", + "test": "676c705384b67d4ffad6d5b25873501e", + "full": "1ee536a16c9d505f5411de5fc8e0e265", }, }, "traffic_2008_10T": { @@ -929,32 +944,35 @@ def list_datasets() -> List[str]: }, "tourism_monthly": { "get_dataset_function": partial(get_tourism_dataset, dataset_freq="monthly"), - "freq": "MS", + "freq": None, + "exog_datetime_columns": ("origin_timestamp",), "parts": ("train", "test", "full"), "hash": { - "train": "2a32e030b783a0de3e74f9d412e6e70c", - "test": "c5d4f520692d000cd6517e1cd67f2345", - "full": "f1d8b9bf506d49f6c902c97624fe23bd", + "train": "eb65658979dcf20254df2e27793c4a2f", + "test": "4413d427fb1c7fd161a2ae896a9f2e17", + "full": "ccb8fd049488568af81c9fe341d05470", }, }, "tourism_quarterly": { "get_dataset_function": partial(get_tourism_dataset, dataset_freq="quarterly"), - "freq": "Q-DEC", + "freq": None, + "exog_datetime_columns": ("origin_timestamp",), "parts": ("train", "test", "full"), "hash": { - "train": "9840d4875899d81349321aae6f859c21", - "test": "17e193090a32c91fc482db9993f5db28", - "full": "645822fcb6a46dfe7375d2eb6f117ef2", + "train": "380fe61422a5333043b714c22bcb6725", + "test": "0cea851864a96c344778037d3baaedf5", + "full": "a58fd54e937182b52220c7e733b982ca", }, }, "tourism_yearly": { "get_dataset_function": partial(get_tourism_dataset, dataset_freq="yearly"), - "freq": "A-DEC", + "freq": None, + "exog_datetime_columns": ("origin_timestamp",), "parts": ("train", "test", "full"), "hash": { - "train": "d0781023602223cc9b9c2dca1981c0fb", - "test": "a5461b2fcbf6bac12591d657b1b930f9", - "full": "9032dbd5d0a7e0f696d6a5c005a493e0", + "train": "62ccbd0a636fd8797d20eab58d78e503", + "test": "52d826295bf39cca8ab067c04e0fb883", + "full": "33bc585db54a4b965149ff9b991c2def", }, }, "weather_10T": { @@ -1014,7 +1032,7 @@ def list_datasets() -> List[str]: "hash": {"full": "8909138462ea130b9809907e947ffae6"}, }, "australian_wine_sales_monthly": { - "get_dataset_function": get_australian_wine_sales_daataset, + "get_dataset_function": get_australian_wine_sales_dataset, "freq": "MS", "parts": ("full",), "hash": {"full": "2dd34b5306d5e5372727e4d610b713be"}, diff --git a/tests/test_datasets/test_internal_datasets.py b/tests/test_datasets/test_internal_datasets.py index 703a26070..de96d9992 100644 --- a/tests/test_datasets/test_internal_datasets.py +++ b/tests/test_datasets/test_internal_datasets.py @@ -79,7 +79,7 @@ def test_not_present_part(): @pytest.mark.parametrize( - "dataset_name, expected_shape, expected_min_date, expected_max_date, dataset_parts", + "dataset_name, expected_shape, expected_min_timestamp, expected_max_timestamp, dataset_parts", [ pytest.param( "electricity_15T", @@ -92,43 +92,45 @@ def test_not_present_part(): ( "m4_hourly", (960 + 48, 414), - pd.to_datetime("2021-11-20 01:00:00"), - pd.to_datetime("2022-01-01 00:00:00"), + 0, + 1007, ("train", "test"), ), - ( + pytest.param( "m4_daily", (9919 + 14, 4227), - pd.to_datetime("1994-10-23 00:00:00"), - pd.to_datetime("2022-01-01 00:00:00"), + 0, + 9932, ("train", "test"), + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), ), ( "m4_weekly", (2597 + 13, 359), - pd.to_datetime("1971-12-27 00:00:00"), - pd.to_datetime("2021-12-27 00:00:00"), + 0, + 2609, ("train", "test"), ), - ( + pytest.param( "m4_monthly", (2794 + 18, 48000), - pd.to_datetime("1787-09-30 00:00:00"), - pd.to_datetime("2021-12-31 00:00:00"), + 0, + 2811, ("train", "test"), + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), ), ( "m4_quarterly", (866 + 8, 24000), - pd.to_datetime("1803-10-01 00:00:00"), - pd.to_datetime("2022-01-01 00:00:00"), + 0, + 873, ("train", "test"), ), ( "m4_yearly", (835 + 6, 23000), - pd.to_datetime("2019-09-14 00:00:00"), - pd.to_datetime("2022-01-01 00:00:00"), + 0, + 840, ("train", "test"), ), pytest.param( @@ -158,50 +160,50 @@ def test_not_present_part(): ( "m3_monthly", (126 + 18, 2856), - pd.to_datetime("2010-01-31 00:00:00"), - pd.to_datetime("2021-12-31 00:00:00"), + 0, + 143, ("train", "test"), ), ( "m3_quarterly", (64 + 8, 1512), - pd.to_datetime("2004-03-31 00:00:00"), - pd.to_datetime("2021-12-31 00:00:00"), + 0, + 71, ("train", "test"), ), ( "m3_other", (96 + 8, 348), - pd.to_datetime("1996-03-31 00:00:00"), - pd.to_datetime("2021-12-31 00:00:00"), + 0, + 103, ("train", "test"), ), ( "m3_yearly", (41 + 6, 1290), - pd.to_datetime("1975-12-31 00:00:00"), - pd.to_datetime("2021-12-31 00:00:00"), + 0, + 46, ("train", "test"), ), ( "tourism_monthly", (309 + 24, 732), - pd.to_datetime("1994-05-01 00:00:00"), - pd.to_datetime("2022-01-01 00:00:00"), + 0, + 332, ("train", "test"), ), ( "tourism_quarterly", (122 + 8, 854), - pd.to_datetime("1989-09-30 00:00:00"), - pd.to_datetime("2021-12-31 00:00:00"), + 0, + 129, ("train", "test"), ), ( "tourism_yearly", (43 + 4, 1036), - pd.to_datetime("1975-12-31 00:00:00"), - pd.to_datetime("2021-12-31 00:00:00"), + 0, + 46, ("train", "test"), ), ( @@ -239,12 +241,13 @@ def test_not_present_part(): pd.to_datetime("2018-06-26 19:00:00"), ("train", "test"), ), - ( + pytest.param( "IHEPC_T", (2075259, 7), pd.to_datetime("2006-12-16 17:24:00"), pd.to_datetime("2010-11-26 21:02:00"), tuple(), + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), ), ( "australian_wine_sales_monthly", @@ -255,11 +258,13 @@ def test_not_present_part(): ), ], ) -def test_dataset_statistics(dataset_name, expected_shape, expected_min_date, expected_max_date, dataset_parts): +def test_dataset_statistics( + dataset_name, expected_shape, expected_min_timestamp, expected_max_timestamp, dataset_parts +): ts_full = load_dataset(dataset_name, parts="full", rebuild_dataset=True) assert ts_full.df.shape == expected_shape - assert ts_full.index.min() == expected_min_date - assert ts_full.index.max() == expected_max_date + assert ts_full.index.min() == expected_min_timestamp + assert ts_full.index.max() == expected_max_timestamp if dataset_parts: ts_parts = load_dataset(dataset_name, parts=dataset_parts) @@ -268,15 +273,15 @@ def test_dataset_statistics(dataset_name, expected_shape, expected_min_date, exp @pytest.mark.parametrize( - "dataset_name, expected_df_exog_shapes, expected_df_exog_dates, dataset_parts", + "dataset_name, expected_df_exog_shapes, expected_df_exog_timestamps, dataset_parts", [ ( "m3_monthly", ((144, 1428), (144, 1428), (18, 1428)), ( - (pd.to_datetime("2010-01-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("2010-01-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("2020-07-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), + (0, 143), + (0, 143), + (126, 143), ), ("full", "train", "test"), ), @@ -284,19 +289,9 @@ def test_dataset_statistics(dataset_name, expected_shape, expected_min_date, exp "m3_quarterly", ((72, 756), (72, 756), (8, 756)), ( - (pd.to_datetime("2004-03-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("2004-03-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("2020-03-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - ), - ("full", "train", "test"), - ), - ( - "m3_other", - ((104, 174), (104, 174), (8, 174)), - ( - (pd.to_datetime("1996-03-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("1996-03-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("2020-03-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), + (0, 71), + (0, 71), + (64, 71), ), ("full", "train", "test"), ), @@ -304,9 +299,9 @@ def test_dataset_statistics(dataset_name, expected_shape, expected_min_date, exp "m3_yearly", ((47, 645), (47, 645), (6, 645)), ( - (pd.to_datetime("1975-12-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("1975-12-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("2016-12-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), + (0, 46), + (0, 46), + (41, 46), ), ("full", "train", "test"), ), @@ -314,9 +309,9 @@ def test_dataset_statistics(dataset_name, expected_shape, expected_min_date, exp "tourism_monthly", ((333, 366), (333, 366), (24, 366)), ( - (pd.to_datetime("1994-05-01 00:00:00"), pd.to_datetime("2022-01-01 00:00:00")), - (pd.to_datetime("1994-05-01 00:00:00"), pd.to_datetime("2022-01-01 00:00:00")), - (pd.to_datetime("2020-02-01 00:00:00"), pd.to_datetime("2022-01-01 00:00:00")), + (0, 332), + (0, 332), + (309, 332), ), ("full", "train", "test"), ), @@ -324,9 +319,9 @@ def test_dataset_statistics(dataset_name, expected_shape, expected_min_date, exp "tourism_quarterly", ((130, 427), (130, 427), (8, 427)), ( - (pd.to_datetime("1989-09-30 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("1989-09-30 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("2020-03-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), + (0, 129), + (0, 129), + (122, 129), ), ("full", "train", "test"), ), @@ -334,9 +329,9 @@ def test_dataset_statistics(dataset_name, expected_shape, expected_min_date, exp "tourism_yearly", ((47, 518), (47, 518), (4, 518)), ( - (pd.to_datetime("1975-12-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("1975-12-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), - (pd.to_datetime("2018-12-31 00:00:00"), pd.to_datetime("2021-12-31 00:00:00")), + (0, 46), + (0, 46), + (43, 46), ), ("full", "train", "test"), ), @@ -345,14 +340,17 @@ def test_dataset_statistics(dataset_name, expected_shape, expected_min_date, exp def test_df_exog_statistics( dataset_name, expected_df_exog_shapes, - expected_df_exog_dates, + expected_df_exog_timestamps, dataset_parts, ): ts_parts = load_dataset(dataset_name, parts=dataset_parts) for i, part in enumerate(ts_parts): assert part.df_exog.shape == expected_df_exog_shapes[i] for i, part in enumerate(ts_parts): - assert (part.df_exog.index.min(), part.df_exog.index.max()) == expected_df_exog_dates[i] + assert (part.df_exog.index.min(), part.df_exog.index.max()) == expected_df_exog_timestamps[i] + for i, part in enumerate(ts_parts): + exog_col_type = part.df_exog.dtypes.iloc[0] + assert pd.api.types.is_datetime64_dtype(exog_col_type) def test_list_datasets(): @@ -368,9 +366,15 @@ def test_list_datasets(): marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), ), "m4_hourly", - "m4_daily", + pytest.param( + "m4_daily", + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), + ), "m4_weekly", - "m4_monthly", + pytest.param( + "m4_monthly", + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), + ), "m4_quarterly", "m4_yearly", pytest.param( @@ -397,7 +401,10 @@ def test_list_datasets(): "ETTm2", "ETTh1", "ETTh2", - "IHEPC_T", + pytest.param( + "IHEPC_T", + marks=pytest.mark.skip(reason="Dataset is too large for testing in GitHub."), + ), "australian_wine_sales_monthly", ], )