etna-team · d-a-bunin · Feb 22, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -65,6 +65,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fix `FourierTransform` on integer index, add inference tests ([#230](https://github.com/etna-team/etna/pull/230))
 - Update outliers transforms to handle integer timestamp ([#229](https://github.com/etna-team/etna/pull/229))
 - Update pipelines to handle integer timestamp ([#241](https://github.com/etna-team/etna/pull/241))
+- Add `timestamp_range` and refactor code with it ([#244](https://github.com/etna-team/etna/pull/244))
 
 ### Fixed
 - 
@@ -74,8 +75,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - 
 - Fix `DeseasonalityTransform` fails to inverse transform short series ([#174](https://github.com/etna-team/etna/pull/174))
 - 
-- 
-- 
+- Fix indexing in `stl_plot`, `plot_periodogram`, `plot_holidays`, `plot_backtest`, `plot_backtest_interactive`, `ResampleWithDistributionTransform` ([#244](https://github.com/etna-team/etna/pull/244))
+- Fix `DifferencingTransform` to handle integer timestamp on test ([#244](https://github.com/etna-team/etna/pull/244))
 - 
 - 
 - 

diff --git a/etna/analysis/decomposition/plots.py b/etna/analysis/decomposition/plots.py
@@ -116,7 +116,7 @@
     Raises
     ------
     ValueError:
-        Datetime ``start`` or ``end`` is used for data with integer timestamp.
+        Incorrect type of ``start`` or ``end`` is used according to ``ts.freq``.
     """
     start, end = _get_borders_ts(ts, start, end)
 
@@ -206,7 +206,7 @@
     Raises
     ------
     ValueError:
-        Datetime ``start`` or ``end`` is used for data with integer timestamp.
+        Incorrect type of ``start`` or ``end`` is used according to ``ts.freq``.
 
     Examples
     --------
@@ -343,7 +343,7 @@
     df = ts.to_pandas()
     for i, segment in enumerate(segments):
         segment_df = df.loc[:, pd.IndexSlice[segment, :]][segment]
-        segment_df = segment_df[segment_df.first_valid_index() : segment_df.last_valid_index()]
+        segment_df = segment_df.loc[segment_df.first_valid_index() : segment_df.last_valid_index()]
         decompose_result = STL(endog=segment_df[in_column], period=period, **stl_kwargs).fit()
 
         # start plotting

diff --git a/etna/analysis/decomposition/utils.py b/etna/analysis/decomposition/utils.py
@@ -11,6 +11,7 @@
 import numpy as np
 import pandas as pd
 from typing_extensions import Literal
+from typing_extensions import assert_never
 
 if TYPE_CHECKING:
     from etna.datasets import TSDataset
@@ -260,6 +261,8 @@ def _prepare_seasonal_plot_df(
     in_column: str,
     segments: List[str],
 ):
+    from etna.datasets.utils import timestamp_range
+
     # for simplicity we will rename our column to target
     df = ts.to_pandas().loc[:, pd.IndexSlice[segments, in_column]]
     df.rename(columns={in_column: "target"}, inplace=True)
@@ -281,24 +284,21 @@ def _prepare_seasonal_plot_df(
         timestamp = df.index
         num_to_add = -len(timestamp) % cycle
 
-        if freq is None:
-            # if we want to align by the first value, then we should append NaNs to timestamp
-            to_add_index = None
-            if SeasonalPlotAlignment(alignment) == SeasonalPlotAlignment.first:
-                to_add_index = np.arange(timestamp.max() + 1, timestamp.max() + 1 + num_to_add)
-            # if we want to align by the last value, then we should prepend NaNs to timestamp
-            elif SeasonalPlotAlignment(alignment) == SeasonalPlotAlignment.last:
-                to_add_index = np.arange(timestamp.min() - num_to_add, timestamp.min())
+        alignment_enum = SeasonalPlotAlignment(alignment)
+        # if we want to align by the first value, then we should append NaNs to timestamp
+        if alignment_enum is SeasonalPlotAlignment.first:
+            to_add_index = timestamp_range(start=timestamp[-1], periods=num_to_add + 1, freq=freq)[1:]
+        # if we want to align by the last value, then we should prepend NaNs to timestamp
+        elif alignment_enum is SeasonalPlotAlignment.last:
+            to_add_index = timestamp_range(end=timestamp[0], periods=num_to_add + 1, freq=freq)[:-1]
         else:
-            # if we want to align by the first value, then we should append NaNs to timestamp
-            to_add_index = None
-            if SeasonalPlotAlignment(alignment) == SeasonalPlotAlignment.first:
-                to_add_index = pd.date_range(start=timestamp.max(), periods=num_to_add + 1, closed="right", freq=freq)
-            # if we want to align by the last value, then we should prepend NaNs to timestamp
-            elif SeasonalPlotAlignment(alignment) == SeasonalPlotAlignment.last:
-                to_add_index = pd.date_range(end=timestamp.min(), periods=num_to_add + 1, closed="left", freq=freq)
-
-        df = pd.concat((df, pd.DataFrame(None, index=to_add_index))).sort_index()
+            assert_never(alignment_enum)
+
+        new_index = df.index.append(to_add_index)
+        index_name = df.index.name
+        df = df.reindex(new_index)
+        df.index.name = index_name
+
     elif freq is None:
         raise ValueError("Setting non-integer cycle isn't supported for data with integer timestamp!")
 

diff --git a/etna/analysis/eda/plots.py b/etna/analysis/eda/plots.py
@@ -215,7 +215,7 @@
         _, ax = _prepare_axes(num_plots=len(segments), columns_num=columns_num, figsize=figsize)
         for i, segment in enumerate(segments):
             segment_df = df.loc[:, pd.IndexSlice[segment, "target"]]
-            segment_df = segment_df[segment_df.first_valid_index() : segment_df.last_valid_index()]
+            segment_df = segment_df.loc[segment_df.first_valid_index() : segment_df.last_valid_index()]
             if segment_df.isna().any():
                 raise ValueError(f"Periodogram can't be calculated on segment with NaNs inside: {segment}")
             frequencies, spectrum = periodogram(x=segment_df, fs=period, **periodogram_params)
@@ -233,7 +233,7 @@
         lengths_segments = []
         for segment in segments:
             segment_df = df.loc[:, pd.IndexSlice[segment, "target"]]
-            segment_df = segment_df[segment_df.first_valid_index() : segment_df.last_valid_index()]
+            segment_df = segment_df.loc[segment_df.first_valid_index() : segment_df.last_valid_index()]
             if segment_df.isna().any():
                 raise ValueError(f"Periodogram can't be calculated on segment with NaNs inside: {segment}")
             lengths_segments.append(len(segment_df))
@@ -244,7 +244,7 @@
         spectrums_segments = []
         for segment in segments:
             segment_df = df.loc[:, pd.IndexSlice[segment, "target"]]
-            segment_df = segment_df[segment_df.first_valid_index() : segment_df.last_valid_index()][-cut_length:]
+            segment_df = segment_df.loc[segment_df.first_valid_index() : segment_df.last_valid_index()][-cut_length:]
             frequencies, spectrum = periodogram(x=segment_df, fs=period, **periodogram_params)
             frequencies_segments.append(frequencies)
             spectrums_segments.append(spectrum)
@@ -314,7 +314,7 @@
     Raises
     ------
     ValueError:
-        Datetime ``start`` or ``end`` is used for data with integer timestamp.
+        Incorrect type of ``start`` or ``end`` is used according to ``ts.freq``.
     ValueError:
         If ``holidays`` nor ``pd.DataFrame`` or ``str``.
     ValueError:
@@ -341,7 +341,7 @@
 
     for i, segment in enumerate(segments):
         segment_df = df.loc[start:end, pd.IndexSlice[segment, "target"]]  # type: ignore
-        segment_df = segment_df[segment_df.first_valid_index() : segment_df.last_valid_index()]
+        segment_df = segment_df.loc[segment_df.first_valid_index() : segment_df.last_valid_index()]
 
         # plot target on segment
         target_plot = ax[i].plot(segment_df.index, segment_df)
@@ -713,7 +713,7 @@
     Raises
     ------
     ValueError:
-        Datetime ``start`` or ``end`` is used for data with integer timestamp.
+        Incorrect type of ``start`` or ``end`` is used according to ``ts.freq``.
     """
     start, end = _get_borders_ts(ts, start, end)
 

diff --git a/etna/analysis/forecast/plots.py b/etna/analysis/forecast/plots.py
@@ -31,6 +31,7 @@
 from etna.analysis.forecast.utils import get_residuals
 from etna.analysis.utils import _prepare_axes
 from etna.datasets.utils import match_target_components
+from etna.datasets.utils import timestamp_range
 
 if TYPE_CHECKING:
     from etna.datasets import TSDataset
@@ -304,7 +305,7 @@
         for fold_number in folds:
             start_fold = fold_numbers[fold_numbers == fold_number].index.min()
             end_fold = fold_numbers[fold_numbers == fold_number].index.max()
-            end_fold_exclusive = pd.date_range(start=end_fold, periods=2, freq=ts.freq)[1]
+            end_fold_exclusive = timestamp_range(start=end_fold, periods=2, freq=ts.freq)[-1]
 
             # draw test
             backtest_df_slice_fold = segment_backtest_df.loc[start_fold:end_fold_exclusive]
@@ -430,7 +431,7 @@
         for fold_number in folds:
             start_fold = fold_numbers[fold_numbers == fold_number].index.min()
             end_fold = fold_numbers[fold_numbers == fold_number].index.max()
-            end_fold_exclusive = pd.date_range(start=end_fold, periods=2, freq=ts.freq)[1]
+            end_fold_exclusive = timestamp_range(start=end_fold, periods=2, freq=ts.freq)[-1]
 
             # draw test
             backtest_df_slice_fold = segment_backtest_df.loc[start_fold:end_fold_exclusive]

diff --git a/etna/analysis/outliers/plots.py b/etna/analysis/outliers/plots.py
@@ -51,7 +51,7 @@ def plot_anomalies(
     Raises
     ------
     ValueError:
-        Datetime ``start`` or ``end`` is used for integer-indexed timestamp.
+        Incorrect type of ``start`` or ``end`` is used according to ``ts.freq``.
     """
     start, end = _get_borders_ts(ts, start, end)
 
@@ -115,7 +115,7 @@ def plot_anomalies_interactive(
     Raises
     ------
     ValueError:
-        Datetime ``start`` or ``end`` is used for data with integer timestamp.
+        Incorrect type of ``start`` or ``end`` is used according to ``ts.freq``.
 
     Examples
     --------

diff --git a/etna/datasets/datasets_generation.py b/etna/datasets/datasets_generation.py
@@ -9,20 +9,18 @@
 from statsmodels.tsa.arima_process import arma_generate_sample
 
 from etna.datasets.utils import _check_timestamp_param
+from etna.datasets.utils import timestamp_range
 
 
 def _create_timestamp(
     start_time: Optional[Union[pd.Timestamp, int, str]], freq: Optional[str], periods: int
 ) -> Sequence[Union[pd.Timestamp, int]]:
-    start_time = _check_timestamp_param(param=start_time, param_name="start_time", freq=freq)
-    if freq is None:
-        if start_time is None:
-            start_time = 0
-        return np.arange(start_time, start_time + periods)  # type: ignore
-    else:
-        if start_time is None:
-            start_time = pd.Timestamp("2000-01-01")
-        return pd.date_range(start=start_time, freq=freq, periods=periods)
+    if freq is None and start_time is None:
+        start_time = 0
+    if freq is not None and start_time is None:
+        start_time = pd.Timestamp("2000-01-01")
+    _check_timestamp_param(param=start_time, param_name="start_time", freq=freq)
+    return timestamp_range(start=start_time, periods=periods, freq=freq)
 
 
 def generate_ar_df(
@@ -57,7 +55,7 @@ def generate_ar_df(
     Raises
     ------
     ValueError:
-        Non-integer timestamp parameter is used for integer-indexed timestamp.
+        Incorrect type of ``start_time`` is used according to ``freq``
     """
     if ar_coef is None:
         ar_coef = [1]
@@ -208,7 +206,7 @@ def generate_from_patterns_df(
     Raises
     ------
     ValueError:
-        Non-integer timestamp parameter is used for integer-indexed timestamp.
+        Incorrect type of ``start_time`` is used according to ``freq``
     """
     n_segments = len(patterns)
     if add_noise:

diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py
@@ -26,6 +26,7 @@
 from etna.datasets.utils import _TorchDataset
 from etna.datasets.utils import get_level_dataframe
 from etna.datasets.utils import inverse_transform_target_components
+from etna.datasets.utils import timestamp_range
 from etna.loggers import tslogger
 
 if TYPE_CHECKING:
@@ -260,18 +261,11 @@ def __getitem__(self, item):
 
     @staticmethod
     def _expand_index(df: pd.DataFrame, freq: Optional[str], future_steps: int) -> pd.DataFrame:
-        if freq is None:
-            new_index = np.arange(df.index.min(), df.index.max() + future_steps + 1)
-        else:
-            with warnings.catch_warnings():
-                warnings.filterwarnings(action="ignore", message="Argument `closed` is deprecated")
-                future_dates = pd.date_range(start=df.index.max(), periods=future_steps + 1, freq=freq, closed="right")
-            new_index = df.index.append(future_dates)
-
+        to_add_index = timestamp_range(start=df.index[-1], periods=future_steps + 1, freq=freq)[1:]
+        new_index = df.index.append(to_add_index)
         index_name = df.index.name
         df = df.reindex(new_index)
         df.index.name = index_name
-
         return df
 
     def make_future(
@@ -612,7 +606,7 @@ def plot(
         Raises
         ------
         ValueError:
-            Datetime ``start`` or ``end`` is used for data with integer timestamp.
+            Incorrect type of ``start`` or ``end`` is used according to ``freq``
         """
         if segments is None:
             segments = self.segments
@@ -1036,7 +1030,8 @@ def train_test_split(
         Raises
         ------
         ValueError:
-            Non-integer timestamp parameter is used for integer-indexed timestamp.
+            Incorrect type of ``train_start`` or ``train_end`` or ``test_start`` or ``test_end``
+            is used according to ``ts.freq``
 
         Examples
         --------

diff --git a/etna/datasets/utils.py b/etna/datasets/utils.py
@@ -426,3 +426,61 @@ def determine_freq(timestamps: Union[pd.Series, pd.Index]) -> Optional[str]:
             raise ValueError("Can't determine frequency of a given dataframe")
 
         return freq
+
+
+def timestamp_range(
+    start: Union[pd.Timestamp, int, str, None] = None,
+    end: Union[pd.Timestamp, int, str, None] = None,
+    periods: Optional[int] = None,
+    freq: Optional[str] = None,
+) -> pd.Index:
+    """Create index with timestamps.
+
+    Parameters
+    ----------
+    start:
+        start of index
+    end:
+        end of index
+    periods:
+        length of the index
+    freq:
+        frequency of timestamps, possible values:
+
+        - `pandas offset aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_ for datetime timestamp
+
+        - None for integer timestamp
+
+    Returns
+    -------
+    :
+        Created index
+
+    Raises
+    ------
+    ValueError:
+        Incorrect type of ``start`` or ``end`` is used according to ``freq``
+    ValueError:
+        Of the three parameters: start, end, periods, exactly two must be specified
+    """
+    start = _check_timestamp_param(param=start, param_name="start", freq=freq)
+    end = _check_timestamp_param(param=end, param_name="end", freq=freq)
+
+    num_set = 0
+    if start is not None:
+        num_set += 1
+    if end is not None:
+        num_set += 1
+    if periods is not None:
+        num_set += 1
+    if num_set != 2:
+        raise ValueError("Of the three parameters: start, end, periods, exactly two must be specified!")
+
+    if freq is None:
+        if start is None:
+            start = end - periods + 1  # type: ignore
+        if periods is None:
+            periods = end - start + 1  # type: ignore
+        return pd.Index(np.arange(start, start + periods))
+    else:
+        return pd.date_range(start=start, end=end, periods=periods, freq=freq)
diff --git a/etna/models/nn/utils.py b/etna/models/nn/utils.py
@@ -15,6 +15,7 @@
 from etna.core import BaseMixin
 from etna.datasets.tsdataset import TSDataset
 from etna.datasets.utils import determine_num_steps
+from etna.datasets.utils import timestamp_range
 from etna.loggers import tslogger
 from etna.models.base import log_decorator
 
@@ -275,11 +276,7 @@ def _is_in_sample_prediction(self, ts: TSDataset, horizon: int) -> bool:
 
     def _is_prediction_with_gap(self, ts: TSDataset, horizon: int) -> bool:
         first_prediction_timestamp = self._get_first_prediction_timestamp(ts=ts, horizon=horizon)
-        if pd.api.types.is_integer_dtype(ts.index.dtype):
-            first_timestamp_after_train = self._last_train_timestamp + 1
-        else:
-            first_timestamp_after_train = pd.date_range(self._last_train_timestamp, periods=2, freq=self._freq)[-1]
-
+        first_timestamp_after_train = timestamp_range(start=self._last_train_timestamp, periods=2, freq=self._freq)[-1]
         return first_prediction_timestamp > first_timestamp_after_train
 
     def _make_target_prediction(self, ts: TSDataset, horizon: int) -> Tuple[TSDataset, DataLoader]: