Implement SavePredictionIntervalsMixin (#87)

* added save/load mixin * removed placeholders from base class * moved fixtures * added tests * added tests * updated changelog * review fixes
etna-team · Sep 23, 2023 · 4d87da9 · 4d87da9
1 parent a8fdd3c
commit 4d87da9
Show file tree

Hide file tree

Showing 6 changed files with 232 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 ### Added
-- Base class `BasePredictionIntervals` for prediction intervals into experimental module. ([#86](https://github.com/etna-team/etna/pull/86))
+- `SavePredictionIntervalsMixin` for the `BasePredictionIntervals` ([#87](https://github.com/etna-team/etna/pull/87))
+- Base class `BasePredictionIntervals` for prediction intervals into experimental module ([#86](https://github.com/etna-team/etna/pull/86))
 - Add `fit_params` parameter to `etna.models.sarimax.SARIMAXModel` ([#69](https://github.com/etna-team/etna/pull/69))
 - Add `quickstart` notebook, add `mechanics_of_forecasting` notebook ([#1343](https://github.com/tinkoff-ai/etna/pull/1343))
 - Add gallery of tutorials divided by level ([#46](https://github.com/etna-team/etna/pull/46))

diff --git a/etna/experimental/prediction_intervals/base.py b/etna/experimental/prediction_intervals/base.py
@@ -1,4 +1,3 @@
-import pathlib
 from abc import abstractmethod
 from typing import Dict
 from typing import Optional
@@ -8,10 +7,11 @@
 
 from etna.datasets import TSDataset
 from etna.distributions import BaseDistribution
+from etna.experimental.prediction_intervals.mixins import SavePredictionIntervalsMixin
 from etna.pipeline.base import BasePipeline
 
 
-class BasePredictionIntervals(BasePipeline):
+class BasePredictionIntervals(SavePredictionIntervalsMixin, BasePipeline):
     """Base class for prediction intervals methods.
 
     This class implements a wrapper interface for pipelines and ensembles that provides the ability to
@@ -113,15 +113,6 @@ def _forecast(self, ts: TSDataset, return_components: bool) -> TSDataset:
         """Make point forecasts using base pipeline or ensemble."""
         return self.pipeline._forecast(ts=ts, return_components=return_components)
 
-    def save(self, path: pathlib.Path):
-        """Implement in SavePredictionIntervalsMixin."""
-        pass
-
-    @classmethod
-    def load(cls, path: pathlib.Path):
-        """Implement in SavePredictionIntervalsMixin."""
-        pass
-
     def forecast(
         self,
         ts: Optional[TSDataset] = None,

diff --git a/etna/experimental/prediction_intervals/mixins.py b/etna/experimental/prediction_intervals/mixins.py
@@ -0,0 +1,92 @@
+import pathlib
+import tempfile
+import zipfile
+from typing import Optional
+
+from typing_extensions import Self
+
+from etna.core import SaveMixin
+from etna.core import load
+from etna.datasets import TSDataset
+from etna.pipeline import BasePipeline
+
+
+class SavePredictionIntervalsMixin(SaveMixin):
+    """Implementation of ``AbstractSaveable`` abstract class for prediction intervals with pipelines inside.
+
+    It saves object to the zip archive with 3 entities:
+
+    * metadata.json: contains library version and class name.
+
+    * object.pkl: pickled without pipeline and ts.
+
+    * pipeline.zip: pipeline archive, saved with its own method.
+    """
+
+    def save(self, path: pathlib.Path):
+        """Save the object.
+
+        Parameters
+        ----------
+        path:
+            Path to save object to.
+        """
+        self.pipeline: BasePipeline
+
+        pipeline = self.pipeline
+
+        try:
+            # extract pipeline to save it with its own method later
+            delattr(self, "pipeline")
+
+            # save the remaining part
+            super().save(path=path)
+
+        finally:
+            self.pipeline = pipeline
+
+        with zipfile.ZipFile(path, "a") as archive:
+            with tempfile.TemporaryDirectory() as _temp_dir:
+                temp_dir = pathlib.Path(_temp_dir)
+
+                # save pipeline separately and add to the archive
+                pipeline_save_path = temp_dir / "pipeline.zip"
+                pipeline.save(path=pipeline_save_path)
+
+                archive.write(pipeline_save_path, "pipeline.zip")
+
+    @classmethod
+    def load(cls, path: pathlib.Path, ts: Optional[TSDataset] = None) -> Self:
+        """Load an object.
+
+        Warning
+        -------
+        This method uses :py:mod:`dill` module which is not secure.
+        It is possible to construct malicious data which will execute arbitrary code during loading.
+        Never load data that could have come from an untrusted source, or that could have been tampered with.
+
+        Parameters
+        ----------
+        path:
+            Path to load object from.
+        ts:
+            TSDataset to set into loaded pipeline.
+
+        Returns
+        -------
+        :
+            Loaded object.
+        """
+        obj = super().load(path=path)
+
+        with zipfile.ZipFile(path, "r") as archive:
+            with tempfile.TemporaryDirectory() as _temp_dir:
+                temp_dir = pathlib.Path(_temp_dir)
+
+                archive.extractall(temp_dir)
+
+                # load pipeline and add to the object
+                pipeline_path = temp_dir / "pipeline.zip"
+                obj.pipeline = load(path=pipeline_path, ts=ts)
+
+        return obj
diff --git a/tests/test_experimental/test_prediction_intervals/conftest.py b/tests/test_experimental/test_prediction_intervals/conftest.py
@@ -0,0 +1,14 @@
+import pytest
+
+from tests.test_experimental.test_prediction_intervals.common import get_naive_pipeline
+from tests.test_experimental.test_prediction_intervals.common import get_naive_pipeline_with_transforms
+
+
+@pytest.fixture()
+def naive_pipeline():
+    return get_naive_pipeline(horizon=5)
+
+
+@pytest.fixture()
+def naive_pipeline_with_transforms():
+    return get_naive_pipeline_with_transforms(horizon=5)
diff --git a/tests/test_experimental/test_prediction_intervals/test_base.py b/tests/test_experimental/test_prediction_intervals/test_base.py
@@ -22,6 +22,7 @@
 from tests.test_experimental.test_prediction_intervals.common import get_naive_pipeline
 from tests.test_experimental.test_prediction_intervals.common import get_naive_pipeline_with_transforms
 from tests.test_experimental.test_prediction_intervals.utils import assert_sampling_is_valid
+from tests.test_pipeline.utils import assert_pipeline_equals_loaded_original
 
 
 def run_base_pipeline_compat_check(ts, pipeline, expected_columns):
@@ -35,16 +36,6 @@ def run_base_pipeline_compat_check(ts, pipeline, expected_columns):
     assert np.sum(intervals_pipeline_pred.df.isna().values) == 0
 
 
-@pytest.fixture()
-def naive_pipeline():
-    return get_naive_pipeline(horizon=5)
-
-
-@pytest.fixture()
-def naive_pipeline_with_transforms():
-    return get_naive_pipeline_with_transforms(horizon=5)
-
-
 def test_pipeline_ref_initialized(naive_pipeline):
     intervals_pipeline = DummyPredictionIntervals(pipeline=naive_pipeline)
 
@@ -215,3 +206,26 @@ def test_default_params_to_tune_error(pipeline):
 
     with pytest.raises(NotImplementedError, match=f"{pipeline.__class__.__name__} doesn't support"):
         _ = intervals_pipeline.params_to_tune()
+
+
+@pytest.mark.parametrize("load_ts", (True, False))
+@pytest.mark.parametrize(
+    "pipeline",
+    (
+        Pipeline(model=LinearPerSegmentModel(), transforms=[DateFlagsTransform()]),
+        AutoRegressivePipeline(model=LinearPerSegmentModel(), transforms=[DateFlagsTransform()], horizon=1),
+        HierarchicalPipeline(
+            model=LinearPerSegmentModel(),
+            transforms=[DateFlagsTransform()],
+            horizon=1,
+            reconciliator=BottomUpReconciliator(target_level="total", source_level="market"),
+        ),
+        DirectEnsemble(pipelines=[get_naive_pipeline(horizon=1), get_naive_pipeline_with_transforms(horizon=2)]),
+        VotingEnsemble(pipelines=[get_naive_pipeline(horizon=1), get_naive_pipeline_with_transforms(horizon=1)]),
+        StackingEnsemble(pipelines=[get_naive_pipeline(horizon=1), get_naive_pipeline_with_transforms(horizon=1)]),
+    ),
+)
+def test_save_load(load_ts, pipeline, market_level_constant_hierarchical_ts_w_exog):
+    ts = market_level_constant_hierarchical_ts_w_exog
+    intervals_pipeline = DummyPredictionIntervals(pipeline=pipeline)
+    assert_pipeline_equals_loaded_original(pipeline=intervals_pipeline, ts=ts, load_ts=load_ts)
diff --git a/tests/test_experimental/test_prediction_intervals/test_mixins.py b/tests/test_experimental/test_prediction_intervals/test_mixins.py
@@ -0,0 +1,98 @@
+import pathlib
+import pickle
+import zipfile
+from copy import deepcopy
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+
+from etna.models import NaiveModel
+from etna.pipeline import Pipeline
+from tests.test_experimental.test_prediction_intervals.common import DummyPredictionIntervals
+
+
+@pytest.mark.parametrize("expected_filenames", ({"metadata.json", "object.pkl", "pipeline.zip"},))
+def test_save(naive_pipeline_with_transforms, example_tsds, tmp_path, expected_filenames):
+    dummy = DummyPredictionIntervals(pipeline=naive_pipeline_with_transforms, width=4)
+
+    path = pathlib.Path(tmp_path) / "dummy.zip"
+
+    initial_dummy = deepcopy(dummy)
+    dummy.save(path)
+
+    with zipfile.ZipFile(path, "r") as archive:
+        files = archive.namelist()
+        assert set(files) == expected_filenames
+
+        with archive.open("object.pkl", "r") as file:
+            loaded_obj = pickle.load(file)
+        assert loaded_obj.width == dummy.width
+
+    # basic check that we didn't break dummy object itself
+    assert dummy.width == initial_dummy.width
+    assert pickle.dumps(dummy.ts) == pickle.dumps(initial_dummy.ts)
+    assert pickle.dumps(dummy.pipeline.model) == pickle.dumps(initial_dummy.pipeline.model)
+    assert pickle.dumps(dummy.pipeline.transforms) == pickle.dumps(initial_dummy.pipeline.transforms)
+
+
+def test_load_file_not_found_error():
+    non_existent_path = pathlib.Path("archive.zip")
+    with pytest.raises(FileNotFoundError):
+        DummyPredictionIntervals.load(non_existent_path)
+
+
+def test_load_with_ts(naive_pipeline_with_transforms, example_tsds, recwarn, tmp_path):
+    dummy = DummyPredictionIntervals(pipeline=naive_pipeline_with_transforms, width=4)
+
+    path = pathlib.Path(tmp_path) / "dummy.zip"
+    dummy.save(path)
+
+    loaded_obj = DummyPredictionIntervals.load(path=path, ts=example_tsds)
+
+    assert loaded_obj.width == dummy.width
+    assert loaded_obj.ts is not dummy.ts
+    pd.testing.assert_frame_equal(loaded_obj.ts.to_pandas(), example_tsds.to_pandas())
+    assert isinstance(loaded_obj.pipeline, Pipeline)
+    assert isinstance(loaded_obj.pipeline.model, NaiveModel)
+    assert len(loaded_obj.pipeline.transforms) == 2
+    assert len(recwarn) == 0
+
+
+def test_load_without_ts(naive_pipeline_with_transforms, recwarn, tmp_path):
+    dummy = DummyPredictionIntervals(pipeline=naive_pipeline_with_transforms, width=4)
+
+    path = pathlib.Path(tmp_path) / "dummy.zip"
+    dummy.save(path)
+
+    loaded_obj = DummyPredictionIntervals.load(path=path)
+
+    assert loaded_obj.width == dummy.width
+    assert loaded_obj.ts is None
+    assert isinstance(loaded_obj.pipeline, Pipeline)
+    assert isinstance(loaded_obj.pipeline.model, NaiveModel)
+    assert len(loaded_obj.pipeline.transforms) == 2
+    assert len(recwarn) == 0
+
+
+@pytest.mark.parametrize(
+    "save_version, load_version", [((1, 5, 0), (2, 5, 0)), ((2, 5, 0), (1, 5, 0)), ((1, 5, 0), (1, 3, 0))]
+)
+@patch("etna.core.mixins.get_etna_version")
+def test_save_mixin_load_warning(
+    get_version_mock, naive_pipeline_with_transforms, save_version, load_version, tmp_path
+):
+    dummy = DummyPredictionIntervals(pipeline=naive_pipeline_with_transforms, width=4)
+    path = pathlib.Path(tmp_path) / "dummy.zip"
+
+    get_version_mock.return_value = save_version
+    dummy.save(path)
+
+    save_version_str = ".".join(map(str, save_version))
+    load_version_str = ".".join(map(str, load_version))
+    with pytest.warns(
+        UserWarning,
+        match=f"The object was saved under etna version {save_version_str} but running version is {load_version_str}",
+    ):
+        get_version_mock.return_value = load_version
+        _ = DummyPredictionIntervals.load(path=path)