Serialziation and deserialization of TPOT (#188)

* Enable pickling of fitted_pipeline from foreshadow.
georgian-io-archive · Dec 18, 2019 · c013774 · c013774
1 parent 1693466
commit c013774
Show file tree

Hide file tree

Showing 2 changed files with 127 additions and 6 deletions.
diff --git a/foreshadow/foreshadow.py b/foreshadow/foreshadow.py
@@ -567,3 +567,31 @@ def set_processed_data_export_path(self, data_path: str) -> NoReturn:
         self.X_preparer.cache_manager["config"][
             ConfigKey.PROCESSED_DATA_EXPORT_PATH
         ] = data_path
+
+    def pickle_fitted_pipeline(self, path: str) -> NoReturn:
+        """Pickle the foreshadow object with the best pipeline estimator.
+
+        Args:
+            path: the pickle file path
+
+        Raises:
+            ValueError: pipeline not fitted.
+
+        """
+        if not self.has_fitted:
+            logging.error("No pipeline has been fitted yet.")
+            raise ValueError("The pipeline has not been fitted yet.")
+
+        import pickle
+
+        if (
+            isinstance(self.estimator, AutoEstimator)
+            and self.estimator.estimator.fitted_pipeline_ is not None
+        ):
+            self.estimator = self.estimator.estimator.fitted_pipeline_
+            # updating the estimator above will not update the reference in
+            # the pipeline instance as it still points to the old object.
+            self.pipeline.steps[1] = ("estimator", self.estimator)
+
+        with open(path, "wb") as fopen:
+            pickle.dump(self.pipeline, fopen)
diff --git a/foreshadow/tests/test_foreshadow.py b/foreshadow/tests/test_foreshadow.py
@@ -1009,8 +1009,81 @@ def test_foreshadow_serialization_adults_small_classification():
     assertions.assertAlmostEqual(score1, score2, places=2)
 
 
+def test_foreshadow_pickling_and_unpickling_unfitted():
+    from foreshadow.foreshadow import Foreshadow
+    from foreshadow.estimators import AutoEstimator
+
+    estimator = AutoEstimator(
+        problem_type=ProblemType.CLASSIFICATION,
+        auto="tpot",
+        estimator_kwargs={"max_time_mins": 1},
+    )
+    shadow = Foreshadow(
+        estimator=estimator, problem_type=ProblemType.CLASSIFICATION
+    )
+    with pytest.raises(ValueError):
+        shadow.pickle_fitted_pipeline("fitted_pipeline.p")
+
+
+def test_foreshadow_pickling_and_unpickling_non_tpot():
+    from foreshadow.foreshadow import Foreshadow
+    import pandas as pd
+    import numpy as np
+    from sklearn.datasets import load_breast_cancer
+    from sklearn.model_selection import train_test_split
+
+    np.random.seed(1337)
+
+    cancer = load_breast_cancer()
+    cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
+    cancery_df = pd.DataFrame(cancer.target, columns=["target"])
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        cancerX_df, cancery_df, test_size=0.2
+    )
+
+    # TODO If we use the following dataset, it may fail the test as the
+    #   processed data frame still contains nan. This triggers TPOT auto
+    #   imputation but since it's not part of the fitted pipeline,
+    #   the unpickled foreshadow may fail on prediction. We need to make sure
+    #   one of the existing PR handles this by making sure processed data by
+    #   foreshadow contains no nan.
+    #
+    # adult = pd.read_csv("examples/42.csv")
+    # X_df = adult.loc[:, "date":"roots"]
+    # y_df = adult.loc[:, "target"]
+    #
+    # X_train, X_test, y_train, y_test = train_test_split(
+    #     X_df, y_df, test_size=0.2
+    # )
+
+    from sklearn.linear_model import LogisticRegression
+
+    shadow = Foreshadow(
+        estimator=LogisticRegression(), problem_type=ProblemType.CLASSIFICATION
+    )
+
+    shadow.fit(X_train, y_train)
+    shadow.pickle_fitted_pipeline("fitted_pipeline.p")
+
+    import pickle
+
+    with open("fitted_pipeline.p", "rb") as fopen:
+        pipeline = pickle.load(fopen)
+
+    pipeline.fit(X_train, y_train)
+
+    score1 = shadow.score(X_test, y_test)
+    score2 = pipeline.score(X_test, y_test)
+    # given the randomness of the tpot algorithm and the short run
+    # time we configured, there is no guarantee the performance can
+    # converge. The test here aims to evaluate if both cases have
+    # produced a reasonable score and the difference is small.
+    assert score1 > 0.9 and score2 > 0.9
+
+
 @slow
-def test_foreshadow_serialization_tpot():
+def test_foreshadow_pickling_and_unpickling_tpot():
     from foreshadow.foreshadow import Foreshadow
     import pandas as pd
     import numpy as np
@@ -1027,25 +1100,45 @@ def test_foreshadow_serialization_tpot():
         cancerX_df, cancery_df, test_size=0.2
     )
 
+    # TODO If we use the following dataset, it may fail the test as the
+    #   processed data frame still contains nan. This triggers TPOT auto
+    #   imputation but since it's not part of the fitted pipeline,
+    #   the unpickled foreshadow may fail on prediction. We need to make sure
+    #   one of the existing PR handles this by making sure processed data by
+    #   foreshadow contains no nan.
+    #
+    # adult = pd.read_csv("examples/42.csv")
+    # X_df = adult.loc[:, "date":"roots"]
+    # y_df = adult.loc[:, "target"]
+    #
+    # X_train, X_test, y_train, y_test = train_test_split(
+    #     X_df, y_df, test_size=0.2
+    # )
+
     from foreshadow.estimators import AutoEstimator
 
     estimator = AutoEstimator(
-        problem_type=ProblemType.CLASSIFICATION, auto="tpot"
+        problem_type=ProblemType.CLASSIFICATION,
+        auto="tpot",
+        estimator_kwargs={"max_time_mins": 1},
     )
 
     shadow = Foreshadow(
         estimator=estimator, problem_type=ProblemType.CLASSIFICATION
     )
 
     shadow.fit(X_train, y_train)
+    shadow.pickle_fitted_pipeline("fitted_pipeline.p")
 
-    shadow.to_json("foreshadow_tpot.json")
+    import pickle
 
-    shadow2 = Foreshadow.from_json("foreshadow_tpot.json")
-    shadow2.fit(X_train, y_train)
+    with open("fitted_pipeline.p", "rb") as fopen:
+        pipeline = pickle.load(fopen)
+
+    pipeline.fit(X_train, y_train)
 
     score1 = shadow.score(X_test, y_test)
-    score2 = shadow2.score(X_test, y_test)
+    score2 = pipeline.score(X_test, y_test)
     # given the randomness of the tpot algorithm and the short run
     # time we configured, there is no guarantee the performance can
     # converge. The test here aims to evaluate if both cases have