fixing the missing y variable preparation step in the pickled pipeline (

#196)
georgian-io-archive · Jan 8, 2020 · 8e9f1db · 8e9f1db
1 parent c2abb8b
commit 8e9f1db
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 1 deletion.
diff --git a/foreshadow/foreshadow.py b/foreshadow/foreshadow.py
@@ -593,7 +593,9 @@ def pickle_fitted_pipeline(self, path: str) -> NoReturn:
             self.estimator = self.estimator.estimator.fitted_pipeline_
             # updating the estimator above will not update the reference in
             # the pipeline instance as it still points to the old object.
-            self.pipeline.steps[1] = ("estimator", self.estimator)
+            # Accessing the estimator wrapper object and update the
+            # estimator field.
+            self.pipeline.steps[1][1].estimator = self.estimator
 
         with open(path, "wb") as fopen:
             pickle.dump(self.pipeline, fopen)

diff --git a/foreshadow/tests/test_foreshadow.py b/foreshadow/tests/test_foreshadow.py
@@ -1245,3 +1245,67 @@ def test_foreshadow_integration_data_cleaner_can_drop(
     # produced a reasonable score and the difference is small.
     # assert score1 > 0.76 and score2 > 0.76
     assertions.assertAlmostEqual(score1, score2, places=2)
+
+
+@pytest.mark.parametrize(
+    "filename,problem_type,X_start, X_end, target",
+    [
+        (
+            "adult_small.csv",
+            ProblemType.CLASSIFICATION,
+            "age",
+            "workclass",
+            "class",
+        )
+    ],
+)
+def test_foreshadow_integration_adult_small_piclking_unpickling(
+    filename, problem_type, X_start, X_end, target, tmpdir
+):
+    from foreshadow.foreshadow import Foreshadow
+    import pandas as pd
+    import numpy as np
+    from sklearn.model_selection import train_test_split
+
+    np.random.seed(1337)
+
+    local_file_folder = "examples"
+    data = pd.read_csv("/".join([local_file_folder, filename]))
+    X_df = data.loc[:, X_start:X_end]
+    y_df = data.loc[:, target]
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_df, y_df, test_size=0.2
+    )
+
+    from foreshadow.estimators import AutoEstimator
+
+    estimator = AutoEstimator(
+        problem_type=problem_type,
+        auto="tpot",
+        estimator_kwargs={"max_time_mins": 1},
+    )
+
+    shadow = Foreshadow(estimator=estimator, problem_type=problem_type)
+
+    pickled_fitted_pipeline_location = tmpdir.join("fitted_pipeline.p")
+    shadow.fit(X_train, y_train)
+    shadow.pickle_fitted_pipeline(pickled_fitted_pipeline_location)
+
+    import pickle
+
+    with open(pickled_fitted_pipeline_location, "rb") as fopen:
+        pipeline = pickle.load(fopen)
+
+    score1 = shadow.score(X_test, y_test)
+    score2 = pipeline.score(X_test, y_test)
+
+    import unittest
+
+    assertions = unittest.TestCase("__init__")
+    # given the randomness of the tpot algorithm and the short run
+    # time we configured, there is no guarantee the performance can
+    # converge. The test here aims to evaluate if both cases have
+    # produced a reasonable score and the difference is small.
+    # assert score1 > 0.76 and score2 > 0.76
+    assertions.assertAlmostEqual(score1, score2, places=2)