Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
fixing the missing y variable preparation step in the pickled pipeline (
Browse files Browse the repository at this point in the history
  • Loading branch information
jzhang-gp committed Jan 8, 2020
1 parent c2abb8b commit 8e9f1db
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 1 deletion.
4 changes: 3 additions & 1 deletion foreshadow/foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,9 @@ def pickle_fitted_pipeline(self, path: str) -> NoReturn:
self.estimator = self.estimator.estimator.fitted_pipeline_
# updating the estimator above will not update the reference in
# the pipeline instance as it still points to the old object.
self.pipeline.steps[1] = ("estimator", self.estimator)
# Accessing the estimator wrapper object and update the
# estimator field.
self.pipeline.steps[1][1].estimator = self.estimator

with open(path, "wb") as fopen:
pickle.dump(self.pipeline, fopen)
Expand Down
64 changes: 64 additions & 0 deletions foreshadow/tests/test_foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1245,3 +1245,67 @@ def test_foreshadow_integration_data_cleaner_can_drop(
# produced a reasonable score and the difference is small.
# assert score1 > 0.76 and score2 > 0.76
assertions.assertAlmostEqual(score1, score2, places=2)


@pytest.mark.parametrize(
"filename,problem_type,X_start, X_end, target",
[
(
"adult_small.csv",
ProblemType.CLASSIFICATION,
"age",
"workclass",
"class",
)
],
)
def test_foreshadow_integration_adult_small_piclking_unpickling(
filename, problem_type, X_start, X_end, target, tmpdir
):
from foreshadow.foreshadow import Foreshadow
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

np.random.seed(1337)

local_file_folder = "examples"
data = pd.read_csv("/".join([local_file_folder, filename]))
X_df = data.loc[:, X_start:X_end]
y_df = data.loc[:, target]

X_train, X_test, y_train, y_test = train_test_split(
X_df, y_df, test_size=0.2
)

from foreshadow.estimators import AutoEstimator

estimator = AutoEstimator(
problem_type=problem_type,
auto="tpot",
estimator_kwargs={"max_time_mins": 1},
)

shadow = Foreshadow(estimator=estimator, problem_type=problem_type)

pickled_fitted_pipeline_location = tmpdir.join("fitted_pipeline.p")
shadow.fit(X_train, y_train)
shadow.pickle_fitted_pipeline(pickled_fitted_pipeline_location)

import pickle

with open(pickled_fitted_pipeline_location, "rb") as fopen:
pipeline = pickle.load(fopen)

score1 = shadow.score(X_test, y_test)
score2 = pipeline.score(X_test, y_test)

import unittest

assertions = unittest.TestCase("__init__")
# given the randomness of the tpot algorithm and the short run
# time we configured, there is no guarantee the performance can
# converge. The test here aims to evaluate if both cases have
# produced a reasonable score and the difference is small.
# assert score1 > 0.76 and score2 > 0.76
assertions.assertAlmostEqual(score1, score2, places=2)

0 comments on commit 8e9f1db

Please sign in to comment.