Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Serialziation and deserialization of TPOT (#188)
Browse files Browse the repository at this point in the history
* Enable pickling of fitted_pipeline from foreshadow.
  • Loading branch information
jzhang-gp committed Dec 18, 2019
1 parent 1693466 commit c013774
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 6 deletions.
28 changes: 28 additions & 0 deletions foreshadow/foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,3 +567,31 @@ def set_processed_data_export_path(self, data_path: str) -> NoReturn:
self.X_preparer.cache_manager["config"][
ConfigKey.PROCESSED_DATA_EXPORT_PATH
] = data_path

def pickle_fitted_pipeline(self, path: str) -> NoReturn:
"""Pickle the foreshadow object with the best pipeline estimator.
Args:
path: the pickle file path
Raises:
ValueError: pipeline not fitted.
"""
if not self.has_fitted:
logging.error("No pipeline has been fitted yet.")
raise ValueError("The pipeline has not been fitted yet.")

import pickle

if (
isinstance(self.estimator, AutoEstimator)
and self.estimator.estimator.fitted_pipeline_ is not None
):
self.estimator = self.estimator.estimator.fitted_pipeline_
# updating the estimator above will not update the reference in
# the pipeline instance as it still points to the old object.
self.pipeline.steps[1] = ("estimator", self.estimator)

with open(path, "wb") as fopen:
pickle.dump(self.pipeline, fopen)
105 changes: 99 additions & 6 deletions foreshadow/tests/test_foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1009,8 +1009,81 @@ def test_foreshadow_serialization_adults_small_classification():
assertions.assertAlmostEqual(score1, score2, places=2)


def test_foreshadow_pickling_and_unpickling_unfitted():
from foreshadow.foreshadow import Foreshadow
from foreshadow.estimators import AutoEstimator

estimator = AutoEstimator(
problem_type=ProblemType.CLASSIFICATION,
auto="tpot",
estimator_kwargs={"max_time_mins": 1},
)
shadow = Foreshadow(
estimator=estimator, problem_type=ProblemType.CLASSIFICATION
)
with pytest.raises(ValueError):
shadow.pickle_fitted_pipeline("fitted_pipeline.p")


def test_foreshadow_pickling_and_unpickling_non_tpot():
from foreshadow.foreshadow import Foreshadow
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

np.random.seed(1337)

cancer = load_breast_cancer()
cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
cancery_df = pd.DataFrame(cancer.target, columns=["target"])

X_train, X_test, y_train, y_test = train_test_split(
cancerX_df, cancery_df, test_size=0.2
)

# TODO If we use the following dataset, it may fail the test as the
# processed data frame still contains nan. This triggers TPOT auto
# imputation but since it's not part of the fitted pipeline,
# the unpickled foreshadow may fail on prediction. We need to make sure
# one of the existing PR handles this by making sure processed data by
# foreshadow contains no nan.
#
# adult = pd.read_csv("examples/42.csv")
# X_df = adult.loc[:, "date":"roots"]
# y_df = adult.loc[:, "target"]
#
# X_train, X_test, y_train, y_test = train_test_split(
# X_df, y_df, test_size=0.2
# )

from sklearn.linear_model import LogisticRegression

shadow = Foreshadow(
estimator=LogisticRegression(), problem_type=ProblemType.CLASSIFICATION
)

shadow.fit(X_train, y_train)
shadow.pickle_fitted_pipeline("fitted_pipeline.p")

import pickle

with open("fitted_pipeline.p", "rb") as fopen:
pipeline = pickle.load(fopen)

pipeline.fit(X_train, y_train)

score1 = shadow.score(X_test, y_test)
score2 = pipeline.score(X_test, y_test)
# given the randomness of the tpot algorithm and the short run
# time we configured, there is no guarantee the performance can
# converge. The test here aims to evaluate if both cases have
# produced a reasonable score and the difference is small.
assert score1 > 0.9 and score2 > 0.9


@slow
def test_foreshadow_serialization_tpot():
def test_foreshadow_pickling_and_unpickling_tpot():
from foreshadow.foreshadow import Foreshadow
import pandas as pd
import numpy as np
Expand All @@ -1027,25 +1100,45 @@ def test_foreshadow_serialization_tpot():
cancerX_df, cancery_df, test_size=0.2
)

# TODO If we use the following dataset, it may fail the test as the
# processed data frame still contains nan. This triggers TPOT auto
# imputation but since it's not part of the fitted pipeline,
# the unpickled foreshadow may fail on prediction. We need to make sure
# one of the existing PR handles this by making sure processed data by
# foreshadow contains no nan.
#
# adult = pd.read_csv("examples/42.csv")
# X_df = adult.loc[:, "date":"roots"]
# y_df = adult.loc[:, "target"]
#
# X_train, X_test, y_train, y_test = train_test_split(
# X_df, y_df, test_size=0.2
# )

from foreshadow.estimators import AutoEstimator

estimator = AutoEstimator(
problem_type=ProblemType.CLASSIFICATION, auto="tpot"
problem_type=ProblemType.CLASSIFICATION,
auto="tpot",
estimator_kwargs={"max_time_mins": 1},
)

shadow = Foreshadow(
estimator=estimator, problem_type=ProblemType.CLASSIFICATION
)

shadow.fit(X_train, y_train)
shadow.pickle_fitted_pipeline("fitted_pipeline.p")

shadow.to_json("foreshadow_tpot.json")
import pickle

shadow2 = Foreshadow.from_json("foreshadow_tpot.json")
shadow2.fit(X_train, y_train)
with open("fitted_pipeline.p", "rb") as fopen:
pipeline = pickle.load(fopen)

pipeline.fit(X_train, y_train)

score1 = shadow.score(X_test, y_test)
score2 = shadow2.score(X_test, y_test)
score2 = pipeline.score(X_test, y_test)
# given the randomness of the tpot algorithm and the short run
# time we configured, there is no guarantee the performance can
# converge. The test here aims to evaluate if both cases have
Expand Down

0 comments on commit c013774

Please sign in to comment.