Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Merge pull request #142 from georgianpartners/serialization
Browse files Browse the repository at this point in the history
Serialization for ColumnSharer and Fixing a bug in serializer in the recursive call
  • Loading branch information
jzhang-gp committed Aug 15, 2019
2 parents b434e89 + c7719a8 commit 5690275
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 6 deletions.
37 changes: 36 additions & 1 deletion foreshadow/columnsharer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pprint
from collections import MutableMapping, defaultdict

from foreshadow.serializers import ConcreteSerializerMixin


# TODO: Make this multi processor safe using managers

Expand All @@ -12,7 +14,7 @@ class PrettyDefaultDict(defaultdict):
__repr__ = dict.__repr__


class ColumnSharer(MutableMapping):
class ColumnSharer(MutableMapping, ConcreteSerializerMixin):
"""Main cache-class to be used as single-instance to share data.
Note:
Expand Down Expand Up @@ -42,6 +44,39 @@ def __init__(self, *args, **kwargs):
lambda: False, acceptable_keys
)

def dict_serialize(self, deep=True):
"""Serialize the init parameters (dictionary form) of a columnsharer.
Args:
deep (bool): If True, will return the parameters for a columnsharer
recursively
Returns:
dict: The initialization parameters of the columnsharer.
"""
# Not returning __acceptable_keys because they are not supposed to be
# exposed to the user.
return {"store": self.store}

@classmethod
def dict_deserialize(cls, data):
"""Deserialize the dictionary form of a columnsharer.
Args:
data: The dictionary to parse as a columnsharer is constructed.
Returns:
object: A re-constructed columnsharer
"""
ret = cls()
store = data["store"]
for key in store:
ret[key] = store[key]

return ret

def __getitem__(self, key_list):
"""Override getitem to support multi key accessing simultaneously.
Expand Down
3 changes: 2 additions & 1 deletion foreshadow/parallelprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
_transform_one,
)

from .serializers import ConcreteSerializerMixin
from foreshadow.base import BaseEstimator


class ParallelProcessor(FeatureUnion):
class ParallelProcessor(FeatureUnion, ConcreteSerializerMixin):
"""Class to support parallel operation on dataframes.
This class functions similarly to a FeatureUnion except it divides a given
Expand Down
17 changes: 14 additions & 3 deletions foreshadow/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,10 @@ def serialize(self, method=None, **kwargs):
"""
if method is None:
method = self.DEFAULT_OPTION
if "_method" in kwargs:
method = kwargs.pop("_method")
else:
method = self.DEFAULT_OPTION

if method in self.OPTIONS:
method_func = getattr(self, method + "_serialize")
Expand Down Expand Up @@ -290,8 +293,16 @@ def dict_deserialize(cls, data):
return pickle_class(**params)
else:
# Cannot use set_params since steps is a required init arg
# for Pipelines
return cls(**params)
# for Pipelines and therefore we cannot use default
# init method (assuming no required args) to initialize
# an instance then call set_params.
if issubclass(cls, PipelineSerializerMixin):
return cls(**params)
else:
ret_tf = cls()
ret_tf.set_params(**params)

return ret_tf

def inline_serialize(self):
"""Convert transformer to hex pickle form inline in a dictionary form.
Expand Down
3 changes: 2 additions & 1 deletion foreshadow/steps/preparerstep.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from foreshadow.pipeline import DynamicPipeline

from ..columnsharer import ColumnSharer
from ..serializers import ConcreteSerializerMixin


GroupProcess = namedtuple(
Expand Down Expand Up @@ -227,7 +228,7 @@ def _batch_parallelize(column_mapping):
return steps, list(all_cols)


class PreparerStep(BaseEstimator, TransformerMixin):
class PreparerStep(BaseEstimator, TransformerMixin, ConcreteSerializerMixin):
"""Base class for any pipeline step of DataPreparer.
This class automatically wraps the defined pipeline to make it
Expand Down
76 changes: 76 additions & 0 deletions foreshadow/tests/test_core/test_column_sharer.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,82 @@ def test_column_sharer_iter(store):
assert expected == cs.store


@pytest.mark.parametrize(
"store",
[
{"domain": {}, "intent": {}, "metastat": {}},
{"domain": {"column1": [0, 1, 2]}},
{
"domain": {"column1": [0, 1, 2]},
"intent": {"column1": [1, 2, 3], "column2": [1, 4, 6]},
"metastat": {},
"registered_key": {},
"another_registered": {"column1": [1, 2, 3], "column2": True},
},
],
)
def test_column_sharer_dict_serialize(store):
"""Test that get_params are returning the right content.
Args:
store: the internal dictionary to use.
"""
from foreshadow.columnsharer import ColumnSharer

cs = ColumnSharer()
for key in store:
cs[key] = store[key]

from foreshadow.columnsharer import PrettyDefaultDict

expected = {
"store": PrettyDefaultDict(lambda: PrettyDefaultDict(lambda: None))
}
for key in store:
if len(store[key]) > 0:
for column in store[key]:
expected["store"][key][column] = store[key][column]
else:
expected["store"][key] = PrettyDefaultDict(lambda: None)

assert expected == cs.dict_serialize(deep=True)


@pytest.mark.parametrize(
"store",
[
{"domain": {}, "intent": {}, "metastat": {}},
{"domain": {"column1": [0, 1, 2]}},
{
"domain": {"column1": [0, 1, 2]},
"intent": {"column1": [1, 2, 3], "column2": [1, 4, 6]},
"metastat": {},
"registered_key": {},
"another_registered": {"column1": [1, 2, 3], "column2": True},
},
],
)
def test_column_sharer_dict_deserialize(store):
"""Test that set_params are updating the ColumnShare correctly
Args:
store: the internal dictionary to use.
"""
from foreshadow.columnsharer import ColumnSharer

cs = ColumnSharer()
for key in store:
cs[key] = store[key]

serialized = cs.serialize(method="dict")

expected = ColumnSharer.dict_deserialize(serialized)

assert expected == cs


@pytest.mark.parametrize(
"key,item_to_set,expected,warning",
[
Expand Down
31 changes: 31 additions & 0 deletions foreshadow/tests/test_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,34 @@ def test_data_preparer_get_params(deep):
assert "reducer_kwargs" in params
assert "y_var" in params
assert "steps" in params


@pytest.mark.parametrize("cleaner_kwargs", [({}), (None)])
def test_data_preparer_serialization(cleaner_kwargs):
"""Test fitting of DataPreparer after creation with kwargs.
Args:
cleaner_kwargs: kwargs to CleanerMapper step
"""
pass
# from foreshadow.preparer import DataPreparer
# from foreshadow.columnsharer import ColumnSharer
# import pandas as pd
#
# boston_path = get_file_path("data", "boston_housing.csv")
# data = pd.read_csv(boston_path)
#
# cs = ColumnSharer()
# dp = DataPreparer(cs, cleaner_kwargs=cleaner_kwargs)
# dp.fit(data)
#
# cs.to_json("column_sharer.json", deep=True)
# cs2 = ColumnSharer.from_json("column_sharer.json")
#
# assert cs == cs2

# dp.to_json("data_preparerer_deep_true3.json", deep=True)
# dp.to_yaml("data_preparerer_deep_true2.yaml", deep=True)

# dp2 = DataPreparer.from_json("data_preparerer_deep_true2.json")
2 changes: 2 additions & 0 deletions foreshadow/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ def get_transformer(class_name, source_lib=None):
"foreshadow.concrete",
"foreshadow.smart",
"foreshadow.intents",
"foreshadow.steps",
"foreshadow.parallelprocessor",
]
)

Expand Down

0 comments on commit 5690275

Please sign in to comment.