Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Data Exporter Before Training the Estimator (#180)
Browse files Browse the repository at this point in the history
* Setting a step to export the processed data before feeding it to the estimator
  • Loading branch information
jzhang-gp committed Dec 18, 2019
1 parent bed56bf commit 9f6b737
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 1 deletion.
11 changes: 11 additions & 0 deletions foreshadow/foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,3 +556,14 @@ def configure_multiprocessing(self, n_job: int = 1) -> NoReturn:
"""
self.X_preparer.cache_manager["config"][ConfigKey.N_JOBS] = n_job

def set_processed_data_export_path(self, data_path: str) -> NoReturn:
"""Set path to export data before feeding the data to the estimator.
Args:
data_path: the data path string
"""
self.X_preparer.cache_manager["config"][
ConfigKey.PROCESSED_DATA_EXPORT_PATH
] = data_path
6 changes: 6 additions & 0 deletions foreshadow/preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from foreshadow.smart import CategoricalEncoder
from foreshadow.steps import (
CleanerMapper,
DataExporterMapper,
FeatureSummarizerMapper,
IntentMapper,
Preprocessor,
Expand Down Expand Up @@ -79,6 +80,7 @@ def __init__(
engineerer_kwargs=None,
preprocessor_kwargs=None,
reducer_kwargs=None,
exporter_kwargs=None,
problem_type=None,
y_var=None,
**kwargs
Expand All @@ -101,6 +103,9 @@ def __init__(
# reducer_kwargs_ = _none_to_dict(
# "reducer_kwargs", reducer_kwargs, cache_manager
# )
exporter_kwargs_ = _none_to_dict(
"exporter_kwargs", exporter_kwargs, cache_manager
)
if not y_var:
steps = [
("data_cleaner", CleanerMapper(**cleaner_kwargs_)),
Expand All @@ -115,6 +120,7 @@ def __init__(
# ),
("feature_preprocessor", Preprocessor(**preprocessor_kwargs_)),
# ("feature_reducer", FeatureReducerMapper(**reducer_kwargs_)),
("feature_exporter", DataExporterMapper(**exporter_kwargs_)),
]
else:
if problem_type == ProblemType.REGRESSION:
Expand Down
2 changes: 2 additions & 0 deletions foreshadow/smart/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
TextEncoder,
)
from foreshadow.smart.cleaner import Cleaner # noqa: F401
from foreshadow.smart.data_exporter import DataExporter # noqa: F401
from foreshadow.smart.feature_engineerer import FeatureEngineerer # noqa: F401
from foreshadow.smart.feature_reducer import FeatureReducer
from foreshadow.smart.feature_summarizer import FeatureSummarizer # noqa: F401
Expand All @@ -32,4 +33,5 @@
"FeatureReducer",
"FeatureEngineerer",
"FeatureSummarizer",
"DataExporter",
]
24 changes: 24 additions & 0 deletions foreshadow/smart/data_exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""SmartSummarizer for FeatureExporterMapper step."""
from foreshadow.concrete.internals import NoTransform
from foreshadow.smart.smart import SmartTransformer


class DataExporter(SmartTransformer):
"""Empty Smart transformer for feature exporter step."""

def __init__(self, check_wrapped=True, **kwargs):
super().__init__(check_wrapped=check_wrapped, **kwargs)

def pick_transformer(self, X, y=None, **fit_params):
"""Get best transformer for a given set of columns.
Args:
X: input DataFrame
y: input labels
**fit_params: fit_params
Returns:
No transformer.
"""
return NoTransform()
2 changes: 2 additions & 0 deletions foreshadow/steps/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Steps for DataPreparer object."""

from .cleaner import CleanerMapper
from .data_exporter import DataExporterMapper
from .feature_engineerer import FeatureEngineererMapper
from .feature_reducer import FeatureReducerMapper
from .feature_summarizer import FeatureSummarizerMapper
Expand All @@ -17,4 +18,5 @@
"FeatureReducerMapper",
"FeatureSummarizerMapper",
"PreparerStep",
"DataExporterMapper",
]
61 changes: 61 additions & 0 deletions foreshadow/steps/data_exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""PrepareStep that exports the processed data before sending to Estimator."""
from foreshadow.logging import logging
from foreshadow.smart import DataExporter
from foreshadow.utils import ConfigKey, DefaultConfig

from .autointentmap import AutoIntentMixin
from .preparerstep import PreparerStep


class DataExporterMapper(PreparerStep, AutoIntentMixin):
"""Define the single step for FeatureExporter.
Args:
**kwargs: kwargs to PreparerStep initializer.
"""

def __init__(self, **kwargs):
"""Define the single step for FeatureExporter.
Args:
**kwargs: kwargs to PreparerStep initializer.
"""
super().__init__(**kwargs)

def get_mapping(self, X): # noqa
return self.separate_cols(
transformers=[
[DataExporter(cache_manager=self.cache_manager)] for c in X
],
cols=X.columns,
)

def fit_transform(self, X, y=None, **fit_params):
"""Fit then transform this PreparerStep.
Side-affect: export the dataframe to disk as a csv file.
Args:
X: input DataFrame
y: input labels
**fit_params: kwarg params to fit
Returns:
Result from .transform(), pass through.
"""
Xt = super().fit_transform(X, y, **fit_params)
if (
ConfigKey.PROCESSED_DATA_EXPORT_PATH
not in self.cache_manager["config"]
):
data_path = DefaultConfig.PROCESSED_DATA_EXPORT_PATH
else:
data_path = self.cache_manager["config"][
ConfigKey.PROCESSED_DATA_EXPORT_PATH
]
Xt.to_csv(data_path, index=False)
logging.info("Exported processed data to {}".format(data_path))
return Xt
27 changes: 27 additions & 0 deletions foreshadow/tests/test_steps/test_data_exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Test Data Exporter"""

from foreshadow.cachemanager import CacheManager
from foreshadow.steps import DataExporterMapper
from foreshadow.utils import ConfigKey


def test_data_exporter_fit_transform():
export_path = "data_export.csv"
cache_manager = CacheManager()
cache_manager["config"][ConfigKey.PROCESSED_DATA_EXPORT_PATH] = export_path

exporter = DataExporterMapper(cache_manager=cache_manager)

from sklearn.datasets import load_breast_cancer
import pandas as pd

cancer = load_breast_cancer()
cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)

processed_df = exporter.fit_transform(X=cancerX_df)

pd.testing.assert_frame_equal(processed_df, cancerX_df)

with open(export_path, "r") as fopen:
exported_df = pd.read_csv(fopen)
pd.testing.assert_frame_equal(processed_df, exported_df)
8 changes: 7 additions & 1 deletion foreshadow/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
get_config_path,
get_transformer,
)
from foreshadow.utils.constants import ConfigKey, EstimatorFamily, ProblemType
from foreshadow.utils.constants import (
ConfigKey,
DefaultConfig,
EstimatorFamily,
ProblemType,
)
from foreshadow.utils.data_summary import (
get_outliers,
mode_freq,
Expand Down Expand Up @@ -50,4 +55,5 @@
"EstimatorFamily",
"Override",
"ConfigKey",
"DefaultConfig",
]
7 changes: 7 additions & 0 deletions foreshadow/utils/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
"""Classes that hold constants in foreshadow."""


class DefaultConfig:
"""Constants for default configurations."""

PROCESSED_DATA_EXPORT_PATH = "processed_data.csv"


class ProblemType:
"""Constants for problem types."""

Expand All @@ -21,3 +27,4 @@ class ConfigKey:
"""Constants of configuration key in foreshadow."""

N_JOBS = "n_jobs"
PROCESSED_DATA_EXPORT_PATH = "processed_data_export_path"

0 comments on commit 9f6b737

Please sign in to comment.