Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
fixing intent override from Categorical to Numeric issue in DataExpor…
Browse files Browse the repository at this point in the history
…tor and add more unit tests (#200)
  • Loading branch information
jzhang-gp committed Jan 16, 2020
1 parent 329137f commit 6515085
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 34 deletions.
22 changes: 14 additions & 8 deletions foreshadow/foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ def get_intent(self, column_name: str) -> Union[str, None]:
# been processed will be visible.
cache_manager = self.X_preparer.cache_manager
if self._has_column_in_cache_manager(column_name):
return cache_manager["intent"][column_name]
return cache_manager[AcceptedKey.INTENT][column_name]
else:
logging.info(
"No intent exists for column {}. Either the column "
Expand Down Expand Up @@ -519,7 +519,7 @@ def _has_column_in_cache_manager(self, column: str) -> Union[bool, None]:
)
return False
cache_manager = self.X_preparer.cache_manager
return True if column in cache_manager["intent"] else False
return True if column in cache_manager[AcceptedKey.INTENT] else False

def override_intent(self, column_name: str, intent: str) -> NoReturn:
"""Override the intent of a particular column.
Expand All @@ -546,10 +546,10 @@ def override_intent(self, column_name: str, intent: str) -> NoReturn:
):
raise ValueError("Invalid Column {}".format(column_name))
# Update the intent
self.X_preparer.cache_manager["override"][
self.X_preparer.cache_manager[AcceptedKey.OVERRIDE][
"_".join([Override.INTENT, column_name])
] = intent
self.X_preparer.cache_manager["intent"][column_name] = intent
self.X_preparer.cache_manager[AcceptedKey.INTENT][column_name] = intent

def configure_multiprocessing(self, n_job: int = 1) -> NoReturn:
"""Configure the multiprocessing option.
Expand All @@ -562,16 +562,22 @@ def configure_multiprocessing(self, n_job: int = 1) -> NoReturn:
ConfigKey.N_JOBS
] = n_job

def set_processed_data_export_path(self, data_path: str) -> NoReturn:
def set_processed_data_export_path(
self, data_path: str, is_train: bool
) -> NoReturn:
"""Set path to export data before feeding the data to the estimator.
Args:
data_path: the data path string
is_train: whether this is for training data
"""
self.X_preparer.cache_manager["config"][
ConfigKey.PROCESSED_DATA_EXPORT_PATH
] = data_path
key = (
ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
if is_train
else ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH
)
self.X_preparer.cache_manager[AcceptedKey.CONFIG][key] = data_path

def pickle_fitted_pipeline(self, path: str) -> NoReturn:
"""Pickle the foreshadow object with the best pipeline estimator.
Expand Down
66 changes: 54 additions & 12 deletions foreshadow/steps/data_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_mapping(self, X): # noqa
)

def fit_transform(self, X, y=None, **fit_params):
"""Fit then transform this PreparerStep.
"""Fit then transform a dataframe.
Side-affect: export the dataframe to disk as a csv file.
Expand All @@ -47,15 +47,57 @@ def fit_transform(self, X, y=None, **fit_params):
"""
Xt = super().fit_transform(X, y, **fit_params)
if (
ConfigKey.PROCESSED_DATA_EXPORT_PATH
not in self.cache_manager["config"]
):
data_path = DefaultConfig.PROCESSED_DATA_EXPORT_PATH
else:
data_path = self.cache_manager["config"][
ConfigKey.PROCESSED_DATA_EXPORT_PATH
]
Xt.to_csv(data_path, index=False)
logging.info("Exported processed data to {}".format(data_path))
self._export_data(Xt, is_train=True)
return Xt

def transform(self, X, *args, **kwargs):
"""Transform a dataframe.
Side-affect: export the dataframe to disk as a csv file.
Args:
X: input DataFrame
*args: args to .transform()
**kwargs: kwargs to .transform()
Returns:
Result from .transform(), pass through.
"""
Xt = super().transform(X, *args, **kwargs)
self._export_data(Xt, is_train=False)
return Xt

def _handle_intent_override(self, default_parallel_process):
"""Handle intent override and see override in the child classes.
For the data exporter, it should just start from scratch as there is no
computation involved anyway.
Args:
default_parallel_process: the default parallel process from scratch
"""
self._parallel_process = default_parallel_process

def _export_data(self, X, is_train=True):
data_path = self._determine_export_path(is_train)
X.to_csv(data_path, index=False)
logging.info("Exported processed data to {}".format(data_path))

def _determine_export_path(self, is_train=True):
key_to_check = (
ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
if is_train
else ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH
)

if key_to_check not in self.cache_manager["config"]:
data_path = (
DefaultConfig.PROCESSED_TRAINING_DATA_EXPORT_PATH
if is_train
else DefaultConfig.PROCESSED_TEST_DATA_EXPORT_PATH
)
else:
data_path = self.cache_manager["config"][key_to_check]
return data_path
36 changes: 36 additions & 0 deletions foreshadow/tests/test_foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ def test_core_foreshadow_example_classification():
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from foreshadow.foreshadow import Foreshadow
from foreshadow.intents import IntentType

np.random.seed(0)
iris = load_iris()
Expand All @@ -714,6 +715,10 @@ def test_core_foreshadow_example_classification():
estimator=LogisticRegression(), problem_type=ProblemType.CLASSIFICATION
)
model.fit(X_train, y_train)
assert not model.get_intent("petal width (cm)") == IntentType.NUMERIC
model.override_intent("petal width (cm)", IntentType.NUMERIC)
model.fit(X_train, y_train)

score = f1_score(y_test, model.predict(X_test), average="weighted")
print("Iris score: %f" % score)

Expand Down Expand Up @@ -1389,3 +1394,34 @@ def metric_score(self, X: pd.DataFrame) -> float:
workclass_values_transformed = list(X_train_cleaned["workclass"].unique())
for value in workclass_values_transformed:
assert not any([c.isupper() for c in value])


def test_set_processed_data_export_path():
from foreshadow.foreshadow import Foreshadow
from sklearn.linear_model import LogisticRegression
from foreshadow.utils import ConfigKey

shadow = Foreshadow(
estimator=LogisticRegression(), problem_type=ProblemType.CLASSIFICATION
)
processed_training_data_path = "datapath1.csv"
shadow.set_processed_data_export_path(
data_path=processed_training_data_path, is_train=True
)
assert (
shadow.X_preparer.cache_manager[AcceptedKey.CONFIG][
ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
]
== processed_training_data_path
)

processed_test_data_path = "datapath2.csv"
shadow.set_processed_data_export_path(
data_path=processed_test_data_path, is_train=False
)
assert (
shadow.X_preparer.cache_manager[AcceptedKey.CONFIG][
ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH
]
== processed_test_data_path
)
88 changes: 76 additions & 12 deletions foreshadow/tests/test_steps/test_data_exporter.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,93 @@
"""Test Data Exporter"""

import pandas as pd
import pytest

from foreshadow.cachemanager import CacheManager
from foreshadow.steps import DataExporterMapper
from foreshadow.utils import AcceptedKey, ConfigKey
from foreshadow.utils import AcceptedKey, ConfigKey, DefaultConfig


def _assert_common(export_path, processed_df, cancerX_df):
pd.testing.assert_frame_equal(processed_df, cancerX_df)

with open(export_path, "r") as fopen:
exported_df = pd.read_csv(fopen)
pd.testing.assert_frame_equal(processed_df, exported_df)


def _prepare_data_common():
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
return pd.DataFrame(cancer.data, columns=cancer.feature_names)


def test_data_exporter_fit_transform(tmpdir):
export_path = tmpdir.join("data_export.csv")
export_path = tmpdir.join("data_export_training.csv")
cache_manager = CacheManager()
cache_manager[AcceptedKey.CONFIG][
ConfigKey.PROCESSED_DATA_EXPORT_PATH
ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
] = export_path

exporter = DataExporterMapper(cache_manager=cache_manager)

from sklearn.datasets import load_breast_cancer
import pandas as pd
df = _prepare_data_common()
processed_df = exporter.fit_transform(X=df)
_assert_common(export_path, processed_df, df)

cancer = load_breast_cancer()
cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)

processed_df = exporter.fit_transform(X=cancerX_df)
def test_data_exporter_transform(tmpdir):
export_path = tmpdir.join("data_export_test.csv")
cache_manager = CacheManager()
cache_manager[AcceptedKey.CONFIG][
ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH
] = export_path

exporter = DataExporterMapper(cache_manager=cache_manager)

pd.testing.assert_frame_equal(processed_df, cancerX_df)
df = _prepare_data_common()
# Need to fit before transform, even though this step doesn't fit
# anything. This is to stay consistent with all other transformers.
_ = exporter.fit(X=df)
processed_df = exporter.transform(X=df)
_assert_common(export_path, processed_df, df)

with open(export_path, "r") as fopen:
exported_df = pd.read_csv(fopen)
pd.testing.assert_frame_equal(processed_df, exported_df)

@pytest.mark.parametrize("is_train", [True, False])
def test_determine_export_path_default(is_train):
cache_manager = CacheManager()
exporter = DataExporterMapper(cache_manager=cache_manager)

data_path = exporter._determine_export_path(is_train=is_train)
expected_data_path = (
DefaultConfig.PROCESSED_TRAINING_DATA_EXPORT_PATH
if is_train
else DefaultConfig.PROCESSED_TEST_DATA_EXPORT_PATH
)
assert data_path == expected_data_path


@pytest.mark.parametrize(
"is_train, user_specified_path",
[
(True, "processed_training_data.csv"),
(False, "processed_test_data.csv"),
],
)
def test_determine_export_path_user_specified(is_train, user_specified_path):
cache_manager = CacheManager()
key = (
ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
if is_train
else ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH
)

cache_manager[AcceptedKey.CONFIG][key] = user_specified_path

exporter = DataExporterMapper(cache_manager=cache_manager)

data_path = exporter._determine_export_path(is_train=is_train)
expected_data_path = user_specified_path

assert data_path == expected_data_path
6 changes: 4 additions & 2 deletions foreshadow/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
class DefaultConfig:
"""Constants for default configurations."""

PROCESSED_DATA_EXPORT_PATH = "processed_data.csv"
PROCESSED_TRAINING_DATA_EXPORT_PATH = "processed_training_data.csv"
PROCESSED_TEST_DATA_EXPORT_PATH = "processed_test_data.csv"
ENABLE_SAMPLING = True
SAMPLING_DATASET_SIZE_THRESHOLD = 10000
SAMPLING_WITH_REPLACEMENT = False
Expand Down Expand Up @@ -36,7 +37,8 @@ class ConfigKey:
SAMPLING_WITH_REPLACEMENT = "with_replacement"
SAMPLING_FRACTION = "sampling_fraction"
N_JOBS = "n_jobs"
PROCESSED_DATA_EXPORT_PATH = "processed_data_export_path"
PROCESSED_TRAINING_DATA_EXPORT_PATH = "processed_training_data_export_path"
PROCESSED_TEST_DATA_EXPORT_PATH = "processed_test_data_export_path"
CUSTOMIZED_CLEANERS = "customized_cleaners"


Expand Down

0 comments on commit 6515085

Please sign in to comment.