fixing intent override from Categorical to Numeric issue in DataExpor…

…tor and add more unit tests (#200)
georgian-io-archive · Jan 16, 2020 · 6515085 · 6515085
1 parent 329137f
commit 6515085
Show file tree

Hide file tree

Showing 5 changed files with 184 additions and 34 deletions.
diff --git a/foreshadow/foreshadow.py b/foreshadow/foreshadow.py
@@ -478,7 +478,7 @@ def get_intent(self, column_name: str) -> Union[str, None]:
         # been processed will be visible.
         cache_manager = self.X_preparer.cache_manager
         if self._has_column_in_cache_manager(column_name):
-            return cache_manager["intent"][column_name]
+            return cache_manager[AcceptedKey.INTENT][column_name]
         else:
             logging.info(
                 "No intent exists for column {}. Either the column "
@@ -519,7 +519,7 @@ def _has_column_in_cache_manager(self, column: str) -> Union[bool, None]:
             )
             return False
         cache_manager = self.X_preparer.cache_manager
-        return True if column in cache_manager["intent"] else False
+        return True if column in cache_manager[AcceptedKey.INTENT] else False
 
     def override_intent(self, column_name: str, intent: str) -> NoReturn:
         """Override the intent of a particular column.
@@ -546,10 +546,10 @@ def override_intent(self, column_name: str, intent: str) -> NoReturn:
         ):
             raise ValueError("Invalid Column {}".format(column_name))
         # Update the intent
-        self.X_preparer.cache_manager["override"][
+        self.X_preparer.cache_manager[AcceptedKey.OVERRIDE][
             "_".join([Override.INTENT, column_name])
         ] = intent
-        self.X_preparer.cache_manager["intent"][column_name] = intent
+        self.X_preparer.cache_manager[AcceptedKey.INTENT][column_name] = intent
 
     def configure_multiprocessing(self, n_job: int = 1) -> NoReturn:
         """Configure the multiprocessing option.
@@ -562,16 +562,22 @@ def configure_multiprocessing(self, n_job: int = 1) -> NoReturn:
             ConfigKey.N_JOBS
         ] = n_job
 
-    def set_processed_data_export_path(self, data_path: str) -> NoReturn:
+    def set_processed_data_export_path(
+        self, data_path: str, is_train: bool
+    ) -> NoReturn:
         """Set path to export data before feeding the data to the estimator.
 
         Args:
             data_path: the data path string
+            is_train: whether this is for training data
 
         """
-        self.X_preparer.cache_manager["config"][
-            ConfigKey.PROCESSED_DATA_EXPORT_PATH
-        ] = data_path
+        key = (
+            ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
+            if is_train
+            else ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH
+        )
+        self.X_preparer.cache_manager[AcceptedKey.CONFIG][key] = data_path
 
     def pickle_fitted_pipeline(self, path: str) -> NoReturn:
         """Pickle the foreshadow object with the best pipeline estimator.

diff --git a/foreshadow/steps/data_exporter.py b/foreshadow/steps/data_exporter.py
@@ -33,7 +33,7 @@ def get_mapping(self, X):  # noqa
         )
 
     def fit_transform(self, X, y=None, **fit_params):
-        """Fit then transform this PreparerStep.
+        """Fit then transform a dataframe.
 
         Side-affect: export the dataframe to disk as a csv file.
 
@@ -47,15 +47,57 @@ def fit_transform(self, X, y=None, **fit_params):
 
         """
         Xt = super().fit_transform(X, y, **fit_params)
-        if (
-            ConfigKey.PROCESSED_DATA_EXPORT_PATH
-            not in self.cache_manager["config"]
-        ):
-            data_path = DefaultConfig.PROCESSED_DATA_EXPORT_PATH
-        else:
-            data_path = self.cache_manager["config"][
-                ConfigKey.PROCESSED_DATA_EXPORT_PATH
-            ]
-        Xt.to_csv(data_path, index=False)
-        logging.info("Exported processed data to {}".format(data_path))
+        self._export_data(Xt, is_train=True)
+        return Xt
+
+    def transform(self, X, *args, **kwargs):
+        """Transform a dataframe.
+
+        Side-affect: export the dataframe to disk as a csv file.
+
+        Args:
+            X: input DataFrame
+            *args: args to .transform()
+            **kwargs: kwargs to .transform()
+
+        Returns:
+            Result from .transform(), pass through.
+
+        """
+        Xt = super().transform(X, *args, **kwargs)
+        self._export_data(Xt, is_train=False)
         return Xt
+
+    def _handle_intent_override(self, default_parallel_process):
+        """Handle intent override and see override in the child classes.
+
+        For the data exporter, it should just start from scratch as there is no
+        computation involved anyway.
+
+        Args:
+            default_parallel_process: the default parallel process from scratch
+
+        """
+        self._parallel_process = default_parallel_process
+
+    def _export_data(self, X, is_train=True):
+        data_path = self._determine_export_path(is_train)
+        X.to_csv(data_path, index=False)
+        logging.info("Exported processed data to {}".format(data_path))
+
+    def _determine_export_path(self, is_train=True):
+        key_to_check = (
+            ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
+            if is_train
+            else ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH
+        )
+
+        if key_to_check not in self.cache_manager["config"]:
+            data_path = (
+                DefaultConfig.PROCESSED_TRAINING_DATA_EXPORT_PATH
+                if is_train
+                else DefaultConfig.PROCESSED_TEST_DATA_EXPORT_PATH
+            )
+        else:
+            data_path = self.cache_manager["config"][key_to_check]
+        return data_path
diff --git a/foreshadow/tests/test_foreshadow.py b/foreshadow/tests/test_foreshadow.py
@@ -701,6 +701,7 @@ def test_core_foreshadow_example_classification():
     from sklearn.metrics import f1_score
     from sklearn.model_selection import train_test_split
     from foreshadow.foreshadow import Foreshadow
+    from foreshadow.intents import IntentType
 
     np.random.seed(0)
     iris = load_iris()
@@ -714,6 +715,10 @@ def test_core_foreshadow_example_classification():
         estimator=LogisticRegression(), problem_type=ProblemType.CLASSIFICATION
     )
     model.fit(X_train, y_train)
+    assert not model.get_intent("petal width (cm)") == IntentType.NUMERIC
+    model.override_intent("petal width (cm)", IntentType.NUMERIC)
+    model.fit(X_train, y_train)
+
     score = f1_score(y_test, model.predict(X_test), average="weighted")
     print("Iris score: %f" % score)
 
@@ -1389,3 +1394,34 @@ def metric_score(self, X: pd.DataFrame) -> float:
     workclass_values_transformed = list(X_train_cleaned["workclass"].unique())
     for value in workclass_values_transformed:
         assert not any([c.isupper() for c in value])
+
+
+def test_set_processed_data_export_path():
+    from foreshadow.foreshadow import Foreshadow
+    from sklearn.linear_model import LogisticRegression
+    from foreshadow.utils import ConfigKey
+
+    shadow = Foreshadow(
+        estimator=LogisticRegression(), problem_type=ProblemType.CLASSIFICATION
+    )
+    processed_training_data_path = "datapath1.csv"
+    shadow.set_processed_data_export_path(
+        data_path=processed_training_data_path, is_train=True
+    )
+    assert (
+        shadow.X_preparer.cache_manager[AcceptedKey.CONFIG][
+            ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
+        ]
+        == processed_training_data_path
+    )
+
+    processed_test_data_path = "datapath2.csv"
+    shadow.set_processed_data_export_path(
+        data_path=processed_test_data_path, is_train=False
+    )
+    assert (
+        shadow.X_preparer.cache_manager[AcceptedKey.CONFIG][
+            ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH
+        ]
+        == processed_test_data_path
+    )
diff --git a/foreshadow/tests/test_steps/test_data_exporter.py b/foreshadow/tests/test_steps/test_data_exporter.py
@@ -1,29 +1,93 @@
 """Test Data Exporter"""
 
+import pandas as pd
+import pytest
+
 from foreshadow.cachemanager import CacheManager
 from foreshadow.steps import DataExporterMapper
-from foreshadow.utils import AcceptedKey, ConfigKey
+from foreshadow.utils import AcceptedKey, ConfigKey, DefaultConfig
+
+
+def _assert_common(export_path, processed_df, cancerX_df):
+    pd.testing.assert_frame_equal(processed_df, cancerX_df)
+
+    with open(export_path, "r") as fopen:
+        exported_df = pd.read_csv(fopen)
+        pd.testing.assert_frame_equal(processed_df, exported_df)
+
+
+def _prepare_data_common():
+    from sklearn.datasets import load_breast_cancer
+
+    cancer = load_breast_cancer()
+    return pd.DataFrame(cancer.data, columns=cancer.feature_names)
 
 
 def test_data_exporter_fit_transform(tmpdir):
-    export_path = tmpdir.join("data_export.csv")
+    export_path = tmpdir.join("data_export_training.csv")
     cache_manager = CacheManager()
     cache_manager[AcceptedKey.CONFIG][
-        ConfigKey.PROCESSED_DATA_EXPORT_PATH
+        ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
     ] = export_path
 
     exporter = DataExporterMapper(cache_manager=cache_manager)
 
-    from sklearn.datasets import load_breast_cancer
-    import pandas as pd
+    df = _prepare_data_common()
+    processed_df = exporter.fit_transform(X=df)
+    _assert_common(export_path, processed_df, df)
 
-    cancer = load_breast_cancer()
-    cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
 
-    processed_df = exporter.fit_transform(X=cancerX_df)
+def test_data_exporter_transform(tmpdir):
+    export_path = tmpdir.join("data_export_test.csv")
+    cache_manager = CacheManager()
+    cache_manager[AcceptedKey.CONFIG][
+        ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH
+    ] = export_path
+
+    exporter = DataExporterMapper(cache_manager=cache_manager)
 
-    pd.testing.assert_frame_equal(processed_df, cancerX_df)
+    df = _prepare_data_common()
+    # Need to fit before transform, even though this step doesn't fit
+    # anything. This is to stay consistent with all other transformers.
+    _ = exporter.fit(X=df)
+    processed_df = exporter.transform(X=df)
+    _assert_common(export_path, processed_df, df)
 
-    with open(export_path, "r") as fopen:
-        exported_df = pd.read_csv(fopen)
-        pd.testing.assert_frame_equal(processed_df, exported_df)
+
+@pytest.mark.parametrize("is_train", [True, False])
+def test_determine_export_path_default(is_train):
+    cache_manager = CacheManager()
+    exporter = DataExporterMapper(cache_manager=cache_manager)
+
+    data_path = exporter._determine_export_path(is_train=is_train)
+    expected_data_path = (
+        DefaultConfig.PROCESSED_TRAINING_DATA_EXPORT_PATH
+        if is_train
+        else DefaultConfig.PROCESSED_TEST_DATA_EXPORT_PATH
+    )
+    assert data_path == expected_data_path
+
+
+@pytest.mark.parametrize(
+    "is_train, user_specified_path",
+    [
+        (True, "processed_training_data.csv"),
+        (False, "processed_test_data.csv"),
+    ],
+)
+def test_determine_export_path_user_specified(is_train, user_specified_path):
+    cache_manager = CacheManager()
+    key = (
+        ConfigKey.PROCESSED_TRAINING_DATA_EXPORT_PATH
+        if is_train
+        else ConfigKey.PROCESSED_TEST_DATA_EXPORT_PATH
+    )
+
+    cache_manager[AcceptedKey.CONFIG][key] = user_specified_path
+
+    exporter = DataExporterMapper(cache_manager=cache_manager)
+
+    data_path = exporter._determine_export_path(is_train=is_train)
+    expected_data_path = user_specified_path
+
+    assert data_path == expected_data_path
diff --git a/foreshadow/utils/constants.py b/foreshadow/utils/constants.py
@@ -4,7 +4,8 @@
 class DefaultConfig:
     """Constants for default configurations."""
 
-    PROCESSED_DATA_EXPORT_PATH = "processed_data.csv"
+    PROCESSED_TRAINING_DATA_EXPORT_PATH = "processed_training_data.csv"
+    PROCESSED_TEST_DATA_EXPORT_PATH = "processed_test_data.csv"
     ENABLE_SAMPLING = True
     SAMPLING_DATASET_SIZE_THRESHOLD = 10000
     SAMPLING_WITH_REPLACEMENT = False
@@ -36,7 +37,8 @@ class ConfigKey:
     SAMPLING_WITH_REPLACEMENT = "with_replacement"
     SAMPLING_FRACTION = "sampling_fraction"
     N_JOBS = "n_jobs"
-    PROCESSED_DATA_EXPORT_PATH = "processed_data_export_path"
+    PROCESSED_TRAINING_DATA_EXPORT_PATH = "processed_training_data_export_path"
+    PROCESSED_TEST_DATA_EXPORT_PATH = "processed_test_data_export_path"
     CUSTOMIZED_CLEANERS = "customized_cleaners"