treat NaN value as a category for categorical value and temporarily u… (

#183) * treat NaN value as a category for categorical value and temporarily use CategoricalEncoder for Neither type
georgian-io-archive · Dec 19, 2019 · 79a96b3 · 79a96b3
1 parent c013774
commit 79a96b3
Show file tree

Hide file tree

Showing 9 changed files with 196 additions and 49 deletions.
diff --git a/foreshadow/concrete/internals/__init__.py b/foreshadow/concrete/internals/__init__.py
@@ -17,6 +17,7 @@
 from foreshadow.concrete.internals.labelencoder import (  # noqa: F403, F401
     FixedLabelEncoder,
 )
+from foreshadow.concrete.internals.nan_filler import NaNFiller  # noqa: F401
 from foreshadow.concrete.internals.notransform import NoTransform  # noqa: F401
 from foreshadow.concrete.internals.tfidf import (  # noqa: F403, F401
     FixedTfidfVectorizer,
@@ -48,4 +49,5 @@
     "DropCleaner",
     "StandardJsonFlattener",
     "NoTransform",
+    "NaNFiller",
 ] + c_all
diff --git a/foreshadow/concrete/internals/nan_filler.py b/foreshadow/concrete/internals/nan_filler.py
@@ -0,0 +1,53 @@
+"""Fill NaNs."""
+
+import numpy as np
+
+from foreshadow.base import BaseEstimator, TransformerMixin
+from foreshadow.utils import Constant
+from foreshadow.wrapper import pandas_wrap
+
+
+@pandas_wrap
+class NaNFiller(BaseEstimator, TransformerMixin):
+    """Fill NaN values in data."""
+
+    def __init__(self, fill_value=Constant.NAN_FILL_VALUE):
+        self.fill_value = fill_value
+
+    def fit(self, X, y=None):
+        """Empty fit.
+
+        Args:
+            X: input observations
+            y: input labels
+
+        Returns:
+            self
+
+        """
+        return self
+
+    def transform(self, X, y=None):
+        """Fill nans in a column with defined fill_value.
+
+        Args:
+            X (:obj:`pandas.DataFrame`): X data
+            y: input labels
+
+        Returns:
+            :obj:`pandas.DataFrame`: Transformed data
+
+        """
+        return X.fillna(self.fill_value)
+
+    def inverse_transform(self, X):
+        """Reverse nan filling transform.
+
+        Args:
+            X (:obj:`numpy.ndarray`): Transformed X data
+
+        Returns:
+            :obj:`numpy.ndarray`: Original data
+
+        """
+        return X.replace(to_replace=Constant.NAN_FILL_VALUE, value=np.nan)
diff --git a/foreshadow/config.py b/foreshadow/config.py
@@ -25,7 +25,14 @@
     # "Numeric": {"Preprocessor": ["Imputer", "Scaler"]},
     "Categorical": {"Preprocessor": ["CategoricalEncoder"]},
     "Text": {"Preprocessor": ["TextEncoder"]},
-    "Neither": {"Preprocessor": ["NeitherProcessor"]},
+    # "Neither": {"Preprocessor": ["NeitherProcessor"]},
+    # TODO we have to use CategoricalEncoder for Neither Type temporarily as
+    #  some columns in Neither type has missing data. By default,
+    #  number-like columns are treated by estimator as numerical while
+    #  string-like columns are treated as categories. Using preprocessing
+    #  for Numeric will fail the second case while using the preprocessing
+    #  for Categorical work for both cases.
+    "Neither": {"Preprocessor": ["CategoricalEncoder"]},
 }
 
 

diff --git a/foreshadow/smart/all.py b/foreshadow/smart/all.py
@@ -12,7 +12,7 @@
 import pandas as pd
 import scipy.stats as ss
 
-from foreshadow.concrete import Imputer, NoTransform
+from foreshadow.concrete import Imputer, NaNFiller, NoTransform
 from foreshadow.concrete.externals import (
     HashingEncoder,
     MinMaxScaler,
@@ -97,11 +97,33 @@ def pick_transformer(self, X, y=None, **fit_params):
             return distributions[best_dist]
 
 
+def will_remove_uncommon(X, temp_uncommon_remover):
+    """Check if the transformer will modify the data.
+
+    Uses current settings.
+
+    Args:
+        X: input observations column
+        temp_uncommon_remover: transformer
+
+    Returns:
+        (tuple) bool and category counts
+
+    """
+    X = check_df(X, single_column=True).iloc[:, 0].values
+    out = temp_uncommon_remover.fit_transform(X).values.ravel()
+
+    return (
+        not (np.array_equal(X, out) | (pd.isnull(X) & pd.isnull(out))).all(),
+        pd.unique(out).size,
+    )
+
+
 class CategoricalEncoder(SmartTransformer):
     """Automatically encode categorical features.
 
-    If there are less than 30 categories, then OneHotEncoder is used, if there
-    are more then HashingEncoder is used. If the columns containing a
+    If there are no more than 30 categories, then OneHotEncoder is used,
+    if there are more then HashingEncoder is used. If the columns containing a
     delimmeter exceed delim_cuttoff then a DummyEncoder is used (set cutoff to
     -1 to force). If used in a y_var context, LabelEncoder is used.
 
@@ -117,29 +139,6 @@ def __init__(self, unique_num_cutoff=30, merge_thresh=0.01, **kwargs):
         self.merge_thresh = merge_thresh
         super().__init__(**kwargs)
 
-    def will_transform(self, X, temp_ur):
-        """Check if the transformer will modify the data.
-
-        Uses current settings.
-
-        Args:
-            X: input observations column
-            temp_ur: transformer
-
-        Returns:
-            (tuple) bool and category counts
-
-        """
-        X = check_df(X, single_column=True).iloc[:, 0].values
-        out = temp_ur.fit_transform(X).values.ravel()
-
-        return (
-            not (
-                np.array_equal(X, out) | (pd.isnull(X) & pd.isnull(out))
-            ).all(),
-            pd.unique(out).size,
-        )
-
     def pick_transformer(self, X, y=None, **fit_params):
         """Determine the appropriate encoding method for an input dataset.
 
@@ -153,37 +152,64 @@ def pick_transformer(self, X, y=None, **fit_params):
             An initialized encoding transformer
 
         """
+        # NaN is treated as a separate category. In order to take it into
+        # account during the econder selection, we fill the na value with
+        # the string "NaN". In the final pipeline, it has a pre-defined
+        # filler as the first step, which will take effect during the real
+        # transformation.
+        X = X.fillna("NaN")
         data = X.iloc[:, 0]
         unique_count = len(data.value_counts())
 
+        # TODO performance drag. We may want to apply sampling on this part
+        #  and the uncommon_remove.
+        # Calculate stats for DummyEncoder
         delimeters = [",", ";", "\t"]
         delim_count = [
             len(list(data.astype("str").str.get_dummies(sep=d)))
             for d in delimeters
         ]
         delim_diff = min(delim_count) - len(list(pd.get_dummies(data)))
-        temp_ur = UncommonRemover(threshold=self.merge_thresh)
-        will_reduce, reduce_count = self.will_transform(X, temp_ur)
+
+        # Calculate stats for UncommonRemover
+        temp_uncommon_remover = UncommonRemover(threshold=self.merge_thresh)
+        will_reduce, potential_reduced_count = will_remove_uncommon(
+            X, temp_uncommon_remover
+        )
+
         ohe = OneHotEncoder(
             return_df=True, use_cat_names=True, handle_unknown="ignore"
         )
 
+        final_pipeline = SerializablePipeline(
+            [("fill_na", NaNFiller(fill_value="NaN"))]
+        )
+
         if self.y_var:
             return LabelEncoder()
         elif delim_diff < 0:
             delim = delimeters[delim_count.index(min(delim_count))]
-            return DummyEncoder(delimeter=delim)
+            final_pipeline.steps.append(
+                ("dummy_encodeer", DummyEncoder(delimeter=delim))
+            )
         elif unique_count <= self.unique_num_cutoff:
-            return ohe
-        elif (reduce_count <= self.unique_num_cutoff) and will_reduce:
-            return SerializablePipeline(
-                [
-                    ("ur", UncommonRemover(threshold=self.merge_thresh)),
-                    ("ohe", ohe),
-                ]
+            final_pipeline.steps.append(("one_hot_encoder", ohe))
+        elif (
+            potential_reduced_count <= self.unique_num_cutoff
+        ) and will_reduce:
+            final_pipeline.steps.append(
+                (
+                    "uncommon_remover",
+                    UncommonRemover(threshold=self.merge_thresh),
+                )
             )
+            final_pipeline.steps.append(("one_hot_encoder", ohe))
         else:
-            return HashingEncoder(n_components=30)
+            final_pipeline.steps.append(
+                ("hash_encoder", HashingEncoder(n_components=30))
+            )
+
+        return final_pipeline
 
 
 class SimpleImputer(SmartTransformer):
@@ -481,6 +507,6 @@ def _can_fit(self, transformer, X, y=None, sampling_threshold=0.1):
             # TODO change to ValueError once TFIDF is fixed.
             # logging.warning("Error during fit: ".format(str(e)))
             logging.warning(
-                "Revert to NoTransform for Neither " "Type temporarily."
+                "Revert to NoTransform for Neither type temporarily."
             )
             return NoTransform()
diff --git a/foreshadow/tests/test_concrete/test_na_filler.py b/foreshadow/tests/test_concrete/test_na_filler.py
diff --git a/foreshadow/tests/test_transformers/test_concrete/test_internals/test_internal.py b/foreshadow/tests/test_transformers/test_concrete/test_internals/test_internal.py
@@ -3,6 +3,37 @@
 from foreshadow.utils.testing import get_file_path
 
 
+def test_nan_filler():
+    import pandas as pd
+    import numpy as np
+
+    from foreshadow.concrete import NaNFiller
+    from foreshadow.utils import Constant
+
+    data = pd.DataFrame(
+        {
+            "a": ["123", "a", "b", np.nan],
+            "b": [np.nan, "q", "w", "v"],
+            "c": [np.nan, "1", "0", "1"],
+        }
+    )
+
+    check = pd.DataFrame(
+        {
+            "a": ["123", "a", "b", Constant.NAN_FILL_VALUE],
+            "b": [Constant.NAN_FILL_VALUE, "q", "w", "v"],
+            "c": [Constant.NAN_FILL_VALUE, "1", "0", "1"],
+        }
+    )
+
+    filler = NaNFiller()
+    df_transformed = filler.transform(data)
+    assert check.equals(df_transformed)
+
+    df_original = filler.inverse_transform(df_transformed)
+    assert data.equals(df_original)
+
+
 def test_dummy_encoder():
     import pandas as pd
 

diff --git a/foreshadow/tests/test_transformers/test_smart/test_smart.py b/foreshadow/tests/test_transformers/test_smart/test_smart.py
@@ -170,27 +170,35 @@ def test_smart_encoder_less_than_30_levels():
 
     from foreshadow.smart import CategoricalEncoder
     from foreshadow.concrete import OneHotEncoder
+    from foreshadow.concrete import NaNFiller
+    from foreshadow.pipeline import SerializablePipeline
 
     np.random.seed(0)
     leq_30_random_data = np.random.choice(30, size=500)
     smart_coder = CategoricalEncoder()
-    assert isinstance(
-        smart_coder.fit(leq_30_random_data).transformer, OneHotEncoder
-    )
+    transformer = smart_coder.fit(leq_30_random_data).transformer
+    assert isinstance(transformer, SerializablePipeline)
+    assert isinstance(transformer.steps[0][1], NaNFiller)
+    assert isinstance(transformer.steps[1][1], OneHotEncoder)
+    assert len(transformer.steps) == 2
 
 
 def test_smart_encoder_more_than_30_levels():
     import numpy as np
 
     from foreshadow.smart import CategoricalEncoder
     from foreshadow.concrete import HashingEncoder
+    from foreshadow.concrete import NaNFiller
+    from foreshadow.pipeline import SerializablePipeline
 
     np.random.seed(0)
     gt_30_random_data = np.random.choice(31, size=500)
     smart_coder = CategoricalEncoder()
-    assert isinstance(
-        smart_coder.fit(gt_30_random_data).transformer, HashingEncoder
-    )
+    transformer = smart_coder.fit(gt_30_random_data).transformer
+    assert isinstance(transformer, SerializablePipeline)
+    assert isinstance(transformer.steps[0][1], NaNFiller)
+    assert isinstance(transformer.steps[1][1], HashingEncoder)
+    assert len(transformer.steps) == 2
 
 
 def test_smart_encoder_more_than_30_levels_that_reduces():
@@ -357,23 +365,35 @@ def test_smart_encoder_delimmited():
     import pandas as pd
     from foreshadow.smart import CategoricalEncoder
     from foreshadow.concrete import DummyEncoder
+    from foreshadow.concrete import NaNFiller
+    from foreshadow.pipeline import SerializablePipeline
 
     data = pd.DataFrame({"test": ["a", "a,b,c", "a,b", "a,c"]})
     smart_coder = CategoricalEncoder()
-    assert isinstance(smart_coder.fit(data).transformer, DummyEncoder)
+    transformer = smart_coder.fit(data).transformer
+
+    assert isinstance(transformer, SerializablePipeline)
+    assert isinstance(transformer.steps[0][1], NaNFiller)
+    assert isinstance(transformer.steps[1][1], DummyEncoder)
+    assert len(transformer.steps) == 2
 
 
 def test_smart_encoder_more_than_30_levels_with_overwritten_cutoff():
     import numpy as np
     from foreshadow.smart import CategoricalEncoder
     from foreshadow.concrete import OneHotEncoder
+    from foreshadow.concrete import NaNFiller
+    from foreshadow.pipeline import SerializablePipeline
 
     np.random.seed(0)
     gt_30_random_data = np.random.choice(31, size=500)
     smart_coder = CategoricalEncoder(unique_num_cutoff=35)
-    assert isinstance(
-        smart_coder.fit(gt_30_random_data).transformer, OneHotEncoder
-    )
+    transformer = smart_coder.fit(gt_30_random_data).transformer
+
+    assert isinstance(transformer, SerializablePipeline)
+    assert isinstance(transformer.steps[0][1], NaNFiller)
+    assert isinstance(transformer.steps[1][1], OneHotEncoder)
+    assert len(transformer.steps) == 2
 
 
 def test_smart_financial_cleaner_us():

diff --git a/foreshadow/utils/__init__.py b/foreshadow/utils/__init__.py
@@ -10,6 +10,7 @@
 )
 from foreshadow.utils.constants import (
     ConfigKey,
+    Constant,
     DefaultConfig,
     EstimatorFamily,
     ProblemType,
@@ -56,4 +57,5 @@
     "Override",
     "ConfigKey",
     "DefaultConfig",
+    "Constant",
 ]
diff --git a/foreshadow/utils/constants.py b/foreshadow/utils/constants.py
@@ -28,3 +28,9 @@ class ConfigKey:
 
     N_JOBS = "n_jobs"
     PROCESSED_DATA_EXPORT_PATH = "processed_data_export_path"
+
+
+class Constant:
+    """General constants in Foreshadow."""
+
+    NAN_FILL_VALUE = "NaN"