Adding Feature Reducer with NoOps placeholder.

* Adding feature reducer placeholder. * Setup intent directory and add failing data cleaner test * intermediary * Working intents * Import madness and the basic intent mapper * import madness * Add additional functionality to metric (default value and invert) * Rename _param_mapping to param_mapping * Add newintents which will be mapped to intents later * Temporarily remove Foreshadow and Preprocessor (the classes) from the global foreshadow package namespace. * Remove improper use of patch in and use pytest-mock in the code * Rename Metric, the class, to MetricWrapper * Patch bug in the way validate_wrapped worked and add test to verify functionality * Fix linting * Address CR * Add newsfragment * Updating DataPreparer Base Classes and Project Restructure * DataCleaner changes * Final Project restructure: Included: tests skipped or changed. Some left failing to change as we integrate DataPreparer. V1 components removed V2 file structure in place with proper import system (some small changes still to be made). * foreshadow.concrete import rollup complete. * Refactor FeatureReducer according to new changes in the development branch * remove pdb statements in the code * Flaked. * Flaked. * fixing setup.cfg * Adding raises for flake8. * Adding raises for flake8. * ignoring flake.
georgian-io-archive · Aug 7, 2019 · acd6542 · acd6542
1 parent ce6a439
commit acd6542
Show file tree

Hide file tree

Showing 7 changed files with 214 additions and 3 deletions.
diff --git a/foreshadow/smart/__init__.py b/foreshadow/smart/__init__.py
@@ -9,6 +9,7 @@
     TextEncoder,
 )
 from foreshadow.smart.cleaner import Cleaner  # noqa: F401
+from foreshadow.smart.feature_reducer import FeatureReducer
 from foreshadow.smart.flatten import Flatten  # noqa: F401
 from foreshadow.smart.intentresolver import IntentResolver
 from foreshadow.smart.smart import SmartTransformer  # noqa: F401
@@ -24,4 +25,5 @@
     "Flatten",
     "Cleaner",
     "IntentResolver",
+    "FeatureReducer",
 ]
diff --git a/foreshadow/smart/feature_reducer.py b/foreshadow/smart/feature_reducer.py
@@ -0,0 +1,29 @@
+"""Smart Feature Reducer for FeatureReducerMapper step."""
+from foreshadow.concrete.internals import NoTransform
+
+from .smart import SmartTransformer
+
+
+class FeatureReducer(SmartTransformer):
+    """Decide which feature reduction function should be applied."""
+
+    def __init__(
+        self,  # manually adding as otherwise get_params won't see it.
+        check_wrapped=False,
+        **kwargs
+    ):
+        super().__init__(check_wrapped=check_wrapped, **kwargs)
+
+    def pick_transformer(self, X, y=None, **fit_params):
+        """Get best transformer for a given set of columns.
+
+        Args:
+            X: input DataFrame
+            y: input labels
+            **fit_params: fit_params
+
+        Returns:
+            Best feature engineering transformer.
+
+        """
+        return NoTransform()
diff --git a/foreshadow/steps/__init__.py b/foreshadow/steps/__init__.py
@@ -2,6 +2,7 @@
 
 from .cleaner import CleanerMapper
 from .feature_engineerer import FeatureEngineererMapper
+from .feature_reducer import FeatureReducerMapper
 from .mapper import IntentMapper
 from .preprocessor import Preprocessor
 
@@ -11,4 +12,5 @@
     "IntentMapper",
     "Preprocessor",
     "FeatureEngineererMapper",
+    "FeatureReducerMapper",
 ]
diff --git a/foreshadow/steps/feature_reducer.py b/foreshadow/steps/feature_reducer.py
@@ -0,0 +1,111 @@
+"""Feature Reducer module in Foreshadow workflow."""
+from collections import defaultdict
+
+from foreshadow.smart import FeatureReducer as _FeatureReducer
+
+from .autointentmap import AutoIntentMixin
+from .preparerstep import PreparerStep
+
+
+class FeatureReducerMapper(PreparerStep, AutoIntentMixin):
+    """FeatureReducer step in DataPreparer."""
+
+    def __init__(self, **kwargs):
+        """Define the single step for FeatureReducer, using SmartReducer.
+
+        Args:
+            **kwargs: kwargs to PreparerStep initializer.
+
+        """
+        super().__init__(**kwargs)
+
+    def get_mapping(self, X):
+        """Return the mapping of transformations for the FeatureReducer step.
+
+        Current code only supports intent-based reduction.
+
+        Args:
+            X: input DataFrame.
+
+        Returns:
+            Mapping in accordance with super.
+
+        """
+        """
+        A longer discussion. Please correct me if I'm wrong.
+        Feature reduction could look at columns in (at least) 2 ways:
+        1. By intent
+        2. All columns as a whole
+        3. One after the other? Probably option1 then option2.
+        4. Other ways? This requires more research...
+
+        Based on current implementation,
+        it is only possible to choose one mapping from option 1 or 2.
+        Option 3 may not be possible.
+
+        The reason is that we must provide a predefined column_mapping,
+        fixing the column names.
+
+        Assuming that we are using Option 3 with a column_mapping like this:
+        {
+            0: {
+                # columns with categorical intents
+                "inputs": ([col1, col2, col3,..., col9], ),
+                "steps": [SmartFeatureReducer,],
+            },
+            1: {
+                # columns with numeric intents
+                "inputs": ([col10, col11,..., col16], ),
+                "steps": [SmartFeatureReducer,],
+            },
+            2: {
+                # all columns
+                "inputs": ([col1, col2, col3,..., col16], ),
+                "steps": [SmartFeatureReducer,],
+            },
+        }
+
+        If we choose a reduction method that does not modify column names, t
+        his may be fine:
+
+        Say we apply this reduction method is applied to mapping[0]
+        and/or mapping [1] and some columns are removed.
+
+        When we process mapping[2], we face the fact of missing column names
+        in the dataframe. In this case, we may just do a pre-processing step
+        to remove missing columns from mapping[2]["inputs"] and proceed
+        as usual.
+
+        However, what if the SmartFeatureReducer decides to use a method
+        that not only reduce dimensionality but also modify the name,
+        like PCA? In that case, the columns in mapping[3]["inputs"]
+        may not be valid. We have to somehow get the latest columns
+        from the dataframe first before applying reduction on the whole df.
+
+        To achieve this, it seems that we need to modify the method
+        parallelize_smart_steps and/or the class ParallelProcessor
+        to inject this column list freshing operation.
+        """
+        self.check_resolve(X)
+
+        def group_by(iterable, column_sharer_key):
+            result = defaultdict(list)
+            for col in iterable:
+                result[self.column_sharer[column_sharer_key][col]].append(col)
+            return result
+
+        columns = X.columns.values.tolist()
+        columns_by_intent = list(group_by(columns, "intent").values())
+
+        """Not sure where the drop_feature functionality would apply.
+        Would reducer produce empty columns? If yes, the concrete reducer
+        should check and apply drop column functionality before return.
+        """
+
+        return self.separate_cols(
+            transformers=[
+                [_FeatureReducer(column_sharer=self.column_sharer)]
+                for col_group in columns_by_intent
+            ],
+            cols=columns_by_intent,
+        )
diff --git a/foreshadow/tests/test_transformers/test_concrete/test_feature_reducer/__init__.py b/foreshadow/tests/test_transformers/test_concrete/test_feature_reducer/__init__.py
diff --git a/...shadow/tests/test_transformers/test_concrete/test_feature_reducer/test_feature_reducer.py b/...shadow/tests/test_transformers/test_concrete/test_feature_reducer/test_feature_reducer.py
@@ -0,0 +1,65 @@
+"""Test feature reducer.py"""
+
+
+def test_feature_reducer_fit_no_ops():
+    import numpy as np
+    import pandas as pd
+
+    from foreshadow.columnsharer import ColumnSharer
+    from foreshadow.steps import FeatureReducerMapper
+
+    data = pd.DataFrame(
+        {
+            "age": [10, 20, 33, 44],
+            "weights": [20, 30, 50, 60],
+            "occupation": ["engineer", "artist", "doctor", "inspector"],
+        },
+        columns=["age", "weights", "occupation"],
+    )
+    cs = ColumnSharer()
+    cs["intent", "age"] = "Numeric"
+    cs["intent", "weights"] = "Numeric"
+    cs["intent", "occupation"] = "Categorical"
+
+    fr = FeatureReducerMapper(column_sharer=cs)
+    fr.fit(data)
+    transformed_data = fr.transform(data)
+    assert np.all(
+        np.equal(
+            data.values[data.notna()],
+            transformed_data.values[transformed_data.notna()],
+        )
+    )
+
+
+def test_feature_reducer_get_mapping_by_intent():
+    import pandas as pd
+
+    from foreshadow.columnsharer import ColumnSharer
+    from foreshadow.steps import FeatureReducerMapper
+    from foreshadow.steps.preparerstep import PreparerMapping
+    from foreshadow.smart import FeatureReducer
+
+    data = pd.DataFrame(
+        {
+            "age": [10, 20, 33, 44],
+            "weights": [20, 30, 50, 60],
+            "occupation": ["engineer", "artist", "doctor", "inspector"],
+        },
+        columns=["age", "weights", "occupation"],
+    )
+    cs = ColumnSharer()
+    cs["intent", "age"] = "Numeric"
+    cs["intent", "weights"] = "Numeric"
+    cs["intent", "occupation"] = "Categorical"
+
+    fr = FeatureReducerMapper(column_sharer=cs)
+    column_mapping = fr.get_mapping(data)
+
+    check = PreparerMapping()
+    check.add(["age", "weights"], [FeatureReducer()])
+    check.add(["occupation"], [FeatureReducer()])
+
+    for key in column_mapping.store:
+        assert key in check.store
+        assert str(column_mapping.store[key]) == str(check.store[key])
diff --git a/foreshadow/wrapper.py b/foreshadow/wrapper.py
@@ -77,11 +77,13 @@ def __init__(self, *args, **kwargs):
 
             Args:
                 *args: args to the parent constructor (shadowed transformer)
-                keep_columns: True to keep the original columns, False to not
-                name: name for new/created columns
                 **kwargs: kwargs to the parent constructor
 
-            ..#noqa: I102
+            Raises:
+                TypeError: if the init for Transformer cannot be called.
+
+            ..# noqa: I401
+            ..# noqa: I402
 
             """
             if "name" in kwargs: