Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Adding Feature Reducer with NoOps placeholder.
Browse files Browse the repository at this point in the history
* Adding feature reducer placeholder.

* Setup intent directory and add failing data cleaner test

* intermediary

* Working intents

* Import madness and the basic intent mapper
* import madness
* Add additional functionality to metric (default value and invert)
* Rename _param_mapping to param_mapping
* Add newintents which will be mapped to intents later
* Temporarily remove Foreshadow and Preprocessor (the classes) from the
global foreshadow package namespace.
* Remove improper use of patch in and use pytest-mock in the code
* Rename Metric, the class, to MetricWrapper
* Patch bug in the way validate_wrapped worked and add test to verify
functionality

* Fix linting

* Address CR

* Add newsfragment

* Updating DataPreparer Base Classes and Project Restructure

* DataCleaner changes

* Final Project restructure:
Included: tests skipped or changed. Some left failing to change as we integrate DataPreparer.
V1 components removed
V2 file structure in place with proper import system (some small changes still to be made).

* foreshadow.concrete import rollup complete.

* Refactor FeatureReducer according to new changes in the development branch

* remove pdb statements in the code

* Flaked.

* Flaked.

* fixing setup.cfg

* Adding raises for flake8.

* Adding raises for flake8.

* ignoring flake.
  • Loading branch information
jzhang-gp authored and cchoquette committed Aug 7, 2019
1 parent ce6a439 commit acd6542
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 3 deletions.
2 changes: 2 additions & 0 deletions foreshadow/smart/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
TextEncoder,
)
from foreshadow.smart.cleaner import Cleaner # noqa: F401
from foreshadow.smart.feature_reducer import FeatureReducer
from foreshadow.smart.flatten import Flatten # noqa: F401
from foreshadow.smart.intentresolver import IntentResolver
from foreshadow.smart.smart import SmartTransformer # noqa: F401
Expand All @@ -24,4 +25,5 @@
"Flatten",
"Cleaner",
"IntentResolver",
"FeatureReducer",
]
29 changes: 29 additions & 0 deletions foreshadow/smart/feature_reducer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Smart Feature Reducer for FeatureReducerMapper step."""
from foreshadow.concrete.internals import NoTransform

from .smart import SmartTransformer


class FeatureReducer(SmartTransformer):
"""Decide which feature reduction function should be applied."""

def __init__(
self, # manually adding as otherwise get_params won't see it.
check_wrapped=False,
**kwargs
):
super().__init__(check_wrapped=check_wrapped, **kwargs)

def pick_transformer(self, X, y=None, **fit_params):
"""Get best transformer for a given set of columns.
Args:
X: input DataFrame
y: input labels
**fit_params: fit_params
Returns:
Best feature engineering transformer.
"""
return NoTransform()
2 changes: 2 additions & 0 deletions foreshadow/steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from .cleaner import CleanerMapper
from .feature_engineerer import FeatureEngineererMapper
from .feature_reducer import FeatureReducerMapper
from .mapper import IntentMapper
from .preprocessor import Preprocessor

Expand All @@ -11,4 +12,5 @@
"IntentMapper",
"Preprocessor",
"FeatureEngineererMapper",
"FeatureReducerMapper",
]
111 changes: 111 additions & 0 deletions foreshadow/steps/feature_reducer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""Feature Reducer module in Foreshadow workflow."""
from collections import defaultdict

from foreshadow.smart import FeatureReducer as _FeatureReducer

from .autointentmap import AutoIntentMixin
from .preparerstep import PreparerStep


class FeatureReducerMapper(PreparerStep, AutoIntentMixin):
"""FeatureReducer step in DataPreparer."""

def __init__(self, **kwargs):
"""Define the single step for FeatureReducer, using SmartReducer.
Args:
**kwargs: kwargs to PreparerStep initializer.
"""
super().__init__(**kwargs)

def get_mapping(self, X):
"""Return the mapping of transformations for the FeatureReducer step.
Current code only supports intent-based reduction.
Args:
X: input DataFrame.
Returns:
Mapping in accordance with super.
"""
"""
A longer discussion. Please correct me if I'm wrong.
Feature reduction could look at columns in (at least) 2 ways:
1. By intent
2. All columns as a whole
3. One after the other? Probably option1 then option2.
4. Other ways? This requires more research...
Based on current implementation,
it is only possible to choose one mapping from option 1 or 2.
Option 3 may not be possible.
The reason is that we must provide a predefined column_mapping,
fixing the column names.
Assuming that we are using Option 3 with a column_mapping like this:
{
0: {
# columns with categorical intents
"inputs": ([col1, col2, col3,..., col9], ),
"steps": [SmartFeatureReducer,],
},
1: {
# columns with numeric intents
"inputs": ([col10, col11,..., col16], ),
"steps": [SmartFeatureReducer,],
},
2: {
# all columns
"inputs": ([col1, col2, col3,..., col16], ),
"steps": [SmartFeatureReducer,],
},
}
If we choose a reduction method that does not modify column names, t
his may be fine:
Say we apply this reduction method is applied to mapping[0]
and/or mapping [1] and some columns are removed.
When we process mapping[2], we face the fact of missing column names
in the dataframe. In this case, we may just do a pre-processing step
to remove missing columns from mapping[2]["inputs"] and proceed
as usual.
However, what if the SmartFeatureReducer decides to use a method
that not only reduce dimensionality but also modify the name,
like PCA? In that case, the columns in mapping[3]["inputs"]
may not be valid. We have to somehow get the latest columns
from the dataframe first before applying reduction on the whole df.
To achieve this, it seems that we need to modify the method
parallelize_smart_steps and/or the class ParallelProcessor
to inject this column list freshing operation.
"""
self.check_resolve(X)

def group_by(iterable, column_sharer_key):
result = defaultdict(list)
for col in iterable:
result[self.column_sharer[column_sharer_key][col]].append(col)
return result

columns = X.columns.values.tolist()
columns_by_intent = list(group_by(columns, "intent").values())

"""Not sure where the drop_feature functionality would apply.
Would reducer produce empty columns? If yes, the concrete reducer
should check and apply drop column functionality before return.
"""

return self.separate_cols(
transformers=[
[_FeatureReducer(column_sharer=self.column_sharer)]
for col_group in columns_by_intent
],
cols=columns_by_intent,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Test feature reducer.py"""


def test_feature_reducer_fit_no_ops():
import numpy as np
import pandas as pd

from foreshadow.columnsharer import ColumnSharer
from foreshadow.steps import FeatureReducerMapper

data = pd.DataFrame(
{
"age": [10, 20, 33, 44],
"weights": [20, 30, 50, 60],
"occupation": ["engineer", "artist", "doctor", "inspector"],
},
columns=["age", "weights", "occupation"],
)
cs = ColumnSharer()
cs["intent", "age"] = "Numeric"
cs["intent", "weights"] = "Numeric"
cs["intent", "occupation"] = "Categorical"

fr = FeatureReducerMapper(column_sharer=cs)
fr.fit(data)
transformed_data = fr.transform(data)
assert np.all(
np.equal(
data.values[data.notna()],
transformed_data.values[transformed_data.notna()],
)
)


def test_feature_reducer_get_mapping_by_intent():
import pandas as pd

from foreshadow.columnsharer import ColumnSharer
from foreshadow.steps import FeatureReducerMapper
from foreshadow.steps.preparerstep import PreparerMapping
from foreshadow.smart import FeatureReducer

data = pd.DataFrame(
{
"age": [10, 20, 33, 44],
"weights": [20, 30, 50, 60],
"occupation": ["engineer", "artist", "doctor", "inspector"],
},
columns=["age", "weights", "occupation"],
)
cs = ColumnSharer()
cs["intent", "age"] = "Numeric"
cs["intent", "weights"] = "Numeric"
cs["intent", "occupation"] = "Categorical"

fr = FeatureReducerMapper(column_sharer=cs)
column_mapping = fr.get_mapping(data)

check = PreparerMapping()
check.add(["age", "weights"], [FeatureReducer()])
check.add(["occupation"], [FeatureReducer()])

for key in column_mapping.store:
assert key in check.store
assert str(column_mapping.store[key]) == str(check.store[key])
8 changes: 5 additions & 3 deletions foreshadow/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,13 @@ def __init__(self, *args, **kwargs):
Args:
*args: args to the parent constructor (shadowed transformer)
keep_columns: True to keep the original columns, False to not
name: name for new/created columns
**kwargs: kwargs to the parent constructor
..#noqa: I102
Raises:
TypeError: if the init for Transformer cannot be called.
..# noqa: I401
..# noqa: I402
"""
if "name" in kwargs:
Expand Down

0 comments on commit acd6542

Please sign in to comment.