Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Handling y-variables in preprocessor and other relevant updates (#21)
Browse files Browse the repository at this point in the history
* Add y-variable support to intents and preprocessors and change intent pipelines to templates
* Simplification of testing validity of Intents and removal of the unused dtype field
* Update foreshadow to work with y_vars, add smart transformer y_var processing, and update Encoder (smart), and patch a few other bugs
* Invertibility of most all transformers
* Add named tuple support for intent pipelines
* Address other CR requests and reformat code
  • Loading branch information
adithyabsk committed Dec 6, 2018
1 parent bacdf5a commit 33eee1c
Show file tree
Hide file tree
Showing 20 changed files with 556 additions and 151 deletions.
2 changes: 1 addition & 1 deletion foreshadow/foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def y_preprocessor(self, yp):
else:
raise ValueError("Invalid value passed as y_preprocessor")
else:
self._y_preprocessor = Preprocessor()
self._y_preprocessor = Preprocessor(y_var=True)

@property
def estimator(self):
Expand Down
81 changes: 42 additions & 39 deletions foreshadow/intents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,17 @@
"""

from abc import abstractmethod
from collections import namedtuple
from functools import wraps

# must be defined above registry import
PipelineTemplateEntry = namedtuple(
"PipelineTemplateEntry", ["transformer_name", "transformer_entry", "y_var"]
)


TransformerEntry = namedtuple("TransformerEntry", ["transformer", "args_dict"])

from .registry import _IntentRegistry, registry_eval


Expand Down Expand Up @@ -52,20 +61,31 @@ class BaseIntent(metaclass=_IntentRegistry):
"""

dtype = None
"""Data type of column required for this intent to match (not implemented)"""

children = None
"""More-specific intents that require this intent to match to be
considered."""

single_pipeline = None
"""Single pipeline of smart transformers that affect a single column in
in an intent"""
single_pipeline_template = None
"""A template for single pipelines of smart transformers that affect a
single column in an intent
The template needs an additional boolean at the end of the tuple that
determines whether the transformation can be applied to response
variables.
Example: single_pipeline_template = [
('t1', Transformer1, False),
('t2', (Transformer2, {'arg1': True}), True),
('t3', Transformer1, True),
]
"""

multi_pipeline = None
"""Multi pipeline of smart transformers that affect multiple columns in
an intent"""
multi_pipeline_template = None
"""A template for multi pipelines of smart transformers that affect multiple
columns in an intent
See single_pipeline_template for an example defention
"""

@classmethod
@check_base
Expand Down Expand Up @@ -124,36 +144,19 @@ def is_intent(cls, df):
pass # pragma: no cover

@classmethod
def _check_required_class_attributes(cls):
def _check_intent(cls):
"""Validate class variables are setup properly"""

not_implemented = lambda x, y: "Subclass must define {} class attribute.\n{}".format(
x, y
not_implemented = lambda v, m: "Subclass must define {} class attribute.\n{}".format(
v, m
)
if cls.dtype is None:
raise NotImplementedError(
not_implemented(
"cls.dtype", "This attribute should define the dtype of the intent."
define_attrs = [
"children",
"single_pipeline_template",
"multi_pipeline_template",
]
# Check that intent attrs are defined
for a in define_attrs:
if getattr(cls, a) is None:
raise NotImplementedError(
not_implemented(a, "Developers please see the documentation.")
)
)
elif cls.children is None:
raise NotImplementedError(
not_implemented(
"cls.children",
"This attribute should define the children of the intent.",
)
)
elif cls.single_pipeline is None:
raise NotImplementedError(
not_implemented(
"cls.single_pipeline",
"This attribute should define the transformers for a single pipeline",
)
)
elif cls.multi_pipeline is None:
raise NotImplementedError(
not_implemented(
"cls.multi_pipeline",
"This attribute should define the transformers for a multi pipeline",
)
)
34 changes: 15 additions & 19 deletions foreshadow/intents/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
import numpy as np

from .base import BaseIntent
from .base import BaseIntent, PipelineTemplateEntry, TransformerEntry

from ..transformers.internals import DropFeature
from ..transformers.smart import SimpleImputer, MultiImputer, Scaler, Encoder
Expand All @@ -19,16 +19,15 @@ class GenericIntent(BaseIntent):
"""

dtype = "str"
"""Matches to string dtypes (not implemented)"""

children = ["NumericIntent", "CategoricalIntent"]
"""Matches to CategoricalIntent over NumericIntent"""

single_pipeline = []
single_pipeline_template = []
"""No transformers"""

multi_pipeline = [("multi_impute", MultiImputer())]
multi_pipeline_template = [
PipelineTemplateEntry("multi_impute", MultiImputer, False)
]
"""Performs multi imputation over the entire DataFrame"""

@classmethod
Expand All @@ -44,20 +43,17 @@ class NumericIntent(GenericIntent):
"""

dtype = "float"
"""Matches to float dtypes (not implemented)"""

children = []
"""No children"""

single_pipeline = [
("dropper", DropFeature()),
("simple_imputer", SimpleImputer()),
("scaler", Scaler()),
single_pipeline_template = [
PipelineTemplateEntry("dropper", DropFeature, False),
PipelineTemplateEntry("simple_imputer", SimpleImputer, False),
PipelineTemplateEntry("scaler", Scaler, True),
]
"""Performs imputation and scaling using Smart Transformers"""

multi_pipeline = []
multi_pipeline_template = []
"""No multi pipeline"""

@classmethod
Expand All @@ -78,16 +74,16 @@ class CategoricalIntent(GenericIntent):
"""

dtype = "int"
"""Matches to integer dtypes (not implemented)"""

children = []
"""No children"""

single_pipeline = [("dropper", DropFeature()), ("impute_encode", Encoder())]
single_pipeline_template = [
PipelineTemplateEntry("dropper", DropFeature, False),
PipelineTemplateEntry("impute_encode", Encoder, True),
]
"""Encodes the column automatically"""

multi_pipeline = []
multi_pipeline_template = []
"""No multi pipeline"""

@classmethod
Expand Down
91 changes: 90 additions & 1 deletion foreshadow/intents/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

from abc import ABCMeta

from sklearn.base import BaseEstimator, TransformerMixin

from ..transformers.base import SmartTransformer
from .base import PipelineTemplateEntry, TransformerEntry

_registry = {}


Expand Down Expand Up @@ -34,6 +39,89 @@ def validate_input(clsname):
raise ValueError("Input must be either a string or a list of strings")


def _process_templates(cls_target):
def _resolve_template(template):
if not all(
isinstance(s, PipelineTemplateEntry)
and (
(
isinstance(s.transformer_entry, type)
and issubclass(
s.transformer_entry, (BaseEstimator, TransformerMixin)
)
)
or (
isinstance(s.transformer_entry, TransformerEntry)
and isinstance(s.transformer_entry.transformer, type)
and issubclass(
s.transformer_entry.transformer,
(BaseEstimator, TransformerMixin),
)
and isinstance(s.transformer_entry.args_dict, dict)
)
)
for s in template
):
raise ValueError("Malformed transformer entry in template")

x_pipeline = [
(
s.transformer_name,
s.transformer_entry()
if callable(s.transformer_entry)
else s.transformer_entry.transformer(**s.transformer_entry.args_dict),
)
for s in template
]
y_pipeline = [
(
s.transformer_name,
s.transformer_entry(
**{
"y_var": True
for _ in range(1)
if issubclass(s.transformer_entry, SmartTransformer)
}
)
if callable(s.transformer_entry)
else s.transformer_entry.transformer(
**s.transformer_entry.args_dict,
**{
"y_var": True
for _ in range(1)
if issubclass(s.transformer_entry.transformer, SmartTransformer)
}
),
)
for s in template
if s.y_var
]

return x_pipeline, y_pipeline

def _process_template(cls_target, template_name):
t = getattr(cls_target, template_name)
attr_base = template_name.replace("_template", "")
if len(t) == 0:
setattr(cls_target, attr_base + "_x", t)
setattr(cls_target, attr_base + "_y", t)
else:
x_pipe, y_pipe = _resolve_template(t)
setattr(cls_target, attr_base + "_x", x_pipe)
setattr(cls_target, attr_base + "_y", y_pipe)

return lambda y_var=False: (
getattr(cls_target, attr_base + "_x")
if not y_var
else getattr(cls_target, attr_base + "_y")
)

cls_target.single_pipeline = _process_template(
cls_target, "single_pipeline_template"
)
cls_target.multi_pipeline = _process_template(cls_target, "multi_pipeline_template")


def registry_eval(cls_target):
"""Retrieve intent class from registry dictionary
Expand All @@ -60,6 +148,7 @@ def __new__(cls, *args, **kwargs):
)
)
elif class_.__name__ is not "BaseIntent":
class_._check_required_class_attributes()
class_._check_intent()
_process_templates(class_)
_register_intent(class_)
return class_
24 changes: 15 additions & 9 deletions foreshadow/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class Preprocessor(BaseEstimator, TransformerMixin):
Parameters:
from_json: Dictionary representing JSON config file (See docs for more)
y_var: Boolean that indicates the processing of a response variable
Attributes:
pipeline: Internal representation of sklearn pipeline. Can be exported and
Expand All @@ -33,7 +34,7 @@ class Preprocessor(BaseEstimator, TransformerMixin):
"""

def __init__(self, from_json=None, **fit_params):
def __init__(self, from_json=None, y_var=False, **fit_params):
self._intent_map = {}
self._pipeline_map = {}
self._choice_map = {}
Expand All @@ -44,6 +45,7 @@ def __init__(self, from_json=None, **fit_params):
self.fit_params = fit_params
self.is_fit = False
self.from_json = from_json
self.y_var = y_var
self.is_linear = False
self._init_json()

Expand Down Expand Up @@ -81,6 +83,7 @@ def _map_intents(self, X_df):
]
self._choice_map[c] = valid_cols
temp_map[c] = valid_cols[-1][1]

# Set intent map with override
self._intent_map = {**temp_map, **self._intent_map}

Expand Down Expand Up @@ -131,18 +134,18 @@ def _map_pipelines(self):
self._pipeline_map = {
# Creates pipeline object from intent single_pipeline attribute
**{
k: Pipeline(deepcopy(v.single_pipeline))
k: Pipeline(deepcopy(v.single_pipeline(self.y_var)))
for k, v in self._intent_map.items()
if v.__name__ not in self._intent_pipelines.keys()
and len(v.single_pipeline) > 0
and len(v.single_pipeline(self.y_var)) > 0
},
# Extracts already resolved single pipelines from JSON intent overrides
**{
k: self._intent_pipelines[v.__name__].get(
"single",
Pipeline(
deepcopy(v.single_pipeline)
if len(v.single_pipeline) > 0
deepcopy(v.single_pipeline(self.y_var))
if len(v.single_pipeline(self.y_var)) > 0
else [("null", None)]
),
)
Expand All @@ -162,13 +165,13 @@ def _map_pipelines(self):
v.__name__: {
# Fetch multi pipeline from Intent class
"multi": Pipeline(
deepcopy(v.multi_pipeline)
if len(v.multi_pipeline) > 0
deepcopy(v.multi_pipeline(self.y_var))
if len(v.multi_pipeline(self.y_var)) > 0
else [("null", None)]
),
"single": Pipeline(
deepcopy(v.single_pipeline)
if len(v.single_pipeline) > 0
deepcopy(v.single_pipeline(self.y_var))
if len(v.single_pipeline(self.y_var)) > 0
else [("null", None)]
),
# Extract multi pipeline from JSON config (highest priority)
Expand Down Expand Up @@ -280,6 +283,8 @@ def _init_json(self):
return

try:
if "y_var" in config.keys():
self.y_var = config["y_var"]
# Parse columns section
if "columns" in config.keys():
# Iterate columns
Expand Down Expand Up @@ -363,6 +368,7 @@ def serialize(self):
"columns": json_cols,
"postprocess": json_multi,
"intents": json_intents,
"y_var": self.y_var,
}

def fit(self, X, y=None, **fit_params):
Expand Down

0 comments on commit 33eee1c

Please sign in to comment.