Handling y-variables in preprocessor and other relevant updates (#21)

* Add y-variable support to intents and preprocessors and change intent pipelines to templates * Simplification of testing validity of Intents and removal of the unused dtype field * Update foreshadow to work with y_vars, add smart transformer y_var processing, and update Encoder (smart), and patch a few other bugs * Invertibility of most all transformers * Add named tuple support for intent pipelines * Address other CR requests and reformat code
georgian-io-archive · Dec 6, 2018 · 33eee1c · 33eee1c
1 parent bacdf5a
commit 33eee1c
Show file tree

Hide file tree

Showing 20 changed files with 556 additions and 151 deletions.
diff --git a/foreshadow/foreshadow.py b/foreshadow/foreshadow.py
@@ -99,7 +99,7 @@ def y_preprocessor(self, yp):
             else:
                 raise ValueError("Invalid value passed as y_preprocessor")
         else:
-            self._y_preprocessor = Preprocessor()
+            self._y_preprocessor = Preprocessor(y_var=True)
 
     @property
     def estimator(self):

diff --git a/foreshadow/intents/base.py b/foreshadow/intents/base.py
@@ -3,8 +3,17 @@
 """
 
 from abc import abstractmethod
+from collections import namedtuple
 from functools import wraps
 
+# must be defined above registry import
+PipelineTemplateEntry = namedtuple(
+    "PipelineTemplateEntry", ["transformer_name", "transformer_entry", "y_var"]
+)
+
+
+TransformerEntry = namedtuple("TransformerEntry", ["transformer", "args_dict"])
+
 from .registry import _IntentRegistry, registry_eval
 
 
@@ -52,20 +61,31 @@ class BaseIntent(metaclass=_IntentRegistry):
 
     """
 
-    dtype = None
-    """Data type of column required for this intent to match (not implemented)"""
-
     children = None
     """More-specific intents that require this intent to match to be
             considered."""
 
-    single_pipeline = None
-    """Single pipeline of smart transformers that affect a single column in 
-            in an intent"""
+    single_pipeline_template = None
+    """A template for single pipelines of smart transformers that affect a 
+        single column in an intent
+
+        The template needs an additional boolean at the end of the tuple that
+        determines whether the transformation can be applied to response 
+        variables.
+    
+        Example: single_pipeline_template = [
+            ('t1', Transformer1, False),
+            ('t2', (Transformer2, {'arg1': True}), True),
+            ('t3', Transformer1, True),
+        ]
+    """
 
-    multi_pipeline = None
-    """Multi pipeline of smart transformers that affect multiple columns in
-            an intent"""
+    multi_pipeline_template = None
+    """A template for multi pipelines of smart transformers that affect multiple 
+        columns in an intent
+    
+        See single_pipeline_template for an example defention
+    """
 
     @classmethod
     @check_base
@@ -124,36 +144,19 @@ def is_intent(cls, df):
         pass  # pragma: no cover
 
     @classmethod
-    def _check_required_class_attributes(cls):
+    def _check_intent(cls):
         """Validate class variables are setup properly"""
-
-        not_implemented = lambda x, y: "Subclass must define {} class attribute.\n{}".format(
-            x, y
+        not_implemented = lambda v, m: "Subclass must define {} class attribute.\n{}".format(
+            v, m
         )
-        if cls.dtype is None:
-            raise NotImplementedError(
-                not_implemented(
-                    "cls.dtype", "This attribute should define the dtype of the intent."
+        define_attrs = [
+            "children",
+            "single_pipeline_template",
+            "multi_pipeline_template",
+        ]
+        # Check that intent attrs are defined
+        for a in define_attrs:
+            if getattr(cls, a) is None:
+                raise NotImplementedError(
+                    not_implemented(a, "Developers please see the documentation.")
                 )
-            )
-        elif cls.children is None:
-            raise NotImplementedError(
-                not_implemented(
-                    "cls.children",
-                    "This attribute should define the children of the intent.",
-                )
-            )
-        elif cls.single_pipeline is None:
-            raise NotImplementedError(
-                not_implemented(
-                    "cls.single_pipeline",
-                    "This attribute should define the transformers for a single pipeline",
-                )
-            )
-        elif cls.multi_pipeline is None:
-            raise NotImplementedError(
-                not_implemented(
-                    "cls.multi_pipeline",
-                    "This attribute should define the transformers for a multi pipeline",
-                )
-            )
diff --git a/foreshadow/intents/general.py b/foreshadow/intents/general.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import numpy as np
 
-from .base import BaseIntent
+from .base import BaseIntent, PipelineTemplateEntry, TransformerEntry
 
 from ..transformers.internals import DropFeature
 from ..transformers.smart import SimpleImputer, MultiImputer, Scaler, Encoder
@@ -19,16 +19,15 @@ class GenericIntent(BaseIntent):
 
     """
 
-    dtype = "str"
-    """Matches to string dtypes (not implemented)"""
-
     children = ["NumericIntent", "CategoricalIntent"]
     """Matches to CategoricalIntent over NumericIntent"""
 
-    single_pipeline = []
+    single_pipeline_template = []
     """No transformers"""
 
-    multi_pipeline = [("multi_impute", MultiImputer())]
+    multi_pipeline_template = [
+        PipelineTemplateEntry("multi_impute", MultiImputer, False)
+    ]
     """Performs multi imputation over the entire DataFrame"""
 
     @classmethod
@@ -44,20 +43,17 @@ class NumericIntent(GenericIntent):
 
     """
 
-    dtype = "float"
-    """Matches to float dtypes (not implemented)"""
-
     children = []
     """No children"""
 
-    single_pipeline = [
-        ("dropper", DropFeature()),
-        ("simple_imputer", SimpleImputer()),
-        ("scaler", Scaler()),
+    single_pipeline_template = [
+        PipelineTemplateEntry("dropper", DropFeature, False),
+        PipelineTemplateEntry("simple_imputer", SimpleImputer, False),
+        PipelineTemplateEntry("scaler", Scaler, True),
     ]
     """Performs imputation and scaling using Smart Transformers"""
 
-    multi_pipeline = []
+    multi_pipeline_template = []
     """No multi pipeline"""
 
     @classmethod
@@ -78,16 +74,16 @@ class CategoricalIntent(GenericIntent):
 
     """
 
-    dtype = "int"
-    """Matches to integer dtypes (not implemented)"""
-
     children = []
     """No children"""
 
-    single_pipeline = [("dropper", DropFeature()), ("impute_encode", Encoder())]
+    single_pipeline_template = [
+        PipelineTemplateEntry("dropper", DropFeature, False),
+        PipelineTemplateEntry("impute_encode", Encoder, True),
+    ]
     """Encodes the column automatically"""
 
-    multi_pipeline = []
+    multi_pipeline_template = []
     """No multi pipeline"""
 
     @classmethod

diff --git a/foreshadow/intents/registry.py b/foreshadow/intents/registry.py
@@ -4,6 +4,11 @@
 
 from abc import ABCMeta
 
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from ..transformers.base import SmartTransformer
+from .base import PipelineTemplateEntry, TransformerEntry
+
 _registry = {}
 
 
@@ -34,6 +39,89 @@ def validate_input(clsname):
         raise ValueError("Input must be either a string or a list of strings")
 
 
+def _process_templates(cls_target):
+    def _resolve_template(template):
+        if not all(
+            isinstance(s, PipelineTemplateEntry)
+            and (
+                (
+                    isinstance(s.transformer_entry, type)
+                    and issubclass(
+                        s.transformer_entry, (BaseEstimator, TransformerMixin)
+                    )
+                )
+                or (
+                    isinstance(s.transformer_entry, TransformerEntry)
+                    and isinstance(s.transformer_entry.transformer, type)
+                    and issubclass(
+                        s.transformer_entry.transformer,
+                        (BaseEstimator, TransformerMixin),
+                    )
+                    and isinstance(s.transformer_entry.args_dict, dict)
+                )
+            )
+            for s in template
+        ):
+            raise ValueError("Malformed transformer entry in template")
+
+        x_pipeline = [
+            (
+                s.transformer_name,
+                s.transformer_entry()
+                if callable(s.transformer_entry)
+                else s.transformer_entry.transformer(**s.transformer_entry.args_dict),
+            )
+            for s in template
+        ]
+        y_pipeline = [
+            (
+                s.transformer_name,
+                s.transformer_entry(
+                    **{
+                        "y_var": True
+                        for _ in range(1)
+                        if issubclass(s.transformer_entry, SmartTransformer)
+                    }
+                )
+                if callable(s.transformer_entry)
+                else s.transformer_entry.transformer(
+                    **s.transformer_entry.args_dict,
+                    **{
+                        "y_var": True
+                        for _ in range(1)
+                        if issubclass(s.transformer_entry.transformer, SmartTransformer)
+                    }
+                ),
+            )
+            for s in template
+            if s.y_var
+        ]
+
+        return x_pipeline, y_pipeline
+
+    def _process_template(cls_target, template_name):
+        t = getattr(cls_target, template_name)
+        attr_base = template_name.replace("_template", "")
+        if len(t) == 0:
+            setattr(cls_target, attr_base + "_x", t)
+            setattr(cls_target, attr_base + "_y", t)
+        else:
+            x_pipe, y_pipe = _resolve_template(t)
+            setattr(cls_target, attr_base + "_x", x_pipe)
+            setattr(cls_target, attr_base + "_y", y_pipe)
+
+        return lambda y_var=False: (
+            getattr(cls_target, attr_base + "_x")
+            if not y_var
+            else getattr(cls_target, attr_base + "_y")
+        )
+
+    cls_target.single_pipeline = _process_template(
+        cls_target, "single_pipeline_template"
+    )
+    cls_target.multi_pipeline = _process_template(cls_target, "multi_pipeline_template")
+
+
 def registry_eval(cls_target):
     """Retrieve intent class from registry dictionary
 
@@ -60,6 +148,7 @@ def __new__(cls, *args, **kwargs):
                 )
             )
         elif class_.__name__ is not "BaseIntent":
-            class_._check_required_class_attributes()
+            class_._check_intent()
+            _process_templates(class_)
             _register_intent(class_)
         return class_
diff --git a/foreshadow/preprocessor.py b/foreshadow/preprocessor.py
@@ -25,6 +25,7 @@ class Preprocessor(BaseEstimator, TransformerMixin):
 
     Parameters:
         from_json: Dictionary representing JSON config file (See docs for more)
+        y_var: Boolean that indicates the processing of a response variable
 
     Attributes:
         pipeline: Internal representation of sklearn pipeline. Can be exported and
@@ -33,7 +34,7 @@ class Preprocessor(BaseEstimator, TransformerMixin):
 
     """
 
-    def __init__(self, from_json=None, **fit_params):
+    def __init__(self, from_json=None, y_var=False, **fit_params):
         self._intent_map = {}
         self._pipeline_map = {}
         self._choice_map = {}
@@ -44,6 +45,7 @@ def __init__(self, from_json=None, **fit_params):
         self.fit_params = fit_params
         self.is_fit = False
         self.from_json = from_json
+        self.y_var = y_var
         self.is_linear = False
         self._init_json()
 
@@ -81,6 +83,7 @@ def _map_intents(self, X_df):
             ]
             self._choice_map[c] = valid_cols
             temp_map[c] = valid_cols[-1][1]
+
         # Set intent map with override
         self._intent_map = {**temp_map, **self._intent_map}
 
@@ -131,18 +134,18 @@ def _map_pipelines(self):
         self._pipeline_map = {
             # Creates pipeline object from intent single_pipeline attribute
             **{
-                k: Pipeline(deepcopy(v.single_pipeline))
+                k: Pipeline(deepcopy(v.single_pipeline(self.y_var)))
                 for k, v in self._intent_map.items()
                 if v.__name__ not in self._intent_pipelines.keys()
-                and len(v.single_pipeline) > 0
+                and len(v.single_pipeline(self.y_var)) > 0
             },
             # Extracts already resolved single pipelines from JSON intent overrides
             **{
                 k: self._intent_pipelines[v.__name__].get(
                     "single",
                     Pipeline(
-                        deepcopy(v.single_pipeline)
-                        if len(v.single_pipeline) > 0
+                        deepcopy(v.single_pipeline(self.y_var))
+                        if len(v.single_pipeline(self.y_var)) > 0
                         else [("null", None)]
                     ),
                 )
@@ -162,13 +165,13 @@ def _map_pipelines(self):
             v.__name__: {
                 # Fetch multi pipeline from Intent class
                 "multi": Pipeline(
-                    deepcopy(v.multi_pipeline)
-                    if len(v.multi_pipeline) > 0
+                    deepcopy(v.multi_pipeline(self.y_var))
+                    if len(v.multi_pipeline(self.y_var)) > 0
                     else [("null", None)]
                 ),
                 "single": Pipeline(
-                    deepcopy(v.single_pipeline)
-                    if len(v.single_pipeline) > 0
+                    deepcopy(v.single_pipeline(self.y_var))
+                    if len(v.single_pipeline(self.y_var)) > 0
                     else [("null", None)]
                 ),
                 # Extract multi pipeline from JSON config (highest priority)
@@ -280,6 +283,8 @@ def _init_json(self):
             return
 
         try:
+            if "y_var" in config.keys():
+                self.y_var = config["y_var"]
             # Parse columns section
             if "columns" in config.keys():
                 # Iterate columns
@@ -363,6 +368,7 @@ def serialize(self):
             "columns": json_cols,
             "postprocess": json_multi,
             "intents": json_intents,
+            "y_var": self.y_var,
         }
 
     def fit(self, X, y=None, **fit_params):