Handle y variable transformation based on problem type. (#171)

* Handle y variable transformation based on problem type.
georgian-io-archive · Nov 19, 2019 · 5d88d7b · 5d88d7b
1 parent b702b05
commit 5d88d7b
Show file tree

Hide file tree

Showing 7 changed files with 198 additions and 50 deletions.
diff --git a/foreshadow/console.py b/foreshadow/console.py
@@ -6,14 +6,13 @@
 import warnings
 
 import pandas as pd
-from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.model_selection import train_test_split
 
 from foreshadow.config import config
 from foreshadow.estimators import AutoEstimator
 from foreshadow.foreshadow import Foreshadow
-from foreshadow.utils import EstimatorFactory, EstimatorFamily, ProblemType
 from foreshadow.logging import logging
+from foreshadow.utils import EstimatorFactory, EstimatorFamily, ProblemType
 
 
 def process_argument(args):  # noqa: C901
@@ -153,9 +152,10 @@ def generate_model(args):  # noqa: C901
     if cargs.level == 1:
         # Default everything with basic estimator
         fs = Foreshadow(
+            problem_type=cargs.problem_type,
             estimator=get_method(
                 cargs.method, y_train, cargs.family, cargs.problem_type
-            )
+            ),
         )
 
     # elif cargs.level == 2:
@@ -218,12 +218,14 @@ def generate_model(args):  # noqa: C901
             **estimator.estimator_kwargs,
         }
 
-        fs = Foreshadow(estimator=estimator)
+        fs = Foreshadow(problem_type=cargs.problem_type, estimator=estimator)
 
     else:
         raise ValueError("Invalid Level. Only levels 1 and 3 supported.")
 
     if cargs.multiprocess:
+        # TODO reconsider this implementation as it will not work if
+        #  foreshadow is used as a library/API.
         config.set_multiprocess(True)
         logging.info("multiprocessing enabled.")
 

diff --git a/foreshadow/foreshadow.py b/foreshadow/foreshadow.py
@@ -16,14 +16,14 @@
     ConcreteSerializerMixin,
     _make_deserializable,
 )
-from foreshadow.utils import check_df, get_transformer
+from foreshadow.utils import ProblemType, check_df, get_transformer
 
 
 class Foreshadow(BaseEstimator, ConcreteSerializerMixin):
     """An end-to-end pipeline to preprocess and tune a machine learning model.
 
     Example:
-        >>> shadow = Foreshadow()
+        >>> shadow = Foreshadow(problem_type=ProblemType.CLASSIFICATION)
 
     Args:
         X_preparer \
@@ -47,9 +47,23 @@ def __init__(
         X_preparer=None,
         y_preparer=None,
         estimator=None,
+        problem_type=None,
         optimizer=None,
         optimizer_kwargs=None,
     ):
+        if problem_type not in [
+            ProblemType.CLASSIFICATION,
+            ProblemType.REGRESSION,
+        ]:
+            raise ValueError(
+                "Unknown Problem Type {}. Please choose from {} "
+                "or {}".format(
+                    problem_type,
+                    ProblemType.CLASSIFICATION,
+                    ProblemType.REGRESSION,
+                )
+            )
+        self.problem_type = problem_type
         self.X_preparer = X_preparer
         self.y_preparer = y_preparer
         self.estimator = estimator
@@ -131,7 +145,9 @@ def y_preparer(self, yp):
                 raise ValueError("Invalid value passed as y_preparer")
         else:
             self._y_preprocessor = DataPreparer(
-                column_sharer=ColumnSharer(), y_var=True
+                column_sharer=ColumnSharer(),
+                y_var=True,
+                problem_type=self.problem_type,
             )
 
     @property

diff --git a/foreshadow/preparer.py b/foreshadow/preparer.py
@@ -7,6 +7,7 @@
     _make_deserializable,
     _make_serializable,
 )
+from foreshadow.smart import CategoricalEncoder
 from foreshadow.steps import (
     CleanerMapper,
     FeatureEngineererMapper,
@@ -15,7 +16,7 @@
     IntentMapper,
     Preprocessor,
 )
-from foreshadow.utils import ConfigureColumnSharerMixin
+from foreshadow.utils import ConfigureColumnSharerMixin, ProblemType
 
 from .concrete import NoTransform
 
@@ -80,6 +81,7 @@ def __init__(
         engineerer_kwargs=None,
         preprocessor_kwargs=None,
         reducer_kwargs=None,
+        problem_type=None,
         y_var=None,
         **kwargs
     ):
@@ -117,13 +119,21 @@ def __init__(
                 ("feature_reducer", FeatureReducerMapper(**reducer_kwargs_)),
             ]
         else:
-            steps = [("output", NoTransform())]
+            if problem_type == ProblemType.REGRESSION:
+                steps = [("output", NoTransform())]
+            elif problem_type == ProblemType.CLASSIFICATION:
+                steps = [("output", CategoricalEncoder(y_var=True))]
+            else:
+                raise ValueError(
+                    "Invalid Problem " "Type {}".format(problem_type)
+                )
         if "steps" in kwargs:  # needed for sklearn estimator clone,
             # which will try to init the object using get_params.
             steps = kwargs.pop("steps")
 
         self.column_sharer = column_sharer
         self.y_var = y_var
+        self.problem_type = problem_type
         super().__init__(steps, **kwargs)
 
     def _get_params(self, attr, deep=True):

diff --git a/foreshadow/tests/test_console.py b/foreshadow/tests/test_console.py
@@ -208,7 +208,9 @@ def test_console_execute():
     X_train, X_test, y_train, y_test = train_test_split(
         X_df, y_df, test_size=0.2
     )
-    fs = Foreshadow(estimator=LinearRegression())
+    fs = Foreshadow(
+        problem_type=ProblemType.REGRESSION, estimator=LinearRegression()
+    )
 
     results = execute_model(fs, X_train, y_train, X_test, y_test)
 
@@ -370,6 +372,7 @@ def test_console_generate_and_execute_model(
 
     execute_model(*model)
 
+
 def test_console_parse_args_multiprocess():
     from foreshadow.console import process_argument