Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Handle y variable transformation based on problem type. (#171)
Browse files Browse the repository at this point in the history
* Handle y variable transformation based on problem type.
  • Loading branch information
jzhang-gp committed Nov 19, 2019
1 parent b702b05 commit 5d88d7b
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 50 deletions.
10 changes: 6 additions & 4 deletions foreshadow/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
import warnings

import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split

from foreshadow.config import config
from foreshadow.estimators import AutoEstimator
from foreshadow.foreshadow import Foreshadow
from foreshadow.utils import EstimatorFactory, EstimatorFamily, ProblemType
from foreshadow.logging import logging
from foreshadow.utils import EstimatorFactory, EstimatorFamily, ProblemType


def process_argument(args): # noqa: C901
Expand Down Expand Up @@ -153,9 +152,10 @@ def generate_model(args): # noqa: C901
if cargs.level == 1:
# Default everything with basic estimator
fs = Foreshadow(
problem_type=cargs.problem_type,
estimator=get_method(
cargs.method, y_train, cargs.family, cargs.problem_type
)
),
)

# elif cargs.level == 2:
Expand Down Expand Up @@ -218,12 +218,14 @@ def generate_model(args): # noqa: C901
**estimator.estimator_kwargs,
}

fs = Foreshadow(estimator=estimator)
fs = Foreshadow(problem_type=cargs.problem_type, estimator=estimator)

else:
raise ValueError("Invalid Level. Only levels 1 and 3 supported.")

if cargs.multiprocess:
# TODO reconsider this implementation as it will not work if
# foreshadow is used as a library/API.
config.set_multiprocess(True)
logging.info("multiprocessing enabled.")

Expand Down
22 changes: 19 additions & 3 deletions foreshadow/foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
ConcreteSerializerMixin,
_make_deserializable,
)
from foreshadow.utils import check_df, get_transformer
from foreshadow.utils import ProblemType, check_df, get_transformer


class Foreshadow(BaseEstimator, ConcreteSerializerMixin):
"""An end-to-end pipeline to preprocess and tune a machine learning model.
Example:
>>> shadow = Foreshadow()
>>> shadow = Foreshadow(problem_type=ProblemType.CLASSIFICATION)
Args:
X_preparer \
Expand All @@ -47,9 +47,23 @@ def __init__(
X_preparer=None,
y_preparer=None,
estimator=None,
problem_type=None,
optimizer=None,
optimizer_kwargs=None,
):
if problem_type not in [
ProblemType.CLASSIFICATION,
ProblemType.REGRESSION,
]:
raise ValueError(
"Unknown Problem Type {}. Please choose from {} "
"or {}".format(
problem_type,
ProblemType.CLASSIFICATION,
ProblemType.REGRESSION,
)
)
self.problem_type = problem_type
self.X_preparer = X_preparer
self.y_preparer = y_preparer
self.estimator = estimator
Expand Down Expand Up @@ -131,7 +145,9 @@ def y_preparer(self, yp):
raise ValueError("Invalid value passed as y_preparer")
else:
self._y_preprocessor = DataPreparer(
column_sharer=ColumnSharer(), y_var=True
column_sharer=ColumnSharer(),
y_var=True,
problem_type=self.problem_type,
)

@property
Expand Down
14 changes: 12 additions & 2 deletions foreshadow/preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
_make_deserializable,
_make_serializable,
)
from foreshadow.smart import CategoricalEncoder
from foreshadow.steps import (
CleanerMapper,
FeatureEngineererMapper,
Expand All @@ -15,7 +16,7 @@
IntentMapper,
Preprocessor,
)
from foreshadow.utils import ConfigureColumnSharerMixin
from foreshadow.utils import ConfigureColumnSharerMixin, ProblemType

from .concrete import NoTransform

Expand Down Expand Up @@ -80,6 +81,7 @@ def __init__(
engineerer_kwargs=None,
preprocessor_kwargs=None,
reducer_kwargs=None,
problem_type=None,
y_var=None,
**kwargs
):
Expand Down Expand Up @@ -117,13 +119,21 @@ def __init__(
("feature_reducer", FeatureReducerMapper(**reducer_kwargs_)),
]
else:
steps = [("output", NoTransform())]
if problem_type == ProblemType.REGRESSION:
steps = [("output", NoTransform())]
elif problem_type == ProblemType.CLASSIFICATION:
steps = [("output", CategoricalEncoder(y_var=True))]
else:
raise ValueError(
"Invalid Problem " "Type {}".format(problem_type)
)
if "steps" in kwargs: # needed for sklearn estimator clone,
# which will try to init the object using get_params.
steps = kwargs.pop("steps")

self.column_sharer = column_sharer
self.y_var = y_var
self.problem_type = problem_type
super().__init__(steps, **kwargs)

def _get_params(self, attr, deep=True):
Expand Down
5 changes: 4 additions & 1 deletion foreshadow/tests/test_console.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,9 @@ def test_console_execute():
X_train, X_test, y_train, y_test = train_test_split(
X_df, y_df, test_size=0.2
)
fs = Foreshadow(estimator=LinearRegression())
fs = Foreshadow(
problem_type=ProblemType.REGRESSION, estimator=LinearRegression()
)

results = execute_model(fs, X_train, y_train, X_test, y_test)

Expand Down Expand Up @@ -370,6 +372,7 @@ def test_console_generate_and_execute_model(

execute_model(*model)


def test_console_parse_args_multiprocess():
from foreshadow.console import process_argument

Expand Down

0 comments on commit 5d88d7b

Please sign in to comment.