Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Support more models in level 1 default mode (#168)
Browse files Browse the repository at this point in the history
* Adding console functionality to run different modeling algorithms
  • Loading branch information
jzhang-gp committed Nov 5, 2019
1 parent 7c12266 commit b702b05
Show file tree
Hide file tree
Showing 8 changed files with 960 additions and 20 deletions.
57 changes: 43 additions & 14 deletions foreshadow/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from foreshadow.config import config
from foreshadow.estimators import AutoEstimator
from foreshadow.foreshadow import Foreshadow
from foreshadow.utils import EstimatorFactory, EstimatorFamily, ProblemType
from foreshadow.logging import logging


Expand All @@ -36,9 +37,9 @@ def process_argument(args): # noqa: C901
)
parser.add_argument(
"problem_type",
default="classification",
default=ProblemType.CLASSIFICATION,
type=str,
choices=["classification", "regression"],
choices=[ProblemType.CLASSIFICATION, ProblemType.REGRESSION],
help="Problem type, choosing from classification or regression, "
"default to classification.",
)
Expand All @@ -65,6 +66,22 @@ def process_argument(args): # noqa: C901
"Defaults to LogisticRegression for classification"
"and LinearRegression for regression",
)
parser.add_argument(
"--family",
default=EstimatorFamily.LINEAR,
type=str,
choices=[
EstimatorFamily.LINEAR,
EstimatorFamily.SVM,
EstimatorFamily.RF,
EstimatorFamily.NN,
],
help="The algorithm family in Sklearn to train the model. Linear "
"includes LinearRegression and LogisticRegression. SVM includes "
"LinearSVC and LinearSVR. RF includes RandomForestClassifier "
"and RandomForestRegression. NN includes MLPClassifier and "
"MLPRegressor.",
)
parser.add_argument(
"--time",
default=10,
Expand Down Expand Up @@ -136,7 +153,9 @@ def generate_model(args): # noqa: C901
if cargs.level == 1:
# Default everything with basic estimator
fs = Foreshadow(
estimator=get_method(cargs.method, cargs.problem_type, y_train)
estimator=get_method(
cargs.method, y_train, cargs.family, cargs.problem_type
)
)

# elif cargs.level == 2:
Expand Down Expand Up @@ -184,13 +203,13 @@ def generate_model(args): # noqa: C901
estimator = AutoEstimator(problem_type=cargs.problem_type, auto="tpot")
estimator.configure_estimator(y_train)

# TODO move this into the configure_estimator method
# TODO "max_time_mins" is an argument for the TPOT library. We cannot
# TODO assign it based on the problem type here. For testing purpose,
# TODO I'm going to hardcode it for TPOT.
# TODO move this into the configure_estimator method "max_time_mins"
# is an argument for the TPOT library. We cannot assign it
# based on the problem type here. For testing purpose, I'm going
# to hardcode it for TPOT.
# kwargs = (
# "max_time_mins"
# if estimator.problem_type == "regression"
# if estimator.problem_type == ProblemType.REGRESSION
# else "time_left_for_this_task"
# )
kwargs = "max_time_mins"
Expand Down Expand Up @@ -265,17 +284,23 @@ def cmd(): # pragma: no cover
execute_model(*model)


def get_method(method, problem_type, y_train):
def get_method(
method,
y_train,
family=EstimatorFamily.LINEAR,
problem_type=ProblemType.CLASSIFICATION,
):
"""Determine what estimator to use.
Uses set of X data and a passed argument referencing an
`BaseException <sklearn.base.BaseEstimator>` class.
Args:
method (str): model name
problem_type (str): problem type, classification or regression
y_train (:obj:`DataFrame <pandas.DataFrame>`): The response variable
data.
family: the algorithm family type
problem_type (str): problem type, classification or regression
Returns:
Estimator
Expand All @@ -297,10 +322,14 @@ def get_method(method, problem_type, y_train):
"estimator from sklearn.linear_model".format(method)
)
else:
return (
LinearRegression()
if problem_type == "regression"
else LogisticRegression()
# return (
# LinearRegression()
# if problem_type == ProblemType.REGRESSION
# else LogisticRegression()
# )
estimator_factory = EstimatorFactory()
return estimator_factory.get_estimator(
family=family, problem_type=problem_type
)


Expand Down
570 changes: 570 additions & 0 deletions foreshadow/tests/data/breast_cancer.csv

Large diffs are not rendered by default.

124 changes: 119 additions & 5 deletions foreshadow/tests/test_console.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import pytest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.svm import LinearSVC, LinearSVR

from foreshadow.utils import EstimatorFamily, ProblemType
from foreshadow.utils.testing import get_file_path


Expand Down Expand Up @@ -212,8 +217,7 @@ def test_console_execute():
)


@pytest.mark.skip("console broken until parametrization is implemented")
def test_console_get_method_default():
def test_console_get_method_default_regression():
import pandas as pd

from foreshadow.console import get_method
Expand All @@ -230,13 +234,47 @@ def test_console_get_method_default():
X_df, y_df, test_size=0.2
)

result = get_method(None, X_train)
result = get_method(
None,
y_train,
family=EstimatorFamily.LINEAR,
problem_type=ProblemType.REGRESSION,
)

assert isinstance(result, LinearRegression)


@pytest.mark.skip("console broken until parametrization is implemented")
def test_console_get_method_default_classification():
import pandas as pd

from foreshadow.console import get_method

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

cancer = load_breast_cancer()
X_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y_df = pd.DataFrame(cancer.target, columns=["target"])

X_train, X_test, y_train, y_test = train_test_split(
X_df, y_df, test_size=0.2
)

result = get_method(
None,
y_train,
family=EstimatorFamily.LINEAR,
problem_type=ProblemType.CLASSIFICATION,
)

assert isinstance(result, LogisticRegression)


def test_console_get_method_override():
# TODO may not need this method in the future if we decide to not
# allow the user to provide method override as it opens a lot of
# potential issues.
from foreshadow.console import get_method

from sklearn.linear_model import LogisticRegression
Expand All @@ -246,8 +284,8 @@ def test_console_get_method_override():
assert isinstance(result, LogisticRegression)


@pytest.mark.skip("console broken until parametrization is implemented")
def test_console_get_method_error():
# TODO may not need this test. Same reason above.
from foreshadow.console import get_method

with pytest.raises(ValueError) as e:
Expand All @@ -256,6 +294,82 @@ def test_console_get_method_error():
assert "Invalid method." in str(e.value)


@pytest.mark.parametrize(
"filename, family, y_var, problem_type, estimator",
[
(
"breast_cancer.csv",
EstimatorFamily.LINEAR,
"target",
ProblemType.CLASSIFICATION,
LogisticRegression,
),
(
"boston_housing.csv",
EstimatorFamily.LINEAR,
"medv",
ProblemType.REGRESSION,
LinearRegression,
),
(
"breast_cancer.csv",
EstimatorFamily.SVM,
"target",
ProblemType.CLASSIFICATION,
LinearSVC,
),
(
"boston_housing.csv",
EstimatorFamily.SVM,
"medv",
ProblemType.REGRESSION,
LinearSVR,
),
(
"breast_cancer.csv",
EstimatorFamily.RF,
"target",
ProblemType.CLASSIFICATION,
RandomForestClassifier,
),
(
"boston_housing.csv",
EstimatorFamily.RF,
"medv",
ProblemType.REGRESSION,
RandomForestRegressor,
),
(
"breast_cancer.csv",
EstimatorFamily.NN,
"target",
ProblemType.CLASSIFICATION,
MLPClassifier,
),
(
"boston_housing.csv",
EstimatorFamily.NN,
"medv",
ProblemType.REGRESSION,
MLPRegressor,
),
],
)
def test_console_generate_and_execute_model(
filename, family, y_var, problem_type, estimator
):
from foreshadow.console import generate_model, execute_model

data_path = get_file_path("data", filename)

args = ["--family", family, data_path, y_var, problem_type]

model = generate_model(args)

assert isinstance(model[0].estimator.estimator, estimator)

execute_model(*model)

def test_console_parse_args_multiprocess():
from foreshadow.console import process_argument

Expand Down
1 change: 0 additions & 1 deletion foreshadow/tests/test_foreshadow.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,6 @@ def test_foreshadow_serialization_adults_small_classification():
)

shadow = Foreshadow(estimator=LogisticRegression())

shadow.fit(X_train, y_train)
shadow.to_json("foreshadow_adults_small_logistic_regression.json")

Expand Down
51 changes: 51 additions & 0 deletions foreshadow/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import pytest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.svm import LinearSVC, LinearSVR

from foreshadow.utils import EstimatorFamily, ProblemType


def test_check_df_passthrough():
Expand Down Expand Up @@ -115,3 +121,48 @@ def test_is_wrapped(transformer_name):

assert not is_wrapped(sk_tf)
assert is_wrapped(fs_tf)


@pytest.mark.parametrize(
"family, problem_type, estimator",
[
(
EstimatorFamily.LINEAR,
ProblemType.CLASSIFICATION,
LogisticRegression,
),
(EstimatorFamily.LINEAR, ProblemType.REGRESSION, LinearRegression),
(EstimatorFamily.SVM, ProblemType.CLASSIFICATION, LinearSVC),
(EstimatorFamily.SVM, ProblemType.REGRESSION, LinearSVR),
(
EstimatorFamily.RF,
ProblemType.CLASSIFICATION,
RandomForestClassifier,
),
(EstimatorFamily.RF, ProblemType.REGRESSION, RandomForestRegressor),
(EstimatorFamily.NN, ProblemType.CLASSIFICATION, MLPClassifier),
(EstimatorFamily.NN, ProblemType.REGRESSION, MLPRegressor),
],
)
def test_get_estimator(family, problem_type, estimator):
from foreshadow.utils import EstimatorFactory

estimator_factory = EstimatorFactory()
assert isinstance(
estimator_factory.get_estimator(family, problem_type), estimator
)


@pytest.mark.parametrize(
"family, problem_type, exception",
[
("Unknown", ProblemType.CLASSIFICATION, pytest.raises(KeyError)),
(EstimatorFamily.LINEAR, "cluster", pytest.raises(KeyError)),
],
)
def test_get_estimator_exception(family, problem_type, exception):
from foreshadow.utils import EstimatorFactory

estimator_factory = EstimatorFactory()
with exception:
estimator_factory.get_estimator(family, problem_type)
5 changes: 5 additions & 0 deletions foreshadow/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
get_config_path,
get_transformer,
)
from foreshadow.utils.constants import EstimatorFamily, ProblemType
from foreshadow.utils.data_summary import (
get_outliers,
mode_freq,
standard_col_summary,
)
from foreshadow.utils.default_estimator_factory import EstimatorFactory
from foreshadow.utils.testing import dynamic_import
from foreshadow.utils.validation import (
PipelineStep,
Expand Down Expand Up @@ -40,4 +42,7 @@
"get_outliers",
"standard_col_summary",
"ConfigureColumnSharerMixin",
"EstimatorFactory",
"ProblemType",
"EstimatorFamily",
]
17 changes: 17 additions & 0 deletions foreshadow/utils/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Classes that hold constants in foreshadow."""


class ProblemType:
"""Constants for problem types."""

CLASSIFICATION = "classification"
REGRESSION = "regression"


class EstimatorFamily:
"""Constants for estimator families."""

LINEAR = "linear"
SVM = "svm"
RF = "random_forest"
NN = "neural_network"

0 comments on commit b702b05

Please sign in to comment.