Support more models in level 1 default mode (#168)

* Adding console functionality to run different modeling algorithms
georgian-io-archive · Nov 5, 2019 · b702b05 · b702b05
1 parent 7c12266
commit b702b05
Show file tree

Hide file tree

Showing 8 changed files with 960 additions and 20 deletions.
diff --git a/foreshadow/console.py b/foreshadow/console.py
@@ -12,6 +12,7 @@
 from foreshadow.config import config
 from foreshadow.estimators import AutoEstimator
 from foreshadow.foreshadow import Foreshadow
+from foreshadow.utils import EstimatorFactory, EstimatorFamily, ProblemType
 from foreshadow.logging import logging
 
 
@@ -36,9 +37,9 @@ def process_argument(args):  # noqa: C901
     )
     parser.add_argument(
         "problem_type",
-        default="classification",
+        default=ProblemType.CLASSIFICATION,
         type=str,
-        choices=["classification", "regression"],
+        choices=[ProblemType.CLASSIFICATION, ProblemType.REGRESSION],
         help="Problem type, choosing from classification or regression, "
         "default to classification.",
     )
@@ -65,6 +66,22 @@ def process_argument(args):  # noqa: C901
         "Defaults to LogisticRegression for classification"
         "and LinearRegression for regression",
     )
+    parser.add_argument(
+        "--family",
+        default=EstimatorFamily.LINEAR,
+        type=str,
+        choices=[
+            EstimatorFamily.LINEAR,
+            EstimatorFamily.SVM,
+            EstimatorFamily.RF,
+            EstimatorFamily.NN,
+        ],
+        help="The algorithm family in Sklearn to train the model. Linear "
+        "includes LinearRegression and LogisticRegression. SVM includes "
+        "LinearSVC and LinearSVR. RF includes RandomForestClassifier "
+        "and RandomForestRegression. NN includes MLPClassifier and "
+        "MLPRegressor.",
+    )
     parser.add_argument(
         "--time",
         default=10,
@@ -136,7 +153,9 @@ def generate_model(args):  # noqa: C901
     if cargs.level == 1:
         # Default everything with basic estimator
         fs = Foreshadow(
-            estimator=get_method(cargs.method, cargs.problem_type, y_train)
+            estimator=get_method(
+                cargs.method, y_train, cargs.family, cargs.problem_type
+            )
         )
 
     # elif cargs.level == 2:
@@ -184,13 +203,13 @@ def generate_model(args):  # noqa: C901
         estimator = AutoEstimator(problem_type=cargs.problem_type, auto="tpot")
         estimator.configure_estimator(y_train)
 
-        # TODO move this into the configure_estimator method
-        # TODO "max_time_mins" is an argument for the TPOT library. We cannot
-        # TODO assign it based on the problem type here. For testing purpose,
-        # TODO I'm going to hardcode it for TPOT.
+        # TODO move this into the configure_estimator method "max_time_mins"
+        #  is an argument for the TPOT library. We cannot assign it
+        #   based on the problem type here. For testing purpose, I'm going
+        #   to hardcode it for TPOT.
         # kwargs = (
         #     "max_time_mins"
-        #     if estimator.problem_type == "regression"
+        #     if estimator.problem_type == ProblemType.REGRESSION
         #     else "time_left_for_this_task"
         # )
         kwargs = "max_time_mins"
@@ -265,17 +284,23 @@ def cmd():  # pragma: no cover
     execute_model(*model)
 
 
-def get_method(method, problem_type, y_train):
+def get_method(
+    method,
+    y_train,
+    family=EstimatorFamily.LINEAR,
+    problem_type=ProblemType.CLASSIFICATION,
+):
     """Determine what estimator to use.
 
     Uses set of X data and a passed argument referencing an
     `BaseException <sklearn.base.BaseEstimator>` class.
 
     Args:
         method (str): model name
-        problem_type (str): problem type, classification or regression
         y_train (:obj:`DataFrame <pandas.DataFrame>`): The response variable
             data.
+        family: the algorithm family type
+        problem_type (str): problem type, classification or regression
 
     Returns:
         Estimator
@@ -297,10 +322,14 @@ def get_method(method, problem_type, y_train):
                 "estimator from sklearn.linear_model".format(method)
             )
     else:
-        return (
-            LinearRegression()
-            if problem_type == "regression"
-            else LogisticRegression()
+        # return (
+        #     LinearRegression()
+        #     if problem_type == ProblemType.REGRESSION
+        #     else LogisticRegression()
+        # )
+        estimator_factory = EstimatorFactory()
+        return estimator_factory.get_estimator(
+            family=family, problem_type=problem_type
         )
 
 

diff --git a/foreshadow/tests/data/breast_cancer.csv b/foreshadow/tests/data/breast_cancer.csv
diff --git a/foreshadow/tests/test_console.py b/foreshadow/tests/test_console.py
@@ -1,5 +1,10 @@
 import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.neural_network import MLPClassifier, MLPRegressor
+from sklearn.svm import LinearSVC, LinearSVR
 
+from foreshadow.utils import EstimatorFamily, ProblemType
 from foreshadow.utils.testing import get_file_path
 
 
@@ -212,8 +217,7 @@ def test_console_execute():
     )
 
 
-@pytest.mark.skip("console broken until parametrization is implemented")
-def test_console_get_method_default():
+def test_console_get_method_default_regression():
     import pandas as pd
 
     from foreshadow.console import get_method
@@ -230,13 +234,47 @@ def test_console_get_method_default():
         X_df, y_df, test_size=0.2
     )
 
-    result = get_method(None, X_train)
+    result = get_method(
+        None,
+        y_train,
+        family=EstimatorFamily.LINEAR,
+        problem_type=ProblemType.REGRESSION,
+    )
 
     assert isinstance(result, LinearRegression)
 
 
-@pytest.mark.skip("console broken until parametrization is implemented")
+def test_console_get_method_default_classification():
+    import pandas as pd
+
+    from foreshadow.console import get_method
+
+    from sklearn.datasets import load_breast_cancer
+    from sklearn.model_selection import train_test_split
+    from sklearn.linear_model import LogisticRegression
+
+    cancer = load_breast_cancer()
+    X_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
+    y_df = pd.DataFrame(cancer.target, columns=["target"])
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_df, y_df, test_size=0.2
+    )
+
+    result = get_method(
+        None,
+        y_train,
+        family=EstimatorFamily.LINEAR,
+        problem_type=ProblemType.CLASSIFICATION,
+    )
+
+    assert isinstance(result, LogisticRegression)
+
+
 def test_console_get_method_override():
+    # TODO may not need this method in the future if we decide to not
+    #  allow the user to provide method override as it opens a lot of
+    #  potential issues.
     from foreshadow.console import get_method
 
     from sklearn.linear_model import LogisticRegression
@@ -246,8 +284,8 @@ def test_console_get_method_override():
     assert isinstance(result, LogisticRegression)
 
 
-@pytest.mark.skip("console broken until parametrization is implemented")
 def test_console_get_method_error():
+    # TODO may not need this test. Same reason above.
     from foreshadow.console import get_method
 
     with pytest.raises(ValueError) as e:
@@ -256,6 +294,82 @@ def test_console_get_method_error():
     assert "Invalid method." in str(e.value)
 
 
+@pytest.mark.parametrize(
+    "filename, family, y_var, problem_type, estimator",
+    [
+        (
+            "breast_cancer.csv",
+            EstimatorFamily.LINEAR,
+            "target",
+            ProblemType.CLASSIFICATION,
+            LogisticRegression,
+        ),
+        (
+            "boston_housing.csv",
+            EstimatorFamily.LINEAR,
+            "medv",
+            ProblemType.REGRESSION,
+            LinearRegression,
+        ),
+        (
+            "breast_cancer.csv",
+            EstimatorFamily.SVM,
+            "target",
+            ProblemType.CLASSIFICATION,
+            LinearSVC,
+        ),
+        (
+            "boston_housing.csv",
+            EstimatorFamily.SVM,
+            "medv",
+            ProblemType.REGRESSION,
+            LinearSVR,
+        ),
+        (
+            "breast_cancer.csv",
+            EstimatorFamily.RF,
+            "target",
+            ProblemType.CLASSIFICATION,
+            RandomForestClassifier,
+        ),
+        (
+            "boston_housing.csv",
+            EstimatorFamily.RF,
+            "medv",
+            ProblemType.REGRESSION,
+            RandomForestRegressor,
+        ),
+        (
+            "breast_cancer.csv",
+            EstimatorFamily.NN,
+            "target",
+            ProblemType.CLASSIFICATION,
+            MLPClassifier,
+        ),
+        (
+            "boston_housing.csv",
+            EstimatorFamily.NN,
+            "medv",
+            ProblemType.REGRESSION,
+            MLPRegressor,
+        ),
+    ],
+)
+def test_console_generate_and_execute_model(
+    filename, family, y_var, problem_type, estimator
+):
+    from foreshadow.console import generate_model, execute_model
+
+    data_path = get_file_path("data", filename)
+
+    args = ["--family", family, data_path, y_var, problem_type]
+
+    model = generate_model(args)
+
+    assert isinstance(model[0].estimator.estimator, estimator)
+
+    execute_model(*model)
+
 def test_console_parse_args_multiprocess():
     from foreshadow.console import process_argument
 

diff --git a/foreshadow/tests/test_foreshadow.py b/foreshadow/tests/test_foreshadow.py
@@ -729,7 +729,6 @@ def test_foreshadow_serialization_adults_small_classification():
     )
 
     shadow = Foreshadow(estimator=LogisticRegression())
-
     shadow.fit(X_train, y_train)
     shadow.to_json("foreshadow_adults_small_logistic_regression.json")
 

diff --git a/foreshadow/tests/test_utils.py b/foreshadow/tests/test_utils.py
@@ -1,4 +1,10 @@
 import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.neural_network import MLPClassifier, MLPRegressor
+from sklearn.svm import LinearSVC, LinearSVR
+
+from foreshadow.utils import EstimatorFamily, ProblemType
 
 
 def test_check_df_passthrough():
@@ -115,3 +121,48 @@ def test_is_wrapped(transformer_name):
 
     assert not is_wrapped(sk_tf)
     assert is_wrapped(fs_tf)
+
+
+@pytest.mark.parametrize(
+    "family, problem_type, estimator",
+    [
+        (
+            EstimatorFamily.LINEAR,
+            ProblemType.CLASSIFICATION,
+            LogisticRegression,
+        ),
+        (EstimatorFamily.LINEAR, ProblemType.REGRESSION, LinearRegression),
+        (EstimatorFamily.SVM, ProblemType.CLASSIFICATION, LinearSVC),
+        (EstimatorFamily.SVM, ProblemType.REGRESSION, LinearSVR),
+        (
+            EstimatorFamily.RF,
+            ProblemType.CLASSIFICATION,
+            RandomForestClassifier,
+        ),
+        (EstimatorFamily.RF, ProblemType.REGRESSION, RandomForestRegressor),
+        (EstimatorFamily.NN, ProblemType.CLASSIFICATION, MLPClassifier),
+        (EstimatorFamily.NN, ProblemType.REGRESSION, MLPRegressor),
+    ],
+)
+def test_get_estimator(family, problem_type, estimator):
+    from foreshadow.utils import EstimatorFactory
+
+    estimator_factory = EstimatorFactory()
+    assert isinstance(
+        estimator_factory.get_estimator(family, problem_type), estimator
+    )
+
+
+@pytest.mark.parametrize(
+    "family, problem_type, exception",
+    [
+        ("Unknown", ProblemType.CLASSIFICATION, pytest.raises(KeyError)),
+        (EstimatorFamily.LINEAR, "cluster", pytest.raises(KeyError)),
+    ],
+)
+def test_get_estimator_exception(family, problem_type, exception):
+    from foreshadow.utils import EstimatorFactory
+
+    estimator_factory = EstimatorFactory()
+    with exception:
+        estimator_factory.get_estimator(family, problem_type)
diff --git a/foreshadow/utils/__init__.py b/foreshadow/utils/__init__.py
@@ -7,11 +7,13 @@
     get_config_path,
     get_transformer,
 )
+from foreshadow.utils.constants import EstimatorFamily, ProblemType
 from foreshadow.utils.data_summary import (
     get_outliers,
     mode_freq,
     standard_col_summary,
 )
+from foreshadow.utils.default_estimator_factory import EstimatorFactory
 from foreshadow.utils.testing import dynamic_import
 from foreshadow.utils.validation import (
     PipelineStep,
@@ -40,4 +42,7 @@
     "get_outliers",
     "standard_col_summary",
     "ConfigureColumnSharerMixin",
+    "EstimatorFactory",
+    "ProblemType",
+    "EstimatorFamily",
 ]
diff --git a/foreshadow/utils/constants.py b/foreshadow/utils/constants.py
@@ -0,0 +1,17 @@
+"""Classes that hold constants in foreshadow."""
+
+
+class ProblemType:
+    """Constants for problem types."""
+
+    CLASSIFICATION = "classification"
+    REGRESSION = "regression"
+
+
+class EstimatorFamily:
+    """Constants for estimator families."""
+
+    LINEAR = "linear"
+    SVM = "svm"
+    RF = "random_forest"
+    NN = "neural_network"