play nice with sklearn by using "train" as modeul build verb, and res…

…erving "fit" for sklearn * issue warning if user attempts to use "fit" method
h2oai · Oct 6, 2015 · 74969ec · 74969ec
1 parent 39676dd
commit 74969ec
Show file tree

Hide file tree

Showing 12 changed files with 126 additions and 70 deletions.
diff --git a/h2o-py/h2o/estimators/estimator_base.py b/h2o-py/h2o/estimators/estimator_base.py
@@ -1,5 +1,6 @@
 from ..model.model_base import ModelBase
 from ..model import build_model
+import inspect, warnings
 
 class EstimatorAttributeError(AttributeError):
   def __init__(self,obj,method):
@@ -9,8 +10,9 @@ def __init__(self,obj,method):
 class H2OEstimator(ModelBase):
   """H2O Estimators
 
-  H2O Estimators implement the following methods
-    * fit
+  H2O Estimators implement the following methods for model construction:
+    * train - Top-level user-facing API for model building.
+    * fit - Used by scikit-learn.
 
   Because H2OEstimator instances are instances of ModelBase, these objects can use the
   H2O model API.
@@ -21,11 +23,10 @@ def __init__(self):
     self.estimator=None
     self.parms=None
 
-  def fit(self,X,y=None,training_frame=None,offset_column=None,fold_column=None,weights_column=None,validation_frame=None,**params):
-    """Fit the H2O model by specifying the predictor columns, response column, and any
+  def train(self,X,y=None,training_frame=None,offset_column=None,fold_column=None,weights_column=None,validation_frame=None,**params):
+    """Train the H2O model by specifying the predictor columns, response column, and any
     additional frame-specific values.
 
-
     Parameters
     ----------
       X : list
@@ -47,14 +48,60 @@ def fit(self,X,y=None,training_frame=None,offset_column=None,fold_column=None,we
     -------
       Returns self.
     """
-    raise EstimatorAttributeError(self,"fit")
+    algo_params = locals()
+    self.parms.update({k:v for k, v in algo_params.iteritems() if k not in ["self","params", "algo_params"] })
+    y = algo_params["y"]
+    tframe = algo_params["training_frame"]
+    if tframe is None: raise ValueError("Missing training_frame")
+    if y is not None:
+      self._estimator_type = "classifier" if tframe[y].isfactor() else "regressor"
+    self.__dict__=build_model(self.parms).__dict__.copy()
 
-  def get_params(self, deep=True):
+
+  ##### Scikit-learn Interface Methods #####
+  def fit(self, X, y=None, **params):
+    """Fit an H2O model as part of a scikit-learn pipeline or grid search.
+
+    A warning will be issued if a caller other than sklearn attempts to use this method.
+
+    Parameters
+    ----------
+      X : H2OFrame
+        An H2OFrame consisting of the predictor variables.
+      y : H2OFrame, optional
+        An H2OFrame consisting of the response variable.
+      params : optional
+        Extra arguments.
+
+    Returns
+    -------
+      None
     """
-    Get parameters for this estimator.
+    stk = inspect.stack()[1:]
+    warn = True
+    for s in stk:
+      mod = inspect.getmodule(s[0])
+      warn = "sklearn" not in mod.__name__
+      if not warn: break
+    if warn:
+      warnings.warn("\n\n\t`fit` is not recommended outside of the sklearn framework. Use `train` instead.", UserWarning, stacklevel=2)
+    training_frame = X.cbind(y) if y is not None else X
+    X = X.names
+    y = y.names[0] if y is not None else None
+    self.train(X, y, training_frame, **params)
+
+  def get_params(self, deep=True):
+    """Useful method for obtaining parameters for this estimator. Used primarily for
+    sklearn Pipelines and sklearn grid search.
 
-    :param deep: (Optional) boolean; if True, return parameters of all subobjects that are estimators.
-    :return: A dict of parameters.
+    Parameters
+    ----------
+      deep : bool, optional
+        If True, return parameters of all sub-objects that are estimators.
+
+    Returns
+    -------
+      A dict of parameters
     """
     out = dict()
     for key,value in self.parms.iteritems():
@@ -65,14 +112,16 @@ def get_params(self, deep=True):
     return out
 
   def set_params(self, **parms):
-    self.parms.update(parms)
-    return self
+    """Used by sklearn for updating parameters during grid search.
 
-  def model_build(self, algo_params):
-    self.parms.update({k:v for k, v in algo_params.iteritems() if k not in ["self","params"] })
-    y = algo_params["y"]
-    tframe = algo_params["training_frame"]
-    if tframe is None: raise ValueError("Missing training_frame")
-    if y is not None:
-      self._estimator_type = "classifier" if tframe[y].isfactor() else "regressor"
-    self.__dict__=build_model(self.parms).__dict__.copy()
+    Parameters
+    ----------
+      parms : dict
+        A dictionary of parameters that will be set on this model.
+
+    Returns
+    -------
+      Returns self, the current estimator object with the parameters all set as desired.
+    """
+    self.parms.update(parms)
+    return self
diff --git a/h2o-py/h2o/estimators/gbm.py b/h2o-py/h2o/estimators/gbm.py
@@ -9,12 +9,12 @@ class H2OGradientBoostingEstimator(H2OEstimator):
 
   Parameters
   ----------
-  model_id : str
-    (Optional) The unique id assigned to the resulting model. If none is given, an id will
+  model_id : str, optional
+    The unique id assigned to the resulting model. If none is given, an id will
     automatically be generated.
   distribution : str
-     A character string. The distribution function of the response. Must be "AUTO",
-     "bernoulli", "multinomial", "poisson", "gamma", "tweedie" or "gaussian"
+     The distribution function of the response. Must be "AUTO", "bernoulli",
+     "multinomial", "poisson", "gamma", "tweedie" or "gaussian"
   tweedie_power : float
     Tweedie power (only for Tweedie distribution, must be between 1 and 2)
   ntrees : int
@@ -24,7 +24,7 @@ class H2OGradientBoostingEstimator(H2OEstimator):
   min_rows : int
     Minimum number of rows to assign to terminal nodes.
   learn_rate : float
-    An integer from 0.0 to 1.0
+    A value from 0.0 to 1.0
   nbins : int
     For numerical columns (real/int), build a histogram of (at least) this many bins, then
     split at the best point.
@@ -69,7 +69,4 @@ def __init__(self, model_id=None, distribution=None, tweedie_power=None, ntrees=
     self.parms = locals()
     self.parms = {k:v for k,v in self.parms.iteritems() if k!="self"}
     self.parms["algo"] = "gbm"
-    self._estimator_type = ""
-
-  def fit(self,X,y=None,training_frame=None,offset_column=None,fold_column=None,weights_column=None,validation_frame=None,**params):
-    self.model_build(locals())
+    self._estimator_type = ""
diff --git a/h2o-py/h2o/estimators/random_forest.py b/h2o-py/h2o/estimators/random_forest.py
@@ -12,46 +12,56 @@ def __init__(self, model_id=None, mtries=None, sample_rate=None, build_tree_one_
     Parameters
     ----------
     model_id : str, optional
-      The unique id assigned to the resulting model. If none is given, an id will automatically be generated.
+      The unique id assigned to the resulting model. If none is given, an id will
+      automatically be generated.
     mtries : int
-      Number of variables randomly sampled as candidates at each split. If set to -1, defaults to sqrt{p} for classification, and p/3 for regression,
-      where p is the number of predictors.
+      Number of variables randomly sampled as candidates at each split. If set to -1,
+      defaults to sqrt{p} for classification, and p/3 for regression, where p is the
+      number of predictors.
     sample_rate : float
       Sample rate, from 0 to 1.0.
     build_tree_one_node : bool
-      Run on one node only; no network overhead but fewer cpus used.  Suitable for small datasets.
+      Run on one node only; no network overhead but fewer CPUs used.
+      Suitable for small datasets.
     ntrees : int
-      A nonnegative integer that determines the number of trees to grow.
+      A non-negative integer that determines the number of trees to grow.
     max_depth : int
       Maximum depth to grow the tree.
     min_rows : int
       Minimum number of rows to assign to terminal nodes.
     nbins : int
-      For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point.
+      For numerical columns (real/int), build a histogram of (at least) this many bins,
+      then split at the best point.
     nbins_top_level : int
-      For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease by factor of two per level.
+      For numerical columns (real/int), build a histogram of (at most) this many bins at
+      the root level, then decrease by factor of two per level.
     nbins_cats : int
-      For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher values can lead to more overfitting.
+      For categorical columns (factors), build a histogram of this many bins, then split
+      at the best point. Higher values can lead to more overfitting.
     binomial_double_trees : bool
-      or binary classification: Build 2x as many trees (one per class) - can lead to higher accuracy.
+      or binary classification: Build 2x as many trees (one per class) - can lead to
+      higher accuracy.
     balance_classes : bool
-      logical, indicates whether or not to balance training data class counts via over/under-sampling (for imbalanced data)
+      logical, indicates whether or not to balance training data class counts via
+      over/under-sampling (for imbalanced data)
     max_after_balance_size : float
-      Maximum relative size of the training data after balancing class counts (can be less than 1.0). Ignored if balance_classes is False, which is the default behavior.
+      Maximum relative size of the training data after balancing class counts
+      (can be less than 1.0). Ignored if balance_classes is False,
+      which is the default behavior.
     seed : int
-      Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded
-    nfolds : int
-      (Optional) Number of folds for cross-validation. If nfolds >= 2, then validation must remain empty.
+      Seed for random numbers (affects sampling) - Note: only reproducible when
+      running single threaded
+    nfolds : int, optional
+      Number of folds for cross-validation. If nfolds >= 2, then validation must
+      remain empty.
     fold_assignment : str
-      Cross-validation fold assignment scheme, if fold_column is not specified Must be "AUTO", "Random" or "Modulo"
+      Cross-validation fold assignment scheme, if fold_column is not specified
+      Must be "AUTO", "Random" or "Modulo"
     keep_cross_validation_predictions : bool
       Whether to keep the predictions of the cross-validation models
     """
     super(H2ORandomForestEstimator, self).__init__()
     self.parms = locals()
     self.parms = {k:v for k,v in self.parms.iteritems() if k!="self"}
     self.parms["algo"] = "drf"
-    self._estimator_type=""
-
-  def fit(self,X,y=None,training_frame=None,offset_column=None,fold_column=None,weights_column=None,validation_frame=None,**params):
-    self.model_build(locals())
+    self._estimator_type=""
diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py
@@ -1040,7 +1040,7 @@ def sd(self):
     :param na_rm: True or False to remove NAs from computation.
     :return: Standard deviation of the H2OVec elements.
     """
-    return H2OFrame(expr=ExprNode("sd", self))._get()
+    return H2OFrame(expr=ExprNode("sd", self))._scalar()
 
   def asfactor(self):
     """

diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py
@@ -45,16 +45,16 @@ def upload_file(path, destination_frame="", header=(-1, 0, 1), sep="", col_names
     A path specifying the location of the data to upload.
   destination_frame : H2OFrame
     The name of the H2O Frame in the H2O Cluster.
-  header :
-   (Optional) -1 means the first line is data, 0 means guess, 1 means first line is header.
-  sep :
-    (Optional) The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
-  col_names :
-    (Optional) A list of column names for the file.
-  col_types :
-    (Optional) A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing.
-  na_strings :
-    (Optional) A list of strings which are to be interpreted as missing values.
+  header : int, optional
+   -1 means the first line is data, 0 means guess, 1 means first line is header.
+  sep : string, optional
+    The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
+  col_names : optional
+    A list of column names for the file.
+  col_types : optional
+    A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing.
+  na_strings : optional
+    A list of strings which are to be interpreted as missing values.
 
  :return: A new H2OFrame
   """
@@ -172,7 +172,7 @@ def parse(setup, h2o_name, first_line_is_header=(-1, 0, 1)):
         }
 
   if setup["destination_frame"]:
-    setup["destination_frame"] = _quoted(setup["destination_frame"]).replace("%",".").replace("&",".")
+    setup["destination_frame"] = _quoted(setup["destination_frame"]).replace("%",".").replace("&",".")  # TODO: really should be url encoding...
 
   if isinstance(first_line_is_header, tuple):
     first_line_is_header = setup["check_header"]

diff --git a/h2o-py/h2o/transforms/transform_base.py b/h2o-py/h2o/transforms/transform_base.py
@@ -24,7 +24,7 @@ def transform(self,X,y=None,**params):         raise TransformAttributeError(sel
   def inverse_transform(self,X,y=None,**params): raise TransformAttributeError(self,"inverse_transform")
   def export(self,X,y,**params):                 raise TransformAttributeError(self,"export")
   def fit_transform(self, X, y=None, **params):
-      return self.fit(X, y, **params).transform(X)
+      return self.fit(X, y, **params).transform(X, **params)
 
   def get_params(self, deep=True):
     """

diff --git a/h2o-py/tests/testdir_algos/deeplearning/pyunit_checkpoint_new_category_in_predictorDL.py b/h2o-py/tests/testdir_algos/deeplearning/pyunit_checkpoint_new_category_in_predictorDL.py
@@ -10,12 +10,12 @@ def checkpoint_new_category_in_predictor():
 
     m1 = h2o.deeplearning(x=sv1[[0,1,2,4]], y=sv1[3], epochs=100)
 
-    m2 = h2o.deeplearning(x=sv2[[0,1,2,4]], y=sv2[3], epochs=200, checkpoint=m1.id)
+    m2 = h2o.deeplearning(x=sv2[[0,1,2,4]], y=sv2[3], epochs=200, checkpoint=m1.model_id)
 
     # attempt to continue building model, but with an expanded categorical predictor domain.
     # this should fail
     try:
-        m3 = h2o.deeplearning(x=vir[[0,1,2,4]], y=vir[3], epochs=200, checkpoint=m1.id)
+        m3 = h2o.deeplearning(x=vir[[0,1,2,4]], y=vir[3], epochs=200, checkpoint=m1.model_id)
         assert False, "Expected continued model-building to fail with new categories introduced in predictor"
     except EnvironmentError:
         pass

diff --git a/h2o-py/tests/testdir_algos/deeplearning/pyunit_checkpoint_new_category_in_response.py b/h2o-py/tests/testdir_algos/deeplearning/pyunit_checkpoint_new_category_in_response.py
@@ -12,7 +12,7 @@ def checkpoint_new_category_in_response():
     # attempt to continue building model, but with an expanded categorical response domain.
     # this should fail
     try:
-        m2 = h2o.deeplearning(x=iris[[0,1,2,3]], y=iris[4], epochs=200, checkpoint=m1.id)
+        m2 = h2o.deeplearning(x=iris[[0,1,2,3]], y=iris[4], epochs=200, checkpoint=m1.model_id)
         assert False, "Expected continued model-building to fail with new categories introduced in response"
     except EnvironmentError:
         pass

diff --git a/h2o-py/tests/testdir_algos/gbm/pyunit_checkpoint_new_category_in_predictorGBM.py b/h2o-py/tests/testdir_algos/gbm/pyunit_checkpoint_new_category_in_predictorGBM.py
@@ -10,12 +10,12 @@ def checkpoint_new_category_in_predictor():
 
     m1 = h2o.gbm(x=sv1[[0,1,2,4]], y=sv1[3], ntrees=100)
 
-    m2 = h2o.gbm(x=sv2[[0,1,2,4]], y=sv2[3], ntrees=200, checkpoint=m1.id)
+    m2 = h2o.gbm(x=sv2[[0,1,2,4]], y=sv2[3], ntrees=200, checkpoint=m1.model_id)
 
     # attempt to continue building model, but with an expanded categorical predictor domain.
     # this should fail until we figure out proper behavior
     try:
-        m3 = h2o.gbm(x=vir[[0,1,2,4]], y=vir[3], ntrees=200, checkpoint=m1.id)
+        m3 = h2o.gbm(x=vir[[0,1,2,4]], y=vir[3], ntrees=200, checkpoint=m1.model_id)
         assert False, "Expected continued model-building to fail with new categories introduced in predictor"
     except EnvironmentError:
         pass

diff --git a/h2o-py/tests/testdir_algos/gbm/pyunit_checkpoint_new_category_in_response.py b/h2o-py/tests/testdir_algos/gbm/pyunit_checkpoint_new_category_in_response.py
@@ -12,7 +12,7 @@ def checkpoint_new_category_in_response():
     # attempt to continue building model, but with an expanded categorical response domain.
     # this should fail
     try:
-        m2 = h2o.gbm(x=iris[[0,1,2,3]], y=iris[4], ntrees=200, checkpoint=m1.id)
+        m2 = h2o.gbm(x=iris[[0,1,2,3]], y=iris[4], ntrees=200, checkpoint=m1.model_id)
         assert False, "Expected continued model-building to fail with new categories introduced in response"
     except EnvironmentError:
         pass

diff --git a/h2o-py/tests/testdir_algos/rf/pyunit_checkpoint_new_category_in_predictorRF.py b/h2o-py/tests/testdir_algos/rf/pyunit_checkpoint_new_category_in_predictorRF.py
@@ -10,12 +10,12 @@ def checkpoint_new_category_in_predictor():
 
     m1 = h2o.random_forest(x=sv1[[0,1,2,4]], y=sv1[3], ntrees=100)
 
-    m2 = h2o.random_forest(x=sv2[[0,1,2,4]], y=sv2[3], ntrees=200, checkpoint=m1.id)
+    m2 = h2o.random_forest(x=sv2[[0,1,2,4]], y=sv2[3], ntrees=200, checkpoint=m1.model_id)
 
     # attempt to continue building model, but with an expanded categorical predictor domain.
     # this should fail until we figure out proper behavior
     try:
-        m3 = h2o.random_forest(x=vir[[0,1,2,4]], y=vir[3], ntrees=200, checkpoint=m1.id)
+        m3 = h2o.random_forest(x=vir[[0,1,2,4]], y=vir[3], ntrees=200, checkpoint=m1.model_id)
         assert False, "Expected continued model-building to fail with new categories introduced in predictor"
     except EnvironmentError:
         pass

diff --git a/h2o-py/tests/testdir_jira/pyunit_pubdev_2041.py b/h2o-py/tests/testdir_jira/pyunit_pubdev_2041.py
@@ -13,7 +13,7 @@ def pubdev_2041():
     m1 = h2o.deeplearning(x=train1[0:4], y=train1[4], epochs=100)
 
     # update m1 with new training data
-    m2 = h2o.deeplearning(x=train2[0:4], y=train2[4], epochs=200, checkpoint=m1.id)
+    m2 = h2o.deeplearning(x=train2[0:4], y=train2[4], epochs=200, checkpoint=m1.model_id)
 
 if __name__ == "__main__":
     tests.run_test(sys.argv, pubdev_2041)