Skip to content

Commit

Permalink
play nice with sklearn by using "train" as modeul build verb, and res…
Browse files Browse the repository at this point in the history
…erving "fit" for sklearn

* issue warning if user attempts to use "fit" method
  • Loading branch information
spennihana committed Oct 6, 2015
1 parent 39676dd commit 74969ec
Show file tree
Hide file tree
Showing 12 changed files with 126 additions and 70 deletions.
89 changes: 69 additions & 20 deletions h2o-py/h2o/estimators/estimator_base.py
@@ -1,5 +1,6 @@
from ..model.model_base import ModelBase
from ..model import build_model
import inspect, warnings

class EstimatorAttributeError(AttributeError):
def __init__(self,obj,method):
Expand All @@ -9,8 +10,9 @@ def __init__(self,obj,method):
class H2OEstimator(ModelBase):
"""H2O Estimators
H2O Estimators implement the following methods
* fit
H2O Estimators implement the following methods for model construction:
* train - Top-level user-facing API for model building.
* fit - Used by scikit-learn.
Because H2OEstimator instances are instances of ModelBase, these objects can use the
H2O model API.
Expand All @@ -21,11 +23,10 @@ def __init__(self):
self.estimator=None
self.parms=None

def fit(self,X,y=None,training_frame=None,offset_column=None,fold_column=None,weights_column=None,validation_frame=None,**params):
"""Fit the H2O model by specifying the predictor columns, response column, and any
def train(self,X,y=None,training_frame=None,offset_column=None,fold_column=None,weights_column=None,validation_frame=None,**params):
"""Train the H2O model by specifying the predictor columns, response column, and any
additional frame-specific values.
Parameters
----------
X : list
Expand All @@ -47,14 +48,60 @@ def fit(self,X,y=None,training_frame=None,offset_column=None,fold_column=None,we
-------
Returns self.
"""
raise EstimatorAttributeError(self,"fit")
algo_params = locals()
self.parms.update({k:v for k, v in algo_params.iteritems() if k not in ["self","params", "algo_params"] })
y = algo_params["y"]
tframe = algo_params["training_frame"]
if tframe is None: raise ValueError("Missing training_frame")
if y is not None:
self._estimator_type = "classifier" if tframe[y].isfactor() else "regressor"
self.__dict__=build_model(self.parms).__dict__.copy()

def get_params(self, deep=True):

##### Scikit-learn Interface Methods #####
def fit(self, X, y=None, **params):
"""Fit an H2O model as part of a scikit-learn pipeline or grid search.
A warning will be issued if a caller other than sklearn attempts to use this method.
Parameters
----------
X : H2OFrame
An H2OFrame consisting of the predictor variables.
y : H2OFrame, optional
An H2OFrame consisting of the response variable.
params : optional
Extra arguments.
Returns
-------
None
"""
Get parameters for this estimator.
stk = inspect.stack()[1:]
warn = True
for s in stk:
mod = inspect.getmodule(s[0])
warn = "sklearn" not in mod.__name__
if not warn: break
if warn:
warnings.warn("\n\n\t`fit` is not recommended outside of the sklearn framework. Use `train` instead.", UserWarning, stacklevel=2)
training_frame = X.cbind(y) if y is not None else X
X = X.names
y = y.names[0] if y is not None else None
self.train(X, y, training_frame, **params)

def get_params(self, deep=True):
"""Useful method for obtaining parameters for this estimator. Used primarily for
sklearn Pipelines and sklearn grid search.
:param deep: (Optional) boolean; if True, return parameters of all subobjects that are estimators.
:return: A dict of parameters.
Parameters
----------
deep : bool, optional
If True, return parameters of all sub-objects that are estimators.
Returns
-------
A dict of parameters
"""
out = dict()
for key,value in self.parms.iteritems():
Expand All @@ -65,14 +112,16 @@ def get_params(self, deep=True):
return out

def set_params(self, **parms):
self.parms.update(parms)
return self
"""Used by sklearn for updating parameters during grid search.
def model_build(self, algo_params):
self.parms.update({k:v for k, v in algo_params.iteritems() if k not in ["self","params"] })
y = algo_params["y"]
tframe = algo_params["training_frame"]
if tframe is None: raise ValueError("Missing training_frame")
if y is not None:
self._estimator_type = "classifier" if tframe[y].isfactor() else "regressor"
self.__dict__=build_model(self.parms).__dict__.copy()
Parameters
----------
parms : dict
A dictionary of parameters that will be set on this model.
Returns
-------
Returns self, the current estimator object with the parameters all set as desired.
"""
self.parms.update(parms)
return self
15 changes: 6 additions & 9 deletions h2o-py/h2o/estimators/gbm.py
Expand Up @@ -9,12 +9,12 @@ class H2OGradientBoostingEstimator(H2OEstimator):
Parameters
----------
model_id : str
(Optional) The unique id assigned to the resulting model. If none is given, an id will
model_id : str, optional
The unique id assigned to the resulting model. If none is given, an id will
automatically be generated.
distribution : str
A character string. The distribution function of the response. Must be "AUTO",
"bernoulli", "multinomial", "poisson", "gamma", "tweedie" or "gaussian"
The distribution function of the response. Must be "AUTO", "bernoulli",
"multinomial", "poisson", "gamma", "tweedie" or "gaussian"
tweedie_power : float
Tweedie power (only for Tweedie distribution, must be between 1 and 2)
ntrees : int
Expand All @@ -24,7 +24,7 @@ class H2OGradientBoostingEstimator(H2OEstimator):
min_rows : int
Minimum number of rows to assign to terminal nodes.
learn_rate : float
An integer from 0.0 to 1.0
A value from 0.0 to 1.0
nbins : int
For numerical columns (real/int), build a histogram of (at least) this many bins, then
split at the best point.
Expand Down Expand Up @@ -69,7 +69,4 @@ def __init__(self, model_id=None, distribution=None, tweedie_power=None, ntrees=
self.parms = locals()
self.parms = {k:v for k,v in self.parms.iteritems() if k!="self"}
self.parms["algo"] = "gbm"
self._estimator_type = ""

def fit(self,X,y=None,training_frame=None,offset_column=None,fold_column=None,weights_column=None,validation_frame=None,**params):
self.model_build(locals())
self._estimator_type = ""
48 changes: 29 additions & 19 deletions h2o-py/h2o/estimators/random_forest.py
Expand Up @@ -12,46 +12,56 @@ def __init__(self, model_id=None, mtries=None, sample_rate=None, build_tree_one_
Parameters
----------
model_id : str, optional
The unique id assigned to the resulting model. If none is given, an id will automatically be generated.
The unique id assigned to the resulting model. If none is given, an id will
automatically be generated.
mtries : int
Number of variables randomly sampled as candidates at each split. If set to -1, defaults to sqrt{p} for classification, and p/3 for regression,
where p is the number of predictors.
Number of variables randomly sampled as candidates at each split. If set to -1,
defaults to sqrt{p} for classification, and p/3 for regression, where p is the
number of predictors.
sample_rate : float
Sample rate, from 0 to 1.0.
build_tree_one_node : bool
Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.
Run on one node only; no network overhead but fewer CPUs used.
Suitable for small datasets.
ntrees : int
A nonnegative integer that determines the number of trees to grow.
A non-negative integer that determines the number of trees to grow.
max_depth : int
Maximum depth to grow the tree.
min_rows : int
Minimum number of rows to assign to terminal nodes.
nbins : int
For numerical columns (real/int), build a histogram of (at least) this many bins, then split at the best point.
For numerical columns (real/int), build a histogram of (at least) this many bins,
then split at the best point.
nbins_top_level : int
For numerical columns (real/int), build a histogram of (at most) this many bins at the root level, then decrease by factor of two per level.
For numerical columns (real/int), build a histogram of (at most) this many bins at
the root level, then decrease by factor of two per level.
nbins_cats : int
For categorical columns (factors), build a histogram of this many bins, then split at the best point. Higher values can lead to more overfitting.
For categorical columns (factors), build a histogram of this many bins, then split
at the best point. Higher values can lead to more overfitting.
binomial_double_trees : bool
or binary classification: Build 2x as many trees (one per class) - can lead to higher accuracy.
or binary classification: Build 2x as many trees (one per class) - can lead to
higher accuracy.
balance_classes : bool
logical, indicates whether or not to balance training data class counts via over/under-sampling (for imbalanced data)
logical, indicates whether or not to balance training data class counts via
over/under-sampling (for imbalanced data)
max_after_balance_size : float
Maximum relative size of the training data after balancing class counts (can be less than 1.0). Ignored if balance_classes is False, which is the default behavior.
Maximum relative size of the training data after balancing class counts
(can be less than 1.0). Ignored if balance_classes is False,
which is the default behavior.
seed : int
Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded
nfolds : int
(Optional) Number of folds for cross-validation. If nfolds >= 2, then validation must remain empty.
Seed for random numbers (affects sampling) - Note: only reproducible when
running single threaded
nfolds : int, optional
Number of folds for cross-validation. If nfolds >= 2, then validation must
remain empty.
fold_assignment : str
Cross-validation fold assignment scheme, if fold_column is not specified Must be "AUTO", "Random" or "Modulo"
Cross-validation fold assignment scheme, if fold_column is not specified
Must be "AUTO", "Random" or "Modulo"
keep_cross_validation_predictions : bool
Whether to keep the predictions of the cross-validation models
"""
super(H2ORandomForestEstimator, self).__init__()
self.parms = locals()
self.parms = {k:v for k,v in self.parms.iteritems() if k!="self"}
self.parms["algo"] = "drf"
self._estimator_type=""

def fit(self,X,y=None,training_frame=None,offset_column=None,fold_column=None,weights_column=None,validation_frame=None,**params):
self.model_build(locals())
self._estimator_type=""
2 changes: 1 addition & 1 deletion h2o-py/h2o/frame.py
Expand Up @@ -1040,7 +1040,7 @@ def sd(self):
:param na_rm: True or False to remove NAs from computation.
:return: Standard deviation of the H2OVec elements.
"""
return H2OFrame(expr=ExprNode("sd", self))._get()
return H2OFrame(expr=ExprNode("sd", self))._scalar()

def asfactor(self):
"""
Expand Down
22 changes: 11 additions & 11 deletions h2o-py/h2o/h2o.py
Expand Up @@ -45,16 +45,16 @@ def upload_file(path, destination_frame="", header=(-1, 0, 1), sep="", col_names
A path specifying the location of the data to upload.
destination_frame : H2OFrame
The name of the H2O Frame in the H2O Cluster.
header :
(Optional) -1 means the first line is data, 0 means guess, 1 means first line is header.
sep :
(Optional) The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
col_names :
(Optional) A list of column names for the file.
col_types :
(Optional) A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing.
na_strings :
(Optional) A list of strings which are to be interpreted as missing values.
header : int, optional
-1 means the first line is data, 0 means guess, 1 means first line is header.
sep : string, optional
The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
col_names : optional
A list of column names for the file.
col_types : optional
A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing.
na_strings : optional
A list of strings which are to be interpreted as missing values.
:return: A new H2OFrame
"""
Expand Down Expand Up @@ -172,7 +172,7 @@ def parse(setup, h2o_name, first_line_is_header=(-1, 0, 1)):
}

if setup["destination_frame"]:
setup["destination_frame"] = _quoted(setup["destination_frame"]).replace("%",".").replace("&",".")
setup["destination_frame"] = _quoted(setup["destination_frame"]).replace("%",".").replace("&",".") # TODO: really should be url encoding...

if isinstance(first_line_is_header, tuple):
first_line_is_header = setup["check_header"]
Expand Down
2 changes: 1 addition & 1 deletion h2o-py/h2o/transforms/transform_base.py
Expand Up @@ -24,7 +24,7 @@ def transform(self,X,y=None,**params): raise TransformAttributeError(sel
def inverse_transform(self,X,y=None,**params): raise TransformAttributeError(self,"inverse_transform")
def export(self,X,y,**params): raise TransformAttributeError(self,"export")
def fit_transform(self, X, y=None, **params):
return self.fit(X, y, **params).transform(X)
return self.fit(X, y, **params).transform(X, **params)

def get_params(self, deep=True):
"""
Expand Down
Expand Up @@ -10,12 +10,12 @@ def checkpoint_new_category_in_predictor():

m1 = h2o.deeplearning(x=sv1[[0,1,2,4]], y=sv1[3], epochs=100)

m2 = h2o.deeplearning(x=sv2[[0,1,2,4]], y=sv2[3], epochs=200, checkpoint=m1.id)
m2 = h2o.deeplearning(x=sv2[[0,1,2,4]], y=sv2[3], epochs=200, checkpoint=m1.model_id)

# attempt to continue building model, but with an expanded categorical predictor domain.
# this should fail
try:
m3 = h2o.deeplearning(x=vir[[0,1,2,4]], y=vir[3], epochs=200, checkpoint=m1.id)
m3 = h2o.deeplearning(x=vir[[0,1,2,4]], y=vir[3], epochs=200, checkpoint=m1.model_id)
assert False, "Expected continued model-building to fail with new categories introduced in predictor"
except EnvironmentError:
pass
Expand Down
Expand Up @@ -12,7 +12,7 @@ def checkpoint_new_category_in_response():
# attempt to continue building model, but with an expanded categorical response domain.
# this should fail
try:
m2 = h2o.deeplearning(x=iris[[0,1,2,3]], y=iris[4], epochs=200, checkpoint=m1.id)
m2 = h2o.deeplearning(x=iris[[0,1,2,3]], y=iris[4], epochs=200, checkpoint=m1.model_id)
assert False, "Expected continued model-building to fail with new categories introduced in response"
except EnvironmentError:
pass
Expand Down
Expand Up @@ -10,12 +10,12 @@ def checkpoint_new_category_in_predictor():

m1 = h2o.gbm(x=sv1[[0,1,2,4]], y=sv1[3], ntrees=100)

m2 = h2o.gbm(x=sv2[[0,1,2,4]], y=sv2[3], ntrees=200, checkpoint=m1.id)
m2 = h2o.gbm(x=sv2[[0,1,2,4]], y=sv2[3], ntrees=200, checkpoint=m1.model_id)

# attempt to continue building model, but with an expanded categorical predictor domain.
# this should fail until we figure out proper behavior
try:
m3 = h2o.gbm(x=vir[[0,1,2,4]], y=vir[3], ntrees=200, checkpoint=m1.id)
m3 = h2o.gbm(x=vir[[0,1,2,4]], y=vir[3], ntrees=200, checkpoint=m1.model_id)
assert False, "Expected continued model-building to fail with new categories introduced in predictor"
except EnvironmentError:
pass
Expand Down
Expand Up @@ -12,7 +12,7 @@ def checkpoint_new_category_in_response():
# attempt to continue building model, but with an expanded categorical response domain.
# this should fail
try:
m2 = h2o.gbm(x=iris[[0,1,2,3]], y=iris[4], ntrees=200, checkpoint=m1.id)
m2 = h2o.gbm(x=iris[[0,1,2,3]], y=iris[4], ntrees=200, checkpoint=m1.model_id)
assert False, "Expected continued model-building to fail with new categories introduced in response"
except EnvironmentError:
pass
Expand Down
Expand Up @@ -10,12 +10,12 @@ def checkpoint_new_category_in_predictor():

m1 = h2o.random_forest(x=sv1[[0,1,2,4]], y=sv1[3], ntrees=100)

m2 = h2o.random_forest(x=sv2[[0,1,2,4]], y=sv2[3], ntrees=200, checkpoint=m1.id)
m2 = h2o.random_forest(x=sv2[[0,1,2,4]], y=sv2[3], ntrees=200, checkpoint=m1.model_id)

# attempt to continue building model, but with an expanded categorical predictor domain.
# this should fail until we figure out proper behavior
try:
m3 = h2o.random_forest(x=vir[[0,1,2,4]], y=vir[3], ntrees=200, checkpoint=m1.id)
m3 = h2o.random_forest(x=vir[[0,1,2,4]], y=vir[3], ntrees=200, checkpoint=m1.model_id)
assert False, "Expected continued model-building to fail with new categories introduced in predictor"
except EnvironmentError:
pass
Expand Down
2 changes: 1 addition & 1 deletion h2o-py/tests/testdir_jira/pyunit_pubdev_2041.py
Expand Up @@ -13,7 +13,7 @@ def pubdev_2041():
m1 = h2o.deeplearning(x=train1[0:4], y=train1[4], epochs=100)

# update m1 with new training data
m2 = h2o.deeplearning(x=train2[0:4], y=train2[4], epochs=200, checkpoint=m1.id)
m2 = h2o.deeplearning(x=train2[0:4], y=train2[4], epochs=200, checkpoint=m1.model_id)

if __name__ == "__main__":
tests.run_test(sys.argv, pubdev_2041)

0 comments on commit 74969ec

Please sign in to comment.