Skip to content

Commit

Permalink
Merge pull request #4078 from h2oai/zuzanao_PUBDEV-6754-Add_method_to…
Browse files Browse the repository at this point in the history
…_provide_actual_ntree_value

PUBDEV-6754: provide actual ntree value
  • Loading branch information
koniecsveta committed Nov 22, 2019
2 parents b5232f1 + 183e075 commit 4a3f679
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 0 deletions.
14 changes: 14 additions & 0 deletions h2o-py/h2o/model/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,20 @@ def scoring_history(self):
print("No score history for this model")


def ntrees_actual(self):
"""
Returns actual number of trees in a tree model. If early stopping enabled, GBM can reset the ntrees value.
In this case, the actual ntrees value is less than the original ntrees value a user set before
building the model.
Type: ``float``
"""
tree_algos = ['gbm', 'drf', 'isolationforest', 'xgboost']
if self._model_json["algo"] in tree_algos:
return self.summary()['number_of_trees'][0]
print("No actual number of trees for this model")


def cross_validation_metrics_summary(self):
"""
Retrieve Cross-Validation Metrics Summary.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from builtins import range
import sys
sys.path.insert(1,"../../../")
import h2o
from tests import pyunit_utils
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.isolation_forest import H2OIsolationForestEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.xgboost import H2OXGBoostEstimator

def tree_algos_ntree_actual():
prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
prostate[1] = prostate[1].asfactor()
prostate.summary()
ntrees_original = 1000

prostate_gbm = H2OGradientBoostingEstimator(nfolds=5,ntrees=ntrees_original, distribution="bernoulli", stopping_metric="MSE", stopping_tolerance=0.01, stopping_rounds=5)
prostate_gbm.train(x=list(range(2,9)), y=1, training_frame=prostate)

print("\n")
print("GradientBoosting: number of trees set by user before building the model is:")
print(ntrees_original)
print("GradientBoosting: number of trees built with early-stopping is:")
print(prostate_gbm.ntrees_actual())

assert prostate_gbm.ntrees_actual() < ntrees_original
assert prostate_gbm.ntrees_actual() == prostate_gbm._model_json['output']['model_summary']['number_of_trees'][0] == prostate_gbm.summary()['number_of_trees'][0]


prostate_if = H2OIsolationForestEstimator(sample_rate = 0.1, max_depth = 20, ntrees=ntrees_original, stopping_metric="anomalyscore", stopping_tolerance=0.01, stopping_rounds=5)
prostate_if.train(x=list(range(2,9)), y=1, training_frame=prostate)

print("\n")
print("IsolationForest: number of trees set by user before building the model is:")
print(ntrees_original)
print("IsolationForest: number of trees built with early-stopping is:")
print(prostate_if.ntrees_actual())

assert prostate_if.ntrees_actual() < ntrees_original
assert prostate_if.ntrees_actual() == prostate_if._model_json['output']['model_summary']['number_of_trees'][0] == prostate_if.summary()['number_of_trees'][0]

prostate_drf = H2ORandomForestEstimator(ntrees=ntrees_original, max_depth=20, min_rows=10, stopping_metric="auc", stopping_tolerance=0.01, stopping_rounds=5)
prostate_drf.train(x=list(range(2,9)), y=1, training_frame=prostate)

print("\n")
print("RandomForest: number of trees set by user before building the model is:")
print(ntrees_original)
print("RandomForest: number of trees built with early-stopping is:")
print(prostate_drf.ntrees_actual())

assert prostate_drf.ntrees_actual() < ntrees_original
assert prostate_drf.ntrees_actual() == prostate_drf._model_json['output']['model_summary']['number_of_trees'][0] == prostate_drf.summary()['number_of_trees'][0]

prostate_xgb = H2OXGBoostEstimator(distribution="auto", ntrees=ntrees_original, seed=1, stopping_metric="auc", stopping_tolerance=0.01, stopping_rounds=5)
prostate_xgb.train(x=list(range(2,9)), y=1, training_frame=prostate)

print("\n")
print("XGBoost: number of trees set by user before building the model is:")
print(ntrees_original)
print("XGBoost: number of trees built with early-stopping is:")
print(prostate_xgb.ntrees_actual())

assert prostate_xgb.ntrees_actual() < ntrees_original
assert prostate_xgb.ntrees_actual() == prostate_xgb._model_json['output']['model_summary']['number_of_trees'][0] == prostate_xgb.summary()['number_of_trees'][0]

if __name__ == "__main__":
pyunit_utils.standalone_test(tree_algos_ntree_actual)
else:
tree_algos_ntree_actual()
22 changes: 22 additions & 0 deletions h2o-r/h2o-package/R/models.R
Original file line number Diff line number Diff line change
Expand Up @@ -1842,6 +1842,28 @@ h2o.scoreHistory <- function(object) {
}
}

#'
#' Retrieve actual number of trees for tree algorithms
#'
#' @param object An \linkS4class{H2OModel} object.
#' @export
h2o.get_ntrees_actual <- function(object) {
o <- object
if( is(o, "H2OModel") ) {
if(o@algorithm == "gbm" | o@algorithm == "drf"| o@algorithm == "isolationforest"| o@algorithm == "xgboost"){
sh <- o@model$model_summary['number_of_trees'][,1]
if( is.null(sh) ) return(NULL)
sh
} else {
warning( paste0("No actual number of trees for this model") )
return(NULL)
}
} else {
warning( paste0("No actual number of trees for ", class(o)) )
return(NULL)
}
}

#'
#' Retrieve the respective weight matrix
#'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../../scripts/h2o-r-test-setup.R")



test.tree_algos.bernoulli <- function() {
Log.info("Importing prostate.csv data...\n")
prostate.hex <- h2o.uploadFile(locate("smalldata/logreg/prostate.csv"), destination_frame="prostate.hex")
Log.info("Converting CAPSULE and RACE columns to factors...\n")
prostate.hex$CAPSULE <- as.factor(prostate.hex$CAPSULE)
prostate.hex$RACE <- as.factor(prostate.hex$RACE)
Log.info("Summary of prostate.csv from H2O:\n")
print(summary(prostate.hex))

# Import csv data for R to use in comparison
prostate.data <- read.csv(locate("smalldata/logreg/prostate.csv"), header = TRUE)
prostate.data$RACE <- as.factor(prostate.data$RACE)
Log.info("Summary of prostate.csv from R:\n")
print(summary(prostate.data))

# Train H2O GBM Model:
ntrees <- 1000
Log.info(paste("H2O GBM with parameters:\nnfolds = 5, distribution = 'bernoulli', ntrees = ", ntrees, ", stopping_metric=\"MSE\", stopping_tolerance=0.01, stopping_rounds=5\n", sep = ""))
prostate_gbm.h2o <- h2o.gbm(x = 3:9, y = "CAPSULE", training_frame = prostate.hex, nfolds = 5, distribution = "bernoulli", ntrees = ntrees, stopping_metric="MSE", stopping_tolerance=0.01, stopping_rounds=5)

Log.info("GBM Model: number of trees set by user before building the model is:"); print(ntrees)
Log.info("GBM Model: number of trees built with early-stopping is:"); print(h2o.get_ntrees_actual(prostate_gbm.h2o))

expect_true(h2o.get_ntrees_actual(prostate_gbm.h2o) < ntrees)
expect_equal(h2o.get_ntrees_actual(prostate_gbm.h2o), prostate_gbm.h2o@model$model_summary['number_of_trees'][,1])

# Train H2O Isolation Forest Model:
Log.info(paste("H2O Isolation Forest with parameters:\nsample_rate = 0.1, max_depth = 20, ntrees = ", ntrees, ", stopping_metric=\"AUTO\", stopping_tolerance=0.01, stopping_rounds=5\n", sep = ""))
prostate_if.h2o <- h2o.isolationForest(sample_rate = 0.1, max_depth = 20, training_frame = prostate.hex, ntrees=ntrees, stopping_metric="AUTO", stopping_tolerance=0.01, stopping_rounds=5)

Log.info("Isolation Forest Model: number of trees set by user before building the model is:"); print(ntrees)
Log.info("Isolation Forest Model: number of trees built with early-stopping is:"); print(h2o.get_ntrees_actual(prostate_if.h2o))

expect_true(h2o.get_ntrees_actual(prostate_if.h2o) < ntrees)
expect_equal(h2o.get_ntrees_actual(prostate_if.h2o), prostate_if.h2o@model$model_summary['number_of_trees'][,1])

# Train H2O Random Forest Model:
Log.info(paste("H2O Random Forest with parameters:\nx = 1:4, y = 5,max_depth=20, min_rows=10, ntrees = ", ntrees, ", stopping_metric=\"AUTO\", stopping_tolerance=0.01, stopping_rounds=5\n", sep = ""))
prostate_rf.h2o <- h2o.randomForest(x = 1:4, y = 5, ntrees=ntrees, max_depth=20, min_rows=10, training_frame = prostate.hex, stopping_metric="AUTO", stopping_tolerance=0.01, stopping_rounds=5)

Log.info("Random Forest Model: number of trees set by user before building the model is:"); print(ntrees)
Log.info("Random Forest Model: number of trees built with early-stopping is:"); print(h2o.get_ntrees_actual(prostate_rf.h2o))

expect_true(h2o.get_ntrees_actual(prostate_rf.h2o) < ntrees)
expect_equal(h2o.get_ntrees_actual(prostate_rf.h2o), prostate_rf.h2o@model$model_summary['number_of_trees'][,1])

# Train H2O XGBoost Model:
Log.info(paste("H2O XGBoost with parameters:\nx = 1:4, y = 5,distribution=\"auto\", seed=1, ntrees = ", ntrees, ", stopping_metric=\"AUTO\", stopping_tolerance=0.01, stopping_rounds=5\n", sep = ""))
prostate_xgb.h2o <- h2o.xgboost(x = 1:4, y = 5, distribution="AUTO",training_frame = prostate.hex, ntrees=ntrees, seed=1, stopping_metric="deviance", stopping_tolerance=0.01, stopping_rounds=1)
Log.info("XGBoost Model: number of trees set by user before building the model is:"); print(ntrees)
Log.info("XGBoost Model: number of trees built with early-stopping is:"); print(h2o.get_ntrees_actual(prostate_xgb.h2o))

expect_true(h2o.get_ntrees_actual(prostate_xgb.h2o) < ntrees)
expect_equal(h2o.get_ntrees_actual(prostate_xgb.h2o), prostate_xgb.h2o@model$model_summary['number_of_trees'][,1])

}

doTest("GBM Test: provide actual ntree value", test.tree_algos.bernoulli)

0 comments on commit 4a3f679

Please sign in to comment.