h2o-r/h2o-package/R/gbm.R

#' Gradient Boosted Machines
#'
#' Builds gradient boosted classification trees, and gradient boosted regression trees on a parsed data set.
#'
#' @param x A vector containing the names or indices of the predictor variables to use in building the GBM model.
#' @param y The name or index of the response variable. If the data does not contain a header, this is the column index
#'        number starting at 0, and increasing from left to right. (The response must be either an integer or a
#'        categorical variable).
#' @param data An \code{\linkS4class{H2OFrame}} object containing the variables in the model.
#' @param key (Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically
#'        be generated.
#' @param loss \code{Defaults to "AUTO"} A \code{character} string. The loss function to be implemented. Must be "AUTO"
#'        or "Bernoulli"
#' @param ntrees \code{Defaults to 50} A nonnegative integer that determines the number of trees to grow.
#' @param max_depth \code{Defaults to 5} Maximum depth to grow the tree.
#' @param min_rows \code{Defaults to 10} Minimum number of rows to assign to teminal nodes.
#' @param learn_rate \code{Defaults to 0.1} An \code{interger} from \code{0.0} to \code{1.0}
#' @param nbins \code{Defaults to 20} Number of bins to use in building histogram.
#' @param group_split  #TODO NEED TO FINISH
#' @param variable_importance #TODO: NEED TO FINISH
#' @param validation_frame An \code{\link{H2OFrame}} object indicating the validation dataset used to contruct the
#'        confusion matrix. If left blank, this defaults to the training data when \code{nfolds = 0}
#' @param balance_classes \code{Defaults to FALSE} logical, indicates whether or not to balance training data class
#'        counts via over/under-sampling (for imbalanced data)
#' @param max_after_balance_size \code{Defaults to 1} Maximum relative size of the training data after balancing class counts (can be less
#'        than 1.0)
#' @param seed Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded
#' @param nfolds (Optional) Number of folds for cross-validation. If \code{nfolds >= 2}, then \code{validation} must remain empty.
#' @seealso \code{\link{predict.H2OGBMModel}} for prediction.
#' @examples
#' #TODO GBM wasn't working example needs to be redone, maybe
#' library(h2o)
#' localH2O = h2o.init()
#'
#' # Run regression GBM on australia.hex data
#' ausPath <- system.file("extdata", "australia.csv", package="h2o")
#' australia.hex <- h2o.uploadFile(localH2O, path = ausPath)
#' independent <- c("premax", "salmax","minairtemp", "maxairtemp", "maxsst", 
#'                  "maxsoilmoist", "Max_czcs")
#' dependent <- "runoffnew"
#' h2o.gbm(y = dependent, x = independent, data = australia.hex, ntrees = 3, 
#'         max_depth = 3, min_rows = 2)
h2o.gbm <- function(x, y, training_frame, do_classification, ...,
                    #AUTOGENERATED params
                    destination_key,
                    loss = c("AUTO", "bernoulli", "multinomial", "gaussian"),
                    ntrees = 50,
                    max_depth = 5,
                    min_rows = 10,
                    learn_rate = 0.1,
                    nbins = 20,
                    group_split = TRUE,
                    variable_importance = FALSE,
                    validation_frame = FALSE,
                    balance_classes = FALSE,
                    max_after_balance_size = 1,
                    seed)
{
  dots <- list(...)
  
  for(type in names(dots))
    if (is.environment(dots[[type]]))
    {
    dots$envir <- type
    type <- NULL
    } else {
      stop(paste0("\n  unused argument (", type, " = ", dots[[type]], ")"))
    }
  if (is.null(dots$envir)) 
    dots$envir <- parent.frame()
  
  # Required args: x, y, training_frame
  if( missing(x) ) stop("`x` is missing, with no default")
  if( missing(y) ) stop("`y` is missing, with no default")
  if( missing(training_frame) ) stop("`training_frame` is missing, with no default")
  
  # Training_frame may be a key or an H2OFrame object
  if (!inherits(training_frame, "H2OFrame"))
    tryCatch(training_frame <- h2o.getFrame(training_frame),
             error = function(err) {
               stop("argument \"training_frame\" must be a valid H2OFrame or key")
             })

  #required map for params with different names, assuming it will change in the RESTAPI end
  .gbm.map <- c("x" = "ignored_columns",
                "y" = "response_column",
                "key" = "destination_key")

  parms <- as.list(match.call(expand.dots = FALSE)[-1L])
  parms$... <- NULL

  args <- .verify_dataxy(training_frame, x, y)
  parms$x <- args$x_ignore
  parms$y <- args$y
  if(!missing(max_after_balance_size) ) parms$max_after_balance_size <- max_after_balance_size #hard-code due to Inf bug

  names(parms) <- lapply(names(parms), function(i) { if( i %in% names(.gbm.map) ) i <- .gbm.map[[i]]; i })

  .h2o.createModel(training_frame@conn, 'gbm', parms, dots$envir )
}

# Function call for R sided cross validation of h2o objects
h2o.gbm.cv <- function(x, y, training_frame, do_classification, nfolds = 2,
           #AUTOGENERATED params
           key,
           loss = c("AUTO", "Bernoulli"),
           ntrees = 50,
           max_depth = 5,
           min_rows = 10,
           learn_rate = 0.1,
           nbins = 20,
           group_split,
           variable_importance = FALSE,
           balance_classes = FALSE,
           max_after_balance_size = 1,
           seed
           # group_split
           )
{
  env <- parent.frame()
  parms <- lapply(as.list(match.call()[-1L]), eval, env)
  parms$nfolds <- NULL
  
  do.call("h2o.crossValidate", list(model.type = 'gbm', nfolds = nfolds, params = parms, envir = env))
}