-
Notifications
You must be signed in to change notification settings - Fork 2k
/
gbm.R
executable file
·125 lines (115 loc) · 5.68 KB
/
gbm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#' Gradient Boosted Machines
#'
#' Builds gradient boosted classification trees, and gradient boosted regression trees on a parsed data set.
#'
#' @param x A vector containing the names or indices of the predictor variables to use in building the GBM model.
#' @param y The name or index of the response variable. If the data does not contain a header, this is the column index
#' number starting at 0, and increasing from left to right. (The response must be either an integer or a
#' categorical variable).
#' @param data An \code{\linkS4class{H2OFrame}} object containing the variables in the model.
#' @param key (Optional) The unique hex key assigned to the resulting model. If none is given, a key will automatically
#' be generated.
#' @param loss \code{Defaults to "AUTO"} A \code{character} string. The loss function to be implemented. Must be "AUTO"
#' or "Bernoulli"
#' @param ntrees \code{Defaults to 50} A nonnegative integer that determines the number of trees to grow.
#' @param max_depth \code{Defaults to 5} Maximum depth to grow the tree.
#' @param min_rows \code{Defaults to 10} Minimum number of rows to assign to teminal nodes.
#' @param learn_rate \code{Defaults to 0.1} An \code{interger} from \code{0.0} to \code{1.0}
#' @param nbins \code{Defaults to 20} Number of bins to use in building histogram.
#' @param group_split #TODO NEED TO FINISH
#' @param variable_importance #TODO: NEED TO FINISH
#' @param validation_frame An \code{\link{H2OFrame}} object indicating the validation dataset used to contruct the
#' confusion matrix. If left blank, this defaults to the training data when \code{nfolds = 0}
#' @param balance_classes \code{Defaults to FALSE} logical, indicates whether or not to balance training data class
#' counts via over/under-sampling (for imbalanced data)
#' @param max_after_balance_size \code{Defaults to 1} Maximum relative size of the training data after balancing class counts (can be less
#' than 1.0)
#' @param seed Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded
#' @param nfolds (Optional) Number of folds for cross-validation. If \code{nfolds >= 2}, then \code{validation} must remain empty.
#' @seealso \code{\link{predict.H2OGBMModel}} for prediction.
#' @examples
#' #TODO GBM wasn't working example needs to be redone, maybe
#' library(h2o)
#' localH2O = h2o.init()
#'
#' # Run regression GBM on australia.hex data
#' ausPath <- system.file("extdata", "australia.csv", package="h2o")
#' australia.hex <- h2o.uploadFile(localH2O, path = ausPath)
#' independent <- c("premax", "salmax","minairtemp", "maxairtemp", "maxsst",
#' "maxsoilmoist", "Max_czcs")
#' dependent <- "runoffnew"
#' h2o.gbm(y = dependent, x = independent, data = australia.hex, ntrees = 3,
#' max_depth = 3, min_rows = 2)
h2o.gbm <- function(x, y, training_frame, do_classification, ...,
#AUTOGENERATED params
destination_key,
loss = c("AUTO", "bernoulli", "multinomial", "gaussian"),
ntrees = 50,
max_depth = 5,
min_rows = 10,
learn_rate = 0.1,
nbins = 20,
group_split = TRUE,
variable_importance = FALSE,
validation_frame = FALSE,
balance_classes = FALSE,
max_after_balance_size = 1,
seed)
{
dots <- list(...)
for(type in names(dots))
if (is.environment(dots[[type]]))
{
dots$envir <- type
type <- NULL
} else {
stop(paste0("\n unused argument (", type, " = ", dots[[type]], ")"))
}
if (is.null(dots$envir))
dots$envir <- parent.frame()
# Required args: x, y, training_frame
if( missing(x) ) stop("`x` is missing, with no default")
if( missing(y) ) stop("`y` is missing, with no default")
if( missing(training_frame) ) stop("`training_frame` is missing, with no default")
# Training_frame may be a key or an H2OFrame object
if (!inherits(training_frame, "H2OFrame"))
tryCatch(training_frame <- h2o.getFrame(training_frame),
error = function(err) {
stop("argument \"training_frame\" must be a valid H2OFrame or key")
})
#required map for params with different names, assuming it will change in the RESTAPI end
.gbm.map <- c("x" = "ignored_columns",
"y" = "response_column",
"key" = "destination_key")
parms <- as.list(match.call(expand.dots = FALSE)[-1L])
parms$... <- NULL
args <- .verify_dataxy(training_frame, x, y)
parms$x <- args$x_ignore
parms$y <- args$y
if(!missing(max_after_balance_size) ) parms$max_after_balance_size <- max_after_balance_size #hard-code due to Inf bug
names(parms) <- lapply(names(parms), function(i) { if( i %in% names(.gbm.map) ) i <- .gbm.map[[i]]; i })
.h2o.createModel(training_frame@conn, 'gbm', parms, dots$envir )
}
# Function call for R sided cross validation of h2o objects
h2o.gbm.cv <- function(x, y, training_frame, do_classification, nfolds = 2,
#AUTOGENERATED params
key,
loss = c("AUTO", "Bernoulli"),
ntrees = 50,
max_depth = 5,
min_rows = 10,
learn_rate = 0.1,
nbins = 20,
group_split,
variable_importance = FALSE,
balance_classes = FALSE,
max_after_balance_size = 1,
seed
# group_split
)
{
env <- parent.frame()
parms <- lapply(as.list(match.call()[-1L]), eval, env)
parms$nfolds <- NULL
do.call("h2o.crossValidate", list(model.type = 'gbm', nfolds = nfolds, params = parms, envir = env))
}