From 100405ff2f2ad0098bda56c0253e3a446bef597b Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 4 Feb 2024 17:51:09 +0100 Subject: [PATCH 1/7] make xgb.cv work for survival and ranking --- R-package/R/utils.R | 54 ++++++++++---- R-package/R/xgb.DMatrix.R | 31 ++++++--- R-package/R/xgb.cv.R | 85 ++++++++++++++--------- R-package/man/xgb.cv.Rd | 53 +++++++++----- R-package/man/xgb.slice.DMatrix.Rd | 10 ++- R-package/src/init.c | 4 +- R-package/src/xgboost_R.cc | 4 +- R-package/src/xgboost_R.h | 3 +- R-package/tests/testthat/test_basic.R | 59 +++++++++++++++- R-package/tests/testthat/test_callbacks.R | 4 +- R-package/tests/testthat/test_dmatrix.R | 40 ++++++++++- 11 files changed, 261 insertions(+), 86 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index e8ae787fc722..4e5163744b39 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -235,33 +235,34 @@ convert.labels <- function(labels, objective_name) { } # Generates random (stratified if needed) CV folds -generate.cv.folds <- function(nfold, nrows, stratified, label, params) { +generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { + if (NROW(group)) { + return(generate.group.folds(nfold, group)) + } + objective <- params$objective + if (!is.character(objective)) { + stratified <- FALSE + } + # cannot stratify if label is NULL + if (stratified && is.null(label)) { + warning("Will use unstratified splitting (no 'labels' available)") + stratified <- FALSE + } # cannot do it for rank - objective <- params$objective if (is.character(objective) && strtrim(objective, 5) == 'rank:') { - stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n", + stop("\n\tAutomatic generation of CV-folds is not implemented for ranking without 'group' field!\n", "\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n") } # shuffle rnd_idx <- sample.int(nrows) - if (stratified && - length(label) == length(rnd_idx)) { + if (stratified && length(label) == length(rnd_idx)) { y <- label[rnd_idx] - # WARNING: some heuristic logic is employed to identify classification setting! # - For classification, need to convert y labels to factor before making the folds, # and then do stratification by factor levels. # - For regression, leave y numeric and do stratification by quantiles. if (is.character(objective)) { - y <- convert.labels(y, params$objective) - } else { - # If no 'objective' given in params, it means that user either wants to - # use the default 'reg:squarederror' objective or has provided a custom - # obj function. Here, assume classification setting when y has 5 or less - # unique values: - if (length(unique(y)) <= 5) { - y <- factor(y) - } + y <- convert.labels(y, objective) } folds <- xgb.createFolds(y = y, k = nfold) } else { @@ -277,6 +278,29 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) { return(folds) } +generate.group.folds <- function(nfold, group) { + ngroups <- length(group) - 1 + if (ngroups < nfold) { + stop("DMatrix has fewer groups than folds.") + } + seq_groups <- seq_len(ngroups) + indices <- lapply(seq_groups, function(gr) seq(group[gr] + 1, group[gr + 1])) + assignments <- base::split(seq_groups, as.integer(seq_groups %% nfold)) + assignments <- unname(assignments) + + out <- vector("list", nfold) + randomized_groups <- sample(ngroups) + for (idx in seq_len(nfold)) { + groups_idx_test <- randomized_groups[assignments[[idx]]] + groups_test <- indices[groups_idx_test] + idx_test <- unlist(groups_test) + attributes(idx_test)$group_test <- sapply(groups_test, length) + attributes(idx_test)$group_train <- sapply(indices[-groups_idx_test], length) + out[[idx]] <- idx_test + } + return(out) +} + # Creates CV folds stratified by the values of y. # It was borrowed from caret::createFolds and simplified # by always returning an unnamed list of fold indices. diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index ba0686cf9b04..5f6bc67ab74e 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -1227,8 +1227,11 @@ xgb.get.DMatrix.data <- function(dmat) { #' Get a new DMatrix containing the specified rows of #' original xgb.DMatrix object #' -#' @param object Object of class "xgb.DMatrix" -#' @param idxset a integer vector of indices of rows needed +#' @param object Object of class "xgb.DMatrix". +#' @param idxset An integer vector of indices of rows needed (base-1 indexing). +#' @param allow_groups Whether to allow slicing an `xgb.DMatrix` with `group` (or +#' equivalently `qid`) field. Note that in such case, the result will not have +#' the groups anymore - they need to be set manually through `setinfo`. #' @param colset currently not used (columns subsetting is not available) #' #' @examples @@ -1243,11 +1246,11 @@ xgb.get.DMatrix.data <- function(dmat) { #' #' @rdname xgb.slice.DMatrix #' @export -xgb.slice.DMatrix <- function(object, idxset) { +xgb.slice.DMatrix <- function(object, idxset, allow_groups = FALSE) { if (!inherits(object, "xgb.DMatrix")) { stop("object must be xgb.DMatrix") } - ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset) + ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset, allow_groups) attr_list <- attributes(object) nr <- nrow(object) @@ -1264,7 +1267,15 @@ xgb.slice.DMatrix <- function(object, idxset) { } } } - return(structure(ret, class = "xgb.DMatrix")) + + out <- structure(ret, class = "xgb.DMatrix") + parent_fields <- as.list(attributes(object)$fields) + if (NROW(parent_fields)) { + child_fields <- parent_fields[!(names(parent_fields) %in% c("group", "qid"))] + child_fields <- as.environment(child_fields) + attributes(out)$fields <- child_fields + } + return(out) } #' @rdname xgb.slice.DMatrix @@ -1308,11 +1319,11 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) { } cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ') - infos <- character(0) - if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label' - if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight') - if (xgb.DMatrix.hasinfo(x, 'base_margin')) infos <- c(infos, 'base_margin') - if (length(infos) == 0) infos <- 'NA' + infos <- names(attributes(x)$fields) + infos <- infos[infos != "feature_name"] + if (!NROW(infos)) infos <- "NA" + infos <- infos[order(infos)] + infos <- paste(infos, collapse = ", ") cat(infos) cnames <- colnames(x) cat(' colnames:') diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 29bddb57f3e2..2e23e45e2ae7 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -1,6 +1,6 @@ #' Cross Validation #' -#' The cross validation function of xgboost +#' The cross validation function of xgboost. #' #' @param params the list of parameters. The complete list of parameters is #' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below @@ -19,13 +19,17 @@ #' #' See \code{\link{xgb.train}} for further details. #' See also demo/ for walkthrough example in R. -#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input. +#' +#' Note that, while `params` accepts a `seed` entry and will use such parameter for model training if +#' supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG +#' system - thus, for reproducible results, one needs to call the `set.seed` function beforehand. +#' @param data An `xgb.DMatrix` object, with corresponding fields like `label` or bounds as required +#' for model training by the objective. +#' +#' Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix` +#' or `xgb.ExternalDMatrix` are not supported here. #' @param nrounds the max number of iterations #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. -#' @param label vector of response values. Should be provided only when data is an R-matrix. -#' @param missing is only used when input is a dense matrix. By default is set to NA, which means -#' that NA values should be considered as 'missing' by the algorithm. -#' Sometimes, 0 or other extreme value might be used to represent missing values. #' @param prediction A logical value indicating whether to return the test fold predictions #' from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback. #' @param showsd \code{boolean}, whether to show standard deviation of cross validation @@ -47,13 +51,30 @@ #' @param feval customized evaluation function. Returns #' \code{list(metric='metric-name', value='metric-value')} with given #' prediction and dtrain. -#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified -#' by the values of outcome labels. +#' @param stratified A \code{boolean} indicating whether sampling of folds should be stratified +#' by the values of outcome labels. For real-valued labels in regression objectives, +#' stratification will be done by discretizing the labels into up to 5 buckets beforehand. +#' +#' If passing "auto", will be set to `TRUE` if the objective in `params` is a classification +#' objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to +#' `FALSE` otherwise. +#' +#' This parameter is ignored when `data` has a `group` field - in such case, the splitting +#' will be based on whole groups (note that this might make the folds have different sizes). +#' +#' Value `TRUE` here is \bold{not} supported for custom objectives. #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds #' (each element must be a vector of test fold's indices). When folds are supplied, #' the \code{nfold} and \code{stratified} parameters are ignored. +#' +#' If `data` has a `group` field and the objective requires this field, each fold (list element) +#' must additionally have two attributes (retrievable through \link{attributes}) named `group_test` +#' and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to +#' the resulting DMatrices. #' @param train_folds \code{list} list specifying which indicies to use for training. If \code{NULL} #' (the default) all indices not specified in \code{folds} will be used for training. +#' +#' This is not supported when `data` has `group` field. #' @param verbose \code{boolean}, print the statistics during the process #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}. #' Default is 1 which means all messages are printed. This parameter is passed to the @@ -118,13 +139,14 @@ #' print(cv, verbose=TRUE) #' #' @export -xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing = NA, +xgb.cv <- function(params = list(), data, nrounds, nfold, prediction = FALSE, showsd = TRUE, metrics = list(), - obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL, + obj = NULL, feval = NULL, stratified = "auto", folds = NULL, train_folds = NULL, verbose = TRUE, print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) { check.deprecation(...) + stopifnot(inherits(data, "xgb.DMatrix")) if (inherits(data, "xgb.DMatrix") && .Call(XGCheckNullPtr_R, data)) { stop("'data' is an invalid 'xgb.DMatrix' object. Must be constructed again.") } @@ -137,16 +159,19 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing check.custom.obj() check.custom.eval() - # Check the labels - if ((inherits(data, 'xgb.DMatrix') && !xgb.DMatrix.hasinfo(data, 'label')) || - (!inherits(data, 'xgb.DMatrix') && is.null(label))) { - stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix") - } else if (inherits(data, 'xgb.DMatrix')) { - if (!is.null(label)) - warning("xgb.cv: label will be ignored, since data is of type xgb.DMatrix") - cv_label <- getinfo(data, 'label') - } else { - cv_label <- label + if (stratified == "auto") { + if (is.character(params$objective)) { + stratified <- params$objective %in% .CLASSIFICATION_OBJECTIVES() + } else { + stratified <- FALSE + } + } + + # Check the labels and groups + cv_label <- getinfo(data, "label") + cv_group <- getinfo(data, "group") + if (!is.null(train_folds) && NROW(cv_group)) { + stop("'train_folds' is not supported for DMatrix object with 'group' field.") } # CV folds @@ -157,7 +182,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing } else { if (nfold <= 1) stop("'nfold' must be > 1") - folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, params) + folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, cv_group, params) } # verbosity & evaluation printing callback: @@ -189,20 +214,18 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing # create the booster-folds # train_folds - dall <- xgb.get.DMatrix( - data = data, - label = label, - missing = missing, - weight = NULL, - nthread = params$nthread - ) + dall <- data bst_folds <- lapply(seq_along(folds), function(k) { - dtest <- xgb.slice.DMatrix(dall, folds[[k]]) + dtest <- xgb.slice.DMatrix(dall, folds[[k]], allow_groups = TRUE) # code originally contributed by @RolandASc on stackoverflow if (is.null(train_folds)) - dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k])) + dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]), allow_groups = TRUE) else - dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]]) + dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]], allow_groups = TRUE) + if (!is.null(attributes(folds[[k]])$group_test)) { + setinfo(dtest, "group", attributes(folds[[k]])$group_test) + setinfo(dtrain, "group", attributes(folds[[k]])$group_train) + } bst <- xgb.Booster( params = params, cachelist = list(dtrain, dtest), diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 9f6103a52762..7fdd197a4498 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -9,14 +9,12 @@ xgb.cv( data, nrounds, nfold, - label = NULL, - missing = NA, prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL, feval = NULL, - stratified = TRUE, + stratified = "auto", folds = NULL, train_folds = NULL, verbose = TRUE, @@ -44,19 +42,22 @@ is a shorter summary: } See \code{\link{xgb.train}} for further details. -See also demo/ for walkthrough example in R.} +See also demo/ for walkthrough example in R. -\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.} +Note that, while \code{params} accepts a \code{seed} entry and will use such parameter for model training if +supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG +system - thus, for reproducible results, one needs to call the \code{set.seed} function beforehand.} -\item{nrounds}{the max number of iterations} +\item{data}{An \code{xgb.DMatrix} object, with corresponding fields like \code{label} or bounds as required +for model training by the objective. -\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.} +\if{html}{\out{
}}\preformatted{ Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix` + or `xgb.ExternalDMatrix` are not supported here. +}\if{html}{\out{
}}} -\item{label}{vector of response values. Should be provided only when data is an R-matrix.} +\item{nrounds}{the max number of iterations} -\item{missing}{is only used when input is a dense matrix. By default is set to NA, which means -that NA values should be considered as 'missing' by the algorithm. -Sometimes, 0 or other extreme value might be used to represent missing values.} +\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.} \item{prediction}{A logical value indicating whether to return the test fold predictions from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.} @@ -84,15 +85,35 @@ gradient with given prediction and dtrain.} \code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain.} -\item{stratified}{a \code{boolean} indicating whether sampling of folds should be stratified -by the values of outcome labels.} +\item{stratified}{A \code{boolean} indicating whether sampling of folds should be stratified +by the values of outcome labels. For real-valued labels in regression objectives, +stratification will be done by discretizing the labels into up to 5 buckets beforehand. + +\if{html}{\out{
}}\preformatted{ If passing "auto", will be set to `TRUE` if the objective in `params` is a classification + objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to + `FALSE` otherwise. + + This parameter is ignored when `data` has a `group` field - in such case, the splitting + will be based on whole groups (note that this might make the folds have different sizes). + + Value `TRUE` here is \\bold\{not\} supported for custom objectives. +}\if{html}{\out{
}}} \item{folds}{\code{list} provides a possibility to use a list of pre-defined CV folds (each element must be a vector of test fold's indices). When folds are supplied, -the \code{nfold} and \code{stratified} parameters are ignored.} +the \code{nfold} and \code{stratified} parameters are ignored. + +\if{html}{\out{
}}\preformatted{ If `data` has a `group` field and the objective requires this field, each fold (list element) + must additionally have two attributes (retrievable through \link{attributes}) named `group_test` + and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to + the resulting DMatrices. +}\if{html}{\out{
}}} \item{train_folds}{\code{list} list specifying which indicies to use for training. If \code{NULL} -(the default) all indices not specified in \code{folds} will be used for training.} +(the default) all indices not specified in \code{folds} will be used for training. + +\if{html}{\out{
}}\preformatted{ This is not supported when `data` has `group` field. +}\if{html}{\out{
}}} \item{verbose}{\code{boolean}, print the statistics during the process} @@ -142,7 +163,7 @@ setting of the \code{cb.cv.predict(save_models = TRUE)} callback. } } \description{ -The cross validation function of xgboost +The cross validation function of xgboost. } \details{ The original sample is randomly partitioned into \code{nfold} equal size subsamples. diff --git a/R-package/man/xgb.slice.DMatrix.Rd b/R-package/man/xgb.slice.DMatrix.Rd index c9695996b66f..c4f7765943bb 100644 --- a/R-package/man/xgb.slice.DMatrix.Rd +++ b/R-package/man/xgb.slice.DMatrix.Rd @@ -6,14 +6,18 @@ \title{Get a new DMatrix containing the specified rows of original xgb.DMatrix object} \usage{ -xgb.slice.DMatrix(object, idxset) +xgb.slice.DMatrix(object, idxset, allow_groups = FALSE) \method{[}{xgb.DMatrix}(object, idxset, colset = NULL) } \arguments{ -\item{object}{Object of class "xgb.DMatrix"} +\item{object}{Object of class "xgb.DMatrix".} -\item{idxset}{a integer vector of indices of rows needed} +\item{idxset}{An integer vector of indices of rows needed (base-1 indexing).} + +\item{allow_groups}{Whether to allow slicing an \code{xgb.DMatrix} with \code{group} (or +equivalently \code{qid}) field. Note that in such case, the result will not have +the groups anymore - they need to be set manually through \code{setinfo}.} \item{colset}{currently not used (columns subsetting is not available)} } diff --git a/R-package/src/init.c b/R-package/src/init.c index a9f3f3e380c2..4453cecc5c90 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -68,7 +68,7 @@ extern SEXP XGDMatrixGetDataAsCSR_R(SEXP); extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixSetStrFeatureInfo_R(SEXP, SEXP, SEXP); -extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP); +extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP, SEXP); extern SEXP XGBSetGlobalConfig_R(SEXP); extern SEXP XGBGetGlobalConfig_R(void); extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP); @@ -127,7 +127,7 @@ static const R_CallMethodDef CallEntries[] = { {"XGDMatrixSaveBinary_R", (DL_FUNC) &XGDMatrixSaveBinary_R, 3}, {"XGDMatrixSetInfo_R", (DL_FUNC) &XGDMatrixSetInfo_R, 3}, {"XGDMatrixSetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixSetStrFeatureInfo_R, 3}, - {"XGDMatrixSliceDMatrix_R", (DL_FUNC) &XGDMatrixSliceDMatrix_R, 2}, + {"XGDMatrixSliceDMatrix_R", (DL_FUNC) &XGDMatrixSliceDMatrix_R, 3}, {"XGBSetGlobalConfig_R", (DL_FUNC) &XGBSetGlobalConfig_R, 1}, {"XGBGetGlobalConfig_R", (DL_FUNC) &XGBGetGlobalConfig_R, 0}, {"XGBoosterFeatureScore_R", (DL_FUNC) &XGBoosterFeatureScore_R, 2}, diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index c91fb94c447c..353556e49838 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -505,7 +505,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP return ret; } -XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) { +XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset, SEXP allow_groups) { SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); R_API_BEGIN(); R_xlen_t len = Rf_xlength(idxset); @@ -524,7 +524,7 @@ XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) { res_code = XGDMatrixSliceDMatrixEx(R_ExternalPtrAddr(handle), BeginPtr(idxvec), len, &res, - 0); + Rf_asLogical(allow_groups)); } CHECK_CALL(res_code); R_SetExternalPtrAddr(ret, res); diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index d2e0ae82855d..582a87afd43a 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -111,9 +111,10 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP * \brief create a new dmatrix from sliced content of existing matrix * \param handle instance of data matrix to be sliced * \param idxset index set + * \param allow_groups Whether to allow slicing the DMatrix if it has a 'group' field * \return a sliced new matrix */ -XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset); +XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset, SEXP allow_groups); /*! * \brief load a data matrix into binary file diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index fb3162e423ce..9236e0245262 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -334,7 +334,7 @@ test_that("xgb.cv works", { set.seed(11) expect_output( cv <- xgb.cv( - data = train$data, label = train$label, max_depth = 2, nfold = 5, + data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, nfold = 5, eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic", eval_metric = "error", verbose = TRUE ), @@ -358,13 +358,13 @@ test_that("xgb.cv works with stratified folds", { cv <- xgb.cv( data = dtrain, max_depth = 2, nfold = 5, eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic", - verbose = TRUE, stratified = FALSE + verbose = FALSE, stratified = FALSE ) set.seed(314159) cv2 <- xgb.cv( data = dtrain, max_depth = 2, nfold = 5, eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic", - verbose = TRUE, stratified = TRUE + verbose = FALSE, stratified = TRUE ) # Stratified folds should result in a different evaluation logs expect_true(all(cv$evaluation_log[, test_logloss_mean] != cv2$evaluation_log[, test_logloss_mean])) @@ -841,3 +841,56 @@ test_that("Seed in params override PRNG from R", { ) ) }) + +test_that("xgb.cv works for AFT", { + X <- matrix(c(1, -1, -1, 1, 0, 1, 1, 0), nrow = 4, byrow = TRUE) # 4x2 matrix + dtrain <- xgb.DMatrix(X, nthread = n_threads) + + params <- list(objective = 'survival:aft', learning_rate = 0.2, max_depth = 2L) + + # data must have bounds + expect_error( + xgb.cv( + params = params, + data = dtrain, + nround = 5L, + nfold = 4L, + nthread = n_threads + ) + ) + + setinfo(dtrain, 'label_lower_bound', c(2, 3, 0, 4)) + setinfo(dtrain, 'label_upper_bound', c(2, Inf, 4, 5)) + + # automatic stratified splitting is turned off + expect_warning( + xgb.cv( + params = params, data = dtrain, nround = 5L, nfold = 4L, + nthread = n_threads, stratified = TRUE, verbose = FALSE + ) + ) + + # this works without any issue + expect_no_warning( + xgb.cv(params = params, data = dtrain, nround = 5L, nfold = 4L, verbose = FALSE) + ) +}) + +test_that("xgb.cv works for ranking", { + data(iris) + x <- iris[, -(4:5)] + y <- as.integer(iris$Petal.Width) + group <- rep(50, 3) + dm <- xgb.DMatrix(x, label = y, group = group) + res <- xgb.cv( + data = dm, + params = list( + objective = "rank:pairwise", + max_depth = 3 + ), + nrounds = 3, + nfold = 2, + verbose = FALSE + ) + expect_equal(length(res$folds), 2L) +}) diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R index c60d0c246f81..76ff66046515 100644 --- a/R-package/tests/testthat/test_callbacks.R +++ b/R-package/tests/testthat/test_callbacks.R @@ -321,7 +321,7 @@ test_that("prediction in early-stopping xgb.cv works", { expect_output( cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.1, nrounds = 20, early_stopping_rounds = 5, maximize = FALSE, stratified = FALSE, - prediction = TRUE, base_score = 0.5) + prediction = TRUE, base_score = 0.5, verbose = TRUE) , "Stopping. Best iteration") expect_false(is.null(cv$best_iteration)) @@ -341,7 +341,7 @@ test_that("prediction in xgb.cv for softprob works", { lb <- as.numeric(iris$Species) - 1 set.seed(11) expect_warning( - cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4, + cv <- xgb.cv(data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), nfold = 4, eta = 0.5, nrounds = 5, max_depth = 3, nthread = n_threads, subsample = 0.8, gamma = 2, verbose = 0, prediction = TRUE, objective = "multi:softprob", num_class = 3) diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 50621f241c3e..f2df9a5778c1 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -243,7 +243,7 @@ test_that("xgb.DMatrix: print", { txt <- capture.output({ print(dtrain) }) - expect_equal(txt, "xgb.DMatrix dim: 6513 x 126 info: label weight base_margin colnames: yes") + expect_equal(txt, "xgb.DMatrix dim: 6513 x 126 info: base_margin, label, weight colnames: yes") # DMatrix with just features dtrain <- xgb.DMatrix( @@ -692,3 +692,41 @@ test_that("xgb.DMatrix: quantile cuts look correct", { } ) }) + +test_that("xgb.DMatrix: slicing keeps field indicators", { + data(mtcars) + x <- as.matrix(mtcars[, -1]) + y <- mtcars[, 1] + dm <- xgb.DMatrix( + data = x, + label_lower_bound = -y, + label_upper_bound = y, + nthread = 1 + ) + idx_take <- seq(1, 5) + dm_slice <- xgb.slice.DMatrix(dm, idx_take) + + expect_true(xgb.DMatrix.hasinfo(dm_slice, "label_lower_bound")) + expect_true(xgb.DMatrix.hasinfo(dm_slice, "label_upper_bound")) + expect_false(xgb.DMatrix.hasinfo(dm_slice, "label")) + + expect_equal(getinfo(dm_slice, "label_lower_bound"), -y[idx_take], tolerance = 1e-6) + expect_equal(getinfo(dm_slice, "label_upper_bound"), y[idx_take], tolerance = 1e-6) +}) + +test_that("xgb.DMatrix: can slice with groups", { + data(iris) + x <- as.matrix(iris[, -5]) + set.seed(123) + y <- sample(3, size = nrow(x), replace = TRUE) + group <- c(50, 50, 50) + dm <- xgb.DMatrix(x, label = y, group = group, nthread = 1) + idx_take <- seq(1, 50) + dm_slice <- xgb.slice.DMatrix(dm, idx_take, allow_groups = TRUE) + + expect_true(xgb.DMatrix.hasinfo(dm_slice, "label")) + expect_false(xgb.DMatrix.hasinfo(dm_slice, "group")) + expect_false(xgb.DMatrix.hasinfo(dm_slice, "qid")) + expect_null(getinfo(dm_slice, "group")) + expect_equal(getinfo(dm_slice, "label"), y[idx_take], tolerance = 1e-6) +}) From c94d018149eefb6dc0d3788fa4ea9ab42247829e Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 4 Feb 2024 21:21:29 +0100 Subject: [PATCH 2/7] linter --- R-package/R/utils.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 4e5163744b39..96a4e5949e3b 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -294,8 +294,8 @@ generate.group.folds <- function(nfold, group) { groups_idx_test <- randomized_groups[assignments[[idx]]] groups_test <- indices[groups_idx_test] idx_test <- unlist(groups_test) - attributes(idx_test)$group_test <- sapply(groups_test, length) - attributes(idx_test)$group_train <- sapply(indices[-groups_idx_test], length) + attributes(idx_test)$group_test <- lengths(groups_test) + attributes(idx_test)$group_train <- lengths(indices[-groups_idx_test]) out[[idx]] <- idx_test } return(out) From 82eb309b2666103bb107294d7f887b78176b38bf Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 4 Feb 2024 21:26:17 +0100 Subject: [PATCH 3/7] fix example --- R-package/R/xgb.cv.R | 4 ++-- R-package/man/print.xgb.cv.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 2e23e45e2ae7..3e59c3a75298 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -308,8 +308,8 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, #' @examples #' data(agaricus.train, package='xgboost') #' train <- agaricus.train -#' cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2, -#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") +#' cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2, +#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' print(cv) #' print(cv, verbose=TRUE) #' diff --git a/R-package/man/print.xgb.cv.Rd b/R-package/man/print.xgb.cv.Rd index 05ad61eed8ac..74fc15d01fb9 100644 --- a/R-package/man/print.xgb.cv.Rd +++ b/R-package/man/print.xgb.cv.Rd @@ -23,8 +23,8 @@ including the best iteration (when available). \examples{ data(agaricus.train, package='xgboost') train <- agaricus.train -cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2, - eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") +cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2, + eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") print(cv) print(cv, verbose=TRUE) From 3639ef133be5adbe954ffef3728ce9d3bbf3e050 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Mar 2024 07:17:14 +0100 Subject: [PATCH 4/7] Update R-package/R/utils.R Co-authored-by: Philip Hyunsu Cho --- R-package/R/utils.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 96a4e5949e3b..9867d15054d5 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -241,6 +241,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { } objective <- params$objective if (!is.character(objective)) { + warning("Will use unstratified splitting (custom objective used)") stratified <- FALSE } # cannot stratify if label is NULL From f083ff732e2c2ff310b2f74107e92476d6b44987 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Mar 2024 07:20:25 +0100 Subject: [PATCH 5/7] error out on stratified+group CV sampling --- R-package/R/utils.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 9867d15054d5..5af7683e98c5 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -237,6 +237,9 @@ convert.labels <- function(labels, objective_name) { # Generates random (stratified if needed) CV folds generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { if (NROW(group)) { + if (stratified) { + stop("stratified sampling is not supported when using 'group' attribute.") + } return(generate.group.folds(nfold, group)) } objective <- params$objective From ba1452545d8651c72f0cd934b2b70575cd434d47 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Mar 2024 17:49:09 +0100 Subject: [PATCH 6/7] change error to warning --- R-package/R/utils.R | 7 ++++++- R-package/tests/testthat/test_basic.R | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index e80a86a6ad44..0fcb3286d178 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -238,7 +238,12 @@ convert.labels <- function(labels, objective_name) { generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { if (NROW(group)) { if (stratified) { - stop("stratified sampling is not supported when using 'group' attribute.") + warning( + paste0( + "Stratified splitting is not supported when using 'group' attribute.", + " Will use unstratified splitting." + ) + ) } return(generate.group.folds(nfold, group)) } diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index f1a999ff389c..bbb8fb323478 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -934,7 +934,8 @@ test_that("xgb.cv works for ranking", { ), nrounds = 3, nfold = 2, - verbose = FALSE + verbose = FALSE, + stratified = FALSE ) expect_equal(length(res$folds), 2L) }) From c06b969175db20ffdc419bc2e7556828be6cc583 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Mar 2024 17:51:34 +0100 Subject: [PATCH 7/7] don't default to stratified sampling for ranking objectives --- R-package/R/utils.R | 5 +++++ R-package/R/xgb.cv.R | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 0fcb3286d178..7b6a20f704dd 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -26,6 +26,11 @@ NVL <- function(x, val) { 'multi:softprob', 'rank:pairwise', 'rank:ndcg', 'rank:map')) } +.RANKING_OBJECTIVES <- function() { + return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax', + 'multi:softprob')) +} + # # Low-level functions for boosting -------------------------------------------- diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 927aa5ebbaa6..880fd56974bc 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -161,7 +161,10 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, if (stratified == "auto") { if (is.character(params$objective)) { - stratified <- params$objective %in% .CLASSIFICATION_OBJECTIVES() + stratified <- ( + (params$objective %in% .CLASSIFICATION_OBJECTIVES()) + && !(params$objective %in% .RANKING_OBJECTIVES()) + ) } else { stratified <- FALSE }