From 100405ff2f2ad0098bda56c0253e3a446bef597b Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Sun, 4 Feb 2024 17:51:09 +0100
Subject: [PATCH 1/7] make xgb.cv work for survival and ranking

---
 R-package/R/utils.R                       | 54 ++++++++++----
 R-package/R/xgb.DMatrix.R                 | 31 ++++++---
 R-package/R/xgb.cv.R                      | 85 ++++++++++++++---------
 R-package/man/xgb.cv.Rd                   | 53 +++++++++-----
 R-package/man/xgb.slice.DMatrix.Rd        | 10 ++-
 R-package/src/init.c                      |  4 +-
 R-package/src/xgboost_R.cc                |  4 +-
 R-package/src/xgboost_R.h                 |  3 +-
 R-package/tests/testthat/test_basic.R     | 59 +++++++++++++++-
 R-package/tests/testthat/test_callbacks.R |  4 +-
 R-package/tests/testthat/test_dmatrix.R   | 40 ++++++++++-
 11 files changed, 261 insertions(+), 86 deletions(-)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index e8ae787fc722..4e5163744b39 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -235,33 +235,34 @@ convert.labels <- function(labels, objective_name) {
 }
 
 # Generates random (stratified if needed) CV folds
-generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
+generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
+  if (NROW(group)) {
+    return(generate.group.folds(nfold, group))
+  }
+  objective <- params$objective
+  if (!is.character(objective)) {
+    stratified <- FALSE
+  }
+  # cannot stratify if label is NULL
+  if (stratified && is.null(label)) {
+    warning("Will use unstratified splitting (no 'labels' available)")
+    stratified <- FALSE
+  }
 
   # cannot do it for rank
-  objective <- params$objective
   if (is.character(objective) && strtrim(objective, 5) == 'rank:') {
-    stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n",
+    stop("\n\tAutomatic generation of CV-folds is not implemented for ranking without 'group' field!\n",
          "\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n")
   }
   # shuffle
   rnd_idx <- sample.int(nrows)
-  if (stratified &&
-      length(label) == length(rnd_idx)) {
+  if (stratified && length(label) == length(rnd_idx)) {
     y <- label[rnd_idx]
-    # WARNING: some heuristic logic is employed to identify classification setting!
     #  - For classification, need to convert y labels to factor before making the folds,
     #    and then do stratification by factor levels.
     #  - For regression, leave y numeric and do stratification by quantiles.
     if (is.character(objective)) {
-      y <- convert.labels(y, params$objective)
-    } else {
-      # If no 'objective' given in params, it means that user either wants to
-      # use the default 'reg:squarederror' objective or has provided a custom
-      # obj function.  Here, assume classification setting when y has 5 or less
-      # unique values:
-      if (length(unique(y)) <= 5) {
-        y <- factor(y)
-      }
+      y <- convert.labels(y, objective)
     }
     folds <- xgb.createFolds(y = y, k = nfold)
   } else {
@@ -277,6 +278,29 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
   return(folds)
 }
 
+generate.group.folds <- function(nfold, group) {
+  ngroups <- length(group) - 1
+  if (ngroups < nfold) {
+    stop("DMatrix has fewer groups than folds.")
+  }
+  seq_groups <- seq_len(ngroups)
+  indices <- lapply(seq_groups, function(gr) seq(group[gr] + 1, group[gr + 1]))
+  assignments <- base::split(seq_groups, as.integer(seq_groups %% nfold))
+  assignments <- unname(assignments)
+
+  out <- vector("list", nfold)
+  randomized_groups <- sample(ngroups)
+  for (idx in seq_len(nfold)) {
+    groups_idx_test <- randomized_groups[assignments[[idx]]]
+    groups_test <- indices[groups_idx_test]
+    idx_test <- unlist(groups_test)
+    attributes(idx_test)$group_test <- sapply(groups_test, length)
+    attributes(idx_test)$group_train <- sapply(indices[-groups_idx_test], length)
+    out[[idx]] <- idx_test
+  }
+  return(out)
+}
+
 # Creates CV folds stratified by the values of y.
 # It was borrowed from caret::createFolds and simplified
 # by always returning an unnamed list of fold indices.
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index ba0686cf9b04..5f6bc67ab74e 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -1227,8 +1227,11 @@ xgb.get.DMatrix.data <- function(dmat) {
 #' Get a new DMatrix containing the specified rows of
 #' original xgb.DMatrix object
 #'
-#' @param object Object of class "xgb.DMatrix"
-#' @param idxset a integer vector of indices of rows needed
+#' @param object Object of class "xgb.DMatrix".
+#' @param idxset An integer vector of indices of rows needed (base-1 indexing).
+#' @param allow_groups Whether to allow slicing an `xgb.DMatrix` with `group` (or
+#'        equivalently `qid`) field. Note that in such case, the result will not have
+#'        the groups anymore - they need to be set manually through `setinfo`.
 #' @param colset currently not used (columns subsetting is not available)
 #'
 #' @examples
@@ -1243,11 +1246,11 @@ xgb.get.DMatrix.data <- function(dmat) {
 #'
 #' @rdname xgb.slice.DMatrix
 #' @export
-xgb.slice.DMatrix <- function(object, idxset) {
+xgb.slice.DMatrix <- function(object, idxset, allow_groups = FALSE) {
   if (!inherits(object, "xgb.DMatrix")) {
     stop("object must be xgb.DMatrix")
   }
-  ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset)
+  ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset, allow_groups)
 
   attr_list <- attributes(object)
   nr <- nrow(object)
@@ -1264,7 +1267,15 @@ xgb.slice.DMatrix <- function(object, idxset) {
       }
     }
   }
-  return(structure(ret, class = "xgb.DMatrix"))
+
+  out <- structure(ret, class = "xgb.DMatrix")
+  parent_fields <- as.list(attributes(object)$fields)
+  if (NROW(parent_fields)) {
+    child_fields <- parent_fields[!(names(parent_fields) %in% c("group", "qid"))]
+    child_fields <- as.environment(child_fields)
+    attributes(out)$fields <- child_fields
+  }
+  return(out)
 }
 
 #' @rdname xgb.slice.DMatrix
@@ -1308,11 +1319,11 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
   }
 
   cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
-  infos <- character(0)
-  if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label'
-  if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight')
-  if (xgb.DMatrix.hasinfo(x, 'base_margin')) infos <- c(infos, 'base_margin')
-  if (length(infos) == 0) infos <- 'NA'
+  infos <- names(attributes(x)$fields)
+  infos <- infos[infos != "feature_name"]
+  if (!NROW(infos)) infos <- "NA"
+  infos <- infos[order(infos)]
+  infos <- paste(infos, collapse = ", ")
   cat(infos)
   cnames <- colnames(x)
   cat('  colnames:')
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 29bddb57f3e2..2e23e45e2ae7 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -1,6 +1,6 @@
 #' Cross Validation
 #'
-#' The cross validation function of xgboost
+#' The cross validation function of xgboost.
 #'
 #' @param params the list of parameters. The complete list of parameters is
 #'   available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
@@ -19,13 +19,17 @@
 #'
 #'   See \code{\link{xgb.train}} for further details.
 #'   See also demo/ for walkthrough example in R.
-#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
+#'
+#' Note that, while `params` accepts a `seed` entry and will use such parameter for model training if
+#' supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG
+#' system - thus, for reproducible results, one needs to call the `set.seed` function beforehand.
+#' @param data An `xgb.DMatrix` object, with corresponding fields like `label` or bounds as required
+#'        for model training by the objective.
+#'
+#'        Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix`
+#'        or `xgb.ExternalDMatrix` are not supported here.
 #' @param nrounds the max number of iterations
 #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
-#' @param label vector of response values. Should be provided only when data is an R-matrix.
-#' @param missing is only used when input is a dense matrix. By default is set to NA, which means
-#'        that NA values should be considered as 'missing' by the algorithm.
-#'        Sometimes, 0 or other extreme value might be used to represent missing values.
 #' @param prediction A logical value indicating whether to return the test fold predictions
 #'        from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.
 #' @param showsd \code{boolean}, whether to show standard deviation of cross validation
@@ -47,13 +51,30 @@
 #' @param feval customized evaluation function. Returns
 #'        \code{list(metric='metric-name', value='metric-value')} with given
 #'        prediction and dtrain.
-#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
-#'        by the values of outcome labels.
+#' @param stratified A \code{boolean} indicating whether sampling of folds should be stratified
+#'        by the values of outcome labels. For real-valued labels in regression objectives,
+#'        stratification will be done by discretizing the labels into up to 5 buckets beforehand.
+#'
+#'        If passing "auto", will be set to `TRUE` if the objective in `params` is a classification
+#'        objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to
+#'        `FALSE` otherwise.
+#'
+#'        This parameter is ignored when `data` has a `group` field - in such case, the splitting
+#'        will be based on whole groups (note that this might make the folds have different sizes).
+#'
+#'        Value `TRUE` here is \bold{not} supported for custom objectives.
 #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
 #'        (each element must be a vector of test fold's indices). When folds are supplied,
 #'        the \code{nfold} and \code{stratified} parameters are ignored.
+#'
+#'        If `data` has a `group` field and the objective requires this field, each fold (list element)
+#'        must additionally have two attributes (retrievable through \link{attributes}) named `group_test`
+#'        and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to
+#'        the resulting DMatrices.
 #' @param train_folds \code{list} list specifying which indicies to use for training. If \code{NULL}
 #'        (the default) all indices not specified in \code{folds} will be used for training.
+#'
+#'        This is not supported when `data` has `group` field.
 #' @param verbose \code{boolean}, print the statistics during the process
 #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
 #'        Default is 1 which means all messages are printed. This parameter is passed to the
@@ -118,13 +139,14 @@
 #' print(cv, verbose=TRUE)
 #'
 #' @export
-xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
+xgb.cv <- function(params = list(), data, nrounds, nfold,
                    prediction = FALSE, showsd = TRUE, metrics = list(),
-                   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL,
+                   obj = NULL, feval = NULL, stratified = "auto", folds = NULL, train_folds = NULL,
                    verbose = TRUE, print_every_n = 1L,
                    early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) {
 
   check.deprecation(...)
+  stopifnot(inherits(data, "xgb.DMatrix"))
   if (inherits(data, "xgb.DMatrix") && .Call(XGCheckNullPtr_R, data)) {
     stop("'data' is an invalid 'xgb.DMatrix' object. Must be constructed again.")
   }
@@ -137,16 +159,19 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
   check.custom.obj()
   check.custom.eval()
 
-  # Check the labels
-  if ((inherits(data, 'xgb.DMatrix') && !xgb.DMatrix.hasinfo(data, 'label')) ||
-      (!inherits(data, 'xgb.DMatrix') && is.null(label))) {
-    stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
-  } else if (inherits(data, 'xgb.DMatrix')) {
-    if (!is.null(label))
-      warning("xgb.cv: label will be ignored, since data is of type xgb.DMatrix")
-    cv_label <- getinfo(data, 'label')
-  } else {
-    cv_label <- label
+  if (stratified == "auto") {
+    if (is.character(params$objective)) {
+      stratified <- params$objective %in% .CLASSIFICATION_OBJECTIVES()
+    } else {
+      stratified <- FALSE
+    }
+  }
+
+  # Check the labels and groups
+  cv_label <- getinfo(data, "label")
+  cv_group <- getinfo(data, "group")
+  if (!is.null(train_folds) && NROW(cv_group)) {
+    stop("'train_folds' is not supported for DMatrix object with 'group' field.")
   }
 
   # CV folds
@@ -157,7 +182,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
   } else {
     if (nfold <= 1)
       stop("'nfold' must be > 1")
-    folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, params)
+    folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, cv_group, params)
   }
 
   # verbosity & evaluation printing callback:
@@ -189,20 +214,18 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
 
   # create the booster-folds
   # train_folds
-  dall <- xgb.get.DMatrix(
-    data = data,
-    label = label,
-    missing = missing,
-    weight = NULL,
-    nthread = params$nthread
-  )
+  dall <- data
   bst_folds <- lapply(seq_along(folds), function(k) {
-    dtest  <- xgb.slice.DMatrix(dall, folds[[k]])
+    dtest <- xgb.slice.DMatrix(dall, folds[[k]], allow_groups = TRUE)
     # code originally contributed by @RolandASc on stackoverflow
     if (is.null(train_folds))
-       dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]))
+       dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]), allow_groups = TRUE)
     else
-       dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]])
+       dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]], allow_groups = TRUE)
+    if (!is.null(attributes(folds[[k]])$group_test)) {
+      setinfo(dtest, "group", attributes(folds[[k]])$group_test)
+      setinfo(dtrain, "group", attributes(folds[[k]])$group_train)
+    }
     bst <- xgb.Booster(
       params = params,
       cachelist = list(dtrain, dtest),
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index 9f6103a52762..7fdd197a4498 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -9,14 +9,12 @@ xgb.cv(
   data,
   nrounds,
   nfold,
-  label = NULL,
-  missing = NA,
   prediction = FALSE,
   showsd = TRUE,
   metrics = list(),
   obj = NULL,
   feval = NULL,
-  stratified = TRUE,
+  stratified = "auto",
   folds = NULL,
   train_folds = NULL,
   verbose = TRUE,
@@ -44,19 +42,22 @@ is a shorter summary:
 }
 
 See \code{\link{xgb.train}} for further details.
-See also demo/ for walkthrough example in R.}
+See also demo/ for walkthrough example in R.
 
-\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.}
+Note that, while \code{params} accepts a \code{seed} entry and will use such parameter for model training if
+supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG
+system - thus, for reproducible results, one needs to call the \code{set.seed} function beforehand.}
 
-\item{nrounds}{the max number of iterations}
+\item{data}{An \code{xgb.DMatrix} object, with corresponding fields like \code{label} or bounds as required
+for model training by the objective.
 
-\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix`
+   or `xgb.ExternalDMatrix` are not supported here.
+}\if{html}{\out{</div>}}}
 
-\item{label}{vector of response values. Should be provided only when data is an R-matrix.}
+\item{nrounds}{the max number of iterations}
 
-\item{missing}{is only used when input is a dense matrix. By default is set to NA, which means
-that NA values should be considered as 'missing' by the algorithm.
-Sometimes, 0 or other extreme value might be used to represent missing values.}
+\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
 
 \item{prediction}{A logical value indicating whether to return the test fold predictions
 from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.}
@@ -84,15 +85,35 @@ gradient with given prediction and dtrain.}
 \code{list(metric='metric-name', value='metric-value')} with given
 prediction and dtrain.}
 
-\item{stratified}{a \code{boolean} indicating whether sampling of folds should be stratified
-by the values of outcome labels.}
+\item{stratified}{A \code{boolean} indicating whether sampling of folds should be stratified
+by the values of outcome labels. For real-valued labels in regression objectives,
+stratification will be done by discretizing the labels into up to 5 buckets beforehand.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   If passing "auto", will be set to `TRUE` if the objective in `params` is a classification
+   objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to
+   `FALSE` otherwise.
+
+   This parameter is ignored when `data` has a `group` field - in such case, the splitting
+   will be based on whole groups (note that this might make the folds have different sizes).
+
+   Value `TRUE` here is \\bold\{not\} supported for custom objectives.
+}\if{html}{\out{</div>}}}
 
 \item{folds}{\code{list} provides a possibility to use a list of pre-defined CV folds
 (each element must be a vector of test fold's indices). When folds are supplied,
-the \code{nfold} and \code{stratified} parameters are ignored.}
+the \code{nfold} and \code{stratified} parameters are ignored.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   If `data` has a `group` field and the objective requires this field, each fold (list element)
+   must additionally have two attributes (retrievable through \link{attributes}) named `group_test`
+   and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to
+   the resulting DMatrices.
+}\if{html}{\out{</div>}}}
 
 \item{train_folds}{\code{list} list specifying which indicies to use for training. If \code{NULL}
-(the default) all indices not specified in \code{folds} will be used for training.}
+(the default) all indices not specified in \code{folds} will be used for training.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   This is not supported when `data` has `group` field.
+}\if{html}{\out{</div>}}}
 
 \item{verbose}{\code{boolean}, print the statistics during the process}
 
@@ -142,7 +163,7 @@ setting of the \code{cb.cv.predict(save_models = TRUE)} callback.
 }
 }
 \description{
-The cross validation function of xgboost
+The cross validation function of xgboost.
 }
 \details{
 The original sample is randomly partitioned into \code{nfold} equal size subsamples.
diff --git a/R-package/man/xgb.slice.DMatrix.Rd b/R-package/man/xgb.slice.DMatrix.Rd
index c9695996b66f..c4f7765943bb 100644
--- a/R-package/man/xgb.slice.DMatrix.Rd
+++ b/R-package/man/xgb.slice.DMatrix.Rd
@@ -6,14 +6,18 @@
 \title{Get a new DMatrix containing the specified rows of
 original xgb.DMatrix object}
 \usage{
-xgb.slice.DMatrix(object, idxset)
+xgb.slice.DMatrix(object, idxset, allow_groups = FALSE)
 
 \method{[}{xgb.DMatrix}(object, idxset, colset = NULL)
 }
 \arguments{
-\item{object}{Object of class "xgb.DMatrix"}
+\item{object}{Object of class "xgb.DMatrix".}
 
-\item{idxset}{a integer vector of indices of rows needed}
+\item{idxset}{An integer vector of indices of rows needed (base-1 indexing).}
+
+\item{allow_groups}{Whether to allow slicing an \code{xgb.DMatrix} with \code{group} (or
+equivalently \code{qid}) field. Note that in such case, the result will not have
+the groups anymore - they need to be set manually through \code{setinfo}.}
 
 \item{colset}{currently not used (columns subsetting is not available)}
 }
diff --git a/R-package/src/init.c b/R-package/src/init.c
index a9f3f3e380c2..4453cecc5c90 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -68,7 +68,7 @@ extern SEXP XGDMatrixGetDataAsCSR_R(SEXP);
 extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixSetStrFeatureInfo_R(SEXP, SEXP, SEXP);
-extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP);
+extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP, SEXP);
 extern SEXP XGBSetGlobalConfig_R(SEXP);
 extern SEXP XGBGetGlobalConfig_R(void);
 extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
@@ -127,7 +127,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGDMatrixSaveBinary_R",       (DL_FUNC) &XGDMatrixSaveBinary_R,       3},
   {"XGDMatrixSetInfo_R",          (DL_FUNC) &XGDMatrixSetInfo_R,          3},
   {"XGDMatrixSetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixSetStrFeatureInfo_R, 3},
-  {"XGDMatrixSliceDMatrix_R",     (DL_FUNC) &XGDMatrixSliceDMatrix_R,     2},
+  {"XGDMatrixSliceDMatrix_R",     (DL_FUNC) &XGDMatrixSliceDMatrix_R,     3},
   {"XGBSetGlobalConfig_R",        (DL_FUNC) &XGBSetGlobalConfig_R,        1},
   {"XGBGetGlobalConfig_R",        (DL_FUNC) &XGBGetGlobalConfig_R,        0},
   {"XGBoosterFeatureScore_R",     (DL_FUNC) &XGBoosterFeatureScore_R,     2},
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index c91fb94c447c..353556e49838 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -505,7 +505,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
   return ret;
 }
 
-XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
+XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset, SEXP allow_groups) {
   SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
   R_xlen_t len = Rf_xlength(idxset);
@@ -524,7 +524,7 @@ XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
     res_code = XGDMatrixSliceDMatrixEx(R_ExternalPtrAddr(handle),
                                        BeginPtr(idxvec), len,
                                        &res,
-                                       0);
+                                       Rf_asLogical(allow_groups));
   }
   CHECK_CALL(res_code);
   R_SetExternalPtrAddr(ret, res);
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index d2e0ae82855d..582a87afd43a 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -111,9 +111,10 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
  * \brief create a new dmatrix from sliced content of existing matrix
  * \param handle instance of data matrix to be sliced
  * \param idxset index set
+ * \param allow_groups Whether to allow slicing the DMatrix if it has a 'group' field
  * \return a sliced new matrix
  */
-XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset);
+XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset, SEXP allow_groups);
 
 /*!
  * \brief load a data matrix into binary file
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index fb3162e423ce..9236e0245262 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -334,7 +334,7 @@ test_that("xgb.cv works", {
   set.seed(11)
   expect_output(
     cv <- xgb.cv(
-      data = train$data, label = train$label, max_depth = 2, nfold = 5,
+      data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, nfold = 5,
       eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
       eval_metric = "error", verbose = TRUE
     ),
@@ -358,13 +358,13 @@ test_that("xgb.cv works with stratified folds", {
   cv <- xgb.cv(
     data = dtrain, max_depth = 2, nfold = 5,
     eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
-    verbose = TRUE, stratified = FALSE
+    verbose = FALSE, stratified = FALSE
   )
   set.seed(314159)
   cv2 <- xgb.cv(
     data = dtrain, max_depth = 2, nfold = 5,
     eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
-    verbose = TRUE, stratified = TRUE
+    verbose = FALSE, stratified = TRUE
   )
   # Stratified folds should result in a different evaluation logs
   expect_true(all(cv$evaluation_log[, test_logloss_mean] != cv2$evaluation_log[, test_logloss_mean]))
@@ -841,3 +841,56 @@ test_that("Seed in params override PRNG from R", {
     )
   )
 })
+
+test_that("xgb.cv works for AFT", {
+  X <- matrix(c(1, -1, -1, 1, 0, 1, 1, 0), nrow = 4, byrow = TRUE)  # 4x2 matrix
+  dtrain <- xgb.DMatrix(X, nthread = n_threads)
+
+  params <- list(objective = 'survival:aft', learning_rate = 0.2, max_depth = 2L)
+
+  # data must have bounds
+  expect_error(
+    xgb.cv(
+      params = params,
+      data = dtrain,
+      nround = 5L,
+      nfold = 4L,
+      nthread = n_threads
+    )
+  )
+
+  setinfo(dtrain, 'label_lower_bound', c(2, 3, 0, 4))
+  setinfo(dtrain, 'label_upper_bound', c(2, Inf, 4, 5))
+
+  # automatic stratified splitting is turned off
+  expect_warning(
+    xgb.cv(
+      params = params, data = dtrain, nround = 5L, nfold = 4L,
+      nthread = n_threads, stratified = TRUE, verbose = FALSE
+    )
+  )
+
+  # this works without any issue
+  expect_no_warning(
+    xgb.cv(params = params, data = dtrain, nround = 5L, nfold = 4L, verbose = FALSE)
+  )
+})
+
+test_that("xgb.cv works for ranking", {
+  data(iris)
+  x <- iris[, -(4:5)]
+  y <- as.integer(iris$Petal.Width)
+  group <- rep(50, 3)
+  dm <- xgb.DMatrix(x, label = y, group = group)
+  res <- xgb.cv(
+    data = dm,
+    params = list(
+      objective = "rank:pairwise",
+      max_depth = 3
+    ),
+    nrounds = 3,
+    nfold = 2,
+    verbose = FALSE
+  )
+  expect_equal(length(res$folds), 2L)
+})
diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R
index c60d0c246f81..76ff66046515 100644
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -321,7 +321,7 @@ test_that("prediction in early-stopping xgb.cv works", {
   expect_output(
     cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.1, nrounds = 20,
                  early_stopping_rounds = 5, maximize = FALSE, stratified = FALSE,
-                 prediction = TRUE, base_score = 0.5)
+                 prediction = TRUE, base_score = 0.5, verbose = TRUE)
   , "Stopping. Best iteration")
 
   expect_false(is.null(cv$best_iteration))
@@ -341,7 +341,7 @@ test_that("prediction in xgb.cv for softprob works", {
   lb <- as.numeric(iris$Species) - 1
   set.seed(11)
   expect_warning(
-    cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4,
+    cv <- xgb.cv(data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), nfold = 4,
                  eta = 0.5, nrounds = 5, max_depth = 3, nthread = n_threads,
                  subsample = 0.8, gamma = 2, verbose = 0,
                  prediction = TRUE, objective = "multi:softprob", num_class = 3)
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 50621f241c3e..f2df9a5778c1 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -243,7 +243,7 @@ test_that("xgb.DMatrix: print", {
     txt <- capture.output({
         print(dtrain)
     })
-    expect_equal(txt, "xgb.DMatrix  dim: 6513 x 126  info: label weight base_margin  colnames: yes")
+    expect_equal(txt, "xgb.DMatrix  dim: 6513 x 126  info: base_margin, label, weight  colnames: yes")
 
     # DMatrix with just features
     dtrain <- xgb.DMatrix(
@@ -692,3 +692,41 @@ test_that("xgb.DMatrix: quantile cuts look correct", {
     }
   )
 })
+
+test_that("xgb.DMatrix: slicing keeps field indicators", {
+  data(mtcars)
+  x <- as.matrix(mtcars[, -1])
+  y <- mtcars[, 1]
+  dm <- xgb.DMatrix(
+    data = x,
+    label_lower_bound = -y,
+    label_upper_bound = y,
+    nthread = 1
+  )
+  idx_take <- seq(1, 5)
+  dm_slice <- xgb.slice.DMatrix(dm, idx_take)
+
+  expect_true(xgb.DMatrix.hasinfo(dm_slice, "label_lower_bound"))
+  expect_true(xgb.DMatrix.hasinfo(dm_slice, "label_upper_bound"))
+  expect_false(xgb.DMatrix.hasinfo(dm_slice, "label"))
+
+  expect_equal(getinfo(dm_slice, "label_lower_bound"), -y[idx_take], tolerance = 1e-6)
+  expect_equal(getinfo(dm_slice, "label_upper_bound"), y[idx_take], tolerance = 1e-6)
+})
+
+test_that("xgb.DMatrix: can slice with groups", {
+  data(iris)
+  x <- as.matrix(iris[, -5])
+  set.seed(123)
+  y <- sample(3, size = nrow(x), replace = TRUE)
+  group <- c(50, 50, 50)
+  dm <- xgb.DMatrix(x, label = y, group = group, nthread = 1)
+  idx_take <- seq(1, 50)
+  dm_slice <- xgb.slice.DMatrix(dm, idx_take, allow_groups = TRUE)
+
+  expect_true(xgb.DMatrix.hasinfo(dm_slice, "label"))
+  expect_false(xgb.DMatrix.hasinfo(dm_slice, "group"))
+  expect_false(xgb.DMatrix.hasinfo(dm_slice, "qid"))
+  expect_null(getinfo(dm_slice, "group"))
+  expect_equal(getinfo(dm_slice, "label"), y[idx_take], tolerance = 1e-6)
+})

From c94d018149eefb6dc0d3788fa4ea9ab42247829e Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Sun, 4 Feb 2024 21:21:29 +0100
Subject: [PATCH 2/7] linter

---
 R-package/R/utils.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 4e5163744b39..96a4e5949e3b 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -294,8 +294,8 @@ generate.group.folds <- function(nfold, group) {
     groups_idx_test <- randomized_groups[assignments[[idx]]]
     groups_test <- indices[groups_idx_test]
     idx_test <- unlist(groups_test)
-    attributes(idx_test)$group_test <- sapply(groups_test, length)
-    attributes(idx_test)$group_train <- sapply(indices[-groups_idx_test], length)
+    attributes(idx_test)$group_test <- lengths(groups_test)
+    attributes(idx_test)$group_train <- lengths(indices[-groups_idx_test])
     out[[idx]] <- idx_test
   }
   return(out)

From 82eb309b2666103bb107294d7f887b78176b38bf Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Sun, 4 Feb 2024 21:26:17 +0100
Subject: [PATCH 3/7] fix example

---
 R-package/R/xgb.cv.R          | 4 ++--
 R-package/man/print.xgb.cv.Rd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 2e23e45e2ae7..3e59c3a75298 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -308,8 +308,8 @@ xgb.cv <- function(params = list(), data, nrounds, nfold,
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
-#' cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#' cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2,
+#'              eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 #' print(cv)
 #' print(cv, verbose=TRUE)
 #'
diff --git a/R-package/man/print.xgb.cv.Rd b/R-package/man/print.xgb.cv.Rd
index 05ad61eed8ac..74fc15d01fb9 100644
--- a/R-package/man/print.xgb.cv.Rd
+++ b/R-package/man/print.xgb.cv.Rd
@@ -23,8 +23,8 @@ including the best iteration (when available).
 \examples{
 data(agaricus.train, package='xgboost')
 train <- agaricus.train
-cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2,
+             eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 print(cv)
 print(cv, verbose=TRUE)
 

From 3639ef133be5adbe954ffef3728ce9d3bbf3e050 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Tue, 26 Mar 2024 07:17:14 +0100
Subject: [PATCH 4/7] Update R-package/R/utils.R

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
---
 R-package/R/utils.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 96a4e5949e3b..9867d15054d5 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -241,6 +241,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
   }
   objective <- params$objective
   if (!is.character(objective)) {
+    warning("Will use unstratified splitting (custom objective used)")
     stratified <- FALSE
   }
   # cannot stratify if label is NULL

From f083ff732e2c2ff310b2f74107e92476d6b44987 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Tue, 26 Mar 2024 07:20:25 +0100
Subject: [PATCH 5/7] error out on stratified+group CV sampling

---
 R-package/R/utils.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 9867d15054d5..5af7683e98c5 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -237,6 +237,9 @@ convert.labels <- function(labels, objective_name) {
 # Generates random (stratified if needed) CV folds
 generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
   if (NROW(group)) {
+    if (stratified) {
+      stop("stratified sampling is not supported when using 'group' attribute.")
+    }
     return(generate.group.folds(nfold, group))
   }
   objective <- params$objective

From ba1452545d8651c72f0cd934b2b70575cd434d47 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Tue, 26 Mar 2024 17:49:09 +0100
Subject: [PATCH 6/7] change error to warning

---
 R-package/R/utils.R                   | 7 ++++++-
 R-package/tests/testthat/test_basic.R | 3 ++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index e80a86a6ad44..0fcb3286d178 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -238,7 +238,12 @@ convert.labels <- function(labels, objective_name) {
 generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
   if (NROW(group)) {
     if (stratified) {
-      stop("stratified sampling is not supported when using 'group' attribute.")
+      warning(
+        paste0(
+          "Stratified splitting is not supported when using 'group' attribute.",
+          " Will use unstratified splitting."
+        )
+      )
     }
     return(generate.group.folds(nfold, group))
   }
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index f1a999ff389c..bbb8fb323478 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -934,7 +934,8 @@ test_that("xgb.cv works for ranking", {
     ),
     nrounds = 3,
     nfold = 2,
-    verbose = FALSE
+    verbose = FALSE,
+    stratified = FALSE
   )
   expect_equal(length(res$folds), 2L)
 })

From c06b969175db20ffdc419bc2e7556828be6cc583 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Tue, 26 Mar 2024 17:51:34 +0100
Subject: [PATCH 7/7] don't default to stratified sampling for ranking
 objectives

---
 R-package/R/utils.R  | 5 +++++
 R-package/R/xgb.cv.R | 5 ++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 0fcb3286d178..7b6a20f704dd 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -26,6 +26,11 @@ NVL <- function(x, val) {
            'multi:softprob', 'rank:pairwise', 'rank:ndcg', 'rank:map'))
 }
 
+.RANKING_OBJECTIVES <- function() {
+  return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax',
+           'multi:softprob'))
+}
+
 
 #
 # Low-level functions for boosting --------------------------------------------
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 927aa5ebbaa6..880fd56974bc 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -161,7 +161,10 @@ xgb.cv <- function(params = list(), data, nrounds, nfold,
 
   if (stratified == "auto") {
     if (is.character(params$objective)) {
-      stratified <- params$objective %in% .CLASSIFICATION_OBJECTIVES()
+      stratified <- (
+        (params$objective %in% .CLASSIFICATION_OBJECTIVES())
+        && !(params$objective %in% .RANKING_OBJECTIVES())
+      )
     } else {
       stratified <- FALSE
     }