diff --git a/Makefile b/Makefile
index bfa1731f0131..64a7ff6ccce4 100644
--- a/Makefile
+++ b/Makefile
@@ -91,10 +91,7 @@ endif
 # If any of the dask tests failed, contributor won't see the other error.
 mypy:
 	cd python-package; \
-	mypy ./xgboost/dask.py && \
-	mypy ./xgboost/rabit.py && \
-	mypy ./xgboost/tracker.py && \
-	mypy ./xgboost/sklearn.py && \
+	mypy . && \
 	mypy ../demo/guide-python/external_memory.py && \
 	mypy ../demo/guide-python/categorical.py && \
 	mypy ../demo/guide-python/cat_in_the_dat.py && \
diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst
index 723cde431bc4..87e54054c66a 100644
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -68,6 +68,12 @@ a filename with ``.json`` or ``.ubj`` as file extension, the latter is the exten
 
   xgb.save(bst, 'model_file_name.json')
 
+.. note::
+
+  Only load models from JSON files that were produced by XGBoost. Attempting to load
+  JSON files that were produced by an external source may lead to undefined behaviors
+  and crashes.
+
 While for memory snapshot, UBJSON is the default starting with xgboost 1.6.
 
 ***************************************************************
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index cce92d3679f4..a731bfac84ed 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -111,15 +111,14 @@ class GradientBooster : public Model, public Configurable {
   /*!
    * \brief Inplace prediction.
    *
-   * \param           x                      A type erased data adapter.
+   * \param           p_fmat                 A proxy DMatrix that contains the data and related
+   *                                         meta info.
    * \param           missing                Missing value in the data.
    * \param [in,out]  out_preds              The output preds.
    * \param           layer_begin (Optional) Beginning of boosted tree layer used for prediction.
    * \param           layer_end   (Optional) End of booster layer. 0 means do not limit trees.
    */
-  virtual void InplacePredict(dmlc::any const &, std::shared_ptr<DMatrix>, float,
-                              PredictionCacheEntry*,
-                              uint32_t,
+  virtual void InplacePredict(std::shared_ptr<DMatrix>, float, PredictionCacheEntry*, uint32_t,
                               uint32_t) const {
     LOG(FATAL) << "Inplace predict is not supported by current booster.";
   }
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 80004e6a8a01..b16ea67ecd5c 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -139,21 +139,16 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   /*!
    * \brief Inplace prediction.
    *
-   * \param          x           A type erased data adapter.
-   * \param          p_m         An optional Proxy DMatrix object storing meta info like
-   *                             base margin.  Can be nullptr.
+   * \param          p_fmat      A proxy DMatrix that contains the data and related meta info.
    * \param          type        Prediction type.
    * \param          missing     Missing value in the data.
    * \param [in,out] out_preds   Pointer to output prediction vector.
    * \param          layer_begin Beginning of boosted tree layer used for prediction.
    * \param          layer_end   End of booster layer. 0 means do not limit trees.
    */
-  virtual void InplacePredict(dmlc::any const &x,
-                              std::shared_ptr<DMatrix> p_m,
-                              PredictionType type,
-                              float missing,
-                              HostDeviceVector<bst_float> **out_preds,
-                              uint32_t layer_begin, uint32_t layer_end) = 0;
+  virtual void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
+                              HostDeviceVector<bst_float>** out_preds, uint32_t layer_begin,
+                              uint32_t layer_end) = 0;
 
   /*!
    * \brief Calculate feature score.  See doc in C API for outputs.
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index 5063922617b5..33c695bc19bf 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -145,7 +145,9 @@ class Predictor {
 
   /**
    * \brief Inplace prediction.
-   * \param           x                      Type erased data adapter.
+   *
+   * \param           p_fmat                 A proxy DMatrix that contains the data and related
+   *                                         meta info.
    * \param           model                  The model to predict from.
    * \param           missing                Missing value in the data.
    * \param [in,out]  out_preds              The output preds.
@@ -154,11 +156,9 @@ class Predictor {
    *
    * \return True if the data can be handled by current predictor, false otherwise.
    */
-  virtual bool InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
-                              const gbm::GBTreeModel &model, float missing,
-                              PredictionCacheEntry *out_preds,
-                              uint32_t tree_begin = 0,
-                              uint32_t tree_end = 0) const = 0;
+  virtual bool InplacePredict(std::shared_ptr<DMatrix> p_fmat, const gbm::GBTreeModel& model,
+                              float missing, PredictionCacheEntry* out_preds,
+                              uint32_t tree_begin = 0, uint32_t tree_end = 0) const = 0;
   /**
    * \brief online prediction function, predict score for one instance at a time
    * NOTE: use the batch prediction interface if possible, batch prediction is
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
index 756b7b54b161..08d186d6f84e 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -61,15 +61,14 @@ class GpuPreXGBoost extends PreXGBoostProvider {
    * @param estimator [[XGBoostClassifier]] or [[XGBoostRegressor]]
    * @param dataset   the training data
    * @param params    all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
-   *         Boolean if building DMatrix in rabit context
+   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
    *         RDD[() => Watches] will be used as the training input
    *         Option[ RDD[_] ] is the optional cached RDD
    */
   override def buildDatasetToRDD(estimator: Estimator[_],
       dataset: Dataset[_],
       params: Map[String, Any]):
-    XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
+    XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {
     GpuPreXGBoost.buildDatasetToRDD(estimator, dataset, params)
   }
 
@@ -123,8 +122,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
    * @param estimator supports XGBoostClassifier and XGBoostRegressor
    * @param dataset   the training data
    * @param params    all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
-   *         Boolean if building DMatrix in rabit context
+   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
    *         RDD[() => Watches] will be used as the training input to build DMatrix
    *         Option[ RDD[_] ] is the optional cached RDD
    */
@@ -132,7 +130,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
       estimator: Estimator[_],
       dataset: Dataset[_],
       params: Map[String, Any]):
-    XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
+    XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {
 
     val (Seq(labelName, weightName, marginName), feturesCols, groupName, evalSets) =
       estimator match {
@@ -170,7 +168,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
     xgbExecParams: XGBoostExecutionParams =>
       val dataMap = prepareInputData(trainingData, evalDataMap, xgbExecParams.numWorkers,
         xgbExecParams.cacheTrainingSet)
-      (true, buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
+      (buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
   }
 
   /**
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
index 01eb3d0a4f32..13484f490f5b 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
@@ -101,8 +101,7 @@ object PreXGBoost extends PreXGBoostProvider {
    * @param estimator supports XGBoostClassifier and XGBoostRegressor
    * @param dataset the training data
    * @param params all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
-   *         Boolean if building DMatrix in rabit context
+   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
    *         RDD[() => Watches] will be used as the training input
    *         Option[RDD[_]\] is the optional cached RDD
    */
@@ -110,7 +109,7 @@ object PreXGBoost extends PreXGBoostProvider {
       estimator: Estimator[_],
       dataset: Dataset[_],
       params: Map[String, Any]): XGBoostExecutionParams =>
-    (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
+    (RDD[() => Watches], Option[RDD[_]]) = {
 
     if (optionProvider.isDefined && optionProvider.get.providerEnabled(Some(dataset))) {
       return optionProvider.get.buildDatasetToRDD(estimator, dataset, params)
@@ -172,12 +171,12 @@ object PreXGBoost extends PreXGBoostProvider {
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (false, trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
         case Right(trainingData) =>
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (false, trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
       }
 
   }
@@ -324,7 +323,7 @@ object PreXGBoost extends PreXGBoostProvider {
       trainingSet: RDD[XGBLabeledPoint],
       evalRDDMap: Map[String, RDD[XGBLabeledPoint]] = Map(),
       hasGroup: Boolean = false):
-  XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
+  XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {
 
     xgbExecParams: XGBoostExecutionParams =>
       composeInputData(trainingSet, hasGroup, xgbExecParams.numWorkers) match {
@@ -332,12 +331,12 @@ object PreXGBoost extends PreXGBoostProvider {
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (false, trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
         case Right(trainingData) =>
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (false, trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
       }
   }
 
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala
index d133aea288dd..4c4dbdec1e53 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala
@@ -50,8 +50,7 @@ private[scala] trait PreXGBoostProvider {
    * @param estimator supports XGBoostClassifier and XGBoostRegressor
    * @param dataset the training data
    * @param params all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
-   *         Boolean if building DMatrix in rabit context
+   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
    *         RDD[() => Watches] will be used as the training input to build DMatrix
    *         Option[ RDD[_] ] is the optional cached RDD
    */
@@ -59,7 +58,7 @@ private[scala] trait PreXGBoostProvider {
     estimator: Estimator[_],
     dataset: Dataset[_],
     params: Map[String, Any]):
-  XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]])
+  XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]])
 
   /**
    * Transform Dataset
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 6cfabcfaca17..fa22e8939e29 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -286,7 +286,6 @@ object XGBoost extends Serializable {
   }
 
   private def buildDistributedBooster(
-      buildDMatrixInRabit: Boolean,
       buildWatches: () => Watches,
       xgbExecutionParam: XGBoostExecutionParams,
       rabitEnv: java.util.Map[String, String],
@@ -295,11 +294,6 @@ object XGBoost extends Serializable {
       prevBooster: Booster): Iterator[(Booster, Map[String, Array[Float]])] = {
 
     var watches: Watches = null
-    if (!buildDMatrixInRabit) {
-      // for CPU pipeline, we need to build DMatrix out of rabit context
-      watches = buildWatchesAndCheck(buildWatches)
-    }
-
     val taskId = TaskContext.getPartitionId().toString
     val attempt = TaskContext.get().attemptNumber.toString
     rabitEnv.put("DMLC_TASK_ID", taskId)
@@ -310,10 +304,7 @@ object XGBoost extends Serializable {
     try {
       Rabit.init(rabitEnv)
 
-      if (buildDMatrixInRabit) {
-        // for GPU pipeline, we need to move dmatrix building into rabit context
-        watches = buildWatchesAndCheck(buildWatches)
-      }
+      watches = buildWatchesAndCheck(buildWatches)
 
       val numEarlyStoppingRounds = xgbExecutionParam.earlyStoppingParams.numEarlyStoppingRounds
       val metrics = Array.tabulate(watches.size)(_ => Array.ofDim[Float](numRounds))
@@ -377,7 +368,7 @@ object XGBoost extends Serializable {
   @throws(classOf[XGBoostError])
   private[spark] def trainDistributed(
       sc: SparkContext,
-      buildTrainingData: XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]),
+      buildTrainingData: XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]),
       params: Map[String, Any]):
     (Booster, Map[String, Array[Float]]) = {
 
@@ -396,7 +387,7 @@ object XGBoost extends Serializable {
     }.orNull
 
     // Get the training data RDD and the cachedRDD
-    val (buildDMatrixInRabit, trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams)
+    val (trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams)
 
     try {
       // Train for every ${savingRound} rounds and save the partially completed booster
@@ -413,9 +404,8 @@ object XGBoost extends Serializable {
             optionWatches = Some(iter.next())
           }
 
-          optionWatches.map { buildWatches => buildDistributedBooster(buildDMatrixInRabit,
-            buildWatches, xgbExecParams, rabitEnv, xgbExecParams.obj,
-            xgbExecParams.eval, prevBooster)}
+          optionWatches.map { buildWatches => buildDistributedBooster(buildWatches,
+            xgbExecParams, rabitEnv, xgbExecParams.obj, xgbExecParams.eval, prevBooster)}
             .getOrElse(throw new RuntimeException("No Watches to train"))
 
         }}
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
index 77683e91437f..2f6827787107 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
@@ -169,6 +169,23 @@ class XGBoostClassifier (
   }
 
   override protected def train(dataset: Dataset[_]): XGBoostClassificationModel = {
+    val _numClasses = getNumClasses(dataset)
+    if (isDefined(numClass) && $(numClass) != _numClasses) {
+      throw new Exception("The number of classes in dataset doesn't match " +
+        "\'num_class\' in xgboost params.")
+    }
+
+    if (_numClasses == 2) {
+      if (!isDefined(objective)) {
+        // If user doesn't set objective, force it to binary:logistic
+        setObjective("binary:logistic")
+      }
+    } else if (_numClasses > 2) {
+      if (!isDefined(objective)) {
+        // If user doesn't set objective, force it to multi:softprob
+        setObjective("multi:softprob")
+      }
+    }
 
     if (!isDefined(evalMetric) || $(evalMetric).isEmpty) {
       set(evalMetric, setupDefaultEvalMetric())
@@ -178,12 +195,6 @@ class XGBoostClassifier (
       set(objectiveType, "classification")
     }
 
-    val _numClasses = getNumClasses(dataset)
-    if (isDefined(numClass) && $(numClass) != _numClasses) {
-      throw new Exception("The number of classes in dataset doesn't match " +
-        "\'num_class\' in xgboost params.")
-    }
-
     // Packing with all params plus params user defined
     val derivedXGBParamMap = xgboostParams ++ MLlib2XGBoostParams
     val buildTrainingData = PreXGBoost.buildDatasetToRDD(this, dataset, derivedXGBParamMap)
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
index b52ba2a2e925..0402beb62a47 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
@@ -169,6 +169,11 @@ class XGBoostRegressor (
 
   override protected def train(dataset: Dataset[_]): XGBoostRegressionModel = {
 
+    if (!isDefined(objective)) {
+      // If user doesn't set objective, force it to reg:squarederror
+      setObjective("reg:squarederror")
+    }
+
     if (!isDefined(evalMetric) || $(evalMetric).isEmpty) {
       set(evalMetric, setupDefaultEvalMetric())
     }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
index 852864d9cb1c..ea7d2b48b11f 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -105,7 +105,7 @@ private[spark] trait LearningTaskParams extends Params {
 
   final def getMaximizeEvaluationMetrics: Boolean = $(maximizeEvaluationMetrics)
 
-  setDefault(objective -> "reg:squarederror", baseScore -> 0.5, trainTestRatio -> 1.0,
+  setDefault(baseScore -> 0.5, trainTestRatio -> 1.0,
     numEarlyStoppingRounds -> 0, cacheTrainingSet -> false)
 }
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
index 79562d1f428b..f96140555809 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
@@ -65,8 +65,6 @@ class FeatureSizeValidatingSuite extends FunSuite with PerTest {
         (id, lp.label, lp.features)
     }.toDF("id", "label", "features")
     val xgb = new XGBoostClassifier(paramMap)
-    intercept[Exception] {
-      xgb.fit(repartitioned)
-    }
+    xgb.fit(repartitioned)
   }
 }
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
index 93b7554017a0..cf8dcca5722b 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
@@ -138,7 +138,7 @@ class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest {
     val testDM = new DMatrix(Classification.test.iterator)
     val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
       "custom_eval" -> new EvalError, "custom_obj" -> new CustomObj(1),
-      "num_round" -> "10", "num_workers" -> numWorkers)
+      "num_round" -> "10", "num_workers" -> numWorkers, "objective" -> "binary:logistic")
 
     val xgbc = new XGBoostClassifier(paramMap)
     val xgbcPath = new File(tempDir.toFile, "xgbc").getPath
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
index 4abd464ade04..9fe2479e5754 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -112,6 +112,34 @@ class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuit
     assert(!transformedDf.columns.contains("probability"))
   }
 
+  test("objective will be set if not specifying it") {
+    val training = buildDataFrame(Classification.train)
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
+      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
+    val xgb = new XGBoostClassifier(paramMap)
+    assert(!xgb.isDefined(xgb.objective))
+    xgb.fit(training)
+    assert(xgb.getObjective == "binary:logistic")
+
+    val trainingDF = buildDataFrame(MultiClassification.train)
+    val paramMap1 = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
+      "num_class" -> "6", "num_round" -> 5, "num_workers" -> numWorkers,
+      "tree_method" -> treeMethod)
+    val xgb1 = new XGBoostClassifier(paramMap1)
+    assert(!xgb1.isDefined(xgb1.objective))
+    xgb1.fit(trainingDF)
+    assert(xgb1.getObjective == "multi:softprob")
+
+    // shouldn't change user's objective setting
+    val paramMap2 = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
+      "num_class" -> "6", "num_round" -> 5, "num_workers" -> numWorkers,
+      "tree_method" -> treeMethod, "objective" -> "multi:softmax")
+    val xgb2 = new XGBoostClassifier(paramMap2)
+    assert(xgb2.getObjective == "multi:softmax")
+    xgb2.fit(trainingDF)
+    assert(xgb2.getObjective == "multi:softmax")
+  }
+
   test("use base margin") {
     val training1 = buildDataFrame(Classification.train)
     val training2 = training1.withColumn("margin", functions.rand())
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
index bd104f6c7987..a530313b9bb5 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
@@ -146,6 +146,24 @@ class XGBoostRegressorSuite extends FunSuite with PerTest {
     prediction.foreach(x => assert(math.abs(x.getAs[Double]("prediction") - first) <= 0.01f))
   }
 
+  test("objective will be set if not specifying it") {
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
+    val training = buildDataFrame(Regression.train)
+    val xgb = new XGBoostRegressor(paramMap)
+    assert(!xgb.isDefined(xgb.objective))
+    xgb.fit(training)
+    assert(xgb.getObjective == "reg:squarederror")
+
+    val paramMap1 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
+      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod,
+      "objective" -> "reg:squaredlogerror")
+    val xgb1 = new XGBoostRegressor(paramMap1)
+    assert(xgb1.getObjective == "reg:squaredlogerror")
+    xgb1.fit(training)
+    assert(xgb1.getObjective == "reg:squaredlogerror")
+  }
+
   test("test predictionLeaf") {
     val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
       "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers,
diff --git a/python-package/setup.py b/python-package/setup.py
index 35314ab218f8..6c83feca0784 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -335,11 +335,12 @@ def run(self) -> None:
               'scipy',
           ],
           ext_modules=[CMakeExtension('libxgboost')],
+          # error: expected "str": "Type[Command]"
           cmdclass={
-              'build_ext': BuildExt,
-              'sdist': Sdist,
-              'install_lib': InstallLib,
-              'install': Install
+              'build_ext': BuildExt,     # type: ignore
+              'sdist': Sdist,            # type: ignore
+              'install_lib': InstallLib,  # type: ignore
+              'install': Install          # type: ignore
           },
           extras_require={
               'pandas': ['pandas'],
diff --git a/python-package/xgboost/_typing.py b/python-package/xgboost/_typing.py
index 64ea9a0a2993..b17f5ecb8a4e 100644
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -1,21 +1,32 @@
 """Shared typing definition."""
 import ctypes
 import os
-from typing import Optional, Any, TypeVar, Union, Sequence
+from typing import Any, TypeVar, Union, Type, Sequence, Callable, List, Dict
 
 # os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/
 # cudf.DataFrame/cupy.array/dlpack
+import numpy as np
+
 DataType = Any
 
 # xgboost accepts some other possible types in practice due to historical reason, which is
 # lesser tested.  For now we encourage users to pass a simple list of string.
-FeatureNames = Optional[Sequence[str]]
-FeatureTypes = Optional[Sequence[str]]
+FeatureInfo = Sequence[str]
+FeatureNames = FeatureInfo
+FeatureTypes = FeatureInfo
+BoosterParam = Union[List, Dict]  # better be sequence
 
 ArrayLike = Any
 PathLike = Union[str, os.PathLike]
 CupyT = ArrayLike  # maybe need a stub for cupy arrays
 NumpyOrCupy = Any
+NumpyDType = Union[str, Type[np.number]]
+PandasDType = Any  # real type is pandas.core.dtypes.base.ExtensionDtype
+
+FloatCompatible = Union[float, np.float32, np.float64]
+
+# callables
+FPreProcCallable = Callable
 
 # ctypes
 # c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
@@ -59,3 +70,4 @@
 
 # template parameter
 _T = TypeVar("_T")
+_F = TypeVar("_F", bound=Callable[..., Any])
diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index 32d408f3a29e..021ccd97236d 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -10,8 +10,7 @@
 import collections
 import os
 import pickle
-from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast
-from typing import Sequence
+from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast, Sequence, Any
 import numpy
 
 from . import rabit
@@ -24,11 +23,14 @@
     "EarlyStopping",
     "EvaluationMonitor",
     "TrainingCheckPoint",
+    "CallbackContainer"
 ]
 
 _Score = Union[float, Tuple[float, float]]
 _ScoreList = Union[List[float], List[Tuple[float, float]]]
 
+_Model = Any  # real type is Union[Booster, CVPack]; need more work
+
 
 # pylint: disable=unused-argument
 class TrainingCallback(ABC):
@@ -43,19 +45,19 @@ class TrainingCallback(ABC):
     def __init__(self) -> None:
         pass
 
-    def before_training(self, model):
+    def before_training(self, model: _Model) -> _Model:
         '''Run before training starts.'''
         return model
 
-    def after_training(self, model):
+    def after_training(self, model: _Model) -> _Model:
         '''Run after training is finished.'''
         return model
 
-    def before_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
+    def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
         '''Run before each iteration.  Return True when training should stop.'''
         return False
 
-    def after_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
+    def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
         '''Run after each iteration.  Return True when training should stop.'''
         return False
 
@@ -140,7 +142,7 @@ def __init__(
         if self.is_cv:
             self.aggregated_cv = None
 
-    def before_training(self, model):
+    def before_training(self, model: _Model) -> _Model:
         '''Function called before training.'''
         for c in self.callbacks:
             model = c.before_training(model=model)
@@ -151,7 +153,7 @@ def before_training(self, model):
                 assert isinstance(model, Booster), msg
         return model
 
-    def after_training(self, model):
+    def after_training(self, model: _Model) -> _Model:
         '''Function called after training.'''
         for c in self.callbacks:
             model = c.after_training(model=model)
@@ -182,7 +184,7 @@ def after_training(self, model):
         return model
 
     def before_iteration(
-        self, model, epoch: int, dtrain: DMatrix, evals: List[Tuple[DMatrix, str]]
+        self, model: _Model, epoch: int, dtrain: DMatrix, evals: Optional[List[Tuple[DMatrix, str]]]
     ) -> bool:
         '''Function called before training iteration.'''
         return any(c.before_iteration(model, epoch, self.history)
@@ -220,7 +222,7 @@ def _update_history(
 
     def after_iteration(
         self,
-        model,
+        model: _Model,
         epoch: int,
         dtrain: DMatrix,
         evals: Optional[List[Tuple[DMatrix, str]]],
@@ -276,7 +278,7 @@ def __init__(
         super().__init__()
 
     def after_iteration(
-        self, model, epoch: int, evals_log: TrainingCallback.EvalsLog
+        self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog
     ) -> bool:
         model.set_param("learning_rate", self.learning_rates(epoch))
         return False
@@ -344,12 +346,12 @@ def __init__(
         self.starting_round: int = 0
         super().__init__()
 
-    def before_training(self, model):
+    def before_training(self, model: _Model) -> _Model:
         self.starting_round = model.num_boosted_rounds()
         return model
 
     def _update_rounds(
-        self, score: _Score, name: str, metric: str, model, epoch: int
+        self, score: _Score, name: str, metric: str, model: _Model, epoch: int
     ) -> bool:
         def get_s(x: _Score) -> float:
             """get score if it's cross validation history."""
@@ -403,7 +405,7 @@ def minimize(new: _Score, best: _Score) -> bool:
             return True
         return False
 
-    def after_iteration(self, model, epoch: int,
+    def after_iteration(self, model: _Model, epoch: int,
                         evals_log: TrainingCallback.EvalsLog) -> bool:
         epoch += self.starting_round  # training continuation
         msg = 'Must have at least 1 validation dataset for early stopping.'
@@ -431,7 +433,7 @@ def after_iteration(self, model, epoch: int,
         score = data_log[metric_name][-1]
         return self._update_rounds(score, data_name, metric_name, model, epoch)
 
-    def after_training(self, model):
+    def after_training(self, model: _Model) -> _Model:
         try:
             if self.save_best:
                 model = model[: int(model.attr("best_iteration")) + 1]
@@ -477,7 +479,7 @@ def _fmt_metric(
             msg = f"\t{data + '-' + metric}:{score:.5f}"
         return msg
 
-    def after_iteration(self, model, epoch: int,
+    def after_iteration(self, model: _Model, epoch: int,
                         evals_log: TrainingCallback.EvalsLog) -> bool:
         if not evals_log:
             return False
@@ -503,7 +505,7 @@ def after_iteration(self, model, epoch: int,
                 self._latest = msg
         return False
 
-    def after_training(self, model):
+    def after_training(self, model: _Model) -> _Model:
         if rabit.get_rank() == self.printer_rank and self._latest is not None:
             rabit.tracker_print(self._latest)
         return model
@@ -544,7 +546,7 @@ def __init__(
         self._epoch = 0
         super().__init__()
 
-    def after_iteration(self, model, epoch: int,
+    def after_iteration(self, model: _Model, epoch: int,
                         evals_log: TrainingCallback.EvalsLog) -> bool:
         if self._epoch == self._iterations:
             path = os.path.join(self._path, self._name + '_' + str(epoch) +
diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index 1967ffc8e1e6..63f9137e67c7 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -1,7 +1,7 @@
 # coding: utf-8
 # pylint: disable= invalid-name,  unused-import
 """For compatibility and optional dependencies."""
-from typing import Any
+from typing import Any, Type, Dict, Optional, List
 import sys
 import types
 import importlib.util
@@ -11,20 +11,20 @@
 assert (sys.version_info[0] == 3), 'Python 2 is no longer supported.'
 
 
-def py_str(x):
+def py_str(x: bytes) -> str:
     """convert c string back to python string"""
-    return x.decode('utf-8')
+    return x.decode('utf-8')  # type: ignore
 
 
-def lazy_isinstance(instance, module, name):
+def lazy_isinstance(instance: Type[object], module: str, name: str) -> bool:
     """Use string representation to identify a type."""
 
     # Notice, we use .__class__ as opposed to type() in order
     # to support object proxies such as weakref.proxy
     cls = instance.__class__
-    module = cls.__module__ == module
-    name = cls.__name__ == name
-    return module and name
+    is_same_module = cls.__module__ == module
+    has_same_name = cls.__name__ == name
+    return is_same_module and has_same_name
 
 
 # pandas
@@ -37,53 +37,33 @@ def lazy_isinstance(instance, module, name):
 except ImportError:
 
     MultiIndex = object
-    DataFrame: Any = object
+    DataFrame = object
     Series = object
     pandas_concat = None
     PANDAS_INSTALLED = False
 
 # sklearn
 try:
-    from sklearn.base import BaseEstimator
-    from sklearn.base import RegressorMixin, ClassifierMixin
+    from sklearn.base import (
+         BaseEstimator as XGBModelBase,
+         RegressorMixin as XGBRegressorBase,
+         ClassifierMixin as XGBClassifierBase
+    )
     from sklearn.preprocessing import LabelEncoder
 
     try:
-        from sklearn.model_selection import KFold, StratifiedKFold
+        from sklearn.model_selection import (
+            KFold as XGBKFold,
+            StratifiedKFold as XGBStratifiedKFold
+        )
     except ImportError:
-        from sklearn.cross_validation import KFold, StratifiedKFold
+        from sklearn.cross_validation import (
+            KFold as XGBKFold,
+            StratifiedKFold as XGBStratifiedKFold
+        )
 
     SKLEARN_INSTALLED = True
 
-    XGBModelBase = BaseEstimator
-    XGBRegressorBase = RegressorMixin
-    XGBClassifierBase = ClassifierMixin
-
-    XGBKFold = KFold
-    XGBStratifiedKFold = StratifiedKFold
-
-    class XGBoostLabelEncoder(LabelEncoder):
-        '''Label encoder with JSON serialization methods.'''
-        def to_json(self):
-            '''Returns a JSON compatible dictionary'''
-            meta = {}
-            for k, v in self.__dict__.items():
-                if isinstance(v, np.ndarray):
-                    meta[k] = v.tolist()
-                else:
-                    meta[k] = v
-            return meta
-
-        def from_json(self, doc):
-            # pylint: disable=attribute-defined-outside-init
-            '''Load the encoder back from a JSON compatible dict.'''
-            meta = {}
-            for k, v in doc.items():
-                if k == 'classes_':
-                    self.classes_ = np.array(v)
-                    continue
-                meta[k] = v
-            self.__dict__.update(meta)
 except ImportError:
     SKLEARN_INSTALLED = False
 
@@ -91,20 +71,34 @@ def from_json(self, doc):
     XGBModelBase = object
     XGBClassifierBase = object
     XGBRegressorBase = object
+    LabelEncoder = object
 
     XGBKFold = None
     XGBStratifiedKFold = None
-    XGBoostLabelEncoder = None
 
 
-# dask
-try:
-    import pkg_resources
-    pkg_resources.get_distribution('dask')
-    DASK_INSTALLED = True
-except pkg_resources.DistributionNotFound:
-    dask = None
-    DASK_INSTALLED = False
+class XGBoostLabelEncoder(LabelEncoder):
+    '''Label encoder with JSON serialization methods.'''
+    def to_json(self) -> Dict:
+        '''Returns a JSON compatible dictionary'''
+        meta = {}
+        for k, v in self.__dict__.items():
+            if isinstance(v, np.ndarray):
+                meta[k] = v.tolist()
+            else:
+                meta[k] = v
+        return meta
+
+    def from_json(self, doc: Dict) -> None:
+        # pylint: disable=attribute-defined-outside-init
+        '''Load the encoder back from a JSON compatible dict.'''
+        meta = {}
+        for k, v in doc.items():
+            if k == 'classes_':
+                self.classes_ = np.array(v)
+                continue
+            meta[k] = v
+        self.__dict__.update(meta)
 
 
 try:
@@ -113,7 +107,7 @@ def from_json(self, doc):
     SCIPY_INSTALLED = True
 except ImportError:
     scipy_sparse = False
-    scipy_csr: Any = object
+    scipy_csr = object
     SCIPY_INSTALLED = False
 
 
@@ -136,15 +130,21 @@ class LazyLoader(types.ModuleType):
     """Lazily import a module, mainly to avoid pulling in large dependencies.
     """
 
-    def __init__(self, local_name, parent_module_globals, name, warning=None):
+    def __init__(
+         self,
+         local_name: str,
+         parent_module_globals: Dict,
+         name: str,
+         warning: Optional[str] = None
+    ) -> None:
         self._local_name = local_name
         self._parent_module_globals = parent_module_globals
         self._warning = warning
-        self.module = None
+        self.module: Optional[types.ModuleType] = None
 
         super().__init__(name)
 
-    def _load(self):
+    def _load(self) -> types.ModuleType:
         """Load the module and insert it into the parent's globals."""
         # Import the target module and insert it into the parent's namespace
         module = importlib.import_module(self.__name__)
@@ -163,12 +163,12 @@ def _load(self):
 
         return module
 
-    def __getattr__(self, item):
+    def __getattr__(self, item: str) -> Any:
         if not self.module:
             self.module = self._load()
         return getattr(self.module, item)
 
-    def __dir__(self):
+    def __dir__(self) -> List[str]:
         if not self.module:
             self.module = self._load()
         return dir(self.module)
diff --git a/python-package/xgboost/config.py b/python-package/xgboost/config.py
index 427ea4ea3915..2344ae4a3541 100644
--- a/python-package/xgboost/config.py
+++ b/python-package/xgboost/config.py
@@ -4,12 +4,20 @@
 import json
 from contextlib import contextmanager
 from functools import wraps
+from typing import Optional, Callable, Any, Dict, cast, Iterator
 
 from .core import _LIB, _check_call, c_str, py_str
+from ._typing import _F
 
 
-def config_doc(*, header=None, extra_note=None, parameters=None, returns=None,
-               see_also=None):
+def config_doc(
+    *,
+    header: Optional[str] = None,
+    extra_note: Optional[str] = None,
+    parameters: Optional[str] = None,
+    returns: Optional[str] = None,
+    see_also: Optional[str] = None
+) -> Callable[[_F], _F]:
     """Decorator to format docstring for config functions.
 
     Parameters
@@ -64,19 +72,19 @@ def config_doc(*, header=None, extra_note=None, parameters=None, returns=None,
         assert xgb.get_config()['verbosity'] == 2  # old value restored
     """
 
-    def none_to_str(value):
+    def none_to_str(value: Optional[str]) -> str:
         return '' if value is None else value
 
-    def config_doc_decorator(func):
+    def config_doc_decorator(func: _F) -> _F:
         func.__doc__ = (doc_template.format(header=none_to_str(header),
                                             extra_note=none_to_str(extra_note))
                         + none_to_str(parameters) + none_to_str(returns)
                         + none_to_str(common_example) + none_to_str(see_also))
 
         @wraps(func)
-        def wrap(*args, **kwargs):
+        def wrap(*args: Any, **kwargs: Any) -> Any:
             return func(*args, **kwargs)
-        return wrap
+        return cast(_F, wrap)
     return config_doc_decorator
 
 
@@ -89,7 +97,7 @@ def wrap(*args, **kwargs):
     new_config: Dict[str, Any]
         Keyword arguments representing the parameters and their values
             """)
-def set_config(**new_config):
+def set_config(**new_config: Any) -> None:
     config = json.dumps(new_config)
     _check_call(_LIB.XGBSetGlobalConfig(c_str(config)))
 
@@ -103,10 +111,12 @@ def set_config(**new_config):
     args: Dict[str, Any]
         The list of global parameters and their values
             """)
-def get_config():
+def get_config() -> Dict[str, Any]:
     config_str = ctypes.c_char_p()
     _check_call(_LIB.XGBGetGlobalConfig(ctypes.byref(config_str)))
-    config = json.loads(py_str(config_str.value))
+    value = config_str.value
+    assert value
+    config = json.loads(py_str(value))
     return config
 
 
@@ -132,7 +142,7 @@ def get_config():
     set_config: Set global XGBoost configuration
     get_config: Get current values of the global configuration
             """)
-def config_context(**new_config):
+def config_context(**new_config: Any) -> Iterator[None]:
     old_config = get_config().copy()
     set_config(**new_config)
 
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index a94c9d767a0a..cd8437847688 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -30,10 +30,12 @@
     ArrayLike,
     CFloatPtr,
     NumpyOrCupy,
-    FeatureNames,
+    FeatureInfo,
     FeatureTypes,
+    FeatureNames,
     _T,
     CupyT,
+    BoosterParam
 )
 
 
@@ -41,7 +43,7 @@ class XGBoostError(ValueError):
     """Error thrown by xgboost trainer."""
 
 
-def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, CStrPptr]:
+def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array]:
     """Convert a Python str or list of Python str to C pointer
 
     Parameters
@@ -53,9 +55,9 @@ def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, CStrPptr]:
     if isinstance(data, str):
         return bytes(data, "utf-8")
     if isinstance(data, list):
-        pointers: ctypes.pointer = (ctypes.c_char_p * len(data))()
+        pointers: ctypes.Array[ctypes.c_char_p] = (ctypes.c_char_p * len(data))()
         data_as_bytes = [bytes(d, 'utf-8') for d in data]
-        pointers[:] = data_as_bytes
+        pointers[:] = data_as_bytes  # type: ignore
         return pointers
     raise TypeError()
 
@@ -270,10 +272,10 @@ def _cuda_array_interface(data: DataType) -> bytes:
 def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.ndarray:
     """Convert a ctypes pointer array to a numpy array."""
     ctype: Type[CNumeric] = _numpy2ctypes_type(dtype)
-    if not isinstance(cptr, ctypes.POINTER(ctype)):
+    if not isinstance(cptr, ctypes.POINTER(ctype)):  # type: ignore
         raise RuntimeError(f"expected {ctype} pointer")
     res = np.zeros(length, dtype=dtype)
-    if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):
+    if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):  # type: ignore
         raise RuntimeError("memmove failed")
     return res
 
@@ -285,7 +287,10 @@ def ctypes2cupy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> CupyT
     from cupy.cuda.memory import MemoryPointer
     from cupy.cuda.memory import UnownedMemory
 
-    CUPY_TO_CTYPES_MAPPING = {cupy.float32: ctypes.c_float, cupy.uint32: ctypes.c_uint}
+    CUPY_TO_CTYPES_MAPPING: Dict[Type[np.number], Type[CNumeric]] = {
+        cupy.float32: ctypes.c_float,
+        cupy.uint32: ctypes.c_uint,
+    }
     if dtype not in CUPY_TO_CTYPES_MAPPING:
         raise RuntimeError(f"Supported types: {CUPY_TO_CTYPES_MAPPING.keys()}")
     addr = ctypes.cast(cptr, ctypes.c_void_p).value
@@ -310,7 +315,7 @@ def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray:
         raise RuntimeError('expected char pointer')
     res = bytearray(length)
     rptr = (ctypes.c_char * length).from_buffer(res)
-    if not ctypes.memmove(rptr, cptr, length):
+    if not ctypes.memmove(rptr, cptr, length):  # type: ignore
         raise RuntimeError('memmove failed')
     return res
 
@@ -320,10 +325,12 @@ def c_str(string: str) -> ctypes.c_char_p:
     return ctypes.c_char_p(string.encode('utf-8'))
 
 
-def c_array(ctype: Type[CTypeT], values: ArrayLike) -> ctypes.Array:
+def c_array(
+    ctype: Type[CTypeT], values: ArrayLike
+) -> Union[ctypes.Array, ctypes.pointer]:
     """Convert a python string to c array."""
     if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype):
-        return (ctype * len(values)).from_buffer_copy(values)
+        return values.ctypes.data_as(ctypes.POINTER(ctype))
     return (ctype * len(values))(*values)
 
 
@@ -434,8 +441,8 @@ def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument
         def data_handle(
             data: Any,
             *,
-            feature_names: FeatureNames = None,
-            feature_types: Optional[List[str]] = None,
+            feature_names: Optional[FeatureNames] = None,
+            feature_types: Optional[FeatureTypes] = None,
             **kwargs: Any,
         ) -> None:
             from .data import dispatch_proxy_set_data
@@ -555,8 +562,8 @@ def __init__(
         base_margin: Optional[ArrayLike] = None,
         missing: Optional[float] = None,
         silent: bool = False,
-        feature_names: FeatureNames = None,
-        feature_types: FeatureTypes = None,
+        feature_names: Optional[FeatureNames] = None,
+        feature_types: Optional[FeatureTypes] = None,
         nthread: Optional[int] = None,
         group: Optional[ArrayLike] = None,
         qid: Optional[ArrayLike] = None,
@@ -718,8 +725,8 @@ def set_info(
         qid: Optional[ArrayLike] = None,
         label_lower_bound: Optional[ArrayLike] = None,
         label_upper_bound: Optional[ArrayLike] = None,
-        feature_names: FeatureNames = None,
-        feature_types: Optional[List[str]] = None,
+        feature_names: Optional[FeatureNames] = None,
+        feature_types: Optional[FeatureTypes] = None,
         feature_weights: Optional[ArrayLike] = None
     ) -> None:
         """Set meta info for DMatrix.  See doc string for :py:obj:`xgboost.DMatrix`."""
@@ -1000,7 +1007,7 @@ def slice(
         return res
 
     @property
-    def feature_names(self) -> Optional[List[str]]:
+    def feature_names(self) -> Optional[FeatureNames]:
         """Get feature names (column labels).
 
         Returns
@@ -1023,7 +1030,7 @@ def feature_names(self) -> Optional[List[str]]:
         return feature_names
 
     @feature_names.setter
-    def feature_names(self, feature_names: FeatureNames) -> None:
+    def feature_names(self, feature_names: Optional[FeatureNames]) -> None:
         """Set feature names (column labels).
 
         Parameters
@@ -1039,7 +1046,7 @@ def feature_names(self, feature_names: FeatureNames) -> None:
                 else:
                     feature_names = [feature_names]
             except TypeError:
-                feature_names = [feature_names]
+                feature_names = [cast(str, feature_names)]
 
             if len(feature_names) != len(set(feature_names)):
                 raise ValueError('feature_names must be unique')
@@ -1069,8 +1076,13 @@ def feature_names(self, feature_names: FeatureNames) -> None:
             self.feature_types = None
 
     @property
-    def feature_types(self) -> Optional[List[str]]:
-        """Get feature types. See :py:class:`DMatrix` for details."""
+    def feature_types(self) -> Optional[FeatureTypes]:
+        """Get feature types (column types).
+
+        Returns
+        -------
+        feature_types : list or None
+        """
         length = c_bst_ulong()
         sarr = ctypes.POINTER(ctypes.c_char_p)()
         _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
@@ -1111,7 +1123,7 @@ def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
                 else:
                     feature_types = [feature_types]
             except TypeError:
-                feature_types = [feature_types]
+                feature_types = [cast(str, feature_types)]
             feature_types_bytes = [bytes(f, encoding='utf-8')
                                for f in feature_types]
             c_feature_types = (ctypes.c_char_p *
@@ -1203,8 +1215,8 @@ def __init__(  # pylint: disable=super-init-not-called
         base_margin: Optional[ArrayLike] = None,
         missing: Optional[float] = None,
         silent: bool = False,
-        feature_names: FeatureNames = None,
-        feature_types: Optional[List[str]] = None,
+        feature_names: Optional[FeatureNames] = None,
+        feature_types: Optional[FeatureTypes] = None,
         nthread: Optional[int] = None,
         max_bin: int = 256,
         group: Optional[ArrayLike] = None,
@@ -1323,7 +1335,7 @@ def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]:
     return num_parallel_tree, num_groups
 
 
-def _configure_metrics(params: Union[Dict, List]) -> Union[Dict, List]:
+def _configure_metrics(params: BoosterParam) -> BoosterParam:
     if (
         isinstance(params, dict)
         and "eval_metric" in params
@@ -1349,7 +1361,7 @@ class Booster:
 
     def __init__(
         self,
-        params: Optional[Dict] = None,
+        params: Optional[BoosterParam] = None,
         cache: Optional[Sequence[DMatrix]] = None,
         model_file: Optional[Union["Booster", bytearray, os.PathLike, str]] = None
     ) -> None:
@@ -1444,7 +1456,7 @@ def _transform_interaction_constraints(
                 "Constrained features are not a subset of training data feature names"
             ) from e
 
-    def _configure_constraints(self, params: Union[List, Dict]) -> Union[List, Dict]:
+    def _configure_constraints(self, params: BoosterParam) -> BoosterParam:
         if isinstance(params, dict):
             value = params.get("monotone_constraints")
             if value is not None:
@@ -1604,10 +1616,12 @@ def attr(self, key: str) -> Optional[str]:
         _check_call(_LIB.XGBoosterGetAttr(
             self.handle, c_str(key), ctypes.byref(ret), ctypes.byref(success)))
         if success.value != 0:
-            return py_str(ret.value)
+            value = ret.value
+            assert value
+            return py_str(value)
         return None
 
-    def attributes(self) -> Dict[str, str]:
+    def attributes(self) -> Dict[str, Optional[str]]:
         """Get attributes stored in the Booster as a dictionary.
 
         Returns
@@ -1632,14 +1646,12 @@ def set_attr(self, **kwargs: Optional[str]) -> None:
             The attributes to set. Setting a value to None deletes an attribute.
         """
         for key, value in kwargs.items():
+            c_value = None
             if value is not None:
-                if not isinstance(value, str):
-                    raise ValueError("Set Attr only accepts string values")
-                value = c_str(str(value))
-            _check_call(_LIB.XGBoosterSetAttr(
-                self.handle, c_str(key), value))
+                c_value = c_str(str(value))
+            _check_call(_LIB.XGBoosterSetAttr(self.handle, c_str(key), c_value))
 
-    def _get_feature_info(self, field: str) -> Optional[List[str]]:
+    def _get_feature_info(self, field: str) -> Optional[FeatureInfo]:
         length = c_bst_ulong()
         sarr = ctypes.POINTER(ctypes.c_char_p)()
         if not hasattr(self, "handle") or self.handle is None:
@@ -1652,7 +1664,7 @@ def _get_feature_info(self, field: str) -> Optional[List[str]]:
         feature_info = from_cstr_to_pystr(sarr, length)
         return feature_info if feature_info else None
 
-    def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> None:
+    def _set_feature_info(self, features: Optional[FeatureInfo], field: str) -> None:
         if features is not None:
             assert isinstance(features, list)
             feature_info_bytes = [bytes(f, encoding="utf-8") for f in features]
@@ -1670,7 +1682,7 @@ def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> No
             )
 
     @property
-    def feature_types(self) -> Optional[List[str]]:
+    def feature_types(self) -> Optional[FeatureTypes]:
         """Feature types for this booster.  Can be directly set by input data or by
         assignment.  See :py:class:`DMatrix` for details.
 
@@ -1678,11 +1690,11 @@ def feature_types(self) -> Optional[List[str]]:
         return self._get_feature_info("feature_type")
 
     @feature_types.setter
-    def feature_types(self, features: Optional[List[str]]) -> None:
+    def feature_types(self, features: Optional[FeatureTypes]) -> None:
         self._set_feature_info(features, "feature_type")
 
     @property
-    def feature_names(self) -> Optional[List[str]]:
+    def feature_names(self) -> Optional[FeatureNames]:
         """Feature names for this booster.  Can be directly set by input data or by
         assignment.
 
@@ -1690,7 +1702,7 @@ def feature_names(self) -> Optional[List[str]]:
         return self._get_feature_info("feature_name")
 
     @feature_names.setter
-    def feature_names(self, features: FeatureNames) -> None:
+    def feature_names(self, features: Optional[FeatureNames]) -> None:
         self._set_feature_info(features, "feature_name")
 
     def set_param(
@@ -1711,7 +1723,7 @@ def set_param(
             params = params.items()
         elif isinstance(params, str) and value is not None:
             params = [(params, value)]
-        for key, val in params:
+        for key, val in cast(Iterable[Tuple[str, str]], params):
             if val is not None:
                 _check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key),
                                                    c_str(str(val))))
@@ -2564,8 +2576,10 @@ def _validate_features(self, data: DMatrix) -> None:
             )
         # Booster can't accept data with different feature names
         if self.feature_names != data.feature_names:
-            dat_missing = set(self.feature_names) - set(data.feature_names)
-            my_missing = set(data.feature_names) - set(self.feature_names)
+            dat_missing = set(cast(FeatureNames, self.feature_names)) - \
+                          set(cast(FeatureNames, data.feature_names))
+            my_missing = set(cast(FeatureNames, data.feature_names)) - \
+                         set(cast(FeatureNames, self.feature_names))
 
             msg = 'feature_names mismatch: {0} {1}'
 
@@ -2619,10 +2633,10 @@ def get_split_value_histogram(
         bins = max(min(n_unique, bins) if bins is not None else n_unique, 1)
 
         nph = np.histogram(values, bins=bins)
-        nph = np.column_stack((nph[1][1:], nph[0]))
-        nph = nph[nph[:, 1] > 0]
+        nph_stacked = np.column_stack((nph[1][1:], nph[0]))
+        nph_stacked = nph_stacked[nph_stacked[:, 1] > 0]
 
-        if nph.size == 0:
+        if nph_stacked.size == 0:
             ft = self.feature_types
             fn = self.feature_names
             if fn is None:
@@ -2640,11 +2654,11 @@ def get_split_value_histogram(
                 )
 
         if as_pandas and PANDAS_INSTALLED:
-            return DataFrame(nph, columns=['SplitValue', 'Count'])
+            return DataFrame(nph_stacked, columns=['SplitValue', 'Count'])
         if as_pandas and not PANDAS_INSTALLED:
             warnings.warn(
                 "Returning histogram as ndarray"
                 " (as_pandas == True, but pandas is not installed).",
                 UserWarning
             )
-        return nph
+        return nph_stacked
diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py
index b54e26c9d550..ee8ea1a5aec4 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -318,7 +318,7 @@ def __init__(
         base_margin: Optional[_DaskCollection] = None,
         missing: float = None,
         silent: bool = False,  # pylint: disable=unused-argument
-        feature_names: FeatureNames = None,
+        feature_names: Optional[FeatureNames] = None,
         feature_types: FeatureTypes = None,
         group: Optional[_DaskCollection] = None,
         qid: Optional[_DaskCollection] = None,
@@ -594,7 +594,7 @@ def __init__(
         qid: Optional[List[Any]] = None,
         label_lower_bound: Optional[List[Any]] = None,
         label_upper_bound: Optional[List[Any]] = None,
-        feature_names: FeatureNames = None,
+        feature_names: Optional[FeatureNames] = None,
         feature_types: Optional[Union[Any, List[Any]]] = None,
     ) -> None:
         self._data = data
@@ -637,7 +637,7 @@ def next(self, input_data: Callable) -> int:
         if self._iter == len(self._data):
             # Return 0 when there's no more batch.
             return 0
-        feature_names: FeatureNames = None
+        feature_names: Optional[FeatureNames] = None
         if self._feature_names:
             feature_names = self._feature_names
         else:
@@ -688,7 +688,7 @@ def __init__(
         base_margin: Optional[_DaskCollection] = None,
         missing: float = None,
         silent: bool = False,  # disable=unused-argument
-        feature_names: FeatureNames = None,
+        feature_names: Optional[FeatureNames] = None,
         feature_types: Optional[Union[Any, List[Any]]] = None,
         max_bin: int = 256,
         group: Optional[_DaskCollection] = None,
@@ -725,7 +725,7 @@ def _create_fn_args(self, worker_addr: str) -> Dict[str, Any]:
 
 
 def _create_device_quantile_dmatrix(
-    feature_names: FeatureNames,
+    feature_names: Optional[FeatureNames],
     feature_types: Optional[Union[Any, List[Any]]],
     feature_weights: Optional[Any],
     missing: float,
@@ -766,7 +766,7 @@ def _create_device_quantile_dmatrix(
 
 
 def _create_dmatrix(
-    feature_names: FeatureNames,
+    feature_names: Optional[FeatureNames],
     feature_types: Optional[Union[Any, List[Any]]],
     feature_weights: Optional[Any],
     missing: float,
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index d21c97910eb3..a0505e9c9105 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -5,17 +5,26 @@
 import json
 import warnings
 import os
-from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type
+from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Sequence, cast
 
 import numpy as np
 
 from .core import c_array, _LIB, _check_call, c_str
 from .core import _cuda_array_interface
-from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames
-from ._typing import FeatureTypes
+from .core import DataIter, _ProxyDMatrix, DMatrix
 from .compat import lazy_isinstance, DataFrame
+from ._typing import (
+    c_bst_ulong,
+    DataType,
+    FeatureTypes,
+    FeatureNames,
+    NumpyDType,
+    CupyT,
+    FloatCompatible, PandasDType
+)
 
-c_bst_ulong = ctypes.c_uint64   # pylint: disable=invalid-name
+DispatchedDataBackendReturnType = Tuple[
+    ctypes.c_void_p, Optional[FeatureNames], Optional[FeatureTypes]]
 
 CAT_T = "c"
 
@@ -23,14 +32,14 @@
 _matrix_meta = {"base_margin", "label"}
 
 
-def _warn_unused_missing(data, missing):
+def _warn_unused_missing(data: DataType, missing: Optional[FloatCompatible]) -> None:
     if (missing is not None) and (not np.isnan(missing)):
         warnings.warn(
             '`missing` is not used for current input data type:' +
             str(type(data)), UserWarning)
 
 
-def _check_complex(data):
+def _check_complex(data: DataType) -> None:
     '''Test whether data is complex using `dtype` attribute.'''
     complex_dtypes = (np.complex128, np.complex64,
                       np.cfloat, np.cdouble, np.clongdouble)
@@ -38,16 +47,15 @@ def _check_complex(data):
         raise ValueError('Complex data not supported')
 
 
-def _check_data_shape(data: Any) -> None:
+def _check_data_shape(data: DataType) -> None:
     if hasattr(data, "shape") and len(data.shape) != 2:
         raise ValueError("Please reshape the input data into 2-dimensional matrix.")
 
 
-def _is_scipy_csr(data):
+def _is_scipy_csr(data: DataType) -> bool:
     try:
-        import scipy
+        import scipy.sparse
     except ImportError:
-        scipy = None
         return False
     return isinstance(data, scipy.sparse.csr_matrix)
 
@@ -64,12 +72,12 @@ def _array_interface(data: np.ndarray) -> bytes:
 
 
 def _from_scipy_csr(
-    data,
-    missing,
-    nthread,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-):
+    data: DataType,
+    missing: FloatCompatible,
+    nthread: int,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+) -> DispatchedDataBackendReturnType:
     """Initialize data from a CSR matrix."""
     if len(data.indices) != len(data.data):
         raise ValueError(
@@ -94,21 +102,20 @@ def _from_scipy_csr(
     return handle, feature_names, feature_types
 
 
-def _is_scipy_csc(data):
+def _is_scipy_csc(data: DataType) -> bool:
     try:
-        import scipy
+        import scipy.sparse
     except ImportError:
-        scipy = None
         return False
     return isinstance(data, scipy.sparse.csc_matrix)
 
 
 def _from_scipy_csc(
-    data,
-    missing,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-):
+    data: DataType,
+    missing: Optional[FloatCompatible],
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+) -> DispatchedDataBackendReturnType:
     if len(data.indices) != len(data.data):
         raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}")
     _warn_unused_missing(data, missing)
@@ -124,27 +131,29 @@ def _from_scipy_csc(
     return handle, feature_names, feature_types
 
 
-def _is_scipy_coo(data):
+def _is_scipy_coo(data: DataType) -> bool:
     try:
-        import scipy
+        import scipy.sparse
     except ImportError:
-        scipy = None
         return False
     return isinstance(data, scipy.sparse.coo_matrix)
 
 
-def _is_numpy_array(data):
+def _is_numpy_array(data: DataType) -> bool:
     return isinstance(data, (np.ndarray, np.matrix))
 
 
-def _ensure_np_dtype(data, dtype) -> Tuple[np.ndarray, np.dtype]:
+def _ensure_np_dtype(
+    data: DataType,
+    dtype: Optional[NumpyDType]
+) -> Tuple[np.ndarray, Optional[NumpyDType]]:
     if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
         data = data.astype(np.float32, copy=False)
         dtype = np.float32
     return data, dtype
 
 
-def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray:
+def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray:
     '''Handle numpy slice.  This can be removed if we use __array_interface__.
     '''
     try:
@@ -159,12 +168,12 @@ def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray:
 
 
 def _from_numpy_array(
-    data,
-    missing,
-    nthread,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-):
+    data: DataType,
+    missing: FloatCompatible,
+    nthread: int,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+) -> DispatchedDataBackendReturnType:
     """Initialize data from a 2-D numpy matrix.
 
     """
@@ -189,7 +198,7 @@ def _from_numpy_array(
     return handle, feature_names, feature_types
 
 
-def _is_pandas_df(data):
+def _is_pandas_df(data: DataType) -> bool:
     try:
         import pandas as pd
     except ImportError:
@@ -197,7 +206,7 @@ def _is_pandas_df(data):
     return isinstance(data, pd.DataFrame)
 
 
-def _is_modin_df(data):
+def _is_modin_df(data: DataType) -> bool:
     try:
         import modin.pandas as pd
     except ImportError:
@@ -232,7 +241,7 @@ def _is_modin_df(data):
 )
 
 
-def _invalid_dataframe_dtype(data: Any) -> None:
+def _invalid_dataframe_dtype(data: DataType) -> None:
     # pandas series has `dtypes` but it's just a single object
     # cudf series doesn't have `dtypes`.
     if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
@@ -253,10 +262,10 @@ def _invalid_dataframe_dtype(data: Any) -> None:
 def _pandas_feature_info(
     data: DataFrame,
     meta: Optional[str],
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
     enable_categorical: bool,
-) -> Tuple[FeatureNames, FeatureTypes]:
+) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
     import pandas as pd
     from pandas.api.types import (
         is_sparse,
@@ -285,13 +294,13 @@ def _pandas_feature_info(
     return feature_names, feature_types
 
 
-def is_nullable_dtype(dtype: Any) -> bool:
+def is_nullable_dtype(dtype: PandasDType) -> bool:
     """Wether dtype is a pandas nullable type."""
     from pandas.api.types import is_integer_dtype, is_bool_dtype
     # dtype: pd.core.arrays.numeric.NumericDtype
     nullable_alias = {"Int16", "Int32", "Int64"}
     is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
-    # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
+    # np.bool has alias `bool`, while pd.BooleanDtype has `bzoolean`.
     is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
     return is_int or is_bool
 
@@ -331,11 +340,11 @@ def _pandas_cat_null(data: DataFrame) -> DataFrame:
 def _transform_pandas_df(
     data: DataFrame,
     enable_categorical: bool,
-    feature_names: FeatureNames = None,
-    feature_types: FeatureTypes = None,
+    feature_names: Optional[FeatureNames] = None,
+    feature_types: Optional[FeatureTypes] = None,
     meta: Optional[str] = None,
-    meta_type: Optional[str] = None,
-) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
+    meta_type: Optional[NumpyDType] = None,
+) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
     from pandas.api.types import (
         is_sparse,
         is_categorical_dtype,
@@ -359,7 +368,7 @@ def _transform_pandas_df(
     if meta and len(data.columns) > 1 and meta not in _matrix_meta:
         raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
 
-    dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32
+    dtype = meta_type if meta_type else np.float32
     arr: np.ndarray = transformed.values
     if meta_type:
         arr = arr.astype(dtype)
@@ -369,18 +378,18 @@ def _transform_pandas_df(
 def _from_pandas_df(
     data: DataFrame,
     enable_categorical: bool,
-    missing: float,
+    missing: FloatCompatible,
     nthread: int,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+) -> DispatchedDataBackendReturnType:
     data, feature_names, feature_types = _transform_pandas_df(
         data, enable_categorical, feature_names, feature_types
     )
     return _from_numpy_array(data, missing, nthread, feature_names, feature_types)
 
 
-def _is_pandas_series(data):
+def _is_pandas_series(data: DataType) -> bool:
     try:
         import pandas as pd
     except ImportError:
@@ -389,18 +398,21 @@ def _is_pandas_series(data):
 
 
 def _meta_from_pandas_series(
-    data, name: str, dtype: Optional[str], handle: ctypes.c_void_p
+    data: DataType,
+    name: str,
+    dtype: Optional[NumpyDType],
+    handle: ctypes.c_void_p
 ) -> None:
     """Help transform pandas series for meta data like labels"""
     data = data.values.astype('float')
     from pandas.api.types import is_sparse
     if is_sparse(data):
-        data = data.to_dense()
+        data = data.to_dense()  # type: ignore
     assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
     _meta_from_numpy(data, name, dtype, handle)
 
 
-def _is_modin_series(data):
+def _is_modin_series(data: DataType) -> bool:
     try:
         import modin.pandas as pd
     except ImportError:
@@ -409,13 +421,13 @@ def _is_modin_series(data):
 
 
 def _from_pandas_series(
-    data,
-    missing: float,
+    data: DataType,
+    missing: FloatCompatible,
     nthread: int,
     enable_categorical: bool,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-):
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+) -> DispatchedDataBackendReturnType:
     from pandas.api.types import is_categorical_dtype
 
     if (data.dtype.name not in _pandas_dtype_mapper) and not (
@@ -433,7 +445,7 @@ def _from_pandas_series(
     )
 
 
-def _is_dt_df(data):
+def _is_dt_df(data: DataType) -> bool:
     return lazy_isinstance(data, 'datatable', 'Frame') or \
         lazy_isinstance(data, 'datatable', 'DataTable')
 
@@ -443,12 +455,12 @@ def _is_dt_df(data):
 
 
 def _transform_dt_df(
-    data,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-    meta=None,
-    meta_type=None,
-):
+    data: DataType,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+    meta: Optional[str] = None,
+    meta_type: Optional[NumpyDType] = None,
+) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
     """Validate feature names and types if data table"""
     if meta and data.shape[1] > 1:
         raise ValueError('DataTable for meta info cannot have multiple columns')
@@ -482,13 +494,13 @@ def _transform_dt_df(
 
 
 def _from_dt_df(
-    data,
-    missing,
-    nthread,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
+    data: DataType,
+    missing: Optional[FloatCompatible],
+    nthread: int,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
     enable_categorical: bool,
-) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
+) -> DispatchedDataBackendReturnType:
     if enable_categorical:
         raise ValueError("categorical data in datatable is not supported yet.")
     data, feature_names, feature_types = _transform_dt_df(
@@ -525,7 +537,7 @@ def _from_dt_df(
     return handle, feature_names, feature_types
 
 
-def _is_arrow(data) -> bool:
+def _is_arrow(data: DataType) -> bool:
     try:
         import pyarrow as pa
         from pyarrow import dataset as arrow_dataset
@@ -571,13 +583,13 @@ def _next(data_handle: int) -> int:
 
 
 def _from_arrow(
-    data,
-    missing: float,
+    data: DataType,
+    missing: FloatCompatible,
     nthread: int,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
     enable_categorical: bool,
-) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
+) -> DispatchedDataBackendReturnType:
     import pyarrow as pa
 
     if not all(
@@ -605,11 +617,11 @@ def _from_arrow(
     return handle, feature_names, feature_types
 
 
-def _is_cudf_df(data) -> bool:
+def _is_cudf_df(data: DataType) -> bool:
     return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
 
 
-def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
+def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
     """Extract CuDF __cuda_array_interface__.  This is special as it returns a new list of
     data and a list of array interfaces.  The data is list of categorical codes that
     caller can safely ignore, but have to keep their reference alive until usage of array
@@ -645,11 +657,11 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
 
 
 def _transform_cudf_df(
-    data,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
+    data: DataType,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
     enable_categorical: bool,
-):
+) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]:
     try:
         from cudf.api.types import is_categorical_dtype
     except ImportError:
@@ -709,13 +721,13 @@ def _transform_cudf_df(
 
 
 def _from_cudf_df(
-    data,
-    missing,
-    nthread,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
+    data: DataType,
+    missing: FloatCompatible,
+    nthread: int,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
     enable_categorical: bool,
-) -> Tuple[ctypes.c_void_p, Any, Any]:
+) -> DispatchedDataBackendReturnType:
     data, cat_codes, feature_names, feature_types = _transform_cudf_df(
         data, feature_names, feature_types, enable_categorical
     )
@@ -732,7 +744,7 @@ def _from_cudf_df(
     return handle, feature_names, feature_types
 
 
-def _is_cudf_ser(data):
+def _is_cudf_ser(data: DataType) -> bool:
     try:
         import cudf
     except ImportError:
@@ -740,13 +752,13 @@ def _is_cudf_ser(data):
     return isinstance(data, cudf.Series)
 
 
-def _is_cupy_array(data: Any) -> bool:
+def _is_cupy_array(data: DataType) -> bool:
     return lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance(
         data, "cupy._core.core", "ndarray"
     )
 
 
-def _transform_cupy_array(data):
+def _transform_cupy_array(data: DataType) -> CupyT:
     import cupy  # pylint: disable=import-error
     if not hasattr(data, '__cuda_array_interface__') and hasattr(
             data, '__array__'):
@@ -757,12 +769,12 @@ def _transform_cupy_array(data):
 
 
 def _from_cupy_array(
-    data,
-    missing,
-    nthread,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-):
+    data: DataType,
+    missing: FloatCompatible,
+    nthread: int,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+) -> DispatchedDataBackendReturnType:
     """Initialize DMatrix from cupy ndarray."""
     data = _transform_cupy_array(data)
     interface_str = _cuda_array_interface(data)
@@ -776,7 +788,7 @@ def _from_cupy_array(
     return handle, feature_names, feature_types
 
 
-def _is_cupy_csr(data):
+def _is_cupy_csr(data: DataType) -> bool:
     try:
         import cupyx
     except ImportError:
@@ -784,7 +796,7 @@ def _is_cupy_csr(data):
     return isinstance(data, cupyx.scipy.sparse.csr_matrix)
 
 
-def _is_cupy_csc(data):
+def _is_cupy_csc(data: DataType) -> bool:
     try:
         import cupyx
     except ImportError:
@@ -792,11 +804,11 @@ def _is_cupy_csc(data):
     return isinstance(data, cupyx.scipy.sparse.csc_matrix)
 
 
-def _is_dlpack(data):
+def _is_dlpack(data: DataType) -> bool:
     return 'PyCapsule' in str(type(data)) and "dltensor" in str(data)
 
 
-def _transform_dlpack(data):
+def _transform_dlpack(data: DataType) -> bool:
     from cupy import fromDlpack  # pylint: disable=E0401
     assert 'used_dltensor' not in str(data)
     data = fromDlpack(data)
@@ -804,27 +816,27 @@ def _transform_dlpack(data):
 
 
 def _from_dlpack(
-    data,
-    missing,
-    nthread,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-):
+    data: DataType,
+    missing: FloatCompatible,
+    nthread: int,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+) -> DispatchedDataBackendReturnType:
     data = _transform_dlpack(data)
     return _from_cupy_array(data, missing, nthread, feature_names,
                             feature_types)
 
 
-def _is_uri(data):
+def _is_uri(data: DataType) -> bool:
     return isinstance(data, (str, os.PathLike))
 
 
 def _from_uri(
-    data,
-    missing,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-):
+    data: DataType,
+    missing: Optional[FloatCompatible],
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+) -> DispatchedDataBackendReturnType:
     _warn_unused_missing(data, missing)
     handle = ctypes.c_void_p()
     data = os.fspath(os.path.expanduser(data))
@@ -834,51 +846,51 @@ def _from_uri(
     return handle, feature_names, feature_types
 
 
-def _is_list(data):
+def _is_list(data: DataType) -> bool:
     return isinstance(data, list)
 
 
 def _from_list(
-    data,
-    missing,
-    n_threads,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-):
+    data: Sequence,
+    missing: FloatCompatible,
+    n_threads: int,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+) -> DispatchedDataBackendReturnType:
     array = np.array(data)
     _check_data_shape(data)
     return _from_numpy_array(array, missing, n_threads, feature_names, feature_types)
 
 
-def _is_tuple(data):
+def _is_tuple(data: DataType) -> bool:
     return isinstance(data, tuple)
 
 
 def _from_tuple(
-    data,
-    missing,
-    n_threads,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
-):
+    data: Sequence,
+    missing: FloatCompatible,
+    n_threads: int,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
+) -> DispatchedDataBackendReturnType:
     return _from_list(data, missing, n_threads, feature_names, feature_types)
 
 
-def _is_iter(data):
+def _is_iter(data: DataType) -> bool:
     return isinstance(data, DataIter)
 
 
-def _has_array_protocol(data):
+def _has_array_protocol(data: DataType) -> bool:
     return hasattr(data, '__array__')
 
 
-def _convert_unknown_data(data):
+def _convert_unknown_data(data: DataType) -> DataType:
     warnings.warn(
         f'Unknown data type: {type(data)}, trying to convert it to csr_matrix',
         UserWarning
     )
     try:
-        import scipy
+        import scipy.sparse
     except ImportError:
         return None
 
@@ -891,13 +903,13 @@ def _convert_unknown_data(data):
 
 
 def dispatch_data_backend(
-    data,
-    missing,
-    threads,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
+    data: DataType,
+    missing: FloatCompatible,  # Or Optional[Float]
+    threads: int,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
     enable_categorical: bool = False,
-):
+) -> DispatchedDataBackendReturnType:
     '''Dispatch data for DMatrix.'''
     if not _is_cudf_ser(data) and not _is_pandas_series(data):
         _check_data_shape(data)
@@ -964,7 +976,7 @@ def dispatch_data_backend(
     raise TypeError('Not supported type for data.' + str(type(data)))
 
 
-def _to_data_type(dtype: str, name: str):
+def _to_data_type(dtype: str, name: str) -> int:
     dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4}
     if dtype not in dtype_map:
         raise TypeError(
@@ -973,7 +985,7 @@ def _to_data_type(dtype: str, name: str):
     return dtype_map[dtype]
 
 
-def _validate_meta_shape(data: Any, name: str) -> None:
+def _validate_meta_shape(data: DataType, name: str) -> None:
     if hasattr(data, "shape"):
         msg = f"Invalid shape: {data.shape} for {name}"
         if name in _matrix_meta:
@@ -990,7 +1002,7 @@ def _validate_meta_shape(data: Any, name: str) -> None:
 def _meta_from_numpy(
     data: np.ndarray,
     field: str,
-    dtype: Optional[Union[np.dtype, str]],
+    dtype: Optional[NumpyDType],
     handle: ctypes.c_void_p,
 ) -> None:
     data, dtype = _ensure_np_dtype(data, dtype)
@@ -1001,16 +1013,26 @@ def _meta_from_numpy(
     _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
 
 
-def _meta_from_list(data, field, dtype, handle):
-    data = np.array(data)
-    _meta_from_numpy(data, field, dtype, handle)
+def _meta_from_list(
+    data: Sequence,
+    field: str,
+    dtype: Optional[NumpyDType],
+    handle: ctypes.c_void_p
+) -> None:
+    data_np = np.array(data)
+    _meta_from_numpy(data_np, field, dtype, handle)
 
 
-def _meta_from_tuple(data, field, dtype, handle):
+def _meta_from_tuple(
+    data: Sequence,
+    field: str,
+    dtype: Optional[NumpyDType],
+    handle: ctypes.c_void_p
+) -> None:
     return _meta_from_list(data, field, dtype, handle)
 
 
-def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None:
+def _meta_from_cudf_df(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
     if field not in _matrix_meta:
         _meta_from_cudf_series(data.iloc[:, 0], field, handle)
     else:
@@ -1019,7 +1041,7 @@ def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None:
         _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface))
 
 
-def _meta_from_cudf_series(data, field, handle):
+def _meta_from_cudf_series(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
     interface = bytes(json.dumps([data.__cuda_array_interface__],
                                  indent=2), 'utf-8')
     _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle,
@@ -1027,7 +1049,7 @@ def _meta_from_cudf_series(data, field, handle):
                                                    interface))
 
 
-def _meta_from_cupy_array(data, field, handle):
+def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
     data = _transform_cupy_array(data)
     interface = bytes(json.dumps([data.__cuda_array_interface__],
                                  indent=2), 'utf-8')
@@ -1036,14 +1058,22 @@ def _meta_from_cupy_array(data, field, handle):
                                                    interface))
 
 
-def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p):
+def _meta_from_dt(
+    data: DataType,
+    field: str,
+    dtype: Optional[NumpyDType],
+    handle: ctypes.c_void_p
+) -> None:
     data, _, _ = _transform_dt_df(data, None, None, field, dtype)
     _meta_from_numpy(data, field, dtype, handle)
 
 
 def dispatch_meta_backend(
-    matrix: DMatrix, data, name: str, dtype: Optional[Union[str, np.dtype]] = None
-):
+    matrix: DMatrix,
+    data: DataType,
+    name: str,
+    dtype: Optional[NumpyDType] = None
+) -> None:
     '''Dispatch for meta info.'''
     handle = matrix.handle
     assert handle is not None
@@ -1060,8 +1090,7 @@ def dispatch_meta_backend(
         _meta_from_numpy(data, name, dtype, handle)
         return
     if _is_pandas_df(data):
-        data, _, _ = _transform_pandas_df(data, False, meta=name,
-                                          meta_type=dtype)
+        data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
         _meta_from_numpy(data, name, dtype, handle)
         return
     if _is_pandas_series(data):
@@ -1107,7 +1136,7 @@ class SingleBatchInternalIter(DataIter):  # pylint: disable=R0902
     area for meta info.
 
     '''
-    def __init__(self, **kwargs: Any):
+    def __init__(self, **kwargs: Any) -> None:
         self.kwargs = kwargs
         self.it = 0             # pylint: disable=invalid-name
         super().__init__()
@@ -1124,11 +1153,13 @@ def reset(self) -> None:
 
 
 def _proxy_transform(
-    data,
-    feature_names: FeatureNames,
-    feature_types: FeatureTypes,
+    data: DataType,
+    feature_names: Optional[FeatureNames],
+    feature_types: Optional[FeatureTypes],
     enable_categorical: bool,
-):
+) -> Tuple[
+    Union[bool, ctypes.c_void_p, np.ndarray],
+        Optional[list], Optional[FeatureNames], Optional[FeatureTypes]]:
     if _is_cudf_df(data) or _is_cudf_ser(data):
         return _transform_cudf_df(
             data, feature_names, feature_types, enable_categorical
@@ -1152,7 +1183,7 @@ def _proxy_transform(
 
 def dispatch_proxy_set_data(
     proxy: _ProxyDMatrix,
-    data: Any,
+    data: DataType,
     cat_codes: Optional[list],
     allow_host: bool,
 ) -> None:
@@ -1162,11 +1193,11 @@ def dispatch_proxy_set_data(
 
     if _is_cudf_df(data):
         # pylint: disable=W0212
-        proxy._set_data_from_cuda_columnar(data, cat_codes)
+        proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
         return
     if _is_cudf_ser(data):
         # pylint: disable=W0212
-        proxy._set_data_from_cuda_columnar(data, cat_codes)
+        proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
         return
     if _is_cupy_array(data):
         proxy._set_data_from_cuda_interface(data)  # pylint: disable=W0212
diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py
index 75159d10434e..85a8428bc181 100644
--- a/python-package/xgboost/plotting.py
+++ b/python-package/xgboost/plotting.py
@@ -4,16 +4,34 @@
 """Plotting Library."""
 from io import BytesIO
 import json
+from typing import Optional, Any
+
 import numpy as np
+
+from ._typing import PathLike
 from .core import Booster
 from .sklearn import XGBModel
 
-
-def plot_importance(booster, ax=None, height=0.2,
-                    xlim=None, ylim=None, title='Feature importance',
-                    xlabel='F score', ylabel='Features', fmap='',
-                    importance_type='weight', max_num_features=None,
-                    grid=True, show_values=True, **kwargs):
+Axes = Any  # real type is matplotlib.axes.Axes
+GraphvizSource = Any  # real type is graphviz.Source
+
+
+def plot_importance(
+    booster: Booster,
+    ax: Optional[Axes] = None,
+    height: float = 0.2,
+    xlim: Optional[tuple] = None,
+    ylim: Optional[tuple] = None,
+    title: str = "Feature importance",
+    xlabel: str = "F score",
+    ylabel: str = "Features",
+    fmap: PathLike = "",
+    importance_type: str = "weight",
+    max_num_features: Optional[int] = None,
+    grid: bool = True,
+    show_values: bool = True,
+    **kwargs: Any
+) -> Axes:
     """Plot importance based on fitted trees.
 
     Parameters
@@ -78,9 +96,9 @@ def plot_importance(booster, ax=None, height=0.2,
     tuples = [(k, importance[k]) for k in importance]
     if max_num_features is not None:
         # pylint: disable=invalid-unary-operand-type
-        tuples = sorted(tuples, key=lambda x: x[1])[-max_num_features:]
+        tuples = sorted(tuples, key=lambda _x: _x[1])[-max_num_features:]
     else:
-        tuples = sorted(tuples, key=lambda x: x[1])
+        tuples = sorted(tuples, key=lambda _x: _x[1])
     labels, values = zip(*tuples)
 
     if ax is None:
@@ -120,9 +138,17 @@ def plot_importance(booster, ax=None, height=0.2,
     return ax
 
 
-def to_graphviz(booster, fmap='', num_trees=0, rankdir=None,
-                yes_color=None, no_color=None,
-                condition_node_params=None, leaf_node_params=None, **kwargs):
+def to_graphviz(
+    booster: Booster,
+    fmap: PathLike = "",
+    num_trees: int = 0,
+    rankdir: Optional[str] = None,
+    yes_color: Optional[str] = None,
+    no_color: Optional[str] = None,
+    condition_node_params: Optional[dict] = None,
+    leaf_node_params: Optional[dict] = None,
+    **kwargs: Any
+) -> GraphvizSource:
     """Convert specified tree to graphviz instance. IPython can automatically plot
     the returned graphiz instance. Otherwise, you should call .render() method
     of the returned graphiz instance.
@@ -212,7 +238,14 @@ def to_graphviz(booster, fmap='', num_trees=0, rankdir=None,
     return g
 
 
-def plot_tree(booster, fmap='', num_trees=0, rankdir=None, ax=None, **kwargs):
+def plot_tree(
+    booster: Booster,
+    fmap: PathLike = "",
+    num_trees: int = 0,
+    rankdir: Optional[str] = None,
+    ax: Optional[Axes] = None,
+    **kwargs: Any
+) -> Axes:
     """Plot specified tree.
 
     Parameters
diff --git a/python-package/xgboost/rabit.py b/python-package/xgboost/rabit.py
index 465a5611a2d1..f5da7a353330 100644
--- a/python-package/xgboost/rabit.py
+++ b/python-package/xgboost/rabit.py
@@ -230,7 +230,9 @@ def version_number() -> int:
 class RabitContext:
     """A context controlling rabit initialization and finalization."""
 
-    def __init__(self, args: List[bytes]) -> None:
+    def __init__(self, args: List[bytes] = None) -> None:
+        if args is None:
+            args = []
         self.args = args
 
     def __enter__(self) -> None:
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index ae883e30ee17..f6b43d8de448 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -4,8 +4,19 @@
 import warnings
 import json
 import os
-from typing import Union, Optional, List, Dict, Callable, Tuple, Any, TypeVar, Type, cast
-from typing import Sequence
+from typing import (
+    Union,
+    Optional,
+    List,
+    Dict,
+    Callable,
+    Sequence,
+    Tuple,
+    Any,
+    TypeVar,
+    Type,
+    cast,
+)
 import numpy as np
 
 from .core import Booster, DMatrix, XGBoostError
@@ -14,7 +25,7 @@
 from .training import train
 from .callback import TrainingCallback
 from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array
-from ._typing import ArrayLike, FeatureTypes
+from ._typing import ArrayLike, FeatureNames, FeatureTypes
 
 # Do not use class names on scikit-learn directly.  Re-define the classes on
 # .compat to guarantee the behavior without scikit-learn
@@ -401,7 +412,7 @@ def _wrap_evaluation_matrices(
     eval_qid: Optional[Sequence[Any]],
     create_dmatrix: Callable,
     enable_categorical: bool,
-    feature_types: FeatureTypes,
+    feature_types: Optional[FeatureTypes],
 ) -> Tuple[Any, List[Tuple[Any, str]]]:
     """Convert array_like evaluation matrices into DMatrix.  Perform validation on the way.
 
@@ -717,7 +728,7 @@ def _get_type(self) -> str:
         return self._estimator_type  # pylint: disable=no-member
 
     def save_model(self, fname: Union[str, os.PathLike]) -> None:
-        meta = {}
+        meta: Dict[str, Any] = {}
         for k, v in self.__dict__.items():
             if k == '_le':
                 meta['_le'] = self._le.to_json()
@@ -1231,7 +1242,7 @@ def dft() -> str:
             importance_type=self.importance_type if self.importance_type else dft()
         )
         if b.feature_names is None:
-            feature_names = [f"f{i}" for i in range(self.n_features_in_)]
+            feature_names: FeatureNames = [f"f{i}" for i in range(self.n_features_in_)]
         else:
             feature_names = b.feature_names
         # gblinear returns all features so the `get` in next line is only for gbtree.
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index 38567b6bf949..2103303fbe20 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -5,20 +5,24 @@
 import copy
 import os
 import warnings
-from typing import Optional, Dict, Any, Union, Tuple, Sequence
+from typing import Optional, Dict, Any, Union, Tuple, Sequence, List, cast, Iterable
 
 import numpy as np
+
+from .callback import TrainingCallback, CallbackContainer, EvaluationMonitor, EarlyStopping
 from .core import Booster, DMatrix, XGBoostError, _deprecate_positional_args
 from .core import Metric, Objective
-from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold)
-from . import callback
+from .compat import SKLEARN_INSTALLED, XGBStratifiedKFold, DataFrame
+from ._typing import _F, FPreProcCallable, BoosterParam
+
+_CVFolds = Sequence["CVPack"]
 
 
 def _assert_new_callback(
-    callbacks: Optional[Sequence[callback.TrainingCallback]]
+    callbacks: Optional[Sequence[TrainingCallback]]
 ) -> None:
     is_new_callback: bool = not callbacks or all(
-        isinstance(c, callback.TrainingCallback) for c in callbacks
+        isinstance(c, TrainingCallback) for c in callbacks
     )
     if not is_new_callback:
         link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html"
@@ -56,10 +60,10 @@ def train(
     feval: Optional[Metric] = None,
     maximize: Optional[bool] = None,
     early_stopping_rounds: Optional[int] = None,
-    evals_result: callback.TrainingCallback.EvalsLog = None,
+    evals_result: TrainingCallback.EvalsLog = None,
     verbose_eval: Optional[Union[bool, int]] = True,
     xgb_model: Optional[Union[str, os.PathLike, Booster, bytearray]] = None,
-    callbacks: Optional[Sequence[callback.TrainingCallback]] = None,
+    callbacks: Optional[Sequence[TrainingCallback]] = None,
     custom_metric: Optional[Metric] = None,
 ) -> Booster:
     """Train a booster with given parameters.
@@ -159,12 +163,12 @@ def train(
     _assert_new_callback(callbacks)
     if verbose_eval:
         verbose_eval = 1 if verbose_eval is True else verbose_eval
-        callbacks.append(callback.EvaluationMonitor(period=verbose_eval))
+        callbacks.append(EvaluationMonitor(period=verbose_eval))
     if early_stopping_rounds:
         callbacks.append(
-            callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
+            EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
         )
-    cb_container = callback.CallbackContainer(
+    cb_container = CallbackContainer(
         callbacks,
         metric=metric_fn,
         # For old `feval` parameter, the behavior is unchanged.  For the new
@@ -194,71 +198,73 @@ def train(
 
 class CVPack:
     """"Auxiliary datastruct to hold one fold of CV."""
-    def __init__(self, dtrain, dtest, param):
+    def __init__(self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]) -> None:
         """"Initialize the CVPack"""
         self.dtrain = dtrain
         self.dtest = dtest
         self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
         self.bst = Booster(param, [dtrain, dtest])
 
-    def __getattr__(self, name):
-        def _inner(*args, **kwargs):
+    def __getattr__(self, name: str) -> _F:
+        def _inner(*args: Any, **kwargs: Any) -> Any:
             return getattr(self.bst, name)(*args, **kwargs)
-        return _inner
+        return cast(_F, _inner)
 
-    def update(self, iteration, fobj):
+    def update(self, iteration: int, fobj: Optional[Objective]) -> None:
         """"Update the boosters for one iteration"""
         self.bst.update(self.dtrain, iteration, fobj)
 
-    def eval(self, iteration, feval, output_margin):
+    def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> str:
         """"Evaluate the CVPack for one iteration."""
         return self.bst.eval_set(self.watchlist, iteration, feval, output_margin)
 
 
 class _PackedBooster:
-    def __init__(self, cvfolds) -> None:
+    def __init__(self, cvfolds: _CVFolds) -> None:
         self.cvfolds = cvfolds
 
-    def update(self, iteration, obj):
+    def update(self, iteration: int, obj: Optional[Objective]) -> None:
         '''Iterate through folds for update'''
         for fold in self.cvfolds:
             fold.update(iteration, obj)
 
-    def eval(self, iteration, feval, output_margin):
+    def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> List[str]:
         '''Iterate through folds for eval'''
         result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds]
         return result
 
-    def set_attr(self, **kwargs):
+    def set_attr(self, **kwargs: Optional[str]) -> Any:
         '''Iterate through folds for setting attributes'''
         for f in self.cvfolds:
             f.bst.set_attr(**kwargs)
 
-    def attr(self, key):
+    def attr(self, key: str) -> Optional[str]:
         '''Redirect to booster attr.'''
         return self.cvfolds[0].bst.attr(key)
 
-    def set_param(self, params, value=None):
+    def set_param(self,
+                  params: Union[Dict, Iterable[Tuple[str, Any]], str],
+                  value: Optional[str] = None) -> None:
         """Iterate through folds for set_param"""
         for f in self.cvfolds:
             f.bst.set_param(params, value)
 
-    def num_boosted_rounds(self):
+    def num_boosted_rounds(self) -> int:
         '''Number of boosted rounds.'''
         return self.cvfolds[0].num_boosted_rounds()
 
     @property
-    def best_iteration(self):
+    def best_iteration(self) -> int:
         '''Get best_iteration'''
-        return int(self.cvfolds[0].bst.attr("best_iteration"))
+        return int(cast(int, self.cvfolds[0].bst.attr("best_iteration")))
 
     @property
-    def best_score(self):
+    def best_score(self) -> float:
         """Get best_score."""
-        return float(self.cvfolds[0].bst.attr("best_score"))
+        return float(cast(float, self.cvfolds[0].bst.attr("best_score")))
 
 
-def groups_to_rows(groups, boundaries):
+def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
     """
     Given group row boundaries, convert ground indexes to row indexes
     :param groups: list of groups for testing
@@ -268,7 +274,9 @@ def groups_to_rows(groups, boundaries):
     return np.concatenate([np.arange(boundaries[g], boundaries[g+1]) for g in groups])
 
 
-def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True):
+def mkgroupfold(dall: DMatrix, nfold: int, param: BoosterParam,
+                evals: Sequence[str] = (), fpreproc: FPreProcCallable = None,
+                shuffle: bool = True) -> List[CVPack]:
     """
     Make n folds for cross-validation maintaining groups
     :return: cross-validation folds
@@ -308,8 +316,10 @@ def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True):
     return ret
 
 
-def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
-            folds=None, shuffle=True):
+def mknfold(dall: DMatrix, nfold: int, param: BoosterParam, seed: int,
+            evals: Sequence[str] = (), fpreproc: FPreProcCallable = None,
+            stratified: bool = False, folds: XGBStratifiedKFold = None, shuffle: bool = True
+            ) -> List[CVPack]:
     """
     Make an n-fold list of CVPack from random indices.
     """
@@ -362,11 +372,27 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
     return ret
 
 
-def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None,
-       metrics=(), obj: Optional[Objective] = None,
-       feval=None, maximize=None, early_stopping_rounds=None,
-       fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True,
-       seed=0, callbacks=None, shuffle=True, custom_metric: Optional[Metric] = None):
+def cv(
+    params: BoosterParam,
+    dtrain: DMatrix,
+    num_boost_round: int = 10,
+    nfold: int = 3,
+    stratified: bool = False,
+    folds: XGBStratifiedKFold = None,
+    metrics: Sequence[str] = (),
+    obj: Optional[Objective] = None,
+    feval: Optional[Metric] = None,
+    maximize: bool = None,
+    early_stopping_rounds: int = None,
+    fpreproc: FPreProcCallable = None,
+    as_pandas: bool = True,
+    verbose_eval: Optional[Union[int, bool]] = None,
+    show_stdv: bool = True,
+    seed: int = 0,
+    callbacks: Sequence[TrainingCallback] = None,
+    shuffle: bool = True,
+    custom_metric: Optional[Metric] = None,
+) -> Union[Dict[str, float], DataFrame]:
     # pylint: disable = invalid-name
     """Cross-validation with given parameters.
 
@@ -477,7 +503,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
 
     params.pop("eval_metric", None)
 
-    results = {}
+    results: Dict[str, List[float]] = {}
     cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
                       stratified, folds, shuffle)
 
@@ -490,13 +516,13 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
     if verbose_eval:
         verbose_eval = 1 if verbose_eval is True else verbose_eval
         callbacks.append(
-            callback.EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
+            EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
         )
     if early_stopping_rounds:
         callbacks.append(
-            callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
+            EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
         )
-    callbacks = callback.CallbackContainer(
+    callbacks_container = CallbackContainer(
         callbacks,
         metric=metric_fn,
         is_cv=True,
@@ -504,16 +530,16 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
     )
 
     booster = _PackedBooster(cvfolds)
-    callbacks.before_training(booster)
+    callbacks_container.before_training(booster)
 
     for i in range(num_boost_round):
-        if callbacks.before_iteration(booster, i, dtrain, None):
+        if callbacks_container.before_iteration(booster, i, dtrain, None):
             break
         booster.update(i, obj)
 
-        should_break = callbacks.after_iteration(booster, i, dtrain, None)
-        res = callbacks.aggregated_cv
-        for key, mean, std in res:
+        should_break = callbacks_container.after_iteration(booster, i, dtrain, None)
+        res = callbacks_container.aggregated_cv
+        for key, mean, std in cast(List[Tuple[str, float, float]], res):
             if key + '-mean' not in results:
                 results[key + '-mean'] = []
             if key + '-std' not in results:
@@ -532,6 +558,6 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
         except ImportError:
             pass
 
-    callbacks.after_training(booster)
+    callbacks_container.after_training(booster)
 
     return results
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 3c7c539802fa..d72eb077b05f 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -300,7 +300,7 @@ XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle,
   CHECK(p_m);
   auto m =   static_cast<xgboost::data::DMatrixProxy*>(p_m->get());
   CHECK(m) << "Current DMatrix type does not support set data.";
-  m->SetData(c_interface_str);
+  m->SetCUDAArray(c_interface_str);
   API_END();
 }
 
@@ -312,7 +312,7 @@ XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle,
   CHECK(p_m);
   auto m =   static_cast<xgboost::data::DMatrixProxy*>(p_m->get());
   CHECK(m) << "Current DMatrix type does not support set data.";
-  m->SetData(c_interface_str);
+  m->SetCUDAArray(c_interface_str);
   API_END();
 }
 
@@ -825,74 +825,69 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
   API_END();
 }
 
-template <typename T>
-void InplacePredictImpl(std::shared_ptr<T> x, std::shared_ptr<DMatrix> p_m,
-                        char const *c_json_config, Learner *learner,
-                        size_t n_rows, size_t n_cols,
-                        xgboost::bst_ulong const **out_shape,
-                        xgboost::bst_ulong *out_dim, const float **out_result) {
+void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config, Learner *learner,
+                        xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
+                        const float **out_result) {
   auto config = Json::Load(StringView{c_json_config});
   CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
 
-  HostDeviceVector<float>* p_predt { nullptr };
+  HostDeviceVector<float> *p_predt{nullptr};
   auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
   float missing = GetMissing(config);
-  learner->InplacePredict(x, p_m, type, missing, &p_predt,
+  learner->InplacePredict(p_m, type, missing, &p_predt,
                           RequiredArg<Integer>(config, "iteration_begin", __func__),
                           RequiredArg<Integer>(config, "iteration_end", __func__));
   CHECK(p_predt);
   auto &shape = learner->GetThreadLocal().prediction_shape;
-  auto chunksize = n_rows == 0 ? 0 : p_predt->Size() / n_rows;
+  auto const &info = p_m->Info();
+  auto n_samples = info.num_row_;
+  auto n_features = info.num_col_;
+  auto chunksize = n_samples == 0 ? 0 : p_predt->Size() / n_samples;
   bool strict_shape = RequiredArg<Boolean>(config, "strict_shape", __func__);
-  CalcPredictShape(strict_shape, type, n_rows, n_cols, chunksize, learner->Groups(),
+  CalcPredictShape(strict_shape, type, n_samples, n_features, chunksize, learner->Groups(),
                    learner->BoostedRounds(), &shape, out_dim);
   *out_result = dmlc::BeginPtr(p_predt->HostVector());
   *out_shape = dmlc::BeginPtr(shape);
 }
 
-// A hidden API as cache id is not being supported yet.
-XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle,
-                                      char const *array_interface,
-                                      char const *c_json_config,
-                                      DMatrixHandle m,
+XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *array_interface,
+                                      char const *c_json_config, DMatrixHandle m,
                                       xgboost::bst_ulong const **out_shape,
-                                      xgboost::bst_ulong *out_dim,
-                                      const float **out_result) {
+                                      xgboost::bst_ulong *out_dim, const float **out_result) {
   API_BEGIN();
   CHECK_HANDLE();
-  std::shared_ptr<xgboost::data::ArrayAdapter> x{
-      new xgboost::data::ArrayAdapter(StringView{array_interface})};
-  std::shared_ptr<DMatrix> p_m {nullptr};
-  if (m) {
+  std::shared_ptr<DMatrix> p_m{nullptr};
+  if (!m) {
+    p_m.reset(new data::DMatrixProxy);
+  } else {
     p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
   }
+  auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
+  CHECK(proxy) << "Invalid input type for inplace predict.";
+  proxy->SetArrayData(array_interface);
   auto *learner = static_cast<xgboost::Learner *>(handle);
-  InplacePredictImpl(x, p_m, c_json_config, learner, x->NumRows(),
-                     x->NumColumns(), out_shape, out_dim, out_result);
+  InplacePredictImpl(p_m, c_json_config, learner, out_shape, out_dim, out_result);
   API_END();
 }
 
-// A hidden API as cache id is not being supported yet.
-XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr,
-                                    char const *indices, char const *data,
-                                    xgboost::bst_ulong cols,
+XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, char const *indices,
+                                    char const *data, xgboost::bst_ulong cols,
                                     char const *c_json_config, DMatrixHandle m,
                                     xgboost::bst_ulong const **out_shape,
-                                    xgboost::bst_ulong *out_dim,
-                                    const float **out_result) {
+                                    xgboost::bst_ulong *out_dim, const float **out_result) {
   API_BEGIN();
   CHECK_HANDLE();
-  std::shared_ptr<xgboost::data::CSRArrayAdapter> x{
-      new xgboost::data::CSRArrayAdapter{StringView{indptr},
-                                         StringView{indices}, StringView{data},
-                                         static_cast<size_t>(cols)}};
-  std::shared_ptr<DMatrix> p_m {nullptr};
-  if (m) {
+  std::shared_ptr<DMatrix> p_m{nullptr};
+  if (!m) {
+    p_m.reset(new data::DMatrixProxy);
+  } else {
     p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
   }
+  auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
+  CHECK(proxy) << "Invalid input type for inplace predict.";
+  proxy->SetCSRData(indptr, indices, data, cols, true);
   auto *learner = static_cast<xgboost::Learner *>(handle);
-  InplacePredictImpl(x, p_m, c_json_config, learner, x->NumRows(),
-                     x->NumColumns(), out_shape, out_dim, out_result);
+  InplacePredictImpl(p_m, c_json_config, learner, out_shape, out_dim, out_result);
   API_END();
 }
 
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 80408ba466eb..c3b303fa447f 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -1,10 +1,11 @@
-// Copyright (c) 2019-2021 by Contributors
-#include "xgboost/data.h"
-#include "xgboost/c_api.h"
-#include "xgboost/learner.h"
+// Copyright (c) 2019-2022 by Contributors
+#include "../data/device_adapter.cuh"
+#include "../data/proxy_dmatrix.h"
 #include "c_api_error.h"
 #include "c_api_utils.h"
-#include "../data/device_adapter.cuh"
+#include "xgboost/c_api.h"
+#include "xgboost/data.h"
+#include "xgboost/learner.h"
 
 namespace xgboost {
 
@@ -85,62 +86,65 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
   API_END();
 }
 
-template <typename T>
-int InplacePreidctCuda(BoosterHandle handle, char const *c_json_strs,
-                       char const *c_json_config,
-                       std::shared_ptr<DMatrix> p_m,
-                       xgboost::bst_ulong const **out_shape,
-                       xgboost::bst_ulong *out_dim, const float **out_result) {
+int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
+                       char const *c_json_config, std::shared_ptr<DMatrix> p_m,
+                       xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
+                       const float **out_result) {
   API_BEGIN();
   CHECK_HANDLE();
+  if (!p_m) {
+    p_m.reset(new data::DMatrixProxy);
+  }
+  auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
+  CHECK(proxy) << "Invalid input type for inplace predict.";
+  proxy->SetCUDAArray(c_array_interface);
+
   auto config = Json::Load(StringView{c_json_config});
-  CHECK_EQ(get<Integer const>(config["cache_id"]), 0)
-      << "Cache ID is not supported yet";
+  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
   auto *learner = static_cast<Learner *>(handle);
 
-  std::string json_str{c_json_strs};
-  auto x = std::make_shared<T>(json_str);
   HostDeviceVector<float> *p_predt{nullptr};
-  auto type = PredictionType(get<Integer const>(config["type"]));
+  auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
   float missing = GetMissing(config);
 
-  learner->InplacePredict(x, p_m, type, missing, &p_predt,
-                          get<Integer const>(config["iteration_begin"]),
-                          get<Integer const>(config["iteration_end"]));
+  learner->InplacePredict(p_m, type, missing, &p_predt,
+                          RequiredArg<Integer>(config, "iteration_begin", __func__),
+                          RequiredArg<Integer>(config, "iteration_end", __func__));
   CHECK(p_predt);
   CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
 
   auto &shape = learner->GetThreadLocal().prediction_shape;
-  auto chunksize = x->NumRows() == 0 ? 0 : p_predt->Size() / x->NumRows();
-  bool strict_shape = get<Boolean const>(config["strict_shape"]);
-  CalcPredictShape(strict_shape, type, x->NumRows(), x->NumColumns(), chunksize,
-                   learner->Groups(), learner->BoostedRounds(), &shape,
-                   out_dim);
+  size_t n_samples = p_m->Info().num_row_;
+  auto chunksize = n_samples == 0 ? 0 : p_predt->Size() / n_samples;
+  bool strict_shape = RequiredArg<Boolean>(config, "strict_shape", __func__);
+  CalcPredictShape(strict_shape, type, n_samples, p_m->Info().num_col_, chunksize,
+                   learner->Groups(), learner->BoostedRounds(), &shape, out_dim);
   *out_shape = dmlc::BeginPtr(shape);
   *out_result = p_predt->ConstDevicePointer();
   API_END();
 }
 
-XGB_DLL int XGBoosterPredictFromCudaColumnar(
-    BoosterHandle handle, char const *c_json_strs, char const *c_json_config,
-    DMatrixHandle m, xgboost::bst_ulong const **out_shape,
-    xgboost::bst_ulong *out_dim, const float **out_result) {
-  std::shared_ptr<DMatrix> p_m {nullptr};
+XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c_json_strs,
+                                             char const *c_json_config, DMatrixHandle m,
+                                             xgboost::bst_ulong const **out_shape,
+                                             xgboost::bst_ulong *out_dim,
+                                             const float **out_result) {
+  std::shared_ptr<DMatrix> p_m{nullptr};
   if (m) {
     p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
   }
-  return InplacePreidctCuda<data::CudfAdapter>(
-      handle, c_json_strs, c_json_config, p_m, out_shape, out_dim, out_result);
+  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+                            out_result);
 }
 
-XGB_DLL int XGBoosterPredictFromCudaArray(
-    BoosterHandle handle, char const *c_json_strs, char const *c_json_config,
-    DMatrixHandle m, xgboost::bst_ulong const **out_shape,
-    xgboost::bst_ulong *out_dim, const float **out_result) {
-  std::shared_ptr<DMatrix> p_m {nullptr};
+XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_json_strs,
+                                          char const *c_json_config, DMatrixHandle m,
+                                          xgboost::bst_ulong const **out_shape,
+                                          xgboost::bst_ulong *out_dim, const float **out_result) {
+  std::shared_ptr<DMatrix> p_m{nullptr};
   if (m) {
     p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
   }
-  return InplacePreidctCuda<data::CupyAdapter>(
-      handle, c_json_strs, c_json_config, p_m, out_shape, out_dim, out_result);
+  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+                            out_result);
 }
diff --git a/src/data/adapter.h b/src/data/adapter.h
index 4025ccd8e996..e6cb6d8b9068 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -1131,6 +1131,24 @@ class RecordBatchesIterAdapter: public dmlc::DataIter<ArrowColumnarBatchVec> {
   struct ArrowSchemaImporter schema_;
   ArrowColumnarBatchVec batches_;
 };
+
+class SparsePageAdapterBatch {
+  HostSparsePageView page_;
+
+ public:
+  struct Line {
+    SparsePage::Inst inst;
+    bst_row_t ridx;
+    COOTuple GetElement(size_t idx) const {
+      return COOTuple{ridx, inst.data()[idx].index, inst.data()[idx].fvalue};
+    }
+    size_t Size() const { return inst.size(); }
+  };
+
+  explicit SparsePageAdapterBatch(HostSparsePageView page) : page_{std::move(page)} {}
+  Line GetLine(size_t ridx) const { return Line{page_[ridx], ridx}; }
+  size_t Size() const { return page_.Size(); }
+};
 };  // namespace data
 }  // namespace xgboost
 #endif  // XGBOOST_DATA_ADAPTER_H_
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 8a6f67f144d0..8744bbf776f5 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -55,7 +55,7 @@ class DMatrixProxy : public DMatrix {
  public:
   int DeviceIdx() const { return ctx_.gpu_id; }
 
-  void SetData(char const* c_interface) {
+  void SetCUDAArray(char const* c_interface) {
     common::AssertGPUSupport();
 #if defined(XGBOOST_USE_CUDA)
     std::string interface_str = c_interface;
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index bb7c341f8beb..8f8facc5392b 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -795,88 +795,75 @@ class Dart : public GBTree {
     this->PredictBatchImpl(p_fmat, p_out_preds, training, layer_begin, layer_end);
   }
 
-  void InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
-                      float missing, PredictionCacheEntry *out_preds,
-                      uint32_t layer_begin, unsigned layer_end) const override {
+  void InplacePredict(std::shared_ptr<DMatrix> p_fmat, float missing,
+                      PredictionCacheEntry* p_out_preds, uint32_t layer_begin,
+                      unsigned layer_end) const override {
     uint32_t tree_begin, tree_end;
     std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
-    std::vector<Predictor const *> predictors{
+    auto n_groups = model_.learner_model_param->num_output_group;
+
+    std::vector<Predictor const*> predictors {
       cpu_predictor_.get(),
 #if defined(XGBOOST_USE_CUDA)
       gpu_predictor_.get()
 #endif  // defined(XGBOOST_USE_CUDA)
     };
-    Predictor const * predictor {nullptr};
-
-    MetaInfo info;
+    Predictor const* predictor{nullptr};
     StringView msg{"Unsupported data type for inplace predict."};
-    int32_t device = GenericParameter::kCpuId;
+
     PredictionCacheEntry predts;
-    // Inplace predict is not used for training, so no need to drop tree.
-    for (size_t i = tree_begin; i < tree_end; ++i) {
+    if (ctx_->gpu_id != Context::kCpuId) {
+      predts.predictions.SetDevice(ctx_->gpu_id);
+    }
+    predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
+
+    auto predict_impl = [&](size_t i) {
+      predts.predictions.Fill(0);
       if (tparam_.predictor == PredictorType::kAuto) {
         // Try both predictor implementations
         bool success = false;
-        for (auto const &p : predictors) {
-          if (p && p->InplacePredict(x, nullptr, model_, missing, &predts, i,
-                                     i + 1)) {
+        for (auto const& p : predictors) {
+          if (p && p->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)) {
             success = true;
             predictor = p;
-#if defined(XGBOOST_USE_CUDA)
-            device = predts.predictions.DeviceIdx();
-#endif  // defined(XGBOOST_USE_CUDA)
             break;
           }
         }
         CHECK(success) << msg;
       } else {
-        // No base margin from meta info for each tree
         predictor = this->GetPredictor().get();
-        bool success = predictor->InplacePredict(x, nullptr, model_, missing,
-                                                 &predts, i, i + 1);
-        device = predts.predictions.DeviceIdx();
+        bool success = predictor->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
         CHECK(success) << msg << std::endl
                        << "Current Predictor: "
-                       << (tparam_.predictor == PredictorType::kCPUPredictor
-                               ? "cpu_predictor"
-                               : "gpu_predictor");
+                       << (tparam_.predictor == PredictorType::kCPUPredictor ? "cpu_predictor"
+                                                                             : "gpu_predictor");
       }
+    };
 
-      auto w = this->weight_drop_.at(i);
-      size_t n_groups = model_.learner_model_param->num_output_group;
-      auto n_rows = predts.predictions.Size() / n_groups;
-
+    // Inplace predict is not used for training, so no need to drop tree.
+    for (size_t i = tree_begin; i < tree_end; ++i) {
+      predict_impl(i);
       if (i == tree_begin) {
-        // base margin is added here.
-        if (p_m) {
-          p_m->Info().num_row_ = n_rows;
-          predictor->InitOutPredictions(p_m->Info(), &out_preds->predictions,
-                                        model_);
-        } else {
-          info.num_row_ = n_rows;
-          predictor->InitOutPredictions(info, &out_preds->predictions, model_);
-        }
+        predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
       }
-
       // Multiple the tree weight
-      CHECK_EQ(predts.predictions.Size(), out_preds->predictions.Size());
+      auto w = this->weight_drop_.at(i);
       auto group = model_.tree_info.at(i);
+      CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
 
-      if (device == GenericParameter::kCpuId) {
-        auto &h_predts = predts.predictions.HostVector();
-        auto &h_out_predts = out_preds->predictions.HostVector();
+      size_t n_rows = p_fmat->Info().num_row_;
+      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
+        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+        GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
+                                 predts.predictions.DeviceSpan(), w, n_rows,
+                                 model_.learner_model_param->base_score, n_groups, group);
+      } else {
+        auto& h_predts = predts.predictions.HostVector();
+        auto& h_out_predts = p_out_preds->predictions.HostVector();
         common::ParallelFor(n_rows, ctx_->Threads(), [&](auto ridx) {
           const size_t offset = ridx * n_groups + group;
-          // Need to remove the base margin from individual tree.
           h_out_predts[offset] += (h_predts[offset] - model_.learner_model_param->base_score) * w;
         });
-      } else {
-        out_preds->predictions.SetDevice(device);
-        predts.predictions.SetDevice(device);
-        GPUDartInplacePredictInc(out_preds->predictions.DeviceSpan(),
-                                 predts.predictions.DeviceSpan(), w, n_rows,
-                                 model_.learner_model_param->base_score,
-                                 n_groups, group);
       }
     }
   }
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 020b7d0cb9c0..0d2d025e5250 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -261,8 +261,7 @@ class GBTree : public GradientBooster {
   void PredictBatch(DMatrix *p_fmat, PredictionCacheEntry *out_preds,
                     bool training, unsigned layer_begin, unsigned layer_end) override;
 
-  void InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
-                      float missing, PredictionCacheEntry *out_preds,
+  void InplacePredict(std::shared_ptr<DMatrix> p_m, float missing, PredictionCacheEntry* out_preds,
                       uint32_t layer_begin, unsigned layer_end) const override {
     CHECK(configured_);
     uint32_t tree_begin, tree_end;
@@ -278,15 +277,14 @@ class GBTree : public GradientBooster {
     if (tparam_.predictor == PredictorType::kAuto) {
       // Try both predictor implementations
       for (auto const &p : predictors) {
-        if (p && p->InplacePredict(x, p_m, model_, missing, out_preds,
-                                   tree_begin, tree_end)) {
+        if (p && p->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end)) {
           return;
         }
       }
       LOG(FATAL) << msg;
     } else {
-      bool success = this->GetPredictor()->InplacePredict(
-          x, p_m, model_, missing, out_preds, tree_begin, tree_end);
+      bool success = this->GetPredictor()->InplacePredict(p_m, model_, missing, out_preds,
+                                                          tree_begin, tree_end);
       CHECK(success) << msg << std::endl
                      << "Current Predictor: "
                      << (tparam_.predictor == PredictorType::kCPUPredictor
diff --git a/src/learner.cc b/src/learner.cc
index 568cfc680714..5d7d067e71e2 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1277,15 +1277,12 @@ class LearnerImpl : public LearnerIO {
     return (*LearnerAPIThreadLocalStore::Get())[this];
   }
 
-  void InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
-                      PredictionType type, float missing,
-                      HostDeviceVector<bst_float> **out_preds,
-                      uint32_t iteration_begin,
+  void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
+                      HostDeviceVector<bst_float>** out_preds, uint32_t iteration_begin,
                       uint32_t iteration_end) override {
     this->Configure();
     auto& out_predictions = this->GetThreadLocal().prediction_entry;
-    this->gbm_->InplacePredict(x, p_m, missing, &out_predictions,
-                               iteration_begin, iteration_end);
+    this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
     if (type == PredictionType::kValue) {
       obj_->PredTransform(&out_predictions.predictions);
     } else if (type == PredictionType::kMargin) {
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 892c956319bb..b5dd9b4af12b 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -1,27 +1,27 @@
 /*!
  * Copyright by Contributors 2017-2021
  */
-#include <dmlc/omp.h>
 #include <dmlc/any.h>
+#include <dmlc/omp.h>
 
 #include <cstddef>
 #include <limits>
 #include <mutex>
 
+#include "../common/categorical.h"
+#include "../common/math.h"
+#include "../common/threading_utils.h"
+#include "../data/adapter.h"
+#include "../data/proxy_dmatrix.h"
+#include "../gbm/gbtree_model.h"
+#include "predict_fn.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/logging.h"
 #include "xgboost/predictor.h"
 #include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"
-#include "xgboost/logging.h"
-#include "xgboost/host_device_vector.h"
-
-#include "predict_fn.h"
-#include "../data/adapter.h"
-#include "../common/math.h"
-#include "../common/threading_utils.h"
-#include "../common/categorical.h"
-#include "../gbm/gbtree_model.h"
 
 namespace xgboost {
 namespace predictor {
@@ -327,22 +327,24 @@ class CPUPredictor : public Predictor {
         &predictions, model, tree_begin, tree_end, &thread_temp, n_threads);
   }
 
-  bool InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
-                      const gbm::GBTreeModel &model, float missing,
+  bool InplacePredict(std::shared_ptr<DMatrix> p_m, const gbm::GBTreeModel &model, float missing,
                       PredictionCacheEntry *out_preds, uint32_t tree_begin,
                       unsigned tree_end) const override {
+    auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
+    CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
+    auto x = proxy->Adapter();
     if (x.type() == typeid(std::shared_ptr<data::DenseAdapter>)) {
       this->DispatchedInplacePredict<data::DenseAdapter, kBlockOfRowsSize>(
           x, p_m, model, missing, out_preds, tree_begin, tree_end);
     } else if (x.type() == typeid(std::shared_ptr<data::CSRAdapter>)) {
-      this->DispatchedInplacePredict<data::CSRAdapter, 1>(
-          x, p_m, model, missing, out_preds, tree_begin, tree_end);
+      this->DispatchedInplacePredict<data::CSRAdapter, 1>(x, p_m, model, missing, out_preds,
+                                                          tree_begin, tree_end);
     } else if (x.type() == typeid(std::shared_ptr<data::ArrayAdapter>)) {
-      this->DispatchedInplacePredict<data::ArrayAdapter, kBlockOfRowsSize> (
+      this->DispatchedInplacePredict<data::ArrayAdapter, kBlockOfRowsSize>(
           x, p_m, model, missing, out_preds, tree_begin, tree_end);
     } else if (x.type() == typeid(std::shared_ptr<data::CSRArrayAdapter>)) {
-      this->DispatchedInplacePredict<data::CSRArrayAdapter, 1> (
-          x, p_m, model, missing, out_preds, tree_begin, tree_end);
+      this->DispatchedInplacePredict<data::CSRArrayAdapter, 1>(x, p_m, model, missing, out_preds,
+                                                               tree_begin, tree_end);
     } else {
       return false;
     }
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 0a09dc255c95..d20918cf2f56 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -1,28 +1,29 @@
 /*!
  * Copyright 2017-2021 by Contributors
  */
+#include <GPUTreeShap/gpu_treeshap.h>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
 #include <thrust/host_vector.h>
-#include <GPUTreeShap/gpu_treeshap.h>
+
 #include <memory>
 
+#include "../common/bitfield.h"
+#include "../common/categorical.h"
+#include "../common/common.h"
+#include "../common/device_helpers.cuh"
+#include "../data/device_adapter.cuh"
+#include "../data/ellpack_page.cuh"
+#include "../data/proxy_dmatrix.h"
+#include "../gbm/gbtree_model.h"
+#include "predict_fn.h"
 #include "xgboost/data.h"
+#include "xgboost/host_device_vector.h"
 #include "xgboost/predictor.h"
 #include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"
-#include "xgboost/host_device_vector.h"
-
-#include "predict_fn.h"
-#include "../gbm/gbtree_model.h"
-#include "../data/ellpack_page.cuh"
-#include "../data/device_adapter.cuh"
-#include "../common/common.h"
-#include "../common/bitfield.h"
-#include "../common/categorical.h"
-#include "../common/device_helpers.cuh"
 
 namespace xgboost {
 namespace predictor {
@@ -789,17 +790,19 @@ class GPUPredictor : public xgboost::Predictor {
         m->NumRows(), entry_start, use_shared, output_groups, missing);
   }
 
-  bool InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
-                      const gbm::GBTreeModel &model, float missing,
-                      PredictionCacheEntry *out_preds, uint32_t tree_begin,
+  bool InplacePredict(std::shared_ptr<DMatrix> p_m, const gbm::GBTreeModel& model, float missing,
+                      PredictionCacheEntry* out_preds, uint32_t tree_begin,
                       unsigned tree_end) const override {
+    auto proxy = dynamic_cast<data::DMatrixProxy*>(p_m.get());
+    CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
+    auto x = proxy->Adapter();
     if (x.type() == typeid(std::shared_ptr<data::CupyAdapter>)) {
-      this->DispatchedInplacePredict<
-          data::CupyAdapter, DeviceAdapterLoader<data::CupyAdapterBatch>>(
+      this->DispatchedInplacePredict<data::CupyAdapter,
+                                     DeviceAdapterLoader<data::CupyAdapterBatch>>(
           x, p_m, model, missing, out_preds, tree_begin, tree_end);
     } else if (x.type() == typeid(std::shared_ptr<data::CudfAdapter>)) {
-      this->DispatchedInplacePredict<
-          data::CudfAdapter, DeviceAdapterLoader<data::CudfAdapterBatch>>(
+      this->DispatchedInplacePredict<data::CudfAdapter,
+                                     DeviceAdapterLoader<data::CudfAdapterBatch>>(
           x, p_m, model, missing, out_preds, tree_begin, tree_end);
     } else {
       return false;
diff --git a/src/tree/driver.h b/src/tree/driver.h
index abb8afadcb8a..cab762338e2d 100644
--- a/src/tree/driver.h
+++ b/src/tree/driver.h
@@ -33,10 +33,11 @@ class Driver {
                           std::function<bool(ExpandEntryT, ExpandEntryT)>>;
 
  public:
-  explicit Driver(TrainParam::TreeGrowPolicy policy)
-      : policy_(policy),
-        queue_(policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT> :
-                                                  LossGuide<ExpandEntryT>) {}
+  explicit Driver(TrainParam param, std::size_t max_node_batch_size = 256)
+      : param_(param),
+        max_node_batch_size_(max_node_batch_size),
+        queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT>
+                                                           : LossGuide<ExpandEntryT>) {}
   template <typename EntryIterT>
   void Push(EntryIterT begin, EntryIterT end) {
     for (auto it = begin; it != end; ++it) {
@@ -55,24 +56,45 @@ class Driver {
     return queue_.empty();
   }
 
+  // Can a child of this entry still be expanded?
+  // can be used to avoid extra work
+  bool IsChildValid(ExpandEntryT const& parent_entry) {
+    std::cout << "param_.max_depth:" << param_.max_depth << " parent_entry.depth:" << parent_entry.depth << std::endl;
+    if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false;
+    std::cout << "param_.max_leaves:" << param_.max_leaves << " parent_entry.num_leaves_:" << num_leaves_ << std::endl;
+    if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false;
+    return true;
+  }
+
   // Return the set of nodes to be expanded
   // This set has no dependencies between entries so they may be expanded in
   // parallel or asynchronously
   std::vector<ExpandEntryT> Pop() {
+    std::cout << "queue_.size():" << queue_.size() << std::endl;
     if (queue_.empty()) return {};
     // Return a single entry for loss guided mode
-    if (policy_ == TrainParam::kLossGuide) {
+    if (param_.grow_policy == TrainParam::kLossGuide) {
       ExpandEntryT e = queue_.top();
       queue_.pop();
-      return {e};
+
+      if (e.IsValid(param_, num_leaves_)) {
+        num_leaves_++;
+        return {e};
+      } else {
+        return {};
+      }
     }
     // Return nodes on same level for depth wise
     std::vector<ExpandEntryT> result;
     ExpandEntryT e = queue_.top();
     int level = e.depth;
-    while (e.depth == level && !queue_.empty()) {
+    while (e.depth == level && !queue_.empty() && result.size() < max_node_batch_size_) {
       queue_.pop();
-      result.emplace_back(e);
+      if (e.IsValid(param_, num_leaves_)) {
+        num_leaves_++;
+        result.emplace_back(e);
+      }
+
       if (!queue_.empty()) {
         e = queue_.top();
       }
@@ -81,7 +103,9 @@ class Driver {
   }
 
  private:
-  TrainParam::TreeGrowPolicy policy_;
+  TrainParam param_;
+  bst_node_t num_leaves_ = 1;
+  std::size_t max_node_batch_size_;
   ExpandQueue queue_;
 };
 }  // namespace tree
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 8d5cc809a280..08b0270ee4d7 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -103,7 +103,7 @@ class GPUHistEvaluator {
   }
 
   /**
-   * \brief Get sorted index storage based on the left node of inputs .
+   * \brief Get sorted index storage based on the left node of inputs.
    */
   auto SortedIdx(EvaluateSplitInputs<GradientSumT> left) {
     if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 791363a05cdd..efb08d5e44e2 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -247,15 +247,6 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
   dh::safe_cuda(cudaGetLastError());
 }
 
-template void BuildGradientHistogram<GradientPair>(
-    EllpackDeviceAccessor const& matrix,
-    FeatureGroupsAccessor const& feature_groups,
-    common::Span<GradientPair const> gpair,
-    common::Span<const uint32_t> ridx,
-    common::Span<GradientPair> histogram,
-    HistRounding<GradientPair> rounding,
-    bool force_global_memory);
-
 template void BuildGradientHistogram<GradientPairPrecise>(
     EllpackDeviceAccessor const& matrix,
     FeatureGroupsAccessor const& feature_groups,
diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h
index d0edfbd379a6..a00059791e13 100644
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -24,16 +24,20 @@ struct CPUExpandEntry {
   }
 
   bool IsValid(const TrainParam& param, int num_leaves) const {
-    if (split.loss_chg <= kRtEps) return false;
+    std::cout << "split.loss_chg:" << split.loss_chg << " eps:" << kRtEps << std::endl;
+    if (split.loss_chg <= kRtEps) { std::cout << "NOT VALID!" << std::endl; return false;}
+    std::cout << "split.left_sum.GetHess():" << split.left_sum.GetHess() << " split.right_sum.GetHess():" << split.right_sum.GetHess() << std::endl;
     if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
       return false;
     }
+    std::cout << "split.loss_chg:" << split.loss_chg << " param.min_split_loss:" << param.min_split_loss << std::endl;
     if (split.loss_chg < param.min_split_loss) {
       return false;
     }
     if (param.max_depth > 0 && depth == param.max_depth) {
       return false;
     }
+    std::cout << "param.max_leaves:" << param.max_leaves << " num_leaves:" << num_leaves << std::endl;
     if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
       return false;
     }
diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h
index 8cdf88834559..ba3533e84f43 100644
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -13,6 +13,7 @@
 #include <utility>
 #include <vector>
 #include <limits>
+#include <algorithm>
 
 #include "xgboost/tree_model.h"
 #include "xgboost/host_device_vector.h"
@@ -49,8 +50,9 @@ class TreeEvaluator {
     } else {
       monotone_.HostVector() = p.monotone_constraints;
       monotone_.HostVector().resize(n_features, 0);
-      lower_bounds_.Resize(p.MaxNodes(), -std::numeric_limits<float>::max());
-      upper_bounds_.Resize(p.MaxNodes(), std::numeric_limits<float>::max());
+      // Initialised to some small size, can grow if needed
+      lower_bounds_.Resize(256, -std::numeric_limits<float>::max());
+      upper_bounds_.Resize(256, std::numeric_limits<float>::max());
       has_constraint_ = true;
     }
 
@@ -157,6 +159,15 @@ class TreeEvaluator {
     if (!has_constraint_) {
       return;
     }
+
+    auto max_nidx = std::max(leftid, rightid);
+    if (lower_bounds_.Size() <= max_nidx) {
+      lower_bounds_.Resize(max_nidx * 2 + 1, -std::numeric_limits<float>::max());
+    }
+    if (upper_bounds_.Size() <= max_nidx) {
+      upper_bounds_.Resize(max_nidx * 2 + 1, std::numeric_limits<float>::max());
+    }
+
     common::Transform<>::Init(
         [=] XGBOOST_DEVICE(size_t, common::Span<float> lower,
                            common::Span<float> upper,
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 887725295bf9..cc4b5a1620f8 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -238,10 +238,9 @@ class GloablApproxBuilder {
     split_conditions_.clear();
     split_ind_.clear();
 
-    Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
+    Driver<CPUExpandEntry> driver(param_);
     auto &tree = *p_tree;
     driver.Push({this->InitRoot(p_fmat, gpair, hess, p_tree)});
-    bst_node_t num_leaves{1};
     auto expand_set = driver.Pop();
     int depth = 0;
     bool is_loss_guide = static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy) ==
@@ -267,16 +266,14 @@ class GloablApproxBuilder {
       bool is_applied = false;
       // candidates that can be applied.
       for (auto const &candidate : expand_set) {
-        if (!candidate.IsValid(param_, num_leaves)) {
-          continue;
-        }
         evaluator_.ApplyTreeSplit(candidate, p_tree);
         applied[candidate.nid] = candidate;
         applied_vec.push_back(candidate);
         CHECK_EQ(applied[candidate.nid].nid, candidate.nid);
-        num_leaves++;
+        // num_leaves++;
         int left_child_nidx = tree[candidate.nid].LeftChild();
-        if (CPUExpandEntry::ChildIsValid(param_, p_tree->GetDepth(left_child_nidx), num_leaves)) {
+//        if (CPUExpandEntry::ChildIsValid(param_, p_tree->GetDepth(left_child_nidx), num_leaves)) {
+        if (driver.IsChildValid(candidate)) {
           valid_candidates.emplace_back(candidate);
         } else {
           if (param_.grow_policy == TrainParam::kLossGuide) {
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 20db181ef187..ae209cdaf205 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -62,7 +62,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
 #endif  // !defined(GTEST_TEST)
 
 /**
- * \struct  DeviceHistogram
+ * \struct  DeviceHistogramStorage
  *
  * \summary Data storage for node histograms on device. Automatically expands.
  *
@@ -72,20 +72,27 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
  * \author  Rory
  * \date    28/07/2018
  */
-template <typename GradientSumT, size_t kStopGrowingSize = 1 << 26>
-class DeviceHistogram {
+template <typename GradientSumT, size_t kStopGrowingSize = 1 << 28>
+class DeviceHistogramStorage {
  private:
   /*! \brief Map nidx to starting index of its histogram. */
   std::map<int, size_t> nidx_map_;
+  // Large buffer of zeroed memory, caches histograms
   dh::device_vector<typename GradientSumT::ValueT> data_;
+  // If we run out of storage allocate one histogram at a time
+  // in overflow. Not cached, overwritten when a new histogram
+  // is requested
+  dh::device_vector<typename GradientSumT::ValueT> overflow_;
+  std::map<int, size_t> overflow_nidx_map_;
   int n_bins_;
   int device_id_;
   static constexpr size_t kNumItemsInGradientSum =
       sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
-  static_assert(kNumItemsInGradientSum == 2,
-                "Number of items in gradient type should be 2.");
+  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
 
  public:
+  // Start with about 16mb
+  DeviceHistogramStorage() { data_.reserve(1 << 22); }
   void Init(int device_id, int n_bins) {
     this->n_bins_ = n_bins;
     this->device_id_ = device_id;
@@ -93,52 +100,47 @@ class DeviceHistogram {
 
   void Reset() {
     auto d_data = data_.data().get();
-    dh::LaunchN(data_.size(),
-                [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
+    dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
     nidx_map_.clear();
+    overflow_nidx_map_.clear();
   }
   bool HistogramExists(int nidx) const {
-    return nidx_map_.find(nidx) != nidx_map_.cend();
-  }
-  int Bins() const {
-    return n_bins_;
-  }
-  size_t HistogramSize() const {
-    return n_bins_ * kNumItemsInGradientSum;
+    return nidx_map_.find(nidx) != nidx_map_.cend() ||
+           overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
   }
+  int Bins() const { return n_bins_; }
+  size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
+  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
 
-  dh::device_vector<typename GradientSumT::ValueT>& Data() {
-    return data_;
-  }
-
-  void AllocateHistogram(int nidx) {
-    if (HistogramExists(nidx)) return;
+  void AllocateHistograms(const std::vector<int>& new_nidxs) {
+    for (int nidx : new_nidxs) {
+      CHECK(!HistogramExists(nidx));
+    }
     // Number of items currently used in data
     const size_t used_size = nidx_map_.size() * HistogramSize();
-    const size_t new_used_size = used_size + HistogramSize();
-    if (data_.size() >= kStopGrowingSize) {
-      // Recycle histogram memory
-      if (new_used_size <= data_.size()) {
-        // no need to remove old node, just insert the new one.
-        nidx_map_[nidx] = used_size;
-        // memset histogram size in bytes
-      } else {
-        std::pair<int, size_t> old_entry = *nidx_map_.begin();
-        nidx_map_.erase(old_entry.first);
-        nidx_map_[nidx] = old_entry.second;
+    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
+    if (used_size >= kStopGrowingSize) {
+      // Use overflow
+      // Delete previous entries
+      overflow_nidx_map_.clear();
+      overflow_.resize(HistogramSize() * new_nidxs.size());
+      // Zero memory
+      auto d_data = overflow_.data().get();
+      dh::LaunchN(overflow_.size(),
+                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
       }
-      // Zero recycled memory
-      auto d_data = data_.data().get() + nidx_map_[nidx];
-      dh::LaunchN(n_bins_ * 2,
-                  [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
     } else {
-      // Append new node histogram
-      nidx_map_[nidx] = used_size;
-      // Check there is enough memory for another histogram node
-      if (data_.size() < new_used_size + HistogramSize()) {
-        size_t new_required_memory =
-            std::max(data_.size() * 2, HistogramSize());
-        data_.resize(new_required_memory);
+      CHECK_GE(data_.size(), used_size);
+      // Expand if necessary
+      if (data_.size() < new_used_size) {
+        data_.resize(std::max(data_.size() * 2, new_used_size));
+      }
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
       }
     }
 
@@ -152,9 +154,16 @@ class DeviceHistogram {
    */
   common::Span<GradientSumT> GetNodeHistogram(int nidx) {
     CHECK(this->HistogramExists(nidx));
-    auto ptr = data_.data().get() + nidx_map_.at(nidx);
-    return common::Span<GradientSumT>(
-        reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+
+    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
+      // Fetch from normal cache
+      auto ptr = data_.data().get() + nidx_map_.at(nidx);
+      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+    } else {
+      // Fetch from overflow
+      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
+      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+    }
   }
 };
 
@@ -171,7 +180,7 @@ struct GPUHistMakerDevice {
   BatchParam batch_param;
 
   std::unique_ptr<RowPartitioner> row_partitioner;
-  DeviceHistogram<GradientSumT> hist{};
+  DeviceHistogramStorage<GradientSumT> hist{};
 
   dh::caching_device_vector<GradientPair> d_gpair;  // storage for gpair;
   common::Span<GradientPair> gpair;
@@ -195,6 +204,7 @@ struct GPUHistMakerDevice {
 
   std::unique_ptr<FeatureGroups> feature_groups;
 
+
   GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
                      common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
                      TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
@@ -213,7 +223,7 @@ struct GPUHistMakerDevice {
       // Copy assigning an empty vector causes an exception in MSVC debug builds
       monotone_constraints = param.monotone_constraints;
     }
-    node_sum_gradients.resize(param.MaxNodes());
+    node_sum_gradients.resize(256);
 
     // Init histogram
     hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
@@ -322,7 +332,6 @@ struct GPUHistMakerDevice {
   }
 
   void BuildHist(int nidx) {
-    hist.AllocateHistogram(nidx);
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
     BuildGradientHistogram(page->GetDeviceAccessor(ctx_->gpu_id),
@@ -330,8 +339,12 @@ struct GPUHistMakerDevice {
                            d_ridx, d_node_hist, histogram_rounding);
   }
 
-  void SubtractionTrick(int nidx_parent, int nidx_histogram,
-                        int nidx_subtraction) {
+  // Attempt to do subtraction trick
+  // return true if succeeded
+  bool SubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) {
+    if (!hist.HistogramExists(nidx_histogram) || !hist.HistogramExists(nidx_parent)) {
+      return false;
+    }
     auto d_node_hist_parent = hist.GetNodeHistogram(nidx_parent);
     auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
     auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
@@ -340,12 +353,7 @@ struct GPUHistMakerDevice {
       d_node_hist_subtraction[idx] =
           d_node_hist_parent[idx] - d_node_hist_histogram[idx];
     });
-  }
-
-  bool CanDoSubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) {
-    // Make sure histograms are already allocated
-    hist.AllocateHistogram(nidx_subtraction);
-    return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent);
+    return true;
   }
 
   void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) {
@@ -505,13 +513,15 @@ struct GPUHistMakerDevice {
     row_partitioner.reset();
   }
 
-  void AllReduceHist(int nidx, dh::AllReducer* reducer) {
+  // num histograms is the number of contiguous histograms in memory to reduce over
+  void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) {
     monitor.Start("AllReduce");
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
-    reducer->AllReduceSum(
-        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-        page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
+    reducer->AllReduceSum(reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+                          reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+                          page->Cuts().TotalBins() *
+                              (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)) *
+                              num_histograms);
 
     monitor.Stop("AllReduce");
   }
@@ -519,33 +529,50 @@ struct GPUHistMakerDevice {
   /**
    * \brief Build GPU local histograms for the left and right child of some parent node
    */
-  void BuildHistLeftRight(const GPUExpandEntry &candidate, int nidx_left,
-        int nidx_right, dh::AllReducer* reducer) {
-    auto build_hist_nidx = nidx_left;
-    auto subtraction_trick_nidx = nidx_right;
-
-    // Decide whether to build the left histogram or right histogram
-    // Use sum of Hessian as a heuristic to select node with fewest training instances
-    bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess();
-    if (fewer_right) {
-      std::swap(build_hist_nidx, subtraction_trick_nidx);
+  void BuildHistLeftRight(std::vector<GPUExpandEntry> const& candidates, dh::AllReducer* reducer,
+                          const RegTree& tree) {
+    if (candidates.empty()) return;
+    // Some nodes we will manually compute histograms
+    // others we will do by subtraction
+    std::vector<int> hist_nidx;
+    std::vector<int> subtraction_nidx;
+    for (auto& e : candidates) {
+      // Decide whether to build the left histogram or right histogram
+      // Use sum of Hessian as a heuristic to select node with fewest training instances
+      bool fewer_right = e.split.right_sum.GetHess() < e.split.left_sum.GetHess();
+      if (fewer_right) {
+        hist_nidx.emplace_back(tree[e.nid].RightChild());
+        subtraction_nidx.emplace_back(tree[e.nid].LeftChild());
+      } else {
+        hist_nidx.emplace_back(tree[e.nid].LeftChild());
+        subtraction_nidx.emplace_back(tree[e.nid].RightChild());
+      }
+    }
+    std::vector<int> all_new = hist_nidx;
+    all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
+    // Allocate the histograms
+    // Guaranteed contiguous memory
+    hist.AllocateHistograms(all_new);
+
+    for (auto nidx : hist_nidx) {
+      this->BuildHist(nidx);
     }
 
-    this->BuildHist(build_hist_nidx);
-    this->AllReduceHist(build_hist_nidx, reducer);
+    // Reduce all in one go
+    // This gives much better latency in a distributed setting
+    // when processing a large batch
+    this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size());
 
-    // Check whether we can use the subtraction trick to calculate the other
-    bool do_subtraction_trick = this->CanDoSubtractionTrick(
-        candidate.nid, build_hist_nidx, subtraction_trick_nidx);
+    for (int i = 0; i < subtraction_nidx.size(); i++) {
+      auto build_hist_nidx = hist_nidx.at(i);
+      auto subtraction_trick_nidx = subtraction_nidx.at(i);
+      auto parent_nidx = candidates.at(i).nid;
 
-    if (do_subtraction_trick) {
-      // Calculate other histogram using subtraction trick
-      this->SubtractionTrick(candidate.nid, build_hist_nidx,
-                             subtraction_trick_nidx);
-    } else {
-      // Calculate other histogram manually
-      this->BuildHist(subtraction_trick_nidx);
-      this->AllReduceHist(subtraction_trick_nidx, reducer);
+      if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
+        // Calculate other histogram manually
+        this->BuildHist(subtraction_trick_nidx);
+        this->AllReduceHist(subtraction_trick_nidx, reducer, 1);
+      }
     }
   }
 
@@ -587,12 +614,17 @@ struct GPUHistMakerDevice {
     }
     evaluator_.ApplyTreeSplit(candidate, p_tree);
 
-    node_sum_gradients[tree[candidate.nid].LeftChild()] = candidate.split.left_sum;
-    node_sum_gradients[tree[candidate.nid].RightChild()] = candidate.split.right_sum;
+    const auto& parent = tree[candidate.nid];
+    std::size_t max_nidx = std::max(parent.LeftChild(), parent.RightChild());
+    // Grow as needed
+    if (node_sum_gradients.size() <= max_nidx) {
+      node_sum_gradients.resize(max_nidx * 2 + 1);
+    }
+    node_sum_gradients[parent.LeftChild()] = candidate.split.left_sum;
+    node_sum_gradients[parent.RightChild()] = candidate.split.right_sum;
 
-    interaction_constraints.Split(candidate.nid, tree[candidate.nid].SplitIndex(),
-                                  tree[candidate.nid].LeftChild(),
-                                  tree[candidate.nid].RightChild());
+    interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(),
+                                  parent.RightChild());
   }
 
   GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
@@ -605,8 +637,9 @@ struct GPUHistMakerDevice {
                    GradientPairPrecise{}, thrust::plus<GradientPairPrecise>{});
     rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double*>(&root_sum), 2);
 
+    hist.AllocateHistograms({kRootNIdx});
     this->BuildHist(kRootNIdx);
-    this->AllReduceHist(kRootNIdx, reducer);
+    this->AllReduceHist(kRootNIdx, reducer, 1);
 
     // Remember root stats
     node_sum_gradients[kRootNIdx] = root_sum;
@@ -624,7 +657,8 @@ struct GPUHistMakerDevice {
                   RegTree* p_tree, dh::AllReducer* reducer,
                   HostDeviceVector<bst_node_t>* p_out_position) {
     auto& tree = *p_tree;
-    Driver<GPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param.grow_policy));
+    // Process maximum 32 nodes at a time
+    Driver<GPUExpandEntry> driver(param, 32);
 
     monitor.Start("Reset");
     this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_);
@@ -634,48 +668,44 @@ struct GPUHistMakerDevice {
     driver.Push({ this->InitRoot(p_tree, reducer) });
     monitor.Stop("InitRoot");
 
-    auto num_leaves = 1;
-
     // The set of leaves that can be expanded asynchronously
     auto expand_set = driver.Pop();
     while (!expand_set.empty()) {
-      auto new_candidates =
-          pinned.GetSpan<GPUExpandEntry>(expand_set.size() * 2, GPUExpandEntry());
-
-      for (auto i = 0ull; i < expand_set.size(); i++) {
-        auto candidate = expand_set.at(i);
-        if (!candidate.IsValid(param, num_leaves)) {
-          continue;
-        }
+      for (auto& candidate : expand_set) {
         this->ApplySplit(candidate, p_tree);
+      }
+      // Get the candidates we are allowed to expand further
+      // e.g. We do not bother further processing nodes whose children are beyond max depth
+      std::vector<GPUExpandEntry> filtered_expand_set;
+      std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
+                   [&](const auto& e) { return driver.IsChildValid(e); });
+
+
+      auto new_candidates =
+          pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
+
+      for (const auto& e : filtered_expand_set) {
+        monitor.Start("UpdatePosition");
+        // Update position is only run when child is valid, instead of right after apply
+        // split (as in approx tree method).  Hense we have the finalise position call
+        // in GPU Hist.
+        this->UpdatePosition(e, p_tree);
+        monitor.Stop("UpdatePosition");
+      }
 
-        num_leaves++;
+      monitor.Start("BuildHist");
+      this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
+      monitor.Stop("BuildHist");
 
+      for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
+        auto candidate = filtered_expand_set.at(i);
         int left_child_nidx = tree[candidate.nid].LeftChild();
         int right_child_nidx = tree[candidate.nid].RightChild();
-        // Only create child entries if needed_
-        if (GPUExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
-                                         num_leaves)) {
-          monitor.Start("UpdatePosition");
-          // Update position is only run when child is valid, instead of right after apply
-          // split (as in approx tree method).  Hense we have the finalise position call
-          // in GPU Hist.
-          this->UpdatePosition(candidate, p_tree);
-          monitor.Stop("UpdatePosition");
-
-          monitor.Start("BuildHist");
-          this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
-          monitor.Stop("BuildHist");
-
-          monitor.Start("EvaluateSplits");
-          this->EvaluateLeftRightSplits(candidate, left_child_nidx, right_child_nidx, *p_tree,
-                                        new_candidates.subspan(i * 2, 2));
-          monitor.Stop("EvaluateSplits");
-        } else {
-          // Set default
-          new_candidates[i * 2] = GPUExpandEntry();
-          new_candidates[i * 2 + 1] = GPUExpandEntry();
-        }
+
+        monitor.Start("EvaluateSplits");
+        this->EvaluateLeftRightSplits(candidate, left_child_nidx, right_child_nidx, *p_tree,
+                                      new_candidates.subspan(i * 2, 2));
+        monitor.Stop("EvaluateSplits");
       }
       dh::DefaultStream().Sync();
       driver.Push(new_candidates.begin(), new_candidates.end());
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 9101863c6f18..cddc3a40bf70 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -155,6 +155,7 @@ void QuantileHistMaker::Builder::InitRoot(
 
 void QuantileHistMaker::Builder::AddSplitsToTree(
           const std::vector<CPUExpandEntry>& expand,
+          Driver<CPUExpandEntry>* driver, 
           RegTree *p_tree,
           int *num_leaves,
           std::vector<CPUExpandEntry>* nodes_for_apply_split,
@@ -164,11 +165,14 @@ void QuantileHistMaker::Builder::AddSplitsToTree(
   const bool is_loss_guided = static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy)
                               != TrainParam::kDepthWise;
   std::vector<uint16_t> complete_node_ids;
+  std::cout << "expand.size():" << expand.size() << std::endl;
   for (auto const& entry : expand) {
-    if (entry.IsValid(param_, *num_leaves)) {
+    if (driver->IsChildValid(entry)) {
+
+    // if (entry.IsValid(param_, *num_leaves)) {
       nodes_for_apply_split->push_back(entry);
       evaluator_->ApplyTreeSplit(entry, p_tree);
-      (*num_leaves)++;
+      // (*num_leaves)++;
       complete_node_ids.push_back((*p_tree)[entry.nid].LeftChild());
       complete_node_ids.push_back((*p_tree)[entry.nid].RightChild());
       *is_left_small = entry.split.left_sum.GetHess() <= entry.split.right_sum.GetHess();
@@ -179,6 +183,8 @@ void QuantileHistMaker::Builder::AddSplitsToTree(
         smalest_nodes_mask[(*p_tree)[entry.nid].RightChild()] = true;
         smalest_nodes_mask[ (*p_tree)[entry.nid].LeftChild()] = false;
       }
+    } else {
+      std::cout << "Not valid!!! entry.nid:" << entry.nid << std::endl;
     }
   }
   child_node_ids_ = complete_node_ids;
@@ -232,11 +238,14 @@ void QuantileHistMaker::Builder::ExpandTree(
     RegTree* p_tree,
     const std::vector<GradientPair>& gpair_h,
     HostDeviceVector<bst_node_t> *p_out_position) {
+  std::cout << "ExpandTree 1" << std::endl;
   monitor_->Start("ExpandTree");
   int num_leaves = 0;
   split_conditions_.clear();
   split_ind_.clear();
-  Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
+  Driver<CPUExpandEntry> driver(param_);
+  std::cout << "ExpandTree 2" << std::endl;
+  // Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
   std::vector<CPUExpandEntry> expand;
   size_t page_id{0};
   std::vector<size_t>& row_indices = *row_set_collection_.Data();
@@ -252,21 +261,28 @@ void QuantileHistMaker::Builder::ExpandTree(
                        TrainParam::kDepthWise ? false : true;
 
   InitRoot<BinIdxType, any_missing>(gmat, p_fmat, p_tree, gpair_h, &num_leaves, &expand);
+  std::cout << "ExpandTree 3" << std::endl;
   driver.Push(expand[0]);
   child_node_ids_.clear();
   child_node_ids_.emplace_back(0);
   int32_t depth = 0;
   while (!driver.IsEmpty()) {
     std::unordered_map<uint32_t, bool> smalest_nodes_mask;
+    std::cout << "ExpandTree before POP:" << depth << std::endl;
     expand = driver.Pop();
-    depth = expand[0].depth + 1;
+    std::cout << "ExpandTree after POP:" << depth << std::endl;
+    if (expand.size()) {
+      depth = expand[0].depth + 1;
+    }
+    std::cout << "ExpandTree depth:" << depth << std::endl;
     std::vector<CPUExpandEntry> nodes_for_apply_split;
     std::vector<CPUExpandEntry> nodes_to_evaluate;
     nodes_for_explicit_hist_build_.clear();
     nodes_for_subtraction_trick_.clear();
     bool is_left_small = false;
-    AddSplitsToTree(expand, p_tree, &num_leaves, &nodes_for_apply_split,
+    AddSplitsToTree(expand, &driver, p_tree, &num_leaves, &nodes_for_apply_split,
                     &smalest_nodes_mask, depth, &is_left_small);
+    std::cout << "AddSplitsToTree finished" << std::endl;
 
     if (nodes_for_apply_split.size() != 0) {
       monitor_->Start("ApplySplit");
@@ -289,9 +305,11 @@ void QuantileHistMaker::Builder::ExpandTree(
           true);
         ++page_id;
       }
+    std::cout << "UpdatePositionDispatched finished" << std::endl;
 
       monitor_->Stop("ApplySplit");
       SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree);
+    std::cout << "SplitSiblings finished" << std::endl;
       if (param_.max_depth == 0 || depth < param_.max_depth) {
         size_t i = 0;
         monitor_->Start("BuildHist");
@@ -309,6 +327,7 @@ void QuantileHistMaker::Builder::ExpandTree(
           std::copy(merged_thread_ids_set[nid].begin(),
                     merged_thread_ids_set[nid].end(), merged_thread_ids[nid].begin());
         }
+            std::cout << "merged_thread_ids_set finished" << std::endl;
 
         for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
           CommonRowPartitioner &partitioner = this->partitioner_.at(i);
@@ -319,25 +338,33 @@ void QuantileHistMaker::Builder::ExpandTree(
               &(partitioner.GetNodeAssignments()), &merged_thread_ids);
           ++i;
         }
+            std::cout << "BuildHist finished" << std::endl;
+
         monitor_->Stop("BuildHist");
         monitor_->Start("EvaluateSplits");
         auto ft = p_fmat->Info().feature_types.ConstHostSpan();
         evaluator_->EvaluateSplits(this->histogram_builder_->Histogram(),
                                   feature_values_, ft, *p_tree, &nodes_to_evaluate);
         monitor_->Stop("EvaluateSplits");
+            std::cout << "EvaluateSplits finished" << std::endl;
       }
+      std::cout << "nodes_for_apply_split.size():" << nodes_for_apply_split.size() << std::endl;
       for (size_t i = 0; i < nodes_for_apply_split.size(); ++i) {
         CPUExpandEntry left_node = nodes_to_evaluate.at(i * 2 + 0);
         CPUExpandEntry right_node = nodes_to_evaluate.at(i * 2 + 1);
         driver.Push(left_node);
         driver.Push(right_node);
       }
+            std::cout << "DRIVERPOP finished" << std::endl;
     }
   }
 
   auto &h_out_position = p_out_position->HostVector();
+  std::cout << "LeafPartition started" << std::endl;
   this->LeafPartition(*p_tree, &h_out_position);
+  std::cout << "LeafPartition finished" << std::endl;
   monitor_->Stop(__func__);
+  std::cout << "ExpandTree finished" << std::endl;
 }
 
 void QuantileHistMaker::Builder::UpdateTree(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h
index 9e283f183e33..684aaa2b0e6a 100644
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -174,11 +174,12 @@ class QuantileHistMaker: public TreeUpdater {
                        RegTree *p_tree);
 
     void AddSplitsToTree(const std::vector<CPUExpandEntry>& expand,
+                         Driver<CPUExpandEntry>* driver, 
                          RegTree *p_tree,
                          int *num_leaves,
                          std::vector<CPUExpandEntry>* nodes_for_apply_split,
-                         std::unordered_map<uint32_t, bool>* smalest_nodes_mask_ptr, size_t depth
-                         , bool * is_left_small);
+                         std::unordered_map<uint32_t, bool>* smalest_nodes_mask_ptr, size_t depth,
+                         bool * is_left_small);
 
     template <typename BinIdxType, bool any_missing>
     void ExpandTree(const GHistIndexMatrix& gmat,
diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc
index 59e49fd8823c..f846142d665e 100644
--- a/tests/cpp/common/test_column_matrix.cc
+++ b/tests/cpp/common/test_column_matrix.cc
@@ -106,4 +106,4 @@ TEST(HistIndexCreationWithExternalMemory, Test) {
   TestGHistIndexMatrixCreation(40);
 }
 }  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost
\ No newline at end of file
diff --git a/tests/cpp/data/test_proxy_dmatrix.cu b/tests/cpp/data/test_proxy_dmatrix.cu
index d9f315a8f144..a599ada6da50 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cu
+++ b/tests/cpp/data/test_proxy_dmatrix.cu
@@ -19,7 +19,7 @@ TEST(ProxyDMatrix, DeviceData) {
                     .GenerateColumnarArrayInterface(&label_storage);
 
   DMatrixProxy proxy;
-  proxy.SetData(data.c_str());
+  proxy.SetCUDAArray(data.c_str());
   proxy.SetInfo("label", labels.c_str());
 
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CupyAdapter>));
@@ -34,7 +34,7 @@ TEST(ProxyDMatrix, DeviceData) {
   data = RandomDataGenerator(kRows, kCols, 0)
                     .Device(0)
                     .GenerateColumnarArrayInterface(&columnar_storage);
-  proxy.SetData(data.c_str());
+  proxy.SetCUDAArray(data.c_str());
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));
   ASSERT_EQ(dmlc::get<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumRows(),
             kRows);
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index f9fe7d38660d..00201769bc03 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -1,16 +1,17 @@
 /*!
  * Copyright 2019-2022 XGBoost contributors
  */
-#include <gtest/gtest.h>
 #include <dmlc/filesystem.h>
+#include <gtest/gtest.h>
 #include <xgboost/generic_parameters.h>
 
+#include "../../../src/data/adapter.h"
+#include "../../../src/data/proxy_dmatrix.h"
+#include "../../../src/gbm/gbtree.h"
+#include "../helpers.h"
 #include "xgboost/base.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/learner.h"
-#include "../helpers.h"
-#include "../../../src/gbm/gbtree.h"
-#include "../../../src/data/adapter.h"
 #include "xgboost/predictor.h"
 
 namespace xgboost {
@@ -246,53 +247,78 @@ TEST(Dart, JsonIO) {
   ASSERT_NE(get<Array>(model["model"]["weight_drop"]).size(), 0ul);
 }
 
-TEST(Dart, Prediction) {
-  size_t constexpr kRows = 16, kCols = 10;
+namespace {
+class Dart : public testing::TestWithParam<char const*> {
+ public:
+  void Run(std::string predictor) {
+    size_t constexpr kRows = 16, kCols = 10;
 
-  HostDeviceVector<float> data;
-  auto array_str = RandomDataGenerator(kRows, kCols, 0).GenerateArrayInterface(&data);
-  auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);
+    HostDeviceVector<float> data;
+    auto rng = RandomDataGenerator(kRows, kCols, 0);
+    if (predictor == "gpu_predictor") {
+      rng.Device(0);
+    }
+    auto array_str = rng.GenerateArrayInterface(&data);
+    auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);
 
-  std::vector<bst_float> labels (kRows);
-  for (size_t i = 0; i < kRows; ++i) {
-    labels[i] = i % 2;
-  }
-  p_mat->SetInfo("label", labels.data(), DataType::kFloat32, kRows);
+    std::vector<bst_float> labels(kRows);
+    for (size_t i = 0; i < kRows; ++i) {
+      labels[i] = i % 2;
+    }
+    p_mat->SetInfo("label", labels.data(), DataType::kFloat32, kRows);
 
-  auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
-  learner->SetParam("booster", "dart");
-  learner->SetParam("rate_drop", "0.5");
-  learner->Configure();
+    auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
+    learner->SetParam("booster", "dart");
+    learner->SetParam("rate_drop", "0.5");
+    learner->Configure();
 
-  for (size_t i = 0; i < 16; ++i) {
-    learner->UpdateOneIter(i, p_mat);
-  }
+    for (size_t i = 0; i < 16; ++i) {
+      learner->UpdateOneIter(i, p_mat);
+    }
+
+    learner->SetParam("predictor", predictor);
 
-  HostDeviceVector<float> predts_training;
-  learner->Predict(p_mat, false, &predts_training, 0, 0, true);
-
-  HostDeviceVector<float>* inplace_predts;
-  auto adapter = std::shared_ptr<data::ArrayAdapter>(new data::ArrayAdapter{StringView{array_str}});
-  learner->InplacePredict(adapter, nullptr, PredictionType::kValue,
-                          std::numeric_limits<float>::quiet_NaN(),
-                          &inplace_predts, 0, 0);
-  CHECK(inplace_predts);
-
-  HostDeviceVector<float> predts_inference;
-  learner->Predict(p_mat, false, &predts_inference, 0, 0, false);
-
-  auto const& h_predts_training = predts_training.ConstHostVector();
-  auto const& h_predts_inference = predts_inference.ConstHostVector();
-  auto const& h_inplace_predts = inplace_predts->HostVector();
-  ASSERT_EQ(h_predts_training.size(), h_predts_inference.size());
-  ASSERT_EQ(h_inplace_predts.size(), h_predts_inference.size());
-  for (size_t i = 0; i < predts_inference.Size(); ++i) {
-    // Inference doesn't drop tree.
-    ASSERT_GT(std::abs(h_predts_training[i] - h_predts_inference[i]), kRtEps * 10);
-    // Inplace prediction is inference.
-    ASSERT_LT(h_inplace_predts[i] - h_predts_inference[i], kRtEps / 10);
+    HostDeviceVector<float> predts_training;
+    learner->Predict(p_mat, false, &predts_training, 0, 0, true);
+
+    HostDeviceVector<float>* inplace_predts;
+    std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy{}};
+    if (predictor == "gpu_predictor") {
+      x->SetCUDAArray(array_str.c_str());
+    } else {
+      x->SetArrayData(array_str.c_str());
+    }
+    learner->InplacePredict(x, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                            &inplace_predts, 0, 0);
+    CHECK(inplace_predts);
+
+    HostDeviceVector<float> predts_inference;
+    learner->Predict(p_mat, false, &predts_inference, 0, 0, false);
+
+    auto const& h_predts_training = predts_training.ConstHostVector();
+    auto const& h_predts_inference = predts_inference.ConstHostVector();
+    auto const& h_inplace_predts = inplace_predts->HostVector();
+    ASSERT_EQ(h_predts_training.size(), h_predts_inference.size());
+    ASSERT_EQ(h_inplace_predts.size(), h_predts_inference.size());
+    for (size_t i = 0; i < predts_inference.Size(); ++i) {
+      // Inference doesn't drop tree.
+      ASSERT_GT(std::abs(h_predts_training[i] - h_predts_inference[i]), kRtEps * 10);
+      // Inplace prediction is inference.
+      ASSERT_LT(h_inplace_predts[i] - h_predts_inference[i], kRtEps / 10);
+    }
   }
-}
+};
+}  // anonymous namespace
+
+TEST_P(Dart, Prediction) { this->Run(GetParam()); }
+
+#if defined(XGBOOST_USE_CUDA)
+INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart,
+                         testing::Values("auto", "cpu_predictor", "gpu_predictor"));
+#else
+INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("auto", "cpu_predictor"));
+#endif  // defined(XGBOOST_USE_CUDA)
+
 
 std::pair<Json, Json> TestModelSlice(std::string booster) {
   size_t constexpr kRows = 1000, kCols = 100, kForest = 2, kClasses = 3;
@@ -485,19 +511,20 @@ TEST(GBTree, PredictRange) {
     // inplace predict
     HostDeviceVector<float> raw_storage;
     auto raw = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateArrayInterface(&raw_storage);
-    std::shared_ptr<data::ArrayAdapter> x{new data::ArrayAdapter{StringView{raw}}};
+    std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy{}};
+    x->SetArrayData(raw.data());
 
     HostDeviceVector<float>* out_predt;
-    learner->InplacePredict(x, nullptr, PredictionType::kValue,
-                            std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 2);
+    learner->InplacePredict(x, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                            &out_predt, 0, 2);
     auto h_out_predt = out_predt->HostVector();
-    learner->InplacePredict(x, nullptr, PredictionType::kValue,
-                            std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 0);
+    learner->InplacePredict(x, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                            &out_predt, 0, 0);
     auto h_out_predt_full = out_predt->HostVector();
 
     ASSERT_TRUE(std::equal(h_out_predt.begin(), h_out_predt.end(), h_out_predt_full.begin()));
 
-    ASSERT_THROW(learner->InplacePredict(x, nullptr, PredictionType::kValue,
+    ASSERT_THROW(learner->InplacePredict(x, PredictionType::kValue,
                                          std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 3),
                  dmlc::Error);
   }
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index f43747abdd9e..5b03f31d8d7a 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -5,11 +5,12 @@
 #include <gtest/gtest.h>
 #include <xgboost/predictor.h>
 
+#include "../../../src/data/adapter.h"
+#include "../../../src/data/proxy_dmatrix.h"
+#include "../../../src/gbm/gbtree.h"
+#include "../../../src/gbm/gbtree_model.h"
 #include "../helpers.h"
 #include "test_predictor.h"
-#include "../../../src/gbm/gbtree_model.h"
-#include "../../../src/gbm/gbtree.h"
-#include "../../../src/data/adapter.h"
 
 namespace xgboost {
 TEST(CpuPredictor, Basic) {
@@ -172,8 +173,11 @@ TEST(CpuPredictor, InplacePredict) {
     HostDeviceVector<float> data;
     gen.GenerateDense(&data);
     ASSERT_EQ(data.Size(), kRows * kCols);
-    std::shared_ptr<data::DenseAdapter> x{
-      new data::DenseAdapter(data.HostPointer(), kRows, kCols)};
+    std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy{}};
+    auto array_interface = GetArrayInterface(&data, kRows, kCols);
+    std::string arr_str;
+    Json::Dump(array_interface, &arr_str);
+    x->SetArrayData(arr_str.data());
     TestInplacePrediction(x, "cpu_predictor", kRows, kCols, -1);
   }
 
@@ -182,9 +186,15 @@ TEST(CpuPredictor, InplacePredict) {
     HostDeviceVector<bst_row_t> rptrs;
     HostDeviceVector<bst_feature_t> columns;
     gen.GenerateCSR(&data, &rptrs, &columns);
-    std::shared_ptr<data::CSRAdapter> x{new data::CSRAdapter(
-        rptrs.HostPointer(), columns.HostPointer(), data.HostPointer(), kRows,
-        data.Size(), kCols)};
+    auto data_interface = GetArrayInterface(&data, kRows * kCols, 1);
+    auto rptr_interface = GetArrayInterface(&rptrs, kRows + 1, 1);
+    auto col_interface = GetArrayInterface(&columns, kRows * kCols, 1);
+    std::string data_str, rptr_str, col_str;
+    Json::Dump(data_interface, &data_str);
+    Json::Dump(rptr_interface, &rptr_str);
+    Json::Dump(col_interface, &col_str);
+    std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy};
+    x->SetCSRData(rptr_str.data(), col_str.data(), data_str.data(), kCols, true);
     TestInplacePrediction(x, "cpu_predictor", kRows, kCols, -1);
   }
 }
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 3113bc62b018..0dbbc8d4588e 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -1,17 +1,19 @@
 /*!
  * Copyright 2017-2020 XGBoost contributors
  */
-#include <gtest/gtest.h>
 #include <dmlc/filesystem.h>
+#include <gtest/gtest.h>
 #include <xgboost/c_api.h>
-#include <xgboost/predictor.h>
-#include <xgboost/logging.h>
 #include <xgboost/learner.h>
+#include <xgboost/logging.h>
+#include <xgboost/predictor.h>
+
 #include <string>
 
-#include "../helpers.h"
-#include "../../../src/gbm/gbtree_model.h"
 #include "../../../src/data/device_adapter.cuh"
+#include "../../../src/data/proxy_dmatrix.h"
+#include "../../../src/gbm/gbtree_model.h"
+#include "../helpers.h"
 #include "test_predictor.h"
 
 namespace xgboost {
@@ -135,8 +137,9 @@ TEST(GPUPredictor, InplacePredictCupy) {
   gen.Device(0);
   HostDeviceVector<float> data;
   std::string interface_str = gen.GenerateArrayInterface(&data);
-  auto x = std::make_shared<data::CupyAdapter>(interface_str);
-  TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 0);
+  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
+  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
+  TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0);
 }
 
 TEST(GPUPredictor, InplacePredictCuDF) {
@@ -145,8 +148,9 @@ TEST(GPUPredictor, InplacePredictCuDF) {
   gen.Device(0);
   std::vector<HostDeviceVector<float>> storage(kCols);
   auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
-  auto x = std::make_shared<data::CudfAdapter>(interface_str);
-  TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 0);
+  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
+  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
+  TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0);
 }
 
 TEST(GPUPredictor, MGPU_InplacePredict) {  // NOLINT
@@ -160,10 +164,10 @@ TEST(GPUPredictor, MGPU_InplacePredict) {  // NOLINT
   gen.Device(1);
   HostDeviceVector<float> data;
   std::string interface_str = gen.GenerateArrayInterface(&data);
-  auto x = std::make_shared<data::CupyAdapter>(interface_str);
-  TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 1);
-  EXPECT_THROW(TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 0),
-               dmlc::Error);
+  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
+  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
+  TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 1);
+  EXPECT_THROW(TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0), dmlc::Error);
 }
 
 TEST(GpuPredictor, LesserFeatures) {
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index e1d8b096a6eb..832d2cf4ceb2 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -2,19 +2,20 @@
  * Copyright 2020-2021 by Contributors
  */
 
+#include "test_predictor.h"
+
 #include <gtest/gtest.h>
-#include <xgboost/predictor.h>
 #include <xgboost/data.h>
-#include <xgboost/host_device_vector.h>
 #include <xgboost/generic_parameters.h>
+#include <xgboost/host_device_vector.h>
+#include <xgboost/predictor.h>
 
-#include "test_predictor.h"
-
-#include "../helpers.h"
-#include "../../../src/data/adapter.h"
-#include "../../../src/common/io.h"
-#include "../../../src/common/categorical.h"
 #include "../../../src/common/bitfield.h"
+#include "../../../src/common/categorical.h"
+#include "../../../src/common/io.h"
+#include "../../../src/data/adapter.h"
+#include "../../../src/data/proxy_dmatrix.h"
+#include "../helpers.h"
 
 namespace xgboost {
 TEST(Predictor, PredictionCache) {
@@ -83,9 +84,8 @@ void TestTrainingPrediction(size_t rows, size_t bins,
   train("gpu_predictor", &predictions_1);
 }
 
-void TestInplacePrediction(dmlc::any x, std::string predictor,
-                           bst_row_t rows, bst_feature_t cols,
-                           int32_t device) {
+void TestInplacePrediction(std::shared_ptr<DMatrix> x, std::string predictor, bst_row_t rows,
+                           bst_feature_t cols, int32_t device) {
   size_t constexpr kClasses { 4 };
   auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(device);
   std::shared_ptr<DMatrix> m = gen.GenerateDMatrix(true, false, kClasses);
@@ -105,24 +105,21 @@ void TestInplacePrediction(dmlc::any x, std::string predictor,
   }
 
   HostDeviceVector<float> *p_out_predictions_0{nullptr};
-  learner->InplacePredict(x, nullptr, PredictionType::kMargin,
-                          std::numeric_limits<float>::quiet_NaN(),
+  learner->InplacePredict(x, PredictionType::kMargin, std::numeric_limits<float>::quiet_NaN(),
                           &p_out_predictions_0, 0, 2);
   CHECK(p_out_predictions_0);
   HostDeviceVector<float> predict_0 (p_out_predictions_0->Size());
   predict_0.Copy(*p_out_predictions_0);
 
   HostDeviceVector<float> *p_out_predictions_1{nullptr};
-  learner->InplacePredict(x, nullptr, PredictionType::kMargin,
-                          std::numeric_limits<float>::quiet_NaN(),
+  learner->InplacePredict(x, PredictionType::kMargin, std::numeric_limits<float>::quiet_NaN(),
                           &p_out_predictions_1, 2, 4);
   CHECK(p_out_predictions_1);
   HostDeviceVector<float> predict_1 (p_out_predictions_1->Size());
   predict_1.Copy(*p_out_predictions_1);
 
   HostDeviceVector<float>* p_out_predictions{nullptr};
-  learner->InplacePredict(x, nullptr, PredictionType::kMargin,
-                          std::numeric_limits<float>::quiet_NaN(),
+  learner->InplacePredict(x, PredictionType::kMargin, std::numeric_limits<float>::quiet_NaN(),
                           &p_out_predictions, 0, 4);
 
   auto& h_pred = p_out_predictions->HostVector();
@@ -378,25 +375,28 @@ void TestSparsePrediction(float sparsity, std::string predictor) {
   learner->SetParam("predictor", predictor);
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
 
-  std::vector<float> with_nan(kRows * kCols, std::numeric_limits<float>::quiet_NaN());
-  for (auto const& page : Xy->GetBatches<SparsePage>()) {
+  HostDeviceVector<float> with_nan(kRows * kCols, std::numeric_limits<float>::quiet_NaN());
+  auto& h_with_nan = with_nan.HostVector();
+  for (auto const &page : Xy->GetBatches<SparsePage>()) {
     auto batch = page.GetView();
     for (size_t i = 0; i < batch.Size(); ++i) {
       auto row = batch[i];
       for (auto e : row) {
-        with_nan[i * kCols + e.index] = e.fvalue;
+        h_with_nan[i * kCols + e.index] = e.fvalue;
       }
     }
   }
 
   learner->SetParam("predictor", "cpu_predictor");
   // Xcode_12.4 doesn't compile with `std::make_shared`.
-  auto dense = std::shared_ptr<data::DenseAdapter>(
-      new data::DenseAdapter(with_nan.data(), kRows, kCols));
+  auto dense = std::shared_ptr<DMatrix>(new data::DMatrixProxy{});
+  auto array_interface = GetArrayInterface(&with_nan, kRows, kCols);
+  std::string arr_str;
+  Json::Dump(array_interface, &arr_str);
+  dynamic_cast<data::DMatrixProxy *>(dense.get())->SetArrayData(arr_str.data());
   HostDeviceVector<float> *p_dense_predt;
-  learner->InplacePredict(dmlc::any(dense), nullptr, PredictionType::kValue,
-                          std::numeric_limits<float>::quiet_NaN(), &p_dense_predt,
-                          0, 0);
+  learner->InplacePredict(dense, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                          &p_dense_predt, 0, 0);
 
   auto const& dense_predt = *p_dense_predt;
   if (predictor == "cpu_predictor") {
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index 9c5d99afef65..1ff96096c533 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -61,9 +61,8 @@ void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method,
                             std::shared_ptr<DMatrix> p_full,
                             std::shared_ptr<DMatrix> p_hist);
 
-void TestInplacePrediction(dmlc::any x, std::string predictor,
-                           bst_row_t rows, bst_feature_t cols,
-                           int32_t device = -1);
+void TestInplacePrediction(std::shared_ptr<DMatrix> x, std::string predictor, bst_row_t rows,
+                           bst_feature_t cols, int32_t device = -1);
 
 void TestPredictionWithLesserFeatures(std::string preditor_name);
 
diff --git a/tests/cpp/tree/gpu_hist/test_driver.cu b/tests/cpp/tree/gpu_hist/test_driver.cu
index d35f3510f628..8e7164e37bec 100644
--- a/tests/cpp/tree/gpu_hist/test_driver.cu
+++ b/tests/cpp/tree/gpu_hist/test_driver.cu
@@ -6,41 +6,58 @@ namespace xgboost {
 namespace tree {
 
 TEST(GpuHist, DriverDepthWise) {
-  Driver<GPUExpandEntry> driver(TrainParam::kDepthWise);
+  TrainParam p;
+  p.InitAllowUnknown(Args{});
+  p.grow_policy = TrainParam::kDepthWise;
+  Driver<GPUExpandEntry> driver(p, 2);
   EXPECT_TRUE(driver.Pop().empty());
   DeviceSplitCandidate split;
   split.loss_chg = 1.0f;
-  GPUExpandEntry root(0, 0, split, .0f, .0f, .0f);
+  split.left_sum = {0.0f, 1.0f};
+  split.right_sum = {0.0f, 1.0f};
+  GPUExpandEntry root(0, 0, split, 2.0f, 1.0f, 1.0f);
   driver.Push({root});
   EXPECT_EQ(driver.Pop().front().nid, 0);
-  driver.Push({GPUExpandEntry{1, 1, split, .0f, .0f, .0f}});
-  driver.Push({GPUExpandEntry{2, 1, split, .0f, .0f, .0f}});
-  driver.Push({GPUExpandEntry{3, 2, split, .0f, .0f, .0f}});
-  // Should return entries from level 1
+  driver.Push({GPUExpandEntry{1, 1, split, 2.0f, 1.0f, 1.0f}});
+  driver.Push({GPUExpandEntry{2, 1, split, 2.0f, 1.0f, 1.0f}});
+  driver.Push({GPUExpandEntry{3, 1, split, 2.0f, 1.0f, 1.0f}});
+  driver.Push({GPUExpandEntry{4, 2, split, 2.0f, 1.0f, 1.0f}});
+  // Should return 2 entries from level 1
+  // as we limited the driver to pop maximum 2 nodes
   auto res = driver.Pop();
   EXPECT_EQ(res.size(), 2);
   for (auto &e : res) {
     EXPECT_EQ(e.depth, 1);
   }
+
+  // Should now return 1 entry from level 1
+  res = driver.Pop();
+  EXPECT_EQ(res.size(), 1);
+  EXPECT_EQ(res.at(0).depth, 1);
+
   res = driver.Pop();
-  EXPECT_EQ(res[0].depth, 2);
+  EXPECT_EQ(res.at(0).depth, 2);
   EXPECT_TRUE(driver.Pop().empty());
 }
 
 TEST(GpuHist, DriverLossGuided) {
   DeviceSplitCandidate high_gain;
+  high_gain.left_sum = {0.0f, 1.0f};
+  high_gain.right_sum = {0.0f, 1.0f};
   high_gain.loss_chg = 5.0f;
-  DeviceSplitCandidate low_gain;
+  DeviceSplitCandidate low_gain = high_gain;
   low_gain.loss_chg = 1.0f;
 
-  Driver<GPUExpandEntry> driver(TrainParam::kLossGuide);
+  TrainParam p;
+  p.grow_policy=TrainParam::kLossGuide;
+  Driver<GPUExpandEntry> driver(p);
   EXPECT_TRUE(driver.Pop().empty());
-  GPUExpandEntry root(0, 0, high_gain, .0f, .0f, .0f);
+  GPUExpandEntry root(0, 0, high_gain, 2.0f, 1.0f, 1.0f );
   driver.Push({root});
   EXPECT_EQ(driver.Pop().front().nid, 0);
   // Select high gain first
-  driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}});
-  driver.Push({GPUExpandEntry{2, 2, high_gain, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
+  driver.Push({GPUExpandEntry{2, 2, high_gain, 2.0f, 1.0f, 1.0f}});
   auto res = driver.Pop();
   EXPECT_EQ(res.size(), 1);
   EXPECT_EQ(res[0].nid, 2);
@@ -49,8 +66,8 @@ TEST(GpuHist, DriverLossGuided) {
   EXPECT_EQ(res[0].nid, 1);
 
   // If equal gain, use nid
-  driver.Push({GPUExpandEntry{2, 1, low_gain, .0f, .0f, .0f}});
-  driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{2, 1, low_gain, 2.0f, 1.0f, 1.0f}});
+  driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
   res = driver.Pop();
   EXPECT_EQ(res[0].nid, 1);
   res = driver.Pop();
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 3b543a48d7cc..75d97b681a61 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -95,7 +95,6 @@ TEST(Histogram, GPUDeterministic) {
   std::vector<int> shm_sizes{48 * 1024, 64 * 1024, 160 * 1024};
   for (bool is_dense : is_dense_array) {
     for (int shm_size : shm_sizes) {
-      TestDeterministicHistogram<GradientPair>(is_dense, shm_size);
       TestDeterministicHistogram<GradientPairPrecise>(is_dense, shm_size);
     }
   }
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index b3c08736c996..e6069cdfdd4d 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -27,31 +27,40 @@ TEST(GpuHist, DeviceHistogram) {
   // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
   dh::safe_cuda(cudaSetDevice(0));
   constexpr size_t kNBins = 128;
-  constexpr size_t kNNodes = 4;
+  constexpr int kNNodes = 4;
   constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
-  DeviceHistogram<GradientPairPrecise, kStopGrowing> histogram;
+  DeviceHistogramStorage<GradientPairPrecise, kStopGrowing> histogram;
   histogram.Init(0, kNBins);
-  for (size_t i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistogram(i);
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms({i});
   }
   histogram.Reset();
   ASSERT_EQ(histogram.Data().size(), kStopGrowing);
 
   // Use allocated memory but do not erase nidx_map.
-  for (size_t i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistogram(i);
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms({i});
   }
-  for (size_t i = 0; i < kNNodes; ++i) {
+  for (int i = 0; i < kNNodes; ++i) {
     ASSERT_TRUE(histogram.HistogramExists(i));
   }
 
-  // Erase existing nidx_map.
-  for (size_t i = kNNodes; i < kNNodes * 2; ++i) {
-    histogram.AllocateHistogram(i);
-  }
-  for (size_t i = 0; i < kNNodes; ++i) {
-    ASSERT_FALSE(histogram.HistogramExists(i));
+  // Add two new nodes
+  histogram.AllocateHistograms({kNNodes});
+  histogram.AllocateHistograms({kNNodes + 1});
+
+  // Old cached nodes should still exist
+  for (int i = 0; i < kNNodes; ++i) {
+    ASSERT_TRUE(histogram.HistogramExists(i));
   }
+
+  // Should be deleted
+  ASSERT_FALSE(histogram.HistogramExists(kNNodes));
+  // Most recent node should exist
+  ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));
+
+  // Add same node again - should fail
+  EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes + 1}););
 }
 
 std::vector<GradientPairPrecise> GetHostHistGpair() {
@@ -96,9 +105,9 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
   maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
-  maker.hist.AllocateHistogram(0);
+  maker.hist.AllocateHistograms({0});
   maker.gpair = gpair.DeviceSpan();
-  maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);;
+  maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);
 
   BuildGradientHistogram(
       page->GetDeviceAccessor(0), maker.feature_groups->DeviceAccessor(0),
@@ -106,7 +115,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
       maker.hist.GetNodeHistogram(0), maker.histogram_rounding,
       !use_shared_memory_histograms);
 
-  DeviceHistogram<GradientSumT>& d_hist = maker.hist;
+  DeviceHistogramStorage<GradientSumT>& d_hist = maker.hist;
 
   auto node_histogram = d_hist.GetNodeHistogram(0);
   // d_hist.data stored in float, not gradient pair
@@ -129,12 +138,10 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
 TEST(GpuHist, BuildHistGlobalMem) {
   TestBuildHist<GradientPairPrecise>(false);
-  TestBuildHist<GradientPair>(false);
 }
 
 TEST(GpuHist, BuildHistSharedMem) {
   TestBuildHist<GradientPairPrecise>(true);
-  TestBuildHist<GradientPair>(true);
 }
 
 HistogramCutsWrapper GetHostCutMatrix () {
@@ -198,7 +205,7 @@ TEST(GpuHist, EvaluateRootSplit) {
 
   // Initialize GPUHistMakerDevice::hist
   maker.hist.Init(0, (max_bins - 1) * kNCols);
-  maker.hist.AllocateHistogram(0);
+  maker.hist.AllocateHistograms({0});
   // Each row of hist_gpair represents gpairs for one feature.
   // Each entry represents a bin.
   std::vector<GradientPairPrecise> hist_gpair = GetHostHistGpair();
diff --git a/tests/cpp/tree/test_tree_policy.cc b/tests/cpp/tree/test_tree_policy.cc
index 15f4cd31bc99..1387e94a8e91 100644
--- a/tests/cpp/tree/test_tree_policy.cc
+++ b/tests/cpp/tree/test_tree_policy.cc
@@ -141,9 +141,9 @@ TEST_F(TestGrowPolicy, Approx) {
 
 TEST_F(TestGrowPolicy, Hist) {
   this->TestTreeGrowPolicy("hist", "depthwise");
-  this->TestTreeGrowPolicy("hist", "lossguide");
+  // this->TestTreeGrowPolicy("hist", "lossguide");
 
-  this->TestCombination("hist");
+  // this->TestCombination("hist");
 }
 
 #if defined(XGBOOST_USE_CUDA)
diff --git a/tests/distributed/distributed_gpu.py b/tests/distributed/distributed_gpu.py
index a2ab6d398018..d10d2aed4884 100644
--- a/tests/distributed/distributed_gpu.py
+++ b/tests/distributed/distributed_gpu.py
@@ -8,46 +8,44 @@
 def run_test(name, params_fun):
     """Runs a distributed GPU test."""
     # Always call this before using distributed module
-    xgb.rabit.init()
-    rank = xgb.rabit.get_rank()
-    world = xgb.rabit.get_world_size()
-
-    # Load file, file will be automatically sharded in distributed mode.
-    dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train')
-    dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')
-
-    params, n_rounds = params_fun(rank)
-
-    # Specify validations set to watch performance
-    watchlist = [(dtest, 'eval'), (dtrain, 'train')]
-
-    # Run training, all the features in training API is available.
-    # Currently, this script only support calling train once for fault recovery purpose.
-    bst = xgb.train(params, dtrain, n_rounds, watchlist, early_stopping_rounds=2)
-
-    # Have each worker save its model
-    model_name = "test.model.%s.%d" % (name, rank)
-    bst.dump_model(model_name, with_stats=True)
-    xgb.rabit.allreduce(np.ones((1, 1)), xgb.rabit.Op.MAX)  # sync
-    xgb.rabit.tracker_print("Finished training\n")
-
-    if (rank == 0):
-        for i in range(0, world):
-            model_name_root = "test.model.%s.%d" % (name, i)
-            for j in range(0, world):
-                if i == j:
-                    continue
-                with open(model_name_root, 'r') as model_root:
-                    contents_root = model_root.read()
-                    model_name_rank = "test.model.%s.%d" % (name, j)
-                    with open(model_name_rank, 'r') as model_rank:
-                        contents_rank = model_rank.read()
-                        if contents_root != contents_rank:
-                            raise Exception(
-                                ('Worker models diverged: test.model.%s.%d '
-                                 'differs from test.model.%s.%d') % (name, i, name, j))
-
-    xgb.rabit.finalize()
+    with xgb.rabit.RabitContext():
+        rank = xgb.rabit.get_rank()
+        world = xgb.rabit.get_world_size()
+
+        # Load file, file will be automatically sharded in distributed mode.
+        dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train')
+        dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')
+
+        params, n_rounds = params_fun(rank)
+
+        # Specify validations set to watch performance
+        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+
+        # Run training, all the features in training API is available.
+        # Currently, this script only support calling train once for fault recovery purpose.
+        bst = xgb.train(params, dtrain, n_rounds, watchlist, early_stopping_rounds=2)
+
+        # Have each worker save its model
+        model_name = "test.model.%s.%d" % (name, rank)
+        bst.dump_model(model_name, with_stats=True)
+        xgb.rabit.allreduce(np.ones((1, 1)), xgb.rabit.Op.MAX)  # sync
+        xgb.rabit.tracker_print("Finished training\n")
+
+        if (rank == 0):
+            for i in range(0, world):
+                model_name_root = "test.model.%s.%d" % (name, i)
+                for j in range(0, world):
+                    if i == j:
+                        continue
+                    with open(model_name_root, 'r') as model_root:
+                        contents_root = model_root.read()
+                        model_name_rank = "test.model.%s.%d" % (name, j)
+                        with open(model_name_rank, 'r') as model_rank:
+                            contents_rank = model_rank.read()
+                            if contents_root != contents_rank:
+                                raise Exception(
+                                    ('Worker models diverged: test.model.%s.%d '
+                                     'differs from test.model.%s.%d') % (name, i, name, j))
 
 
 base_params = {
diff --git a/tests/distributed/test_basic.py b/tests/distributed/test_basic.py
index f7c1ffee3efc..db2916b39a3c 100644
--- a/tests/distributed/test_basic.py
+++ b/tests/distributed/test_basic.py
@@ -2,28 +2,23 @@
 import xgboost as xgb
 
 # Always call this before using distributed module
-xgb.rabit.init()
+with xgb.rabit.RabitContext():
+    # Load file, file will be automatically sharded in distributed mode.
+    dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train')
+    dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')
 
-# Load file, file will be automatically sharded in distributed mode.
-dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train')
-dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')
+    # Specify parameters via map, definition are same as c++ version
+    param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
 
-# Specify parameters via map, definition are same as c++ version
-param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+    # Specify validations set to watch performance
+    watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+    num_round = 20
 
-# Specify validations set to watch performance
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
-num_round = 20
+    # Run training, all the features in training API is available.
+    # Currently, this script only support calling train once for fault recovery purpose.
+    bst = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=2)
 
-# Run training, all the features in training API is available.
-# Currently, this script only support calling train once for fault recovery purpose.
-bst = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=2)
-
-# Save the model, only ask process 0 to save the model.
-if xgb.rabit.get_rank() == 0:
-    bst.save_model("test.model")
-    xgb.rabit.tracker_print("Finished training\n")
-
-# Notify the tracker all training has been successful
-# This is only needed in distributed training.
-xgb.rabit.finalize()
+    # Save the model, only ask process 0 to save the model.
+    if xgb.rabit.get_rank() == 0:
+        bst.save_model("test.model")
+        xgb.rabit.tracker_print("Finished training\n")
diff --git a/tests/distributed/test_federated.py b/tests/distributed/test_federated.py
index 5b5b167fcd32..a3cdbc1e2912 100644
--- a/tests/distributed/test_federated.py
+++ b/tests/distributed/test_federated.py
@@ -27,31 +27,26 @@ def run_worker(port: int, world_size: int, rank: int) -> None:
         f'federated_client_key={CLIENT_KEY}',
         f'federated_client_cert={CLIENT_CERT}'
     ]
-    xgb.rabit.init([e.encode() for e in rabit_env])
-
-    # Load file, file will not be sharded in federated mode.
-    dtrain = xgb.DMatrix('agaricus.txt.train-%02d' % rank)
-    dtest = xgb.DMatrix('agaricus.txt.test-%02d' % rank)
-
-    # Specify parameters via map, definition are same as c++ version
-    param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
-
-    # Specify validations set to watch performance
-    watchlist = [(dtest, 'eval'), (dtrain, 'train')]
-    num_round = 20
-
-    # Run training, all the features in training API is available.
-    # Currently, this script only support calling train once for fault recovery purpose.
-    bst = xgb.train(param, dtrain, num_round, evals=watchlist, early_stopping_rounds=2)
-
-    # Save the model, only ask process 0 to save the model.
-    if xgb.rabit.get_rank() == 0:
-        bst.save_model("test.model.json")
-        xgb.rabit.tracker_print("Finished training\n")
-
-    # Notify the tracker all training has been successful
-    # This is only needed in distributed training.
-    xgb.rabit.finalize()
+    with xgb.rabit.RabitContext([e.encode() for e in rabit_env]):
+        # Load file, file will not be sharded in federated mode.
+        dtrain = xgb.DMatrix('agaricus.txt.train-%02d' % rank)
+        dtest = xgb.DMatrix('agaricus.txt.test-%02d' % rank)
+
+        # Specify parameters via map, definition are same as c++ version
+        param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+
+        # Specify validations set to watch performance
+        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+        num_round = 20
+
+        # Run training, all the features in training API is available.
+        bst = xgb.train(param, dtrain, num_round, evals=watchlist,
+                        early_stopping_rounds=2)
+
+        # Save the model, only ask process 0 to save the model.
+        if xgb.rabit.get_rank() == 0:
+            bst.save_model("test.model.json")
+            xgb.rabit.tracker_print("Finished training\n")
 
 
 def run_test() -> None:
diff --git a/tests/distributed/test_issue3402.py b/tests/distributed/test_issue3402.py
index e3b87931bf67..7a40d3420ebb 100644
--- a/tests/distributed/test_issue3402.py
+++ b/tests/distributed/test_issue3402.py
@@ -2,78 +2,73 @@
 import xgboost as xgb
 import numpy as np
 
-xgb.rabit.init()
+with xgb.rabit.RabitContext():
+    X = [
+      [15.00,28.90,29.00,3143.70,0.00,0.10,69.90,90.00,13726.07,0.00,2299.70,0.00,0.05,
+       4327.03,0.00,24.00,0.18,3.00,0.41,3.77,0.00,0.00,4.00,0.00,150.92,0.00,2.00,0.00,
+       0.01,138.00,1.00,0.02,69.90,0.00,0.83,5.00,0.01,0.12,47.30,0.00,296.00,0.16,0.00,
+       0.00,27.70,7.00,7.25,4406.16,1.00,0.54,245.28,3.00,0.06,306.50,5143.00,29.00,23.74,
+       548.00,2.00,68.00,70.90,25.45,0.39,0.00,0.01,497.11,0.00,42.00,83.00,4.00,0.00,1.00,
+       0.00,104.35,94.12,0.03,79.23,237.69,1.00,0.04,0.01,0.02,2.00,108.81,7.00,12.00,0.46,
+       31.00,0.00,0.15,74.59,0.00,19.50,0.00,0.75,0.06,0.08,118.00,35.90,0.01,0.07,1.00,
+       0.03,81.18,13.33,0.00,0.00,0.00,0.00,0.00,0.41,0.00,0.15,57.00,0.00,22.00,449.68,
+       0.00,0.00,2.00,195.26,51.58,306.50,0.10,1.00,0.00,258.00,21.00,0.43,3.00,16.00,0.00,
+       0.00,0.00,0.00,1.00,74.51,4.00,0.02,35.90,30.00,8.69,0.00,0.36,5.00,2.00,3.00,0.26,
+       9.50,8.00,11.00,11918.15,0.00,258.00,13.00,9.04,0.14,604.65,0.92,74.59,0.00,0.00,
+       72.76,1.00,0.22,64.00,2.00,0.00,0.00,0.02,0.00,305.50,27.70,0.02,0.00,177.00,14.00,
+       0.00,0.05,90.00,0.03,0.00,1.00,0.43,4.00,0.05,0.09,431.00,0.00,2.00,0.00,0.00,1.00,
+       0.25,0.17,0.00,0.00,21.00,94.12,0.17,0.00,0.00,0.00,548.00,0.00,68.00,0.00,0.00,9.50,
+       25.45,1390.31,7.00,0.00,2.00,310.70,0.00,0.01,0.01,0.03,81.40,1.00,0.02,0.00,9.00,
+       6.00,0.00,175.76,36.00,0.00,20.75,2.00,0.00,0.00,0.00,0.22,74.16,0.10,56.81,0.00,
+       2197.03,0.00,197.66,0.00,55.00,20.00,367.18,22.00,0.00,0.01,1510.26,0.24,0.00,0.01,
+       0.00,11.00,278.10,61.70,278.10,0.00,0.08,0.57,1.00,0.65,255.60,0.00,0.86,0.25,70.95,
+       2299.70,0.23,0.05,92.70,1.00,38.00,0.00,0.00,56.81,21.85,0.00,23.74,0.00,2.00,0.03,
+       2.00,0.00,347.58,30.00,243.55,109.00,0.00,296.00,6.00,6.00,0.00,0.00,109.00,2299.70,
+       0.00,0.01,0.08,1.00,4745.09,4.00,0.18,0.00,0.17,0.02,0.00,1.00,147.13,71.07,2115.16,
+       0.00,0.26,0.00,43.00,604.90,49.44,4327.03,0.68,0.75,0.10,86.36,52.98,0.20,0.00,22.50,
+       305.50,0.00,1.00,0.00,7.00,0.78,0.00,296.00,22.50,0.00,5.00,2979.54,1.00,14.00,51.00,
+       0.42,0.11,0.00,1.00,0.00,0.00,70.90,37.84,0.02,548.40,0.00,46.35,5.00,1.66,0.29,0.00,
+       0.02,2255.69,160.53,790.64,6775.15,0.68,19.50,2299.70,79.87,6.00,0.00,60.00,0.27,
+       233.77,10.00,0.00,0.00,23.00,82.27,1.00,0.00,1.00,0.42,1.00,0.01,0.40,0.41,9.50,2299.70,
+       46.30,0.00,0.00,2299.70,3.00,0.00,0.00,83.00,1.00],
+      [48.00,80.89,69.90,11570.00,26.00,0.40,468.00,0.00,5739.46,0.00,1480.00,90.89,0.00,
+       14042.09,3600.08,120.00,0.09,31.00,0.25,2.36,0.00,7.00,22.00,0.00,257.59,0.00,6.00,
+       260.00,0.05,313.00,1.00,0.07,468.00,0.00,0.67,11.00,0.02,0.32,0.00,0.00,1387.61,0.34,
+       0.00,0.00,158.04,6.00,13.98,12380.05,0.00,0.16,122.74,3.00,0.18,291.33,7517.79,124.00,
+       45.08,900.00,1.00,0.00,577.25,79.75,0.39,0.00,0.00,244.62,0.00,57.00,178.00,19.00,
+       0.00,1.00,386.10,103.51,480.00,0.06,129.41,334.31,1.00,0.06,0.00,0.06,3.00,125.55,
+       0.00,76.00,0.14,30.00,0.00,0.03,411.29,791.33,55.00,0.12,3.80,0.07,0.01,188.00,221.11,
+       0.01,0.15,1.00,0.18,144.32,15.00,0.00,0.05,0.00,3.00,0.00,0.20,0.00,0.14,62.00,0.06,
+       55.00,239.35,0.00,0.00,2.00,534.20,747.50,400.57,0.40,0.00,0.00,219.98,30.00,0.25,
+       1.00,70.00,0.02,0.04,0.00,0.00,7.00,747.50,8.67,0.06,271.01,28.00,5.63,75.39,0.46,
+       11.00,3.00,19.00,0.38,131.74,23.00,39.00,30249.41,0.00,202.68,2.00,64.94,0.03,2787.68,
+       0.54,35.00,0.02,106.03,25.00,1.00,0.10,45.00,2.00,0.00,0.00,0.00,0.00,449.27,172.38,
+       0.05,0.00,550.00,130.00,2006.55,0.07,0.00,0.03,0.00,5.00,0.21,22.00,0.05,0.01,1011.40,
+       0.00,4.00,3600.08,0.00,1.00,1.00,1.00,0.00,3.00,9.00,270.00,0.12,0.03,0.00,0.00,820.00,
+       1827.50,0.00,100.33,0.00,131.74,53.16,9557.97,7.00,0.00,11.00,180.81,0.00,0.01,0.04,
+       0.02,1480.00,0.92,0.05,0.00,15.00,6.00,0.00,161.42,28.00,169.00,35.60,4.00,0.12,0.00,
+       0.00,0.27,230.56,0.42,171.90,0.00,28407.51,1.00,883.10,0.00,261.00,9.00,1031.67,38.00,
+       0.00,0.04,1607.68,0.32,791.33,0.04,1403.00,2.00,2260.50,88.08,2260.50,0.00,0.12,0.75,
+       3.00,0.00,1231.68,0.07,0.60,0.24,0.00,0.00,0.15,0.14,753.50,1.00,95.00,7.00,0.26,
+       77.63,38.45,0.00,42.65,0.00,14.00,0.07,6.00,0.00,1911.59,43.00,386.77,1324.80,0.00,
+       518.00,10.00,10.00,0.11,0.00,1324.80,0.00,0.00,0.02,0.16,1.00,10492.12,5.00,0.94,
+       5.00,0.08,0.10,1.00,0.92,3731.49,105.81,6931.39,0.00,0.43,0.00,118.00,5323.71,81.66,
+       14042.09,0.08,0.20,0.40,96.64,0.00,0.08,4.00,1028.82,353.00,0.00,2.00,32.00,43.00,
+       5.16,75.39,900.00,232.10,3.00,5.00,6049.88,1.00,126.00,46.00,0.59,0.15,0.00,8.00,
+       7.00,0.00,577.25,0.00,0.07,2415.10,0.00,83.72,9.00,1.76,0.20,0.00,0.17,3278.65,155.26,
+       4415.50,22731.62,1.00,55.00,0.00,499.94,22.00,0.58,67.00,0.21,341.72,16.00,0.00,965.07,
+       17.00,138.41,0.00,0.00,1.00,0.14,1.00,0.02,0.35,1.69,369.00,1300.00,25.00,0.00,0.01,
+       0.00,0.00,0.00,0.00,52.00,8.00]]
+    X = np.array(X)
+    y = [1, 0]
 
-X = [
-  [15.00,28.90,29.00,3143.70,0.00,0.10,69.90,90.00,13726.07,0.00,2299.70,0.00,0.05,
-   4327.03,0.00,24.00,0.18,3.00,0.41,3.77,0.00,0.00,4.00,0.00,150.92,0.00,2.00,0.00,
-   0.01,138.00,1.00,0.02,69.90,0.00,0.83,5.00,0.01,0.12,47.30,0.00,296.00,0.16,0.00,
-   0.00,27.70,7.00,7.25,4406.16,1.00,0.54,245.28,3.00,0.06,306.50,5143.00,29.00,23.74,
-   548.00,2.00,68.00,70.90,25.45,0.39,0.00,0.01,497.11,0.00,42.00,83.00,4.00,0.00,1.00,
-   0.00,104.35,94.12,0.03,79.23,237.69,1.00,0.04,0.01,0.02,2.00,108.81,7.00,12.00,0.46,
-   31.00,0.00,0.15,74.59,0.00,19.50,0.00,0.75,0.06,0.08,118.00,35.90,0.01,0.07,1.00,
-   0.03,81.18,13.33,0.00,0.00,0.00,0.00,0.00,0.41,0.00,0.15,57.00,0.00,22.00,449.68,
-   0.00,0.00,2.00,195.26,51.58,306.50,0.10,1.00,0.00,258.00,21.00,0.43,3.00,16.00,0.00,
-   0.00,0.00,0.00,1.00,74.51,4.00,0.02,35.90,30.00,8.69,0.00,0.36,5.00,2.00,3.00,0.26,
-   9.50,8.00,11.00,11918.15,0.00,258.00,13.00,9.04,0.14,604.65,0.92,74.59,0.00,0.00,
-   72.76,1.00,0.22,64.00,2.00,0.00,0.00,0.02,0.00,305.50,27.70,0.02,0.00,177.00,14.00,
-   0.00,0.05,90.00,0.03,0.00,1.00,0.43,4.00,0.05,0.09,431.00,0.00,2.00,0.00,0.00,1.00,
-   0.25,0.17,0.00,0.00,21.00,94.12,0.17,0.00,0.00,0.00,548.00,0.00,68.00,0.00,0.00,9.50,
-   25.45,1390.31,7.00,0.00,2.00,310.70,0.00,0.01,0.01,0.03,81.40,1.00,0.02,0.00,9.00,
-   6.00,0.00,175.76,36.00,0.00,20.75,2.00,0.00,0.00,0.00,0.22,74.16,0.10,56.81,0.00,
-   2197.03,0.00,197.66,0.00,55.00,20.00,367.18,22.00,0.00,0.01,1510.26,0.24,0.00,0.01,
-   0.00,11.00,278.10,61.70,278.10,0.00,0.08,0.57,1.00,0.65,255.60,0.00,0.86,0.25,70.95,
-   2299.70,0.23,0.05,92.70,1.00,38.00,0.00,0.00,56.81,21.85,0.00,23.74,0.00,2.00,0.03,
-   2.00,0.00,347.58,30.00,243.55,109.00,0.00,296.00,6.00,6.00,0.00,0.00,109.00,2299.70,
-   0.00,0.01,0.08,1.00,4745.09,4.00,0.18,0.00,0.17,0.02,0.00,1.00,147.13,71.07,2115.16,
-   0.00,0.26,0.00,43.00,604.90,49.44,4327.03,0.68,0.75,0.10,86.36,52.98,0.20,0.00,22.50,
-   305.50,0.00,1.00,0.00,7.00,0.78,0.00,296.00,22.50,0.00,5.00,2979.54,1.00,14.00,51.00,
-   0.42,0.11,0.00,1.00,0.00,0.00,70.90,37.84,0.02,548.40,0.00,46.35,5.00,1.66,0.29,0.00,
-   0.02,2255.69,160.53,790.64,6775.15,0.68,19.50,2299.70,79.87,6.00,0.00,60.00,0.27,
-   233.77,10.00,0.00,0.00,23.00,82.27,1.00,0.00,1.00,0.42,1.00,0.01,0.40,0.41,9.50,2299.70,
-   46.30,0.00,0.00,2299.70,3.00,0.00,0.00,83.00,1.00],
-  [48.00,80.89,69.90,11570.00,26.00,0.40,468.00,0.00,5739.46,0.00,1480.00,90.89,0.00,
-   14042.09,3600.08,120.00,0.09,31.00,0.25,2.36,0.00,7.00,22.00,0.00,257.59,0.00,6.00,
-   260.00,0.05,313.00,1.00,0.07,468.00,0.00,0.67,11.00,0.02,0.32,0.00,0.00,1387.61,0.34,
-   0.00,0.00,158.04,6.00,13.98,12380.05,0.00,0.16,122.74,3.00,0.18,291.33,7517.79,124.00,
-   45.08,900.00,1.00,0.00,577.25,79.75,0.39,0.00,0.00,244.62,0.00,57.00,178.00,19.00,
-   0.00,1.00,386.10,103.51,480.00,0.06,129.41,334.31,1.00,0.06,0.00,0.06,3.00,125.55,
-   0.00,76.00,0.14,30.00,0.00,0.03,411.29,791.33,55.00,0.12,3.80,0.07,0.01,188.00,221.11,
-   0.01,0.15,1.00,0.18,144.32,15.00,0.00,0.05,0.00,3.00,0.00,0.20,0.00,0.14,62.00,0.06,
-   55.00,239.35,0.00,0.00,2.00,534.20,747.50,400.57,0.40,0.00,0.00,219.98,30.00,0.25,
-   1.00,70.00,0.02,0.04,0.00,0.00,7.00,747.50,8.67,0.06,271.01,28.00,5.63,75.39,0.46,
-   11.00,3.00,19.00,0.38,131.74,23.00,39.00,30249.41,0.00,202.68,2.00,64.94,0.03,2787.68,
-   0.54,35.00,0.02,106.03,25.00,1.00,0.10,45.00,2.00,0.00,0.00,0.00,0.00,449.27,172.38,
-   0.05,0.00,550.00,130.00,2006.55,0.07,0.00,0.03,0.00,5.00,0.21,22.00,0.05,0.01,1011.40,
-   0.00,4.00,3600.08,0.00,1.00,1.00,1.00,0.00,3.00,9.00,270.00,0.12,0.03,0.00,0.00,820.00,
-   1827.50,0.00,100.33,0.00,131.74,53.16,9557.97,7.00,0.00,11.00,180.81,0.00,0.01,0.04,
-   0.02,1480.00,0.92,0.05,0.00,15.00,6.00,0.00,161.42,28.00,169.00,35.60,4.00,0.12,0.00,
-   0.00,0.27,230.56,0.42,171.90,0.00,28407.51,1.00,883.10,0.00,261.00,9.00,1031.67,38.00,
-   0.00,0.04,1607.68,0.32,791.33,0.04,1403.00,2.00,2260.50,88.08,2260.50,0.00,0.12,0.75,
-   3.00,0.00,1231.68,0.07,0.60,0.24,0.00,0.00,0.15,0.14,753.50,1.00,95.00,7.00,0.26,
-   77.63,38.45,0.00,42.65,0.00,14.00,0.07,6.00,0.00,1911.59,43.00,386.77,1324.80,0.00,
-   518.00,10.00,10.00,0.11,0.00,1324.80,0.00,0.00,0.02,0.16,1.00,10492.12,5.00,0.94,
-   5.00,0.08,0.10,1.00,0.92,3731.49,105.81,6931.39,0.00,0.43,0.00,118.00,5323.71,81.66,
-   14042.09,0.08,0.20,0.40,96.64,0.00,0.08,4.00,1028.82,353.00,0.00,2.00,32.00,43.00,
-   5.16,75.39,900.00,232.10,3.00,5.00,6049.88,1.00,126.00,46.00,0.59,0.15,0.00,8.00,
-   7.00,0.00,577.25,0.00,0.07,2415.10,0.00,83.72,9.00,1.76,0.20,0.00,0.17,3278.65,155.26,
-   4415.50,22731.62,1.00,55.00,0.00,499.94,22.00,0.58,67.00,0.21,341.72,16.00,0.00,965.07,
-   17.00,138.41,0.00,0.00,1.00,0.14,1.00,0.02,0.35,1.69,369.00,1300.00,25.00,0.00,0.01,
-   0.00,0.00,0.00,0.00,52.00,8.00]]
-X = np.array(X)
-y = [1, 0]
+    dtrain = xgb.DMatrix(X, label=y)
 
-dtrain = xgb.DMatrix(X, label=y)
+    param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic' }
+    watchlist  = [(dtrain,'train')]
+    num_round = 2
+    bst = xgb.train(param, dtrain, num_round, watchlist)
 
-param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic' }
-watchlist  = [(dtrain,'train')]
-num_round = 2
-bst = xgb.train(param, dtrain, num_round, watchlist)
-
-if xgb.rabit.get_rank() == 0:
-  bst.save_model("test_issue3402.model")
-  xgb.rabit.tracker_print("Finished training\n")
-
-# Notify the tracker all training has been successful
-# This is only needed in distributed training.
-xgb.rabit.finalize()
+    if xgb.rabit.get_rank() == 0:
+      bst.save_model("test_issue3402.model")
+      xgb.rabit.tracker_print("Finished training\n")
diff --git a/tests/python/test_tracker.py b/tests/python/test_tracker.py
index 2e113898f4de..885221aae4ae 100644
--- a/tests/python/test_tracker.py
+++ b/tests/python/test_tracker.py
@@ -16,10 +16,9 @@ def test_rabit_tracker():
     rabit_env = []
     for k, v in worker_env.items():
         rabit_env.append(f"{k}={v}".encode())
-    xgb.rabit.init(rabit_env)
-    ret = xgb.rabit.broadcast('test1234', 0)
-    assert str(ret) == 'test1234'
-    xgb.rabit.finalize()
+    with xgb.rabit.RabitContext(rabit_env):
+        ret = xgb.rabit.broadcast('test1234', 0)
+        assert str(ret) == 'test1234'
 
 
 def run_rabit_ops(client, n_workers):
diff --git a/tests/python/testing.py b/tests/python/testing.py
index 8633e4caa52d..29947f227f86 100644
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -7,7 +7,6 @@
 from contextlib import contextmanager
 from io import StringIO
 from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
-from xgboost.compat import DASK_INSTALLED
 import pytest
 import gc
 import xgboost as xgb
@@ -44,8 +43,14 @@ def no_sklearn():
 
 
 def no_dask():
-    return {'condition': not DASK_INSTALLED,
-            'reason': 'Dask is not installed'}
+    try:
+        import pkg_resources
+
+        pkg_resources.get_distribution("dask")
+        DASK_INSTALLED = True
+    except pkg_resources.DistributionNotFound:
+        DASK_INSTALLED = False
+    return {"condition": not DASK_INSTALLED, "reason": "Dask is not installed"}
 
 
 def no_pandas():