Merge branch 'master' of https://github.com/dmlc/xgboost into optimiz…

…ation_part_applysplit
dmlc · May 22, 2022 · 8cd8cd5 · 8cd8cd5
2 parents efb4f50 + f6babc8
commit 8cd8cd5
Show file tree

Hide file tree

Showing 63 changed files with 1,406 additions and 1,082 deletions.
diff --git a/Makefile b/Makefile
@@ -91,10 +91,7 @@ endif
 # If any of the dask tests failed, contributor won't see the other error.
 mypy:
 	cd python-package; \
-	mypy ./xgboost/dask.py && \
-	mypy ./xgboost/rabit.py && \
-	mypy ./xgboost/tracker.py && \
-	mypy ./xgboost/sklearn.py && \
+	mypy . && \
 	mypy ../demo/guide-python/external_memory.py && \
 	mypy ../demo/guide-python/categorical.py && \
 	mypy ../demo/guide-python/cat_in_the_dat.py && \

diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst
@@ -68,6 +68,12 @@ a filename with ``.json`` or ``.ubj`` as file extension, the latter is the exten
 
   xgb.save(bst, 'model_file_name.json')
 
+.. note::
+
+  Only load models from JSON files that were produced by XGBoost. Attempting to load
+  JSON files that were produced by an external source may lead to undefined behaviors
+  and crashes.
+
 While for memory snapshot, UBJSON is the default starting with xgboost 1.6.
 
 ***************************************************************

diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
@@ -111,15 +111,14 @@ class GradientBooster : public Model, public Configurable {
   /*!
    * \brief Inplace prediction.
    *
-   * \param           x                      A type erased data adapter.
+   * \param           p_fmat                 A proxy DMatrix that contains the data and related
+   *                                         meta info.
    * \param           missing                Missing value in the data.
    * \param [in,out]  out_preds              The output preds.
    * \param           layer_begin (Optional) Beginning of boosted tree layer used for prediction.
    * \param           layer_end   (Optional) End of booster layer. 0 means do not limit trees.
    */
-  virtual void InplacePredict(dmlc::any const &, std::shared_ptr<DMatrix>, float,
-                              PredictionCacheEntry*,
-                              uint32_t,
+  virtual void InplacePredict(std::shared_ptr<DMatrix>, float, PredictionCacheEntry*, uint32_t,
                               uint32_t) const {
     LOG(FATAL) << "Inplace predict is not supported by current booster.";
   }

diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
@@ -139,21 +139,16 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   /*!
    * \brief Inplace prediction.
    *
-   * \param          x           A type erased data adapter.
-   * \param          p_m         An optional Proxy DMatrix object storing meta info like
-   *                             base margin.  Can be nullptr.
+   * \param          p_fmat      A proxy DMatrix that contains the data and related meta info.
    * \param          type        Prediction type.
    * \param          missing     Missing value in the data.
    * \param [in,out] out_preds   Pointer to output prediction vector.
    * \param          layer_begin Beginning of boosted tree layer used for prediction.
    * \param          layer_end   End of booster layer. 0 means do not limit trees.
    */
-  virtual void InplacePredict(dmlc::any const &x,
-                              std::shared_ptr<DMatrix> p_m,
-                              PredictionType type,
-                              float missing,
-                              HostDeviceVector<bst_float> **out_preds,
-                              uint32_t layer_begin, uint32_t layer_end) = 0;
+  virtual void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
+                              HostDeviceVector<bst_float>** out_preds, uint32_t layer_begin,
+                              uint32_t layer_end) = 0;
 
   /*!
    * \brief Calculate feature score.  See doc in C API for outputs.

diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
@@ -145,7 +145,9 @@ class Predictor {
 
   /**
    * \brief Inplace prediction.
-   * \param           x                      Type erased data adapter.
+   *
+   * \param           p_fmat                 A proxy DMatrix that contains the data and related
+   *                                         meta info.
    * \param           model                  The model to predict from.
    * \param           missing                Missing value in the data.
    * \param [in,out]  out_preds              The output preds.
@@ -154,11 +156,9 @@ class Predictor {
    *
    * \return True if the data can be handled by current predictor, false otherwise.
    */
-  virtual bool InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
-                              const gbm::GBTreeModel &model, float missing,
-                              PredictionCacheEntry *out_preds,
-                              uint32_t tree_begin = 0,
-                              uint32_t tree_end = 0) const = 0;
+  virtual bool InplacePredict(std::shared_ptr<DMatrix> p_fmat, const gbm::GBTreeModel& model,
+                              float missing, PredictionCacheEntry* out_preds,
+                              uint32_t tree_begin = 0, uint32_t tree_end = 0) const = 0;
   /**
    * \brief online prediction function, predict score for one instance at a time
    * NOTE: use the batch prediction interface if possible, batch prediction is

diff --git a/...boost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/...boost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -61,15 +61,14 @@ class GpuPreXGBoost extends PreXGBoostProvider {
    * @param estimator [[XGBoostClassifier]] or [[XGBoostRegressor]]
    * @param dataset   the training data
    * @param params    all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
-   *         Boolean if building DMatrix in rabit context
+   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
    *         RDD[() => Watches] will be used as the training input
    *         Option[ RDD[_] ] is the optional cached RDD
    */
   override def buildDatasetToRDD(estimator: Estimator[_],
       dataset: Dataset[_],
       params: Map[String, Any]):
-    XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
+    XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {
     GpuPreXGBoost.buildDatasetToRDD(estimator, dataset, params)
   }
 
@@ -123,16 +122,15 @@ object GpuPreXGBoost extends PreXGBoostProvider {
    * @param estimator supports XGBoostClassifier and XGBoostRegressor
    * @param dataset   the training data
    * @param params    all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
-   *         Boolean if building DMatrix in rabit context
+   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
    *         RDD[() => Watches] will be used as the training input to build DMatrix
    *         Option[ RDD[_] ] is the optional cached RDD
    */
   override def buildDatasetToRDD(
       estimator: Estimator[_],
       dataset: Dataset[_],
       params: Map[String, Any]):
-    XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
+    XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {
 
     val (Seq(labelName, weightName, marginName), feturesCols, groupName, evalSets) =
       estimator match {
@@ -170,7 +168,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
     xgbExecParams: XGBoostExecutionParams =>
       val dataMap = prepareInputData(trainingData, evalDataMap, xgbExecParams.numWorkers,
         xgbExecParams.cacheTrainingSet)
-      (true, buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
+      (buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
   }
 
   /**

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
@@ -101,16 +101,15 @@ object PreXGBoost extends PreXGBoostProvider {
    * @param estimator supports XGBoostClassifier and XGBoostRegressor
    * @param dataset the training data
    * @param params all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
-   *         Boolean if building DMatrix in rabit context
+   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
    *         RDD[() => Watches] will be used as the training input
    *         Option[RDD[_]\] is the optional cached RDD
    */
   override def buildDatasetToRDD(
       estimator: Estimator[_],
       dataset: Dataset[_],
       params: Map[String, Any]): XGBoostExecutionParams =>
-    (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
+    (RDD[() => Watches], Option[RDD[_]]) = {
 
     if (optionProvider.isDefined && optionProvider.get.providerEnabled(Some(dataset))) {
       return optionProvider.get.buildDatasetToRDD(estimator, dataset, params)
@@ -172,12 +171,12 @@ object PreXGBoost extends PreXGBoostProvider {
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (false, trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
         case Right(trainingData) =>
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (false, trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
       }
 
   }
@@ -324,20 +323,20 @@ object PreXGBoost extends PreXGBoostProvider {
       trainingSet: RDD[XGBLabeledPoint],
       evalRDDMap: Map[String, RDD[XGBLabeledPoint]] = Map(),
       hasGroup: Boolean = false):
-  XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
+  XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {
 
     xgbExecParams: XGBoostExecutionParams =>
       composeInputData(trainingSet, hasGroup, xgbExecParams.numWorkers) match {
         case Left(trainingData) =>
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (false, trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
         case Right(trainingData) =>
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (false, trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
       }
   }
 

diff --git a/...ges/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala b/...ges/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala
@@ -50,16 +50,15 @@ private[scala] trait PreXGBoostProvider {
    * @param estimator supports XGBoostClassifier and XGBoostRegressor
    * @param dataset the training data
    * @param params all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
-   *         Boolean if building DMatrix in rabit context
+   * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
    *         RDD[() => Watches] will be used as the training input to build DMatrix
    *         Option[ RDD[_] ] is the optional cached RDD
    */
   def buildDatasetToRDD(
     estimator: Estimator[_],
     dataset: Dataset[_],
     params: Map[String, Any]):
-  XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]])
+  XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]])
 
   /**
    * Transform Dataset

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -286,7 +286,6 @@ object XGBoost extends Serializable {
   }
 
   private def buildDistributedBooster(
-      buildDMatrixInRabit: Boolean,
       buildWatches: () => Watches,
       xgbExecutionParam: XGBoostExecutionParams,
       rabitEnv: java.util.Map[String, String],
@@ -295,11 +294,6 @@ object XGBoost extends Serializable {
       prevBooster: Booster): Iterator[(Booster, Map[String, Array[Float]])] = {
 
     var watches: Watches = null
-    if (!buildDMatrixInRabit) {
-      // for CPU pipeline, we need to build DMatrix out of rabit context
-      watches = buildWatchesAndCheck(buildWatches)
-    }
-
     val taskId = TaskContext.getPartitionId().toString
     val attempt = TaskContext.get().attemptNumber.toString
     rabitEnv.put("DMLC_TASK_ID", taskId)
@@ -310,10 +304,7 @@ object XGBoost extends Serializable {
     try {
       Rabit.init(rabitEnv)
 
-      if (buildDMatrixInRabit) {
-        // for GPU pipeline, we need to move dmatrix building into rabit context
-        watches = buildWatchesAndCheck(buildWatches)
-      }
+      watches = buildWatchesAndCheck(buildWatches)
 
       val numEarlyStoppingRounds = xgbExecutionParam.earlyStoppingParams.numEarlyStoppingRounds
       val metrics = Array.tabulate(watches.size)(_ => Array.ofDim[Float](numRounds))
@@ -377,7 +368,7 @@ object XGBoost extends Serializable {
   @throws(classOf[XGBoostError])
   private[spark] def trainDistributed(
       sc: SparkContext,
-      buildTrainingData: XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]),
+      buildTrainingData: XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]),
       params: Map[String, Any]):
     (Booster, Map[String, Array[Float]]) = {
 
@@ -396,7 +387,7 @@ object XGBoost extends Serializable {
     }.orNull
 
     // Get the training data RDD and the cachedRDD
-    val (buildDMatrixInRabit, trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams)
+    val (trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams)
 
     try {
       // Train for every ${savingRound} rounds and save the partially completed booster
@@ -413,9 +404,8 @@ object XGBoost extends Serializable {
             optionWatches = Some(iter.next())
           }
 
-          optionWatches.map { buildWatches => buildDistributedBooster(buildDMatrixInRabit,
-            buildWatches, xgbExecParams, rabitEnv, xgbExecParams.obj,
-            xgbExecParams.eval, prevBooster)}
+          optionWatches.map { buildWatches => buildDistributedBooster(buildWatches,
+            xgbExecParams, rabitEnv, xgbExecParams.obj, xgbExecParams.eval, prevBooster)}
             .getOrElse(throw new RuntimeException("No Watches to train"))
 
         }}

diff --git a/...ages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala b/...ages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
@@ -169,6 +169,23 @@ class XGBoostClassifier (
   }
 
   override protected def train(dataset: Dataset[_]): XGBoostClassificationModel = {
+    val _numClasses = getNumClasses(dataset)
+    if (isDefined(numClass) && $(numClass) != _numClasses) {
+      throw new Exception("The number of classes in dataset doesn't match " +
+        "\'num_class\' in xgboost params.")
+    }
+
+    if (_numClasses == 2) {
+      if (!isDefined(objective)) {
+        // If user doesn't set objective, force it to binary:logistic
+        setObjective("binary:logistic")
+      }
+    } else if (_numClasses > 2) {
+      if (!isDefined(objective)) {
+        // If user doesn't set objective, force it to multi:softprob
+        setObjective("multi:softprob")
+      }
+    }
 
     if (!isDefined(evalMetric) || $(evalMetric).isEmpty) {
       set(evalMetric, setupDefaultEvalMetric())
@@ -178,12 +195,6 @@ class XGBoostClassifier (
       set(objectiveType, "classification")
     }
 
-    val _numClasses = getNumClasses(dataset)
-    if (isDefined(numClass) && $(numClass) != _numClasses) {
-      throw new Exception("The number of classes in dataset doesn't match " +
-        "\'num_class\' in xgboost params.")
-    }
-
     // Packing with all params plus params user defined
     val derivedXGBParamMap = xgboostParams ++ MLlib2XGBoostParams
     val buildTrainingData = PreXGBoost.buildDatasetToRDD(this, dataset, derivedXGBParamMap)

diff --git a/...kages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala b/...kages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
@@ -169,6 +169,11 @@ class XGBoostRegressor (
 
   override protected def train(dataset: Dataset[_]): XGBoostRegressionModel = {
 
+    if (!isDefined(objective)) {
+      // If user doesn't set objective, force it to reg:squarederror
+      setObjective("reg:squarederror")
+    }
+
     if (!isDefined(evalMetric) || $(evalMetric).isEmpty) {
       set(evalMetric, setupDefaultEvalMetric())
     }

diff --git a/...oost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala b/...oost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -105,7 +105,7 @@ private[spark] trait LearningTaskParams extends Params {
 
   final def getMaximizeEvaluationMetrics: Boolean = $(maximizeEvaluationMetrics)
 
-  setDefault(objective -> "reg:squarederror", baseScore -> 0.5, trainTestRatio -> 1.0,
+  setDefault(baseScore -> 0.5, trainTestRatio -> 1.0,
     numEarlyStoppingRounds -> 0, cacheTrainingSet -> false)
 }
 

diff --git a/...ost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala b/...ost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
@@ -65,8 +65,6 @@ class FeatureSizeValidatingSuite extends FunSuite with PerTest {
         (id, lp.label, lp.features)
     }.toDF("id", "label", "features")
     val xgb = new XGBoostClassifier(paramMap)
-    intercept[Exception] {
-      xgb.fit(repartitioned)
-    }
+    xgb.fit(repartitioned)
   }
 }
diff --git a/...kages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala b/...kages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
@@ -138,7 +138,7 @@ class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest {
     val testDM = new DMatrix(Classification.test.iterator)
     val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
       "custom_eval" -> new EvalError, "custom_obj" -> new CustomObj(1),
-      "num_round" -> "10", "num_workers" -> numWorkers)
+      "num_round" -> "10", "num_workers" -> numWorkers, "objective" -> "binary:logistic")
 
     val xgbc = new XGBoostClassifier(paramMap)
     val xgbcPath = new File(tempDir.toFile, "xgbc").getPath

diff --git a/...xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/...xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -112,6 +112,34 @@ class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuit
     assert(!transformedDf.columns.contains("probability"))
   }
 
+  test("objective will be set if not specifying it") {
+    val training = buildDataFrame(Classification.train)
+    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
+      "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
+    val xgb = new XGBoostClassifier(paramMap)
+    assert(!xgb.isDefined(xgb.objective))
+    xgb.fit(training)
+    assert(xgb.getObjective == "binary:logistic")
+
+    val trainingDF = buildDataFrame(MultiClassification.train)
+    val paramMap1 = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
+      "num_class" -> "6", "num_round" -> 5, "num_workers" -> numWorkers,
+      "tree_method" -> treeMethod)
+    val xgb1 = new XGBoostClassifier(paramMap1)
+    assert(!xgb1.isDefined(xgb1.objective))
+    xgb1.fit(trainingDF)
+    assert(xgb1.getObjective == "multi:softprob")
+
+    // shouldn't change user's objective setting
+    val paramMap2 = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
+      "num_class" -> "6", "num_round" -> 5, "num_workers" -> numWorkers,
+      "tree_method" -> treeMethod, "objective" -> "multi:softmax")
+    val xgb2 = new XGBoostClassifier(paramMap2)
+    assert(xgb2.getObjective == "multi:softmax")
+    xgb2.fit(trainingDF)
+    assert(xgb2.getObjective == "multi:softmax")
+  }
+
   test("use base margin") {
     val training1 = buildDataFrame(Classification.train)
     val training2 = training1.withColumn("margin", functions.rand())