diff --git a/Makefile b/Makefile index bfa1731f0131..64a7ff6ccce4 100644 --- a/Makefile +++ b/Makefile @@ -91,10 +91,7 @@ endif # If any of the dask tests failed, contributor won't see the other error. mypy: cd python-package; \ - mypy ./xgboost/dask.py && \ - mypy ./xgboost/rabit.py && \ - mypy ./xgboost/tracker.py && \ - mypy ./xgboost/sklearn.py && \ + mypy . && \ mypy ../demo/guide-python/external_memory.py && \ mypy ../demo/guide-python/categorical.py && \ mypy ../demo/guide-python/cat_in_the_dat.py && \ diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst index 723cde431bc4..87e54054c66a 100644 --- a/doc/tutorials/saving_model.rst +++ b/doc/tutorials/saving_model.rst @@ -68,6 +68,12 @@ a filename with ``.json`` or ``.ubj`` as file extension, the latter is the exten xgb.save(bst, 'model_file_name.json') +.. note:: + + Only load models from JSON files that were produced by XGBoost. Attempting to load + JSON files that were produced by an external source may lead to undefined behaviors + and crashes. + While for memory snapshot, UBJSON is the default starting with xgboost 1.6. *************************************************************** diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index cce92d3679f4..a731bfac84ed 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -111,15 +111,14 @@ class GradientBooster : public Model, public Configurable { /*! * \brief Inplace prediction. * - * \param x A type erased data adapter. + * \param p_fmat A proxy DMatrix that contains the data and related + * meta info. * \param missing Missing value in the data. * \param [in,out] out_preds The output preds. * \param layer_begin (Optional) Beginning of boosted tree layer used for prediction. * \param layer_end (Optional) End of booster layer. 0 means do not limit trees. */ - virtual void InplacePredict(dmlc::any const &, std::shared_ptr, float, - PredictionCacheEntry*, - uint32_t, + virtual void InplacePredict(std::shared_ptr, float, PredictionCacheEntry*, uint32_t, uint32_t) const { LOG(FATAL) << "Inplace predict is not supported by current booster."; } diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 80004e6a8a01..b16ea67ecd5c 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -139,21 +139,16 @@ class Learner : public Model, public Configurable, public dmlc::Serializable { /*! * \brief Inplace prediction. * - * \param x A type erased data adapter. - * \param p_m An optional Proxy DMatrix object storing meta info like - * base margin. Can be nullptr. + * \param p_fmat A proxy DMatrix that contains the data and related meta info. * \param type Prediction type. * \param missing Missing value in the data. * \param [in,out] out_preds Pointer to output prediction vector. * \param layer_begin Beginning of boosted tree layer used for prediction. * \param layer_end End of booster layer. 0 means do not limit trees. */ - virtual void InplacePredict(dmlc::any const &x, - std::shared_ptr p_m, - PredictionType type, - float missing, - HostDeviceVector **out_preds, - uint32_t layer_begin, uint32_t layer_end) = 0; + virtual void InplacePredict(std::shared_ptr p_m, PredictionType type, float missing, + HostDeviceVector** out_preds, uint32_t layer_begin, + uint32_t layer_end) = 0; /*! * \brief Calculate feature score. See doc in C API for outputs. diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h index 5063922617b5..33c695bc19bf 100644 --- a/include/xgboost/predictor.h +++ b/include/xgboost/predictor.h @@ -145,7 +145,9 @@ class Predictor { /** * \brief Inplace prediction. - * \param x Type erased data adapter. + * + * \param p_fmat A proxy DMatrix that contains the data and related + * meta info. * \param model The model to predict from. * \param missing Missing value in the data. * \param [in,out] out_preds The output preds. @@ -154,11 +156,9 @@ class Predictor { * * \return True if the data can be handled by current predictor, false otherwise. */ - virtual bool InplacePredict(dmlc::any const &x, std::shared_ptr p_m, - const gbm::GBTreeModel &model, float missing, - PredictionCacheEntry *out_preds, - uint32_t tree_begin = 0, - uint32_t tree_end = 0) const = 0; + virtual bool InplacePredict(std::shared_ptr p_fmat, const gbm::GBTreeModel& model, + float missing, PredictionCacheEntry* out_preds, + uint32_t tree_begin = 0, uint32_t tree_end = 0) const = 0; /** * \brief online prediction function, predict score for one instance at a time * NOTE: use the batch prediction interface if possible, batch prediction is diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala index 756b7b54b161..08d186d6f84e 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala +++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala @@ -61,15 +61,14 @@ class GpuPreXGBoost extends PreXGBoostProvider { * @param estimator [[XGBoostClassifier]] or [[XGBoostRegressor]] * @param dataset the training data * @param params all user defined and defaulted params - * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ]) - * Boolean if building DMatrix in rabit context + * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ]) * RDD[() => Watches] will be used as the training input * Option[ RDD[_] ] is the optional cached RDD */ override def buildDatasetToRDD(estimator: Estimator[_], dataset: Dataset[_], params: Map[String, Any]): - XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = { + XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = { GpuPreXGBoost.buildDatasetToRDD(estimator, dataset, params) } @@ -123,8 +122,7 @@ object GpuPreXGBoost extends PreXGBoostProvider { * @param estimator supports XGBoostClassifier and XGBoostRegressor * @param dataset the training data * @param params all user defined and defaulted params - * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ]) - * Boolean if building DMatrix in rabit context + * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ]) * RDD[() => Watches] will be used as the training input to build DMatrix * Option[ RDD[_] ] is the optional cached RDD */ @@ -132,7 +130,7 @@ object GpuPreXGBoost extends PreXGBoostProvider { estimator: Estimator[_], dataset: Dataset[_], params: Map[String, Any]): - XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = { + XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = { val (Seq(labelName, weightName, marginName), feturesCols, groupName, evalSets) = estimator match { @@ -170,7 +168,7 @@ object GpuPreXGBoost extends PreXGBoostProvider { xgbExecParams: XGBoostExecutionParams => val dataMap = prepareInputData(trainingData, evalDataMap, xgbExecParams.numWorkers, xgbExecParams.cacheTrainingSet) - (true, buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None) + (buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None) } /** diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala index 01eb3d0a4f32..13484f490f5b 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala @@ -101,8 +101,7 @@ object PreXGBoost extends PreXGBoostProvider { * @param estimator supports XGBoostClassifier and XGBoostRegressor * @param dataset the training data * @param params all user defined and defaulted params - * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ]) - * Boolean if building DMatrix in rabit context + * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ]) * RDD[() => Watches] will be used as the training input * Option[RDD[_]\] is the optional cached RDD */ @@ -110,7 +109,7 @@ object PreXGBoost extends PreXGBoostProvider { estimator: Estimator[_], dataset: Dataset[_], params: Map[String, Any]): XGBoostExecutionParams => - (Boolean, RDD[() => Watches], Option[RDD[_]]) = { + (RDD[() => Watches], Option[RDD[_]]) = { if (optionProvider.isDefined && optionProvider.get.providerEnabled(Some(dataset))) { return optionProvider.get.buildDatasetToRDD(estimator, dataset, params) @@ -172,12 +171,12 @@ object PreXGBoost extends PreXGBoostProvider { val cachedRDD = if (xgbExecParams.cacheTrainingSet) { Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK)) } else None - (false, trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD) + (trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD) case Right(trainingData) => val cachedRDD = if (xgbExecParams.cacheTrainingSet) { Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK)) } else None - (false, trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD) + (trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD) } } @@ -324,7 +323,7 @@ object PreXGBoost extends PreXGBoostProvider { trainingSet: RDD[XGBLabeledPoint], evalRDDMap: Map[String, RDD[XGBLabeledPoint]] = Map(), hasGroup: Boolean = false): - XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = { + XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = { xgbExecParams: XGBoostExecutionParams => composeInputData(trainingSet, hasGroup, xgbExecParams.numWorkers) match { @@ -332,12 +331,12 @@ object PreXGBoost extends PreXGBoostProvider { val cachedRDD = if (xgbExecParams.cacheTrainingSet) { Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK)) } else None - (false, trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD) + (trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD) case Right(trainingData) => val cachedRDD = if (xgbExecParams.cacheTrainingSet) { Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK)) } else None - (false, trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD) + (trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD) } } diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala index d133aea288dd..4c4dbdec1e53 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala @@ -50,8 +50,7 @@ private[scala] trait PreXGBoostProvider { * @param estimator supports XGBoostClassifier and XGBoostRegressor * @param dataset the training data * @param params all user defined and defaulted params - * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ]) - * Boolean if building DMatrix in rabit context + * @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ]) * RDD[() => Watches] will be used as the training input to build DMatrix * Option[ RDD[_] ] is the optional cached RDD */ @@ -59,7 +58,7 @@ private[scala] trait PreXGBoostProvider { estimator: Estimator[_], dataset: Dataset[_], params: Map[String, Any]): - XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) + XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) /** * Transform Dataset diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index 6cfabcfaca17..fa22e8939e29 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -286,7 +286,6 @@ object XGBoost extends Serializable { } private def buildDistributedBooster( - buildDMatrixInRabit: Boolean, buildWatches: () => Watches, xgbExecutionParam: XGBoostExecutionParams, rabitEnv: java.util.Map[String, String], @@ -295,11 +294,6 @@ object XGBoost extends Serializable { prevBooster: Booster): Iterator[(Booster, Map[String, Array[Float]])] = { var watches: Watches = null - if (!buildDMatrixInRabit) { - // for CPU pipeline, we need to build DMatrix out of rabit context - watches = buildWatchesAndCheck(buildWatches) - } - val taskId = TaskContext.getPartitionId().toString val attempt = TaskContext.get().attemptNumber.toString rabitEnv.put("DMLC_TASK_ID", taskId) @@ -310,10 +304,7 @@ object XGBoost extends Serializable { try { Rabit.init(rabitEnv) - if (buildDMatrixInRabit) { - // for GPU pipeline, we need to move dmatrix building into rabit context - watches = buildWatchesAndCheck(buildWatches) - } + watches = buildWatchesAndCheck(buildWatches) val numEarlyStoppingRounds = xgbExecutionParam.earlyStoppingParams.numEarlyStoppingRounds val metrics = Array.tabulate(watches.size)(_ => Array.ofDim[Float](numRounds)) @@ -377,7 +368,7 @@ object XGBoost extends Serializable { @throws(classOf[XGBoostError]) private[spark] def trainDistributed( sc: SparkContext, - buildTrainingData: XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]), + buildTrainingData: XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]), params: Map[String, Any]): (Booster, Map[String, Array[Float]]) = { @@ -396,7 +387,7 @@ object XGBoost extends Serializable { }.orNull // Get the training data RDD and the cachedRDD - val (buildDMatrixInRabit, trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams) + val (trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams) try { // Train for every ${savingRound} rounds and save the partially completed booster @@ -413,9 +404,8 @@ object XGBoost extends Serializable { optionWatches = Some(iter.next()) } - optionWatches.map { buildWatches => buildDistributedBooster(buildDMatrixInRabit, - buildWatches, xgbExecParams, rabitEnv, xgbExecParams.obj, - xgbExecParams.eval, prevBooster)} + optionWatches.map { buildWatches => buildDistributedBooster(buildWatches, + xgbExecParams, rabitEnv, xgbExecParams.obj, xgbExecParams.eval, prevBooster)} .getOrElse(throw new RuntimeException("No Watches to train")) }} diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala index 77683e91437f..2f6827787107 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala @@ -169,6 +169,23 @@ class XGBoostClassifier ( } override protected def train(dataset: Dataset[_]): XGBoostClassificationModel = { + val _numClasses = getNumClasses(dataset) + if (isDefined(numClass) && $(numClass) != _numClasses) { + throw new Exception("The number of classes in dataset doesn't match " + + "\'num_class\' in xgboost params.") + } + + if (_numClasses == 2) { + if (!isDefined(objective)) { + // If user doesn't set objective, force it to binary:logistic + setObjective("binary:logistic") + } + } else if (_numClasses > 2) { + if (!isDefined(objective)) { + // If user doesn't set objective, force it to multi:softprob + setObjective("multi:softprob") + } + } if (!isDefined(evalMetric) || $(evalMetric).isEmpty) { set(evalMetric, setupDefaultEvalMetric()) @@ -178,12 +195,6 @@ class XGBoostClassifier ( set(objectiveType, "classification") } - val _numClasses = getNumClasses(dataset) - if (isDefined(numClass) && $(numClass) != _numClasses) { - throw new Exception("The number of classes in dataset doesn't match " + - "\'num_class\' in xgboost params.") - } - // Packing with all params plus params user defined val derivedXGBParamMap = xgboostParams ++ MLlib2XGBoostParams val buildTrainingData = PreXGBoost.buildDatasetToRDD(this, dataset, derivedXGBParamMap) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala index b52ba2a2e925..0402beb62a47 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala @@ -169,6 +169,11 @@ class XGBoostRegressor ( override protected def train(dataset: Dataset[_]): XGBoostRegressionModel = { + if (!isDefined(objective)) { + // If user doesn't set objective, force it to reg:squarederror + setObjective("reg:squarederror") + } + if (!isDefined(evalMetric) || $(evalMetric).isEmpty) { set(evalMetric, setupDefaultEvalMetric()) } diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala index 852864d9cb1c..ea7d2b48b11f 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala @@ -1,5 +1,5 @@ /* - Copyright (c) 2014 by Contributors + Copyright (c) 2014-2022 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -105,7 +105,7 @@ private[spark] trait LearningTaskParams extends Params { final def getMaximizeEvaluationMetrics: Boolean = $(maximizeEvaluationMetrics) - setDefault(objective -> "reg:squarederror", baseScore -> 0.5, trainTestRatio -> 1.0, + setDefault(baseScore -> 0.5, trainTestRatio -> 1.0, numEarlyStoppingRounds -> 0, cacheTrainingSet -> false) } diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala index 79562d1f428b..f96140555809 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala @@ -65,8 +65,6 @@ class FeatureSizeValidatingSuite extends FunSuite with PerTest { (id, lp.label, lp.features) }.toDF("id", "label", "features") val xgb = new XGBoostClassifier(paramMap) - intercept[Exception] { - xgb.fit(repartitioned) - } + xgb.fit(repartitioned) } } diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala index 93b7554017a0..cf8dcca5722b 100755 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala @@ -138,7 +138,7 @@ class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest { val testDM = new DMatrix(Classification.test.iterator) val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1", "custom_eval" -> new EvalError, "custom_obj" -> new CustomObj(1), - "num_round" -> "10", "num_workers" -> numWorkers) + "num_round" -> "10", "num_workers" -> numWorkers, "objective" -> "binary:logistic") val xgbc = new XGBoostClassifier(paramMap) val xgbcPath = new File(tempDir.toFile, "xgbc").getPath diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala index 4abd464ade04..9fe2479e5754 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala @@ -112,6 +112,34 @@ class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuit assert(!transformedDf.columns.contains("probability")) } + test("objective will be set if not specifying it") { + val training = buildDataFrame(Classification.train) + val paramMap = Map("eta" -> "1", "max_depth" -> "6", + "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod) + val xgb = new XGBoostClassifier(paramMap) + assert(!xgb.isDefined(xgb.objective)) + xgb.fit(training) + assert(xgb.getObjective == "binary:logistic") + + val trainingDF = buildDataFrame(MultiClassification.train) + val paramMap1 = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1", + "num_class" -> "6", "num_round" -> 5, "num_workers" -> numWorkers, + "tree_method" -> treeMethod) + val xgb1 = new XGBoostClassifier(paramMap1) + assert(!xgb1.isDefined(xgb1.objective)) + xgb1.fit(trainingDF) + assert(xgb1.getObjective == "multi:softprob") + + // shouldn't change user's objective setting + val paramMap2 = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1", + "num_class" -> "6", "num_round" -> 5, "num_workers" -> numWorkers, + "tree_method" -> treeMethod, "objective" -> "multi:softmax") + val xgb2 = new XGBoostClassifier(paramMap2) + assert(xgb2.getObjective == "multi:softmax") + xgb2.fit(trainingDF) + assert(xgb2.getObjective == "multi:softmax") + } + test("use base margin") { val training1 = buildDataFrame(Classification.train) val training2 = training1.withColumn("margin", functions.rand()) diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala index bd104f6c7987..a530313b9bb5 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala @@ -146,6 +146,24 @@ class XGBoostRegressorSuite extends FunSuite with PerTest { prediction.foreach(x => assert(math.abs(x.getAs[Double]("prediction") - first) <= 0.01f)) } + test("objective will be set if not specifying it") { + val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1", + "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod) + val training = buildDataFrame(Regression.train) + val xgb = new XGBoostRegressor(paramMap) + assert(!xgb.isDefined(xgb.objective)) + xgb.fit(training) + assert(xgb.getObjective == "reg:squarederror") + + val paramMap1 = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1", + "num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod, + "objective" -> "reg:squaredlogerror") + val xgb1 = new XGBoostRegressor(paramMap1) + assert(xgb1.getObjective == "reg:squaredlogerror") + xgb1.fit(training) + assert(xgb1.getObjective == "reg:squaredlogerror") + } + test("test predictionLeaf") { val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1", "objective" -> "reg:squarederror", "num_round" -> 5, "num_workers" -> numWorkers, diff --git a/python-package/setup.py b/python-package/setup.py index 35314ab218f8..6c83feca0784 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -335,11 +335,12 @@ def run(self) -> None: 'scipy', ], ext_modules=[CMakeExtension('libxgboost')], + # error: expected "str": "Type[Command]" cmdclass={ - 'build_ext': BuildExt, - 'sdist': Sdist, - 'install_lib': InstallLib, - 'install': Install + 'build_ext': BuildExt, # type: ignore + 'sdist': Sdist, # type: ignore + 'install_lib': InstallLib, # type: ignore + 'install': Install # type: ignore }, extras_require={ 'pandas': ['pandas'], diff --git a/python-package/xgboost/_typing.py b/python-package/xgboost/_typing.py index 64ea9a0a2993..b17f5ecb8a4e 100644 --- a/python-package/xgboost/_typing.py +++ b/python-package/xgboost/_typing.py @@ -1,21 +1,32 @@ """Shared typing definition.""" import ctypes import os -from typing import Optional, Any, TypeVar, Union, Sequence +from typing import Any, TypeVar, Union, Type, Sequence, Callable, List, Dict # os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/ # cudf.DataFrame/cupy.array/dlpack +import numpy as np + DataType = Any # xgboost accepts some other possible types in practice due to historical reason, which is # lesser tested. For now we encourage users to pass a simple list of string. -FeatureNames = Optional[Sequence[str]] -FeatureTypes = Optional[Sequence[str]] +FeatureInfo = Sequence[str] +FeatureNames = FeatureInfo +FeatureTypes = FeatureInfo +BoosterParam = Union[List, Dict] # better be sequence ArrayLike = Any PathLike = Union[str, os.PathLike] CupyT = ArrayLike # maybe need a stub for cupy arrays NumpyOrCupy = Any +NumpyDType = Union[str, Type[np.number]] +PandasDType = Any # real type is pandas.core.dtypes.base.ExtensionDtype + +FloatCompatible = Union[float, np.float32, np.float64] + +# callables +FPreProcCallable = Callable # ctypes # c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h @@ -59,3 +70,4 @@ # template parameter _T = TypeVar("_T") +_F = TypeVar("_F", bound=Callable[..., Any]) diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py index 32d408f3a29e..021ccd97236d 100644 --- a/python-package/xgboost/callback.py +++ b/python-package/xgboost/callback.py @@ -10,8 +10,7 @@ import collections import os import pickle -from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast -from typing import Sequence +from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast, Sequence, Any import numpy from . import rabit @@ -24,11 +23,14 @@ "EarlyStopping", "EvaluationMonitor", "TrainingCheckPoint", + "CallbackContainer" ] _Score = Union[float, Tuple[float, float]] _ScoreList = Union[List[float], List[Tuple[float, float]]] +_Model = Any # real type is Union[Booster, CVPack]; need more work + # pylint: disable=unused-argument class TrainingCallback(ABC): @@ -43,19 +45,19 @@ class TrainingCallback(ABC): def __init__(self) -> None: pass - def before_training(self, model): + def before_training(self, model: _Model) -> _Model: '''Run before training starts.''' return model - def after_training(self, model): + def after_training(self, model: _Model) -> _Model: '''Run after training is finished.''' return model - def before_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool: + def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool: '''Run before each iteration. Return True when training should stop.''' return False - def after_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool: + def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool: '''Run after each iteration. Return True when training should stop.''' return False @@ -140,7 +142,7 @@ def __init__( if self.is_cv: self.aggregated_cv = None - def before_training(self, model): + def before_training(self, model: _Model) -> _Model: '''Function called before training.''' for c in self.callbacks: model = c.before_training(model=model) @@ -151,7 +153,7 @@ def before_training(self, model): assert isinstance(model, Booster), msg return model - def after_training(self, model): + def after_training(self, model: _Model) -> _Model: '''Function called after training.''' for c in self.callbacks: model = c.after_training(model=model) @@ -182,7 +184,7 @@ def after_training(self, model): return model def before_iteration( - self, model, epoch: int, dtrain: DMatrix, evals: List[Tuple[DMatrix, str]] + self, model: _Model, epoch: int, dtrain: DMatrix, evals: Optional[List[Tuple[DMatrix, str]]] ) -> bool: '''Function called before training iteration.''' return any(c.before_iteration(model, epoch, self.history) @@ -220,7 +222,7 @@ def _update_history( def after_iteration( self, - model, + model: _Model, epoch: int, dtrain: DMatrix, evals: Optional[List[Tuple[DMatrix, str]]], @@ -276,7 +278,7 @@ def __init__( super().__init__() def after_iteration( - self, model, epoch: int, evals_log: TrainingCallback.EvalsLog + self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog ) -> bool: model.set_param("learning_rate", self.learning_rates(epoch)) return False @@ -344,12 +346,12 @@ def __init__( self.starting_round: int = 0 super().__init__() - def before_training(self, model): + def before_training(self, model: _Model) -> _Model: self.starting_round = model.num_boosted_rounds() return model def _update_rounds( - self, score: _Score, name: str, metric: str, model, epoch: int + self, score: _Score, name: str, metric: str, model: _Model, epoch: int ) -> bool: def get_s(x: _Score) -> float: """get score if it's cross validation history.""" @@ -403,7 +405,7 @@ def minimize(new: _Score, best: _Score) -> bool: return True return False - def after_iteration(self, model, epoch: int, + def after_iteration(self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog) -> bool: epoch += self.starting_round # training continuation msg = 'Must have at least 1 validation dataset for early stopping.' @@ -431,7 +433,7 @@ def after_iteration(self, model, epoch: int, score = data_log[metric_name][-1] return self._update_rounds(score, data_name, metric_name, model, epoch) - def after_training(self, model): + def after_training(self, model: _Model) -> _Model: try: if self.save_best: model = model[: int(model.attr("best_iteration")) + 1] @@ -477,7 +479,7 @@ def _fmt_metric( msg = f"\t{data + '-' + metric}:{score:.5f}" return msg - def after_iteration(self, model, epoch: int, + def after_iteration(self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog) -> bool: if not evals_log: return False @@ -503,7 +505,7 @@ def after_iteration(self, model, epoch: int, self._latest = msg return False - def after_training(self, model): + def after_training(self, model: _Model) -> _Model: if rabit.get_rank() == self.printer_rank and self._latest is not None: rabit.tracker_print(self._latest) return model @@ -544,7 +546,7 @@ def __init__( self._epoch = 0 super().__init__() - def after_iteration(self, model, epoch: int, + def after_iteration(self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog) -> bool: if self._epoch == self._iterations: path = os.path.join(self._path, self._name + '_' + str(epoch) + diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 1967ffc8e1e6..63f9137e67c7 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -1,7 +1,7 @@ # coding: utf-8 # pylint: disable= invalid-name, unused-import """For compatibility and optional dependencies.""" -from typing import Any +from typing import Any, Type, Dict, Optional, List import sys import types import importlib.util @@ -11,20 +11,20 @@ assert (sys.version_info[0] == 3), 'Python 2 is no longer supported.' -def py_str(x): +def py_str(x: bytes) -> str: """convert c string back to python string""" - return x.decode('utf-8') + return x.decode('utf-8') # type: ignore -def lazy_isinstance(instance, module, name): +def lazy_isinstance(instance: Type[object], module: str, name: str) -> bool: """Use string representation to identify a type.""" # Notice, we use .__class__ as opposed to type() in order # to support object proxies such as weakref.proxy cls = instance.__class__ - module = cls.__module__ == module - name = cls.__name__ == name - return module and name + is_same_module = cls.__module__ == module + has_same_name = cls.__name__ == name + return is_same_module and has_same_name # pandas @@ -37,53 +37,33 @@ def lazy_isinstance(instance, module, name): except ImportError: MultiIndex = object - DataFrame: Any = object + DataFrame = object Series = object pandas_concat = None PANDAS_INSTALLED = False # sklearn try: - from sklearn.base import BaseEstimator - from sklearn.base import RegressorMixin, ClassifierMixin + from sklearn.base import ( + BaseEstimator as XGBModelBase, + RegressorMixin as XGBRegressorBase, + ClassifierMixin as XGBClassifierBase + ) from sklearn.preprocessing import LabelEncoder try: - from sklearn.model_selection import KFold, StratifiedKFold + from sklearn.model_selection import ( + KFold as XGBKFold, + StratifiedKFold as XGBStratifiedKFold + ) except ImportError: - from sklearn.cross_validation import KFold, StratifiedKFold + from sklearn.cross_validation import ( + KFold as XGBKFold, + StratifiedKFold as XGBStratifiedKFold + ) SKLEARN_INSTALLED = True - XGBModelBase = BaseEstimator - XGBRegressorBase = RegressorMixin - XGBClassifierBase = ClassifierMixin - - XGBKFold = KFold - XGBStratifiedKFold = StratifiedKFold - - class XGBoostLabelEncoder(LabelEncoder): - '''Label encoder with JSON serialization methods.''' - def to_json(self): - '''Returns a JSON compatible dictionary''' - meta = {} - for k, v in self.__dict__.items(): - if isinstance(v, np.ndarray): - meta[k] = v.tolist() - else: - meta[k] = v - return meta - - def from_json(self, doc): - # pylint: disable=attribute-defined-outside-init - '''Load the encoder back from a JSON compatible dict.''' - meta = {} - for k, v in doc.items(): - if k == 'classes_': - self.classes_ = np.array(v) - continue - meta[k] = v - self.__dict__.update(meta) except ImportError: SKLEARN_INSTALLED = False @@ -91,20 +71,34 @@ def from_json(self, doc): XGBModelBase = object XGBClassifierBase = object XGBRegressorBase = object + LabelEncoder = object XGBKFold = None XGBStratifiedKFold = None - XGBoostLabelEncoder = None -# dask -try: - import pkg_resources - pkg_resources.get_distribution('dask') - DASK_INSTALLED = True -except pkg_resources.DistributionNotFound: - dask = None - DASK_INSTALLED = False +class XGBoostLabelEncoder(LabelEncoder): + '''Label encoder with JSON serialization methods.''' + def to_json(self) -> Dict: + '''Returns a JSON compatible dictionary''' + meta = {} + for k, v in self.__dict__.items(): + if isinstance(v, np.ndarray): + meta[k] = v.tolist() + else: + meta[k] = v + return meta + + def from_json(self, doc: Dict) -> None: + # pylint: disable=attribute-defined-outside-init + '''Load the encoder back from a JSON compatible dict.''' + meta = {} + for k, v in doc.items(): + if k == 'classes_': + self.classes_ = np.array(v) + continue + meta[k] = v + self.__dict__.update(meta) try: @@ -113,7 +107,7 @@ def from_json(self, doc): SCIPY_INSTALLED = True except ImportError: scipy_sparse = False - scipy_csr: Any = object + scipy_csr = object SCIPY_INSTALLED = False @@ -136,15 +130,21 @@ class LazyLoader(types.ModuleType): """Lazily import a module, mainly to avoid pulling in large dependencies. """ - def __init__(self, local_name, parent_module_globals, name, warning=None): + def __init__( + self, + local_name: str, + parent_module_globals: Dict, + name: str, + warning: Optional[str] = None + ) -> None: self._local_name = local_name self._parent_module_globals = parent_module_globals self._warning = warning - self.module = None + self.module: Optional[types.ModuleType] = None super().__init__(name) - def _load(self): + def _load(self) -> types.ModuleType: """Load the module and insert it into the parent's globals.""" # Import the target module and insert it into the parent's namespace module = importlib.import_module(self.__name__) @@ -163,12 +163,12 @@ def _load(self): return module - def __getattr__(self, item): + def __getattr__(self, item: str) -> Any: if not self.module: self.module = self._load() return getattr(self.module, item) - def __dir__(self): + def __dir__(self) -> List[str]: if not self.module: self.module = self._load() return dir(self.module) diff --git a/python-package/xgboost/config.py b/python-package/xgboost/config.py index 427ea4ea3915..2344ae4a3541 100644 --- a/python-package/xgboost/config.py +++ b/python-package/xgboost/config.py @@ -4,12 +4,20 @@ import json from contextlib import contextmanager from functools import wraps +from typing import Optional, Callable, Any, Dict, cast, Iterator from .core import _LIB, _check_call, c_str, py_str +from ._typing import _F -def config_doc(*, header=None, extra_note=None, parameters=None, returns=None, - see_also=None): +def config_doc( + *, + header: Optional[str] = None, + extra_note: Optional[str] = None, + parameters: Optional[str] = None, + returns: Optional[str] = None, + see_also: Optional[str] = None +) -> Callable[[_F], _F]: """Decorator to format docstring for config functions. Parameters @@ -64,19 +72,19 @@ def config_doc(*, header=None, extra_note=None, parameters=None, returns=None, assert xgb.get_config()['verbosity'] == 2 # old value restored """ - def none_to_str(value): + def none_to_str(value: Optional[str]) -> str: return '' if value is None else value - def config_doc_decorator(func): + def config_doc_decorator(func: _F) -> _F: func.__doc__ = (doc_template.format(header=none_to_str(header), extra_note=none_to_str(extra_note)) + none_to_str(parameters) + none_to_str(returns) + none_to_str(common_example) + none_to_str(see_also)) @wraps(func) - def wrap(*args, **kwargs): + def wrap(*args: Any, **kwargs: Any) -> Any: return func(*args, **kwargs) - return wrap + return cast(_F, wrap) return config_doc_decorator @@ -89,7 +97,7 @@ def wrap(*args, **kwargs): new_config: Dict[str, Any] Keyword arguments representing the parameters and their values """) -def set_config(**new_config): +def set_config(**new_config: Any) -> None: config = json.dumps(new_config) _check_call(_LIB.XGBSetGlobalConfig(c_str(config))) @@ -103,10 +111,12 @@ def set_config(**new_config): args: Dict[str, Any] The list of global parameters and their values """) -def get_config(): +def get_config() -> Dict[str, Any]: config_str = ctypes.c_char_p() _check_call(_LIB.XGBGetGlobalConfig(ctypes.byref(config_str))) - config = json.loads(py_str(config_str.value)) + value = config_str.value + assert value + config = json.loads(py_str(value)) return config @@ -132,7 +142,7 @@ def get_config(): set_config: Set global XGBoost configuration get_config: Get current values of the global configuration """) -def config_context(**new_config): +def config_context(**new_config: Any) -> Iterator[None]: old_config = get_config().copy() set_config(**new_config) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index a94c9d767a0a..cd8437847688 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -30,10 +30,12 @@ ArrayLike, CFloatPtr, NumpyOrCupy, - FeatureNames, + FeatureInfo, FeatureTypes, + FeatureNames, _T, CupyT, + BoosterParam ) @@ -41,7 +43,7 @@ class XGBoostError(ValueError): """Error thrown by xgboost trainer.""" -def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, CStrPptr]: +def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array]: """Convert a Python str or list of Python str to C pointer Parameters @@ -53,9 +55,9 @@ def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, CStrPptr]: if isinstance(data, str): return bytes(data, "utf-8") if isinstance(data, list): - pointers: ctypes.pointer = (ctypes.c_char_p * len(data))() + pointers: ctypes.Array[ctypes.c_char_p] = (ctypes.c_char_p * len(data))() data_as_bytes = [bytes(d, 'utf-8') for d in data] - pointers[:] = data_as_bytes + pointers[:] = data_as_bytes # type: ignore return pointers raise TypeError() @@ -270,10 +272,10 @@ def _cuda_array_interface(data: DataType) -> bytes: def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.ndarray: """Convert a ctypes pointer array to a numpy array.""" ctype: Type[CNumeric] = _numpy2ctypes_type(dtype) - if not isinstance(cptr, ctypes.POINTER(ctype)): + if not isinstance(cptr, ctypes.POINTER(ctype)): # type: ignore raise RuntimeError(f"expected {ctype} pointer") res = np.zeros(length, dtype=dtype) - if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): + if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): # type: ignore raise RuntimeError("memmove failed") return res @@ -285,7 +287,10 @@ def ctypes2cupy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> CupyT from cupy.cuda.memory import MemoryPointer from cupy.cuda.memory import UnownedMemory - CUPY_TO_CTYPES_MAPPING = {cupy.float32: ctypes.c_float, cupy.uint32: ctypes.c_uint} + CUPY_TO_CTYPES_MAPPING: Dict[Type[np.number], Type[CNumeric]] = { + cupy.float32: ctypes.c_float, + cupy.uint32: ctypes.c_uint, + } if dtype not in CUPY_TO_CTYPES_MAPPING: raise RuntimeError(f"Supported types: {CUPY_TO_CTYPES_MAPPING.keys()}") addr = ctypes.cast(cptr, ctypes.c_void_p).value @@ -310,7 +315,7 @@ def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray: raise RuntimeError('expected char pointer') res = bytearray(length) rptr = (ctypes.c_char * length).from_buffer(res) - if not ctypes.memmove(rptr, cptr, length): + if not ctypes.memmove(rptr, cptr, length): # type: ignore raise RuntimeError('memmove failed') return res @@ -320,10 +325,12 @@ def c_str(string: str) -> ctypes.c_char_p: return ctypes.c_char_p(string.encode('utf-8')) -def c_array(ctype: Type[CTypeT], values: ArrayLike) -> ctypes.Array: +def c_array( + ctype: Type[CTypeT], values: ArrayLike +) -> Union[ctypes.Array, ctypes.pointer]: """Convert a python string to c array.""" if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype): - return (ctype * len(values)).from_buffer_copy(values) + return values.ctypes.data_as(ctypes.POINTER(ctype)) return (ctype * len(values))(*values) @@ -434,8 +441,8 @@ def _next_wrapper(self, this: None) -> int: # pylint: disable=unused-argument def data_handle( data: Any, *, - feature_names: FeatureNames = None, - feature_types: Optional[List[str]] = None, + feature_names: Optional[FeatureNames] = None, + feature_types: Optional[FeatureTypes] = None, **kwargs: Any, ) -> None: from .data import dispatch_proxy_set_data @@ -555,8 +562,8 @@ def __init__( base_margin: Optional[ArrayLike] = None, missing: Optional[float] = None, silent: bool = False, - feature_names: FeatureNames = None, - feature_types: FeatureTypes = None, + feature_names: Optional[FeatureNames] = None, + feature_types: Optional[FeatureTypes] = None, nthread: Optional[int] = None, group: Optional[ArrayLike] = None, qid: Optional[ArrayLike] = None, @@ -718,8 +725,8 @@ def set_info( qid: Optional[ArrayLike] = None, label_lower_bound: Optional[ArrayLike] = None, label_upper_bound: Optional[ArrayLike] = None, - feature_names: FeatureNames = None, - feature_types: Optional[List[str]] = None, + feature_names: Optional[FeatureNames] = None, + feature_types: Optional[FeatureTypes] = None, feature_weights: Optional[ArrayLike] = None ) -> None: """Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`.""" @@ -1000,7 +1007,7 @@ def slice( return res @property - def feature_names(self) -> Optional[List[str]]: + def feature_names(self) -> Optional[FeatureNames]: """Get feature names (column labels). Returns @@ -1023,7 +1030,7 @@ def feature_names(self) -> Optional[List[str]]: return feature_names @feature_names.setter - def feature_names(self, feature_names: FeatureNames) -> None: + def feature_names(self, feature_names: Optional[FeatureNames]) -> None: """Set feature names (column labels). Parameters @@ -1039,7 +1046,7 @@ def feature_names(self, feature_names: FeatureNames) -> None: else: feature_names = [feature_names] except TypeError: - feature_names = [feature_names] + feature_names = [cast(str, feature_names)] if len(feature_names) != len(set(feature_names)): raise ValueError('feature_names must be unique') @@ -1069,8 +1076,13 @@ def feature_names(self, feature_names: FeatureNames) -> None: self.feature_types = None @property - def feature_types(self) -> Optional[List[str]]: - """Get feature types. See :py:class:`DMatrix` for details.""" + def feature_types(self) -> Optional[FeatureTypes]: + """Get feature types (column types). + + Returns + ------- + feature_types : list or None + """ length = c_bst_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle, @@ -1111,7 +1123,7 @@ def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None: else: feature_types = [feature_types] except TypeError: - feature_types = [feature_types] + feature_types = [cast(str, feature_types)] feature_types_bytes = [bytes(f, encoding='utf-8') for f in feature_types] c_feature_types = (ctypes.c_char_p * @@ -1203,8 +1215,8 @@ def __init__( # pylint: disable=super-init-not-called base_margin: Optional[ArrayLike] = None, missing: Optional[float] = None, silent: bool = False, - feature_names: FeatureNames = None, - feature_types: Optional[List[str]] = None, + feature_names: Optional[FeatureNames] = None, + feature_types: Optional[FeatureTypes] = None, nthread: Optional[int] = None, max_bin: int = 256, group: Optional[ArrayLike] = None, @@ -1323,7 +1335,7 @@ def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]: return num_parallel_tree, num_groups -def _configure_metrics(params: Union[Dict, List]) -> Union[Dict, List]: +def _configure_metrics(params: BoosterParam) -> BoosterParam: if ( isinstance(params, dict) and "eval_metric" in params @@ -1349,7 +1361,7 @@ class Booster: def __init__( self, - params: Optional[Dict] = None, + params: Optional[BoosterParam] = None, cache: Optional[Sequence[DMatrix]] = None, model_file: Optional[Union["Booster", bytearray, os.PathLike, str]] = None ) -> None: @@ -1444,7 +1456,7 @@ def _transform_interaction_constraints( "Constrained features are not a subset of training data feature names" ) from e - def _configure_constraints(self, params: Union[List, Dict]) -> Union[List, Dict]: + def _configure_constraints(self, params: BoosterParam) -> BoosterParam: if isinstance(params, dict): value = params.get("monotone_constraints") if value is not None: @@ -1604,10 +1616,12 @@ def attr(self, key: str) -> Optional[str]: _check_call(_LIB.XGBoosterGetAttr( self.handle, c_str(key), ctypes.byref(ret), ctypes.byref(success))) if success.value != 0: - return py_str(ret.value) + value = ret.value + assert value + return py_str(value) return None - def attributes(self) -> Dict[str, str]: + def attributes(self) -> Dict[str, Optional[str]]: """Get attributes stored in the Booster as a dictionary. Returns @@ -1632,14 +1646,12 @@ def set_attr(self, **kwargs: Optional[str]) -> None: The attributes to set. Setting a value to None deletes an attribute. """ for key, value in kwargs.items(): + c_value = None if value is not None: - if not isinstance(value, str): - raise ValueError("Set Attr only accepts string values") - value = c_str(str(value)) - _check_call(_LIB.XGBoosterSetAttr( - self.handle, c_str(key), value)) + c_value = c_str(str(value)) + _check_call(_LIB.XGBoosterSetAttr(self.handle, c_str(key), c_value)) - def _get_feature_info(self, field: str) -> Optional[List[str]]: + def _get_feature_info(self, field: str) -> Optional[FeatureInfo]: length = c_bst_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() if not hasattr(self, "handle") or self.handle is None: @@ -1652,7 +1664,7 @@ def _get_feature_info(self, field: str) -> Optional[List[str]]: feature_info = from_cstr_to_pystr(sarr, length) return feature_info if feature_info else None - def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> None: + def _set_feature_info(self, features: Optional[FeatureInfo], field: str) -> None: if features is not None: assert isinstance(features, list) feature_info_bytes = [bytes(f, encoding="utf-8") for f in features] @@ -1670,7 +1682,7 @@ def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> No ) @property - def feature_types(self) -> Optional[List[str]]: + def feature_types(self) -> Optional[FeatureTypes]: """Feature types for this booster. Can be directly set by input data or by assignment. See :py:class:`DMatrix` for details. @@ -1678,11 +1690,11 @@ def feature_types(self) -> Optional[List[str]]: return self._get_feature_info("feature_type") @feature_types.setter - def feature_types(self, features: Optional[List[str]]) -> None: + def feature_types(self, features: Optional[FeatureTypes]) -> None: self._set_feature_info(features, "feature_type") @property - def feature_names(self) -> Optional[List[str]]: + def feature_names(self) -> Optional[FeatureNames]: """Feature names for this booster. Can be directly set by input data or by assignment. @@ -1690,7 +1702,7 @@ def feature_names(self) -> Optional[List[str]]: return self._get_feature_info("feature_name") @feature_names.setter - def feature_names(self, features: FeatureNames) -> None: + def feature_names(self, features: Optional[FeatureNames]) -> None: self._set_feature_info(features, "feature_name") def set_param( @@ -1711,7 +1723,7 @@ def set_param( params = params.items() elif isinstance(params, str) and value is not None: params = [(params, value)] - for key, val in params: + for key, val in cast(Iterable[Tuple[str, str]], params): if val is not None: _check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key), c_str(str(val)))) @@ -2564,8 +2576,10 @@ def _validate_features(self, data: DMatrix) -> None: ) # Booster can't accept data with different feature names if self.feature_names != data.feature_names: - dat_missing = set(self.feature_names) - set(data.feature_names) - my_missing = set(data.feature_names) - set(self.feature_names) + dat_missing = set(cast(FeatureNames, self.feature_names)) - \ + set(cast(FeatureNames, data.feature_names)) + my_missing = set(cast(FeatureNames, data.feature_names)) - \ + set(cast(FeatureNames, self.feature_names)) msg = 'feature_names mismatch: {0} {1}' @@ -2619,10 +2633,10 @@ def get_split_value_histogram( bins = max(min(n_unique, bins) if bins is not None else n_unique, 1) nph = np.histogram(values, bins=bins) - nph = np.column_stack((nph[1][1:], nph[0])) - nph = nph[nph[:, 1] > 0] + nph_stacked = np.column_stack((nph[1][1:], nph[0])) + nph_stacked = nph_stacked[nph_stacked[:, 1] > 0] - if nph.size == 0: + if nph_stacked.size == 0: ft = self.feature_types fn = self.feature_names if fn is None: @@ -2640,11 +2654,11 @@ def get_split_value_histogram( ) if as_pandas and PANDAS_INSTALLED: - return DataFrame(nph, columns=['SplitValue', 'Count']) + return DataFrame(nph_stacked, columns=['SplitValue', 'Count']) if as_pandas and not PANDAS_INSTALLED: warnings.warn( "Returning histogram as ndarray" " (as_pandas == True, but pandas is not installed).", UserWarning ) - return nph + return nph_stacked diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index b54e26c9d550..ee8ea1a5aec4 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -318,7 +318,7 @@ def __init__( base_margin: Optional[_DaskCollection] = None, missing: float = None, silent: bool = False, # pylint: disable=unused-argument - feature_names: FeatureNames = None, + feature_names: Optional[FeatureNames] = None, feature_types: FeatureTypes = None, group: Optional[_DaskCollection] = None, qid: Optional[_DaskCollection] = None, @@ -594,7 +594,7 @@ def __init__( qid: Optional[List[Any]] = None, label_lower_bound: Optional[List[Any]] = None, label_upper_bound: Optional[List[Any]] = None, - feature_names: FeatureNames = None, + feature_names: Optional[FeatureNames] = None, feature_types: Optional[Union[Any, List[Any]]] = None, ) -> None: self._data = data @@ -637,7 +637,7 @@ def next(self, input_data: Callable) -> int: if self._iter == len(self._data): # Return 0 when there's no more batch. return 0 - feature_names: FeatureNames = None + feature_names: Optional[FeatureNames] = None if self._feature_names: feature_names = self._feature_names else: @@ -688,7 +688,7 @@ def __init__( base_margin: Optional[_DaskCollection] = None, missing: float = None, silent: bool = False, # disable=unused-argument - feature_names: FeatureNames = None, + feature_names: Optional[FeatureNames] = None, feature_types: Optional[Union[Any, List[Any]]] = None, max_bin: int = 256, group: Optional[_DaskCollection] = None, @@ -725,7 +725,7 @@ def _create_fn_args(self, worker_addr: str) -> Dict[str, Any]: def _create_device_quantile_dmatrix( - feature_names: FeatureNames, + feature_names: Optional[FeatureNames], feature_types: Optional[Union[Any, List[Any]]], feature_weights: Optional[Any], missing: float, @@ -766,7 +766,7 @@ def _create_device_quantile_dmatrix( def _create_dmatrix( - feature_names: FeatureNames, + feature_names: Optional[FeatureNames], feature_types: Optional[Union[Any, List[Any]]], feature_weights: Optional[Any], missing: float, diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index d21c97910eb3..a0505e9c9105 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -5,17 +5,26 @@ import json import warnings import os -from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type +from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Sequence, cast import numpy as np from .core import c_array, _LIB, _check_call, c_str from .core import _cuda_array_interface -from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames -from ._typing import FeatureTypes +from .core import DataIter, _ProxyDMatrix, DMatrix from .compat import lazy_isinstance, DataFrame +from ._typing import ( + c_bst_ulong, + DataType, + FeatureTypes, + FeatureNames, + NumpyDType, + CupyT, + FloatCompatible, PandasDType +) -c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name +DispatchedDataBackendReturnType = Tuple[ + ctypes.c_void_p, Optional[FeatureNames], Optional[FeatureTypes]] CAT_T = "c" @@ -23,14 +32,14 @@ _matrix_meta = {"base_margin", "label"} -def _warn_unused_missing(data, missing): +def _warn_unused_missing(data: DataType, missing: Optional[FloatCompatible]) -> None: if (missing is not None) and (not np.isnan(missing)): warnings.warn( '`missing` is not used for current input data type:' + str(type(data)), UserWarning) -def _check_complex(data): +def _check_complex(data: DataType) -> None: '''Test whether data is complex using `dtype` attribute.''' complex_dtypes = (np.complex128, np.complex64, np.cfloat, np.cdouble, np.clongdouble) @@ -38,16 +47,15 @@ def _check_complex(data): raise ValueError('Complex data not supported') -def _check_data_shape(data: Any) -> None: +def _check_data_shape(data: DataType) -> None: if hasattr(data, "shape") and len(data.shape) != 2: raise ValueError("Please reshape the input data into 2-dimensional matrix.") -def _is_scipy_csr(data): +def _is_scipy_csr(data: DataType) -> bool: try: - import scipy + import scipy.sparse except ImportError: - scipy = None return False return isinstance(data, scipy.sparse.csr_matrix) @@ -64,12 +72,12 @@ def _array_interface(data: np.ndarray) -> bytes: def _from_scipy_csr( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: FloatCompatible, + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: """Initialize data from a CSR matrix.""" if len(data.indices) != len(data.data): raise ValueError( @@ -94,21 +102,20 @@ def _from_scipy_csr( return handle, feature_names, feature_types -def _is_scipy_csc(data): +def _is_scipy_csc(data: DataType) -> bool: try: - import scipy + import scipy.sparse except ImportError: - scipy = None return False return isinstance(data, scipy.sparse.csc_matrix) def _from_scipy_csc( - data, - missing, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: Optional[FloatCompatible], + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: if len(data.indices) != len(data.data): raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}") _warn_unused_missing(data, missing) @@ -124,27 +131,29 @@ def _from_scipy_csc( return handle, feature_names, feature_types -def _is_scipy_coo(data): +def _is_scipy_coo(data: DataType) -> bool: try: - import scipy + import scipy.sparse except ImportError: - scipy = None return False return isinstance(data, scipy.sparse.coo_matrix) -def _is_numpy_array(data): +def _is_numpy_array(data: DataType) -> bool: return isinstance(data, (np.ndarray, np.matrix)) -def _ensure_np_dtype(data, dtype) -> Tuple[np.ndarray, np.dtype]: +def _ensure_np_dtype( + data: DataType, + dtype: Optional[NumpyDType] +) -> Tuple[np.ndarray, Optional[NumpyDType]]: if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]: data = data.astype(np.float32, copy=False) dtype = np.float32 return data, dtype -def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray: +def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray: '''Handle numpy slice. This can be removed if we use __array_interface__. ''' try: @@ -159,12 +168,12 @@ def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray: def _from_numpy_array( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: FloatCompatible, + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: """Initialize data from a 2-D numpy matrix. """ @@ -189,7 +198,7 @@ def _from_numpy_array( return handle, feature_names, feature_types -def _is_pandas_df(data): +def _is_pandas_df(data: DataType) -> bool: try: import pandas as pd except ImportError: @@ -197,7 +206,7 @@ def _is_pandas_df(data): return isinstance(data, pd.DataFrame) -def _is_modin_df(data): +def _is_modin_df(data: DataType) -> bool: try: import modin.pandas as pd except ImportError: @@ -232,7 +241,7 @@ def _is_modin_df(data): ) -def _invalid_dataframe_dtype(data: Any) -> None: +def _invalid_dataframe_dtype(data: DataType) -> None: # pandas series has `dtypes` but it's just a single object # cudf series doesn't have `dtypes`. if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"): @@ -253,10 +262,10 @@ def _invalid_dataframe_dtype(data: Any) -> None: def _pandas_feature_info( data: DataFrame, meta: Optional[str], - feature_names: FeatureNames, - feature_types: FeatureTypes, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -) -> Tuple[FeatureNames, FeatureTypes]: +) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]: import pandas as pd from pandas.api.types import ( is_sparse, @@ -285,13 +294,13 @@ def _pandas_feature_info( return feature_names, feature_types -def is_nullable_dtype(dtype: Any) -> bool: +def is_nullable_dtype(dtype: PandasDType) -> bool: """Wether dtype is a pandas nullable type.""" from pandas.api.types import is_integer_dtype, is_bool_dtype # dtype: pd.core.arrays.numeric.NumericDtype nullable_alias = {"Int16", "Int32", "Int64"} is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias - # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`. + # np.bool has alias `bool`, while pd.BooleanDtype has `bzoolean`. is_bool = is_bool_dtype(dtype) and dtype.name == "boolean" return is_int or is_bool @@ -331,11 +340,11 @@ def _pandas_cat_null(data: DataFrame) -> DataFrame: def _transform_pandas_df( data: DataFrame, enable_categorical: bool, - feature_names: FeatureNames = None, - feature_types: FeatureTypes = None, + feature_names: Optional[FeatureNames] = None, + feature_types: Optional[FeatureTypes] = None, meta: Optional[str] = None, - meta_type: Optional[str] = None, -) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]: + meta_type: Optional[NumpyDType] = None, +) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]: from pandas.api.types import ( is_sparse, is_categorical_dtype, @@ -359,7 +368,7 @@ def _transform_pandas_df( if meta and len(data.columns) > 1 and meta not in _matrix_meta: raise ValueError(f"DataFrame for {meta} cannot have multiple columns") - dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32 + dtype = meta_type if meta_type else np.float32 arr: np.ndarray = transformed.values if meta_type: arr = arr.astype(dtype) @@ -369,18 +378,18 @@ def _transform_pandas_df( def _from_pandas_df( data: DataFrame, enable_categorical: bool, - missing: float, + missing: FloatCompatible, nthread: int, - feature_names: FeatureNames, - feature_types: FeatureTypes, -) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]: + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: data, feature_names, feature_types = _transform_pandas_df( data, enable_categorical, feature_names, feature_types ) return _from_numpy_array(data, missing, nthread, feature_names, feature_types) -def _is_pandas_series(data): +def _is_pandas_series(data: DataType) -> bool: try: import pandas as pd except ImportError: @@ -389,18 +398,21 @@ def _is_pandas_series(data): def _meta_from_pandas_series( - data, name: str, dtype: Optional[str], handle: ctypes.c_void_p + data: DataType, + name: str, + dtype: Optional[NumpyDType], + handle: ctypes.c_void_p ) -> None: """Help transform pandas series for meta data like labels""" data = data.values.astype('float') from pandas.api.types import is_sparse if is_sparse(data): - data = data.to_dense() + data = data.to_dense() # type: ignore assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 _meta_from_numpy(data, name, dtype, handle) -def _is_modin_series(data): +def _is_modin_series(data: DataType) -> bool: try: import modin.pandas as pd except ImportError: @@ -409,13 +421,13 @@ def _is_modin_series(data): def _from_pandas_series( - data, - missing: float, + data: DataType, + missing: FloatCompatible, nthread: int, enable_categorical: bool, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: from pandas.api.types import is_categorical_dtype if (data.dtype.name not in _pandas_dtype_mapper) and not ( @@ -433,7 +445,7 @@ def _from_pandas_series( ) -def _is_dt_df(data): +def _is_dt_df(data: DataType) -> bool: return lazy_isinstance(data, 'datatable', 'Frame') or \ lazy_isinstance(data, 'datatable', 'DataTable') @@ -443,12 +455,12 @@ def _is_dt_df(data): def _transform_dt_df( - data, - feature_names: FeatureNames, - feature_types: FeatureTypes, - meta=None, - meta_type=None, -): + data: DataType, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], + meta: Optional[str] = None, + meta_type: Optional[NumpyDType] = None, +) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]: """Validate feature names and types if data table""" if meta and data.shape[1] > 1: raise ValueError('DataTable for meta info cannot have multiple columns') @@ -482,13 +494,13 @@ def _transform_dt_df( def _from_dt_df( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, + data: DataType, + missing: Optional[FloatCompatible], + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]: +) -> DispatchedDataBackendReturnType: if enable_categorical: raise ValueError("categorical data in datatable is not supported yet.") data, feature_names, feature_types = _transform_dt_df( @@ -525,7 +537,7 @@ def _from_dt_df( return handle, feature_names, feature_types -def _is_arrow(data) -> bool: +def _is_arrow(data: DataType) -> bool: try: import pyarrow as pa from pyarrow import dataset as arrow_dataset @@ -571,13 +583,13 @@ def _next(data_handle: int) -> int: def _from_arrow( - data, - missing: float, + data: DataType, + missing: FloatCompatible, nthread: int, - feature_names: FeatureNames, - feature_types: FeatureTypes, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]: +) -> DispatchedDataBackendReturnType: import pyarrow as pa if not all( @@ -605,11 +617,11 @@ def _from_arrow( return handle, feature_names, feature_types -def _is_cudf_df(data) -> bool: +def _is_cudf_df(data: DataType) -> bool: return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame") -def _cudf_array_interfaces(data, cat_codes: list) -> bytes: +def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes: """Extract CuDF __cuda_array_interface__. This is special as it returns a new list of data and a list of array interfaces. The data is list of categorical codes that caller can safely ignore, but have to keep their reference alive until usage of array @@ -645,11 +657,11 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes: def _transform_cudf_df( - data, - feature_names: FeatureNames, - feature_types: FeatureTypes, + data: DataType, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -): +) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]: try: from cudf.api.types import is_categorical_dtype except ImportError: @@ -709,13 +721,13 @@ def _transform_cudf_df( def _from_cudf_df( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, + data: DataType, + missing: FloatCompatible, + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -) -> Tuple[ctypes.c_void_p, Any, Any]: +) -> DispatchedDataBackendReturnType: data, cat_codes, feature_names, feature_types = _transform_cudf_df( data, feature_names, feature_types, enable_categorical ) @@ -732,7 +744,7 @@ def _from_cudf_df( return handle, feature_names, feature_types -def _is_cudf_ser(data): +def _is_cudf_ser(data: DataType) -> bool: try: import cudf except ImportError: @@ -740,13 +752,13 @@ def _is_cudf_ser(data): return isinstance(data, cudf.Series) -def _is_cupy_array(data: Any) -> bool: +def _is_cupy_array(data: DataType) -> bool: return lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance( data, "cupy._core.core", "ndarray" ) -def _transform_cupy_array(data): +def _transform_cupy_array(data: DataType) -> CupyT: import cupy # pylint: disable=import-error if not hasattr(data, '__cuda_array_interface__') and hasattr( data, '__array__'): @@ -757,12 +769,12 @@ def _transform_cupy_array(data): def _from_cupy_array( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: FloatCompatible, + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: """Initialize DMatrix from cupy ndarray.""" data = _transform_cupy_array(data) interface_str = _cuda_array_interface(data) @@ -776,7 +788,7 @@ def _from_cupy_array( return handle, feature_names, feature_types -def _is_cupy_csr(data): +def _is_cupy_csr(data: DataType) -> bool: try: import cupyx except ImportError: @@ -784,7 +796,7 @@ def _is_cupy_csr(data): return isinstance(data, cupyx.scipy.sparse.csr_matrix) -def _is_cupy_csc(data): +def _is_cupy_csc(data: DataType) -> bool: try: import cupyx except ImportError: @@ -792,11 +804,11 @@ def _is_cupy_csc(data): return isinstance(data, cupyx.scipy.sparse.csc_matrix) -def _is_dlpack(data): +def _is_dlpack(data: DataType) -> bool: return 'PyCapsule' in str(type(data)) and "dltensor" in str(data) -def _transform_dlpack(data): +def _transform_dlpack(data: DataType) -> bool: from cupy import fromDlpack # pylint: disable=E0401 assert 'used_dltensor' not in str(data) data = fromDlpack(data) @@ -804,27 +816,27 @@ def _transform_dlpack(data): def _from_dlpack( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: FloatCompatible, + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: data = _transform_dlpack(data) return _from_cupy_array(data, missing, nthread, feature_names, feature_types) -def _is_uri(data): +def _is_uri(data: DataType) -> bool: return isinstance(data, (str, os.PathLike)) def _from_uri( - data, - missing, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: Optional[FloatCompatible], + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: _warn_unused_missing(data, missing) handle = ctypes.c_void_p() data = os.fspath(os.path.expanduser(data)) @@ -834,51 +846,51 @@ def _from_uri( return handle, feature_names, feature_types -def _is_list(data): +def _is_list(data: DataType) -> bool: return isinstance(data, list) def _from_list( - data, - missing, - n_threads, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: Sequence, + missing: FloatCompatible, + n_threads: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: array = np.array(data) _check_data_shape(data) return _from_numpy_array(array, missing, n_threads, feature_names, feature_types) -def _is_tuple(data): +def _is_tuple(data: DataType) -> bool: return isinstance(data, tuple) def _from_tuple( - data, - missing, - n_threads, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: Sequence, + missing: FloatCompatible, + n_threads: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: return _from_list(data, missing, n_threads, feature_names, feature_types) -def _is_iter(data): +def _is_iter(data: DataType) -> bool: return isinstance(data, DataIter) -def _has_array_protocol(data): +def _has_array_protocol(data: DataType) -> bool: return hasattr(data, '__array__') -def _convert_unknown_data(data): +def _convert_unknown_data(data: DataType) -> DataType: warnings.warn( f'Unknown data type: {type(data)}, trying to convert it to csr_matrix', UserWarning ) try: - import scipy + import scipy.sparse except ImportError: return None @@ -891,13 +903,13 @@ def _convert_unknown_data(data): def dispatch_data_backend( - data, - missing, - threads, - feature_names: FeatureNames, - feature_types: FeatureTypes, + data: DataType, + missing: FloatCompatible, # Or Optional[Float] + threads: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool = False, -): +) -> DispatchedDataBackendReturnType: '''Dispatch data for DMatrix.''' if not _is_cudf_ser(data) and not _is_pandas_series(data): _check_data_shape(data) @@ -964,7 +976,7 @@ def dispatch_data_backend( raise TypeError('Not supported type for data.' + str(type(data))) -def _to_data_type(dtype: str, name: str): +def _to_data_type(dtype: str, name: str) -> int: dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4} if dtype not in dtype_map: raise TypeError( @@ -973,7 +985,7 @@ def _to_data_type(dtype: str, name: str): return dtype_map[dtype] -def _validate_meta_shape(data: Any, name: str) -> None: +def _validate_meta_shape(data: DataType, name: str) -> None: if hasattr(data, "shape"): msg = f"Invalid shape: {data.shape} for {name}" if name in _matrix_meta: @@ -990,7 +1002,7 @@ def _validate_meta_shape(data: Any, name: str) -> None: def _meta_from_numpy( data: np.ndarray, field: str, - dtype: Optional[Union[np.dtype, str]], + dtype: Optional[NumpyDType], handle: ctypes.c_void_p, ) -> None: data, dtype = _ensure_np_dtype(data, dtype) @@ -1001,16 +1013,26 @@ def _meta_from_numpy( _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str)) -def _meta_from_list(data, field, dtype, handle): - data = np.array(data) - _meta_from_numpy(data, field, dtype, handle) +def _meta_from_list( + data: Sequence, + field: str, + dtype: Optional[NumpyDType], + handle: ctypes.c_void_p +) -> None: + data_np = np.array(data) + _meta_from_numpy(data_np, field, dtype, handle) -def _meta_from_tuple(data, field, dtype, handle): +def _meta_from_tuple( + data: Sequence, + field: str, + dtype: Optional[NumpyDType], + handle: ctypes.c_void_p +) -> None: return _meta_from_list(data, field, dtype, handle) -def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None: +def _meta_from_cudf_df(data: DataType, field: str, handle: ctypes.c_void_p) -> None: if field not in _matrix_meta: _meta_from_cudf_series(data.iloc[:, 0], field, handle) else: @@ -1019,7 +1041,7 @@ def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None: _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) -def _meta_from_cudf_series(data, field, handle): +def _meta_from_cudf_series(data: DataType, field: str, handle: ctypes.c_void_p) -> None: interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), 'utf-8') _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, @@ -1027,7 +1049,7 @@ def _meta_from_cudf_series(data, field, handle): interface)) -def _meta_from_cupy_array(data, field, handle): +def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -> None: data = _transform_cupy_array(data) interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), 'utf-8') @@ -1036,14 +1058,22 @@ def _meta_from_cupy_array(data, field, handle): interface)) -def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p): +def _meta_from_dt( + data: DataType, + field: str, + dtype: Optional[NumpyDType], + handle: ctypes.c_void_p +) -> None: data, _, _ = _transform_dt_df(data, None, None, field, dtype) _meta_from_numpy(data, field, dtype, handle) def dispatch_meta_backend( - matrix: DMatrix, data, name: str, dtype: Optional[Union[str, np.dtype]] = None -): + matrix: DMatrix, + data: DataType, + name: str, + dtype: Optional[NumpyDType] = None +) -> None: '''Dispatch for meta info.''' handle = matrix.handle assert handle is not None @@ -1060,8 +1090,7 @@ def dispatch_meta_backend( _meta_from_numpy(data, name, dtype, handle) return if _is_pandas_df(data): - data, _, _ = _transform_pandas_df(data, False, meta=name, - meta_type=dtype) + data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype) _meta_from_numpy(data, name, dtype, handle) return if _is_pandas_series(data): @@ -1107,7 +1136,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902 area for meta info. ''' - def __init__(self, **kwargs: Any): + def __init__(self, **kwargs: Any) -> None: self.kwargs = kwargs self.it = 0 # pylint: disable=invalid-name super().__init__() @@ -1124,11 +1153,13 @@ def reset(self) -> None: def _proxy_transform( - data, - feature_names: FeatureNames, - feature_types: FeatureTypes, + data: DataType, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -): +) -> Tuple[ + Union[bool, ctypes.c_void_p, np.ndarray], + Optional[list], Optional[FeatureNames], Optional[FeatureTypes]]: if _is_cudf_df(data) or _is_cudf_ser(data): return _transform_cudf_df( data, feature_names, feature_types, enable_categorical @@ -1152,7 +1183,7 @@ def _proxy_transform( def dispatch_proxy_set_data( proxy: _ProxyDMatrix, - data: Any, + data: DataType, cat_codes: Optional[list], allow_host: bool, ) -> None: @@ -1162,11 +1193,11 @@ def dispatch_proxy_set_data( if _is_cudf_df(data): # pylint: disable=W0212 - proxy._set_data_from_cuda_columnar(data, cat_codes) + proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes)) return if _is_cudf_ser(data): # pylint: disable=W0212 - proxy._set_data_from_cuda_columnar(data, cat_codes) + proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes)) return if _is_cupy_array(data): proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212 diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py index 75159d10434e..85a8428bc181 100644 --- a/python-package/xgboost/plotting.py +++ b/python-package/xgboost/plotting.py @@ -4,16 +4,34 @@ """Plotting Library.""" from io import BytesIO import json +from typing import Optional, Any + import numpy as np + +from ._typing import PathLike from .core import Booster from .sklearn import XGBModel - -def plot_importance(booster, ax=None, height=0.2, - xlim=None, ylim=None, title='Feature importance', - xlabel='F score', ylabel='Features', fmap='', - importance_type='weight', max_num_features=None, - grid=True, show_values=True, **kwargs): +Axes = Any # real type is matplotlib.axes.Axes +GraphvizSource = Any # real type is graphviz.Source + + +def plot_importance( + booster: Booster, + ax: Optional[Axes] = None, + height: float = 0.2, + xlim: Optional[tuple] = None, + ylim: Optional[tuple] = None, + title: str = "Feature importance", + xlabel: str = "F score", + ylabel: str = "Features", + fmap: PathLike = "", + importance_type: str = "weight", + max_num_features: Optional[int] = None, + grid: bool = True, + show_values: bool = True, + **kwargs: Any +) -> Axes: """Plot importance based on fitted trees. Parameters @@ -78,9 +96,9 @@ def plot_importance(booster, ax=None, height=0.2, tuples = [(k, importance[k]) for k in importance] if max_num_features is not None: # pylint: disable=invalid-unary-operand-type - tuples = sorted(tuples, key=lambda x: x[1])[-max_num_features:] + tuples = sorted(tuples, key=lambda _x: _x[1])[-max_num_features:] else: - tuples = sorted(tuples, key=lambda x: x[1]) + tuples = sorted(tuples, key=lambda _x: _x[1]) labels, values = zip(*tuples) if ax is None: @@ -120,9 +138,17 @@ def plot_importance(booster, ax=None, height=0.2, return ax -def to_graphviz(booster, fmap='', num_trees=0, rankdir=None, - yes_color=None, no_color=None, - condition_node_params=None, leaf_node_params=None, **kwargs): +def to_graphviz( + booster: Booster, + fmap: PathLike = "", + num_trees: int = 0, + rankdir: Optional[str] = None, + yes_color: Optional[str] = None, + no_color: Optional[str] = None, + condition_node_params: Optional[dict] = None, + leaf_node_params: Optional[dict] = None, + **kwargs: Any +) -> GraphvizSource: """Convert specified tree to graphviz instance. IPython can automatically plot the returned graphiz instance. Otherwise, you should call .render() method of the returned graphiz instance. @@ -212,7 +238,14 @@ def to_graphviz(booster, fmap='', num_trees=0, rankdir=None, return g -def plot_tree(booster, fmap='', num_trees=0, rankdir=None, ax=None, **kwargs): +def plot_tree( + booster: Booster, + fmap: PathLike = "", + num_trees: int = 0, + rankdir: Optional[str] = None, + ax: Optional[Axes] = None, + **kwargs: Any +) -> Axes: """Plot specified tree. Parameters diff --git a/python-package/xgboost/rabit.py b/python-package/xgboost/rabit.py index 465a5611a2d1..f5da7a353330 100644 --- a/python-package/xgboost/rabit.py +++ b/python-package/xgboost/rabit.py @@ -230,7 +230,9 @@ def version_number() -> int: class RabitContext: """A context controlling rabit initialization and finalization.""" - def __init__(self, args: List[bytes]) -> None: + def __init__(self, args: List[bytes] = None) -> None: + if args is None: + args = [] self.args = args def __enter__(self) -> None: diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index ae883e30ee17..f6b43d8de448 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -4,8 +4,19 @@ import warnings import json import os -from typing import Union, Optional, List, Dict, Callable, Tuple, Any, TypeVar, Type, cast -from typing import Sequence +from typing import ( + Union, + Optional, + List, + Dict, + Callable, + Sequence, + Tuple, + Any, + TypeVar, + Type, + cast, +) import numpy as np from .core import Booster, DMatrix, XGBoostError @@ -14,7 +25,7 @@ from .training import train from .callback import TrainingCallback from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array -from ._typing import ArrayLike, FeatureTypes +from ._typing import ArrayLike, FeatureNames, FeatureTypes # Do not use class names on scikit-learn directly. Re-define the classes on # .compat to guarantee the behavior without scikit-learn @@ -401,7 +412,7 @@ def _wrap_evaluation_matrices( eval_qid: Optional[Sequence[Any]], create_dmatrix: Callable, enable_categorical: bool, - feature_types: FeatureTypes, + feature_types: Optional[FeatureTypes], ) -> Tuple[Any, List[Tuple[Any, str]]]: """Convert array_like evaluation matrices into DMatrix. Perform validation on the way. @@ -717,7 +728,7 @@ def _get_type(self) -> str: return self._estimator_type # pylint: disable=no-member def save_model(self, fname: Union[str, os.PathLike]) -> None: - meta = {} + meta: Dict[str, Any] = {} for k, v in self.__dict__.items(): if k == '_le': meta['_le'] = self._le.to_json() @@ -1231,7 +1242,7 @@ def dft() -> str: importance_type=self.importance_type if self.importance_type else dft() ) if b.feature_names is None: - feature_names = [f"f{i}" for i in range(self.n_features_in_)] + feature_names: FeatureNames = [f"f{i}" for i in range(self.n_features_in_)] else: feature_names = b.feature_names # gblinear returns all features so the `get` in next line is only for gbtree. diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index 38567b6bf949..2103303fbe20 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -5,20 +5,24 @@ import copy import os import warnings -from typing import Optional, Dict, Any, Union, Tuple, Sequence +from typing import Optional, Dict, Any, Union, Tuple, Sequence, List, cast, Iterable import numpy as np + +from .callback import TrainingCallback, CallbackContainer, EvaluationMonitor, EarlyStopping from .core import Booster, DMatrix, XGBoostError, _deprecate_positional_args from .core import Metric, Objective -from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold) -from . import callback +from .compat import SKLEARN_INSTALLED, XGBStratifiedKFold, DataFrame +from ._typing import _F, FPreProcCallable, BoosterParam + +_CVFolds = Sequence["CVPack"] def _assert_new_callback( - callbacks: Optional[Sequence[callback.TrainingCallback]] + callbacks: Optional[Sequence[TrainingCallback]] ) -> None: is_new_callback: bool = not callbacks or all( - isinstance(c, callback.TrainingCallback) for c in callbacks + isinstance(c, TrainingCallback) for c in callbacks ) if not is_new_callback: link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html" @@ -56,10 +60,10 @@ def train( feval: Optional[Metric] = None, maximize: Optional[bool] = None, early_stopping_rounds: Optional[int] = None, - evals_result: callback.TrainingCallback.EvalsLog = None, + evals_result: TrainingCallback.EvalsLog = None, verbose_eval: Optional[Union[bool, int]] = True, xgb_model: Optional[Union[str, os.PathLike, Booster, bytearray]] = None, - callbacks: Optional[Sequence[callback.TrainingCallback]] = None, + callbacks: Optional[Sequence[TrainingCallback]] = None, custom_metric: Optional[Metric] = None, ) -> Booster: """Train a booster with given parameters. @@ -159,12 +163,12 @@ def train( _assert_new_callback(callbacks) if verbose_eval: verbose_eval = 1 if verbose_eval is True else verbose_eval - callbacks.append(callback.EvaluationMonitor(period=verbose_eval)) + callbacks.append(EvaluationMonitor(period=verbose_eval)) if early_stopping_rounds: callbacks.append( - callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize) + EarlyStopping(rounds=early_stopping_rounds, maximize=maximize) ) - cb_container = callback.CallbackContainer( + cb_container = CallbackContainer( callbacks, metric=metric_fn, # For old `feval` parameter, the behavior is unchanged. For the new @@ -194,71 +198,73 @@ def train( class CVPack: """"Auxiliary datastruct to hold one fold of CV.""" - def __init__(self, dtrain, dtest, param): + def __init__(self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]) -> None: """"Initialize the CVPack""" self.dtrain = dtrain self.dtest = dtest self.watchlist = [(dtrain, 'train'), (dtest, 'test')] self.bst = Booster(param, [dtrain, dtest]) - def __getattr__(self, name): - def _inner(*args, **kwargs): + def __getattr__(self, name: str) -> _F: + def _inner(*args: Any, **kwargs: Any) -> Any: return getattr(self.bst, name)(*args, **kwargs) - return _inner + return cast(_F, _inner) - def update(self, iteration, fobj): + def update(self, iteration: int, fobj: Optional[Objective]) -> None: """"Update the boosters for one iteration""" self.bst.update(self.dtrain, iteration, fobj) - def eval(self, iteration, feval, output_margin): + def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> str: """"Evaluate the CVPack for one iteration.""" return self.bst.eval_set(self.watchlist, iteration, feval, output_margin) class _PackedBooster: - def __init__(self, cvfolds) -> None: + def __init__(self, cvfolds: _CVFolds) -> None: self.cvfolds = cvfolds - def update(self, iteration, obj): + def update(self, iteration: int, obj: Optional[Objective]) -> None: '''Iterate through folds for update''' for fold in self.cvfolds: fold.update(iteration, obj) - def eval(self, iteration, feval, output_margin): + def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> List[str]: '''Iterate through folds for eval''' result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds] return result - def set_attr(self, **kwargs): + def set_attr(self, **kwargs: Optional[str]) -> Any: '''Iterate through folds for setting attributes''' for f in self.cvfolds: f.bst.set_attr(**kwargs) - def attr(self, key): + def attr(self, key: str) -> Optional[str]: '''Redirect to booster attr.''' return self.cvfolds[0].bst.attr(key) - def set_param(self, params, value=None): + def set_param(self, + params: Union[Dict, Iterable[Tuple[str, Any]], str], + value: Optional[str] = None) -> None: """Iterate through folds for set_param""" for f in self.cvfolds: f.bst.set_param(params, value) - def num_boosted_rounds(self): + def num_boosted_rounds(self) -> int: '''Number of boosted rounds.''' return self.cvfolds[0].num_boosted_rounds() @property - def best_iteration(self): + def best_iteration(self) -> int: '''Get best_iteration''' - return int(self.cvfolds[0].bst.attr("best_iteration")) + return int(cast(int, self.cvfolds[0].bst.attr("best_iteration"))) @property - def best_score(self): + def best_score(self) -> float: """Get best_score.""" - return float(self.cvfolds[0].bst.attr("best_score")) + return float(cast(float, self.cvfolds[0].bst.attr("best_score"))) -def groups_to_rows(groups, boundaries): +def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray: """ Given group row boundaries, convert ground indexes to row indexes :param groups: list of groups for testing @@ -268,7 +274,9 @@ def groups_to_rows(groups, boundaries): return np.concatenate([np.arange(boundaries[g], boundaries[g+1]) for g in groups]) -def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True): +def mkgroupfold(dall: DMatrix, nfold: int, param: BoosterParam, + evals: Sequence[str] = (), fpreproc: FPreProcCallable = None, + shuffle: bool = True) -> List[CVPack]: """ Make n folds for cross-validation maintaining groups :return: cross-validation folds @@ -308,8 +316,10 @@ def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True): return ret -def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, - folds=None, shuffle=True): +def mknfold(dall: DMatrix, nfold: int, param: BoosterParam, seed: int, + evals: Sequence[str] = (), fpreproc: FPreProcCallable = None, + stratified: bool = False, folds: XGBStratifiedKFold = None, shuffle: bool = True + ) -> List[CVPack]: """ Make an n-fold list of CVPack from random indices. """ @@ -362,11 +372,27 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, return ret -def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None, - metrics=(), obj: Optional[Objective] = None, - feval=None, maximize=None, early_stopping_rounds=None, - fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, - seed=0, callbacks=None, shuffle=True, custom_metric: Optional[Metric] = None): +def cv( + params: BoosterParam, + dtrain: DMatrix, + num_boost_round: int = 10, + nfold: int = 3, + stratified: bool = False, + folds: XGBStratifiedKFold = None, + metrics: Sequence[str] = (), + obj: Optional[Objective] = None, + feval: Optional[Metric] = None, + maximize: bool = None, + early_stopping_rounds: int = None, + fpreproc: FPreProcCallable = None, + as_pandas: bool = True, + verbose_eval: Optional[Union[int, bool]] = None, + show_stdv: bool = True, + seed: int = 0, + callbacks: Sequence[TrainingCallback] = None, + shuffle: bool = True, + custom_metric: Optional[Metric] = None, +) -> Union[Dict[str, float], DataFrame]: # pylint: disable = invalid-name """Cross-validation with given parameters. @@ -477,7 +503,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None params.pop("eval_metric", None) - results = {} + results: Dict[str, List[float]] = {} cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds, shuffle) @@ -490,13 +516,13 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None if verbose_eval: verbose_eval = 1 if verbose_eval is True else verbose_eval callbacks.append( - callback.EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv) + EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv) ) if early_stopping_rounds: callbacks.append( - callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize) + EarlyStopping(rounds=early_stopping_rounds, maximize=maximize) ) - callbacks = callback.CallbackContainer( + callbacks_container = CallbackContainer( callbacks, metric=metric_fn, is_cv=True, @@ -504,16 +530,16 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None ) booster = _PackedBooster(cvfolds) - callbacks.before_training(booster) + callbacks_container.before_training(booster) for i in range(num_boost_round): - if callbacks.before_iteration(booster, i, dtrain, None): + if callbacks_container.before_iteration(booster, i, dtrain, None): break booster.update(i, obj) - should_break = callbacks.after_iteration(booster, i, dtrain, None) - res = callbacks.aggregated_cv - for key, mean, std in res: + should_break = callbacks_container.after_iteration(booster, i, dtrain, None) + res = callbacks_container.aggregated_cv + for key, mean, std in cast(List[Tuple[str, float, float]], res): if key + '-mean' not in results: results[key + '-mean'] = [] if key + '-std' not in results: @@ -532,6 +558,6 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None except ImportError: pass - callbacks.after_training(booster) + callbacks_container.after_training(booster) return results diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 3c7c539802fa..d72eb077b05f 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -300,7 +300,7 @@ XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle, CHECK(p_m); auto m = static_cast(p_m->get()); CHECK(m) << "Current DMatrix type does not support set data."; - m->SetData(c_interface_str); + m->SetCUDAArray(c_interface_str); API_END(); } @@ -312,7 +312,7 @@ XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, CHECK(p_m); auto m = static_cast(p_m->get()); CHECK(m) << "Current DMatrix type does not support set data."; - m->SetData(c_interface_str); + m->SetCUDAArray(c_interface_str); API_END(); } @@ -825,74 +825,69 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, API_END(); } -template -void InplacePredictImpl(std::shared_ptr x, std::shared_ptr p_m, - char const *c_json_config, Learner *learner, - size_t n_rows, size_t n_cols, - xgboost::bst_ulong const **out_shape, - xgboost::bst_ulong *out_dim, const float **out_result) { +void InplacePredictImpl(std::shared_ptr p_m, char const *c_json_config, Learner *learner, + xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim, + const float **out_result) { auto config = Json::Load(StringView{c_json_config}); CHECK_EQ(get(config["cache_id"]), 0) << "Cache ID is not supported yet"; - HostDeviceVector* p_predt { nullptr }; + HostDeviceVector *p_predt{nullptr}; auto type = PredictionType(RequiredArg(config, "type", __func__)); float missing = GetMissing(config); - learner->InplacePredict(x, p_m, type, missing, &p_predt, + learner->InplacePredict(p_m, type, missing, &p_predt, RequiredArg(config, "iteration_begin", __func__), RequiredArg(config, "iteration_end", __func__)); CHECK(p_predt); auto &shape = learner->GetThreadLocal().prediction_shape; - auto chunksize = n_rows == 0 ? 0 : p_predt->Size() / n_rows; + auto const &info = p_m->Info(); + auto n_samples = info.num_row_; + auto n_features = info.num_col_; + auto chunksize = n_samples == 0 ? 0 : p_predt->Size() / n_samples; bool strict_shape = RequiredArg(config, "strict_shape", __func__); - CalcPredictShape(strict_shape, type, n_rows, n_cols, chunksize, learner->Groups(), + CalcPredictShape(strict_shape, type, n_samples, n_features, chunksize, learner->Groups(), learner->BoostedRounds(), &shape, out_dim); *out_result = dmlc::BeginPtr(p_predt->HostVector()); *out_shape = dmlc::BeginPtr(shape); } -// A hidden API as cache id is not being supported yet. -XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, - char const *array_interface, - char const *c_json_config, - DMatrixHandle m, +XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *array_interface, + char const *c_json_config, DMatrixHandle m, xgboost::bst_ulong const **out_shape, - xgboost::bst_ulong *out_dim, - const float **out_result) { + xgboost::bst_ulong *out_dim, const float **out_result) { API_BEGIN(); CHECK_HANDLE(); - std::shared_ptr x{ - new xgboost::data::ArrayAdapter(StringView{array_interface})}; - std::shared_ptr p_m {nullptr}; - if (m) { + std::shared_ptr p_m{nullptr}; + if (!m) { + p_m.reset(new data::DMatrixProxy); + } else { p_m = *static_cast *>(m); } + auto proxy = dynamic_cast(p_m.get()); + CHECK(proxy) << "Invalid input type for inplace predict."; + proxy->SetArrayData(array_interface); auto *learner = static_cast(handle); - InplacePredictImpl(x, p_m, c_json_config, learner, x->NumRows(), - x->NumColumns(), out_shape, out_dim, out_result); + InplacePredictImpl(p_m, c_json_config, learner, out_shape, out_dim, out_result); API_END(); } -// A hidden API as cache id is not being supported yet. -XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, - char const *indices, char const *data, - xgboost::bst_ulong cols, +XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, char const *indices, + char const *data, xgboost::bst_ulong cols, char const *c_json_config, DMatrixHandle m, xgboost::bst_ulong const **out_shape, - xgboost::bst_ulong *out_dim, - const float **out_result) { + xgboost::bst_ulong *out_dim, const float **out_result) { API_BEGIN(); CHECK_HANDLE(); - std::shared_ptr x{ - new xgboost::data::CSRArrayAdapter{StringView{indptr}, - StringView{indices}, StringView{data}, - static_cast(cols)}}; - std::shared_ptr p_m {nullptr}; - if (m) { + std::shared_ptr p_m{nullptr}; + if (!m) { + p_m.reset(new data::DMatrixProxy); + } else { p_m = *static_cast *>(m); } + auto proxy = dynamic_cast(p_m.get()); + CHECK(proxy) << "Invalid input type for inplace predict."; + proxy->SetCSRData(indptr, indices, data, cols, true); auto *learner = static_cast(handle); - InplacePredictImpl(x, p_m, c_json_config, learner, x->NumRows(), - x->NumColumns(), out_shape, out_dim, out_result); + InplacePredictImpl(p_m, c_json_config, learner, out_shape, out_dim, out_result); API_END(); } diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu index 80408ba466eb..c3b303fa447f 100644 --- a/src/c_api/c_api.cu +++ b/src/c_api/c_api.cu @@ -1,10 +1,11 @@ -// Copyright (c) 2019-2021 by Contributors -#include "xgboost/data.h" -#include "xgboost/c_api.h" -#include "xgboost/learner.h" +// Copyright (c) 2019-2022 by Contributors +#include "../data/device_adapter.cuh" +#include "../data/proxy_dmatrix.h" #include "c_api_error.h" #include "c_api_utils.h" -#include "../data/device_adapter.cuh" +#include "xgboost/c_api.h" +#include "xgboost/data.h" +#include "xgboost/learner.h" namespace xgboost { @@ -85,62 +86,65 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, API_END(); } -template -int InplacePreidctCuda(BoosterHandle handle, char const *c_json_strs, - char const *c_json_config, - std::shared_ptr p_m, - xgboost::bst_ulong const **out_shape, - xgboost::bst_ulong *out_dim, const float **out_result) { +int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface, + char const *c_json_config, std::shared_ptr p_m, + xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim, + const float **out_result) { API_BEGIN(); CHECK_HANDLE(); + if (!p_m) { + p_m.reset(new data::DMatrixProxy); + } + auto proxy = dynamic_cast(p_m.get()); + CHECK(proxy) << "Invalid input type for inplace predict."; + proxy->SetCUDAArray(c_array_interface); + auto config = Json::Load(StringView{c_json_config}); - CHECK_EQ(get(config["cache_id"]), 0) - << "Cache ID is not supported yet"; + CHECK_EQ(get(config["cache_id"]), 0) << "Cache ID is not supported yet"; auto *learner = static_cast(handle); - std::string json_str{c_json_strs}; - auto x = std::make_shared(json_str); HostDeviceVector *p_predt{nullptr}; - auto type = PredictionType(get(config["type"])); + auto type = PredictionType(RequiredArg(config, "type", __func__)); float missing = GetMissing(config); - learner->InplacePredict(x, p_m, type, missing, &p_predt, - get(config["iteration_begin"]), - get(config["iteration_end"])); + learner->InplacePredict(p_m, type, missing, &p_predt, + RequiredArg(config, "iteration_begin", __func__), + RequiredArg(config, "iteration_end", __func__)); CHECK(p_predt); CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead()); auto &shape = learner->GetThreadLocal().prediction_shape; - auto chunksize = x->NumRows() == 0 ? 0 : p_predt->Size() / x->NumRows(); - bool strict_shape = get(config["strict_shape"]); - CalcPredictShape(strict_shape, type, x->NumRows(), x->NumColumns(), chunksize, - learner->Groups(), learner->BoostedRounds(), &shape, - out_dim); + size_t n_samples = p_m->Info().num_row_; + auto chunksize = n_samples == 0 ? 0 : p_predt->Size() / n_samples; + bool strict_shape = RequiredArg(config, "strict_shape", __func__); + CalcPredictShape(strict_shape, type, n_samples, p_m->Info().num_col_, chunksize, + learner->Groups(), learner->BoostedRounds(), &shape, out_dim); *out_shape = dmlc::BeginPtr(shape); *out_result = p_predt->ConstDevicePointer(); API_END(); } -XGB_DLL int XGBoosterPredictFromCudaColumnar( - BoosterHandle handle, char const *c_json_strs, char const *c_json_config, - DMatrixHandle m, xgboost::bst_ulong const **out_shape, - xgboost::bst_ulong *out_dim, const float **out_result) { - std::shared_ptr p_m {nullptr}; +XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c_json_strs, + char const *c_json_config, DMatrixHandle m, + xgboost::bst_ulong const **out_shape, + xgboost::bst_ulong *out_dim, + const float **out_result) { + std::shared_ptr p_m{nullptr}; if (m) { p_m = *static_cast *>(m); } - return InplacePreidctCuda( - handle, c_json_strs, c_json_config, p_m, out_shape, out_dim, out_result); + return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim, + out_result); } -XGB_DLL int XGBoosterPredictFromCudaArray( - BoosterHandle handle, char const *c_json_strs, char const *c_json_config, - DMatrixHandle m, xgboost::bst_ulong const **out_shape, - xgboost::bst_ulong *out_dim, const float **out_result) { - std::shared_ptr p_m {nullptr}; +XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_json_strs, + char const *c_json_config, DMatrixHandle m, + xgboost::bst_ulong const **out_shape, + xgboost::bst_ulong *out_dim, const float **out_result) { + std::shared_ptr p_m{nullptr}; if (m) { p_m = *static_cast *>(m); } - return InplacePreidctCuda( - handle, c_json_strs, c_json_config, p_m, out_shape, out_dim, out_result); + return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim, + out_result); } diff --git a/src/data/adapter.h b/src/data/adapter.h index 4025ccd8e996..e6cb6d8b9068 100644 --- a/src/data/adapter.h +++ b/src/data/adapter.h @@ -1131,6 +1131,24 @@ class RecordBatchesIterAdapter: public dmlc::DataIter { struct ArrowSchemaImporter schema_; ArrowColumnarBatchVec batches_; }; + +class SparsePageAdapterBatch { + HostSparsePageView page_; + + public: + struct Line { + SparsePage::Inst inst; + bst_row_t ridx; + COOTuple GetElement(size_t idx) const { + return COOTuple{ridx, inst.data()[idx].index, inst.data()[idx].fvalue}; + } + size_t Size() const { return inst.size(); } + }; + + explicit SparsePageAdapterBatch(HostSparsePageView page) : page_{std::move(page)} {} + Line GetLine(size_t ridx) const { return Line{page_[ridx], ridx}; } + size_t Size() const { return page_.Size(); } +}; }; // namespace data } // namespace xgboost #endif // XGBOOST_DATA_ADAPTER_H_ diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h index 8a6f67f144d0..8744bbf776f5 100644 --- a/src/data/proxy_dmatrix.h +++ b/src/data/proxy_dmatrix.h @@ -55,7 +55,7 @@ class DMatrixProxy : public DMatrix { public: int DeviceIdx() const { return ctx_.gpu_id; } - void SetData(char const* c_interface) { + void SetCUDAArray(char const* c_interface) { common::AssertGPUSupport(); #if defined(XGBOOST_USE_CUDA) std::string interface_str = c_interface; diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index bb7c341f8beb..8f8facc5392b 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -795,88 +795,75 @@ class Dart : public GBTree { this->PredictBatchImpl(p_fmat, p_out_preds, training, layer_begin, layer_end); } - void InplacePredict(dmlc::any const &x, std::shared_ptr p_m, - float missing, PredictionCacheEntry *out_preds, - uint32_t layer_begin, unsigned layer_end) const override { + void InplacePredict(std::shared_ptr p_fmat, float missing, + PredictionCacheEntry* p_out_preds, uint32_t layer_begin, + unsigned layer_end) const override { uint32_t tree_begin, tree_end; std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end); - std::vector predictors{ + auto n_groups = model_.learner_model_param->num_output_group; + + std::vector predictors { cpu_predictor_.get(), #if defined(XGBOOST_USE_CUDA) gpu_predictor_.get() #endif // defined(XGBOOST_USE_CUDA) }; - Predictor const * predictor {nullptr}; - - MetaInfo info; + Predictor const* predictor{nullptr}; StringView msg{"Unsupported data type for inplace predict."}; - int32_t device = GenericParameter::kCpuId; + PredictionCacheEntry predts; - // Inplace predict is not used for training, so no need to drop tree. - for (size_t i = tree_begin; i < tree_end; ++i) { + if (ctx_->gpu_id != Context::kCpuId) { + predts.predictions.SetDevice(ctx_->gpu_id); + } + predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0); + + auto predict_impl = [&](size_t i) { + predts.predictions.Fill(0); if (tparam_.predictor == PredictorType::kAuto) { // Try both predictor implementations bool success = false; - for (auto const &p : predictors) { - if (p && p->InplacePredict(x, nullptr, model_, missing, &predts, i, - i + 1)) { + for (auto const& p : predictors) { + if (p && p->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)) { success = true; predictor = p; -#if defined(XGBOOST_USE_CUDA) - device = predts.predictions.DeviceIdx(); -#endif // defined(XGBOOST_USE_CUDA) break; } } CHECK(success) << msg; } else { - // No base margin from meta info for each tree predictor = this->GetPredictor().get(); - bool success = predictor->InplacePredict(x, nullptr, model_, missing, - &predts, i, i + 1); - device = predts.predictions.DeviceIdx(); + bool success = predictor->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1); CHECK(success) << msg << std::endl << "Current Predictor: " - << (tparam_.predictor == PredictorType::kCPUPredictor - ? "cpu_predictor" - : "gpu_predictor"); + << (tparam_.predictor == PredictorType::kCPUPredictor ? "cpu_predictor" + : "gpu_predictor"); } + }; - auto w = this->weight_drop_.at(i); - size_t n_groups = model_.learner_model_param->num_output_group; - auto n_rows = predts.predictions.Size() / n_groups; - + // Inplace predict is not used for training, so no need to drop tree. + for (size_t i = tree_begin; i < tree_end; ++i) { + predict_impl(i); if (i == tree_begin) { - // base margin is added here. - if (p_m) { - p_m->Info().num_row_ = n_rows; - predictor->InitOutPredictions(p_m->Info(), &out_preds->predictions, - model_); - } else { - info.num_row_ = n_rows; - predictor->InitOutPredictions(info, &out_preds->predictions, model_); - } + predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_); } - // Multiple the tree weight - CHECK_EQ(predts.predictions.Size(), out_preds->predictions.Size()); + auto w = this->weight_drop_.at(i); auto group = model_.tree_info.at(i); + CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size()); - if (device == GenericParameter::kCpuId) { - auto &h_predts = predts.predictions.HostVector(); - auto &h_out_predts = out_preds->predictions.HostVector(); + size_t n_rows = p_fmat->Info().num_row_; + if (predts.predictions.DeviceIdx() != Context::kCpuId) { + p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx()); + GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(), + predts.predictions.DeviceSpan(), w, n_rows, + model_.learner_model_param->base_score, n_groups, group); + } else { + auto& h_predts = predts.predictions.HostVector(); + auto& h_out_predts = p_out_preds->predictions.HostVector(); common::ParallelFor(n_rows, ctx_->Threads(), [&](auto ridx) { const size_t offset = ridx * n_groups + group; - // Need to remove the base margin from individual tree. h_out_predts[offset] += (h_predts[offset] - model_.learner_model_param->base_score) * w; }); - } else { - out_preds->predictions.SetDevice(device); - predts.predictions.SetDevice(device); - GPUDartInplacePredictInc(out_preds->predictions.DeviceSpan(), - predts.predictions.DeviceSpan(), w, n_rows, - model_.learner_model_param->base_score, - n_groups, group); } } } diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h index 020b7d0cb9c0..0d2d025e5250 100644 --- a/src/gbm/gbtree.h +++ b/src/gbm/gbtree.h @@ -261,8 +261,7 @@ class GBTree : public GradientBooster { void PredictBatch(DMatrix *p_fmat, PredictionCacheEntry *out_preds, bool training, unsigned layer_begin, unsigned layer_end) override; - void InplacePredict(dmlc::any const &x, std::shared_ptr p_m, - float missing, PredictionCacheEntry *out_preds, + void InplacePredict(std::shared_ptr p_m, float missing, PredictionCacheEntry* out_preds, uint32_t layer_begin, unsigned layer_end) const override { CHECK(configured_); uint32_t tree_begin, tree_end; @@ -278,15 +277,14 @@ class GBTree : public GradientBooster { if (tparam_.predictor == PredictorType::kAuto) { // Try both predictor implementations for (auto const &p : predictors) { - if (p && p->InplacePredict(x, p_m, model_, missing, out_preds, - tree_begin, tree_end)) { + if (p && p->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end)) { return; } } LOG(FATAL) << msg; } else { - bool success = this->GetPredictor()->InplacePredict( - x, p_m, model_, missing, out_preds, tree_begin, tree_end); + bool success = this->GetPredictor()->InplacePredict(p_m, model_, missing, out_preds, + tree_begin, tree_end); CHECK(success) << msg << std::endl << "Current Predictor: " << (tparam_.predictor == PredictorType::kCPUPredictor diff --git a/src/learner.cc b/src/learner.cc index 568cfc680714..5d7d067e71e2 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -1277,15 +1277,12 @@ class LearnerImpl : public LearnerIO { return (*LearnerAPIThreadLocalStore::Get())[this]; } - void InplacePredict(dmlc::any const &x, std::shared_ptr p_m, - PredictionType type, float missing, - HostDeviceVector **out_preds, - uint32_t iteration_begin, + void InplacePredict(std::shared_ptr p_m, PredictionType type, float missing, + HostDeviceVector** out_preds, uint32_t iteration_begin, uint32_t iteration_end) override { this->Configure(); auto& out_predictions = this->GetThreadLocal().prediction_entry; - this->gbm_->InplacePredict(x, p_m, missing, &out_predictions, - iteration_begin, iteration_end); + this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end); if (type == PredictionType::kValue) { obj_->PredTransform(&out_predictions.predictions); } else if (type == PredictionType::kMargin) { diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc index 892c956319bb..b5dd9b4af12b 100644 --- a/src/predictor/cpu_predictor.cc +++ b/src/predictor/cpu_predictor.cc @@ -1,27 +1,27 @@ /*! * Copyright by Contributors 2017-2021 */ -#include #include +#include #include #include #include +#include "../common/categorical.h" +#include "../common/math.h" +#include "../common/threading_utils.h" +#include "../data/adapter.h" +#include "../data/proxy_dmatrix.h" +#include "../gbm/gbtree_model.h" +#include "predict_fn.h" #include "xgboost/base.h" #include "xgboost/data.h" +#include "xgboost/host_device_vector.h" +#include "xgboost/logging.h" #include "xgboost/predictor.h" #include "xgboost/tree_model.h" #include "xgboost/tree_updater.h" -#include "xgboost/logging.h" -#include "xgboost/host_device_vector.h" - -#include "predict_fn.h" -#include "../data/adapter.h" -#include "../common/math.h" -#include "../common/threading_utils.h" -#include "../common/categorical.h" -#include "../gbm/gbtree_model.h" namespace xgboost { namespace predictor { @@ -327,22 +327,24 @@ class CPUPredictor : public Predictor { &predictions, model, tree_begin, tree_end, &thread_temp, n_threads); } - bool InplacePredict(dmlc::any const &x, std::shared_ptr p_m, - const gbm::GBTreeModel &model, float missing, + bool InplacePredict(std::shared_ptr p_m, const gbm::GBTreeModel &model, float missing, PredictionCacheEntry *out_preds, uint32_t tree_begin, unsigned tree_end) const override { + auto proxy = dynamic_cast(p_m.get()); + CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input."; + auto x = proxy->Adapter(); if (x.type() == typeid(std::shared_ptr)) { this->DispatchedInplacePredict( x, p_m, model, missing, out_preds, tree_begin, tree_end); } else if (x.type() == typeid(std::shared_ptr)) { - this->DispatchedInplacePredict( - x, p_m, model, missing, out_preds, tree_begin, tree_end); + this->DispatchedInplacePredict(x, p_m, model, missing, out_preds, + tree_begin, tree_end); } else if (x.type() == typeid(std::shared_ptr)) { - this->DispatchedInplacePredict ( + this->DispatchedInplacePredict( x, p_m, model, missing, out_preds, tree_begin, tree_end); } else if (x.type() == typeid(std::shared_ptr)) { - this->DispatchedInplacePredict ( - x, p_m, model, missing, out_preds, tree_begin, tree_end); + this->DispatchedInplacePredict(x, p_m, model, missing, out_preds, + tree_begin, tree_end); } else { return false; } diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 0a09dc255c95..d20918cf2f56 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -1,28 +1,29 @@ /*! * Copyright 2017-2021 by Contributors */ +#include #include #include #include #include #include -#include + #include +#include "../common/bitfield.h" +#include "../common/categorical.h" +#include "../common/common.h" +#include "../common/device_helpers.cuh" +#include "../data/device_adapter.cuh" +#include "../data/ellpack_page.cuh" +#include "../data/proxy_dmatrix.h" +#include "../gbm/gbtree_model.h" +#include "predict_fn.h" #include "xgboost/data.h" +#include "xgboost/host_device_vector.h" #include "xgboost/predictor.h" #include "xgboost/tree_model.h" #include "xgboost/tree_updater.h" -#include "xgboost/host_device_vector.h" - -#include "predict_fn.h" -#include "../gbm/gbtree_model.h" -#include "../data/ellpack_page.cuh" -#include "../data/device_adapter.cuh" -#include "../common/common.h" -#include "../common/bitfield.h" -#include "../common/categorical.h" -#include "../common/device_helpers.cuh" namespace xgboost { namespace predictor { @@ -789,17 +790,19 @@ class GPUPredictor : public xgboost::Predictor { m->NumRows(), entry_start, use_shared, output_groups, missing); } - bool InplacePredict(dmlc::any const &x, std::shared_ptr p_m, - const gbm::GBTreeModel &model, float missing, - PredictionCacheEntry *out_preds, uint32_t tree_begin, + bool InplacePredict(std::shared_ptr p_m, const gbm::GBTreeModel& model, float missing, + PredictionCacheEntry* out_preds, uint32_t tree_begin, unsigned tree_end) const override { + auto proxy = dynamic_cast(p_m.get()); + CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input."; + auto x = proxy->Adapter(); if (x.type() == typeid(std::shared_ptr)) { - this->DispatchedInplacePredict< - data::CupyAdapter, DeviceAdapterLoader>( + this->DispatchedInplacePredict>( x, p_m, model, missing, out_preds, tree_begin, tree_end); } else if (x.type() == typeid(std::shared_ptr)) { - this->DispatchedInplacePredict< - data::CudfAdapter, DeviceAdapterLoader>( + this->DispatchedInplacePredict>( x, p_m, model, missing, out_preds, tree_begin, tree_end); } else { return false; diff --git a/src/tree/driver.h b/src/tree/driver.h index abb8afadcb8a..cab762338e2d 100644 --- a/src/tree/driver.h +++ b/src/tree/driver.h @@ -33,10 +33,11 @@ class Driver { std::function>; public: - explicit Driver(TrainParam::TreeGrowPolicy policy) - : policy_(policy), - queue_(policy == TrainParam::kDepthWise ? DepthWise : - LossGuide) {} + explicit Driver(TrainParam param, std::size_t max_node_batch_size = 256) + : param_(param), + max_node_batch_size_(max_node_batch_size), + queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise + : LossGuide) {} template void Push(EntryIterT begin, EntryIterT end) { for (auto it = begin; it != end; ++it) { @@ -55,24 +56,45 @@ class Driver { return queue_.empty(); } + // Can a child of this entry still be expanded? + // can be used to avoid extra work + bool IsChildValid(ExpandEntryT const& parent_entry) { + std::cout << "param_.max_depth:" << param_.max_depth << " parent_entry.depth:" << parent_entry.depth << std::endl; + if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false; + std::cout << "param_.max_leaves:" << param_.max_leaves << " parent_entry.num_leaves_:" << num_leaves_ << std::endl; + if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false; + return true; + } + // Return the set of nodes to be expanded // This set has no dependencies between entries so they may be expanded in // parallel or asynchronously std::vector Pop() { + std::cout << "queue_.size():" << queue_.size() << std::endl; if (queue_.empty()) return {}; // Return a single entry for loss guided mode - if (policy_ == TrainParam::kLossGuide) { + if (param_.grow_policy == TrainParam::kLossGuide) { ExpandEntryT e = queue_.top(); queue_.pop(); - return {e}; + + if (e.IsValid(param_, num_leaves_)) { + num_leaves_++; + return {e}; + } else { + return {}; + } } // Return nodes on same level for depth wise std::vector result; ExpandEntryT e = queue_.top(); int level = e.depth; - while (e.depth == level && !queue_.empty()) { + while (e.depth == level && !queue_.empty() && result.size() < max_node_batch_size_) { queue_.pop(); - result.emplace_back(e); + if (e.IsValid(param_, num_leaves_)) { + num_leaves_++; + result.emplace_back(e); + } + if (!queue_.empty()) { e = queue_.top(); } @@ -81,7 +103,9 @@ class Driver { } private: - TrainParam::TreeGrowPolicy policy_; + TrainParam param_; + bst_node_t num_leaves_ = 1; + std::size_t max_node_batch_size_; ExpandQueue queue_; }; } // namespace tree diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 8d5cc809a280..08b0270ee4d7 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -103,7 +103,7 @@ class GPUHistEvaluator { } /** - * \brief Get sorted index storage based on the left node of inputs . + * \brief Get sorted index storage based on the left node of inputs. */ auto SortedIdx(EvaluateSplitInputs left) { if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) { diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 791363a05cdd..efb08d5e44e2 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -247,15 +247,6 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix, dh::safe_cuda(cudaGetLastError()); } -template void BuildGradientHistogram( - EllpackDeviceAccessor const& matrix, - FeatureGroupsAccessor const& feature_groups, - common::Span gpair, - common::Span ridx, - common::Span histogram, - HistRounding rounding, - bool force_global_memory); - template void BuildGradientHistogram( EllpackDeviceAccessor const& matrix, FeatureGroupsAccessor const& feature_groups, diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h index d0edfbd379a6..a00059791e13 100644 --- a/src/tree/hist/expand_entry.h +++ b/src/tree/hist/expand_entry.h @@ -24,16 +24,20 @@ struct CPUExpandEntry { } bool IsValid(const TrainParam& param, int num_leaves) const { - if (split.loss_chg <= kRtEps) return false; + std::cout << "split.loss_chg:" << split.loss_chg << " eps:" << kRtEps << std::endl; + if (split.loss_chg <= kRtEps) { std::cout << "NOT VALID!" << std::endl; return false;} + std::cout << "split.left_sum.GetHess():" << split.left_sum.GetHess() << " split.right_sum.GetHess():" << split.right_sum.GetHess() << std::endl; if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) { return false; } + std::cout << "split.loss_chg:" << split.loss_chg << " param.min_split_loss:" << param.min_split_loss << std::endl; if (split.loss_chg < param.min_split_loss) { return false; } if (param.max_depth > 0 && depth == param.max_depth) { return false; } + std::cout << "param.max_leaves:" << param.max_leaves << " num_leaves:" << num_leaves << std::endl; if (param.max_leaves > 0 && num_leaves == param.max_leaves) { return false; } diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h index 8cdf88834559..ba3533e84f43 100644 --- a/src/tree/split_evaluator.h +++ b/src/tree/split_evaluator.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "xgboost/tree_model.h" #include "xgboost/host_device_vector.h" @@ -49,8 +50,9 @@ class TreeEvaluator { } else { monotone_.HostVector() = p.monotone_constraints; monotone_.HostVector().resize(n_features, 0); - lower_bounds_.Resize(p.MaxNodes(), -std::numeric_limits::max()); - upper_bounds_.Resize(p.MaxNodes(), std::numeric_limits::max()); + // Initialised to some small size, can grow if needed + lower_bounds_.Resize(256, -std::numeric_limits::max()); + upper_bounds_.Resize(256, std::numeric_limits::max()); has_constraint_ = true; } @@ -157,6 +159,15 @@ class TreeEvaluator { if (!has_constraint_) { return; } + + auto max_nidx = std::max(leftid, rightid); + if (lower_bounds_.Size() <= max_nidx) { + lower_bounds_.Resize(max_nidx * 2 + 1, -std::numeric_limits::max()); + } + if (upper_bounds_.Size() <= max_nidx) { + upper_bounds_.Resize(max_nidx * 2 + 1, std::numeric_limits::max()); + } + common::Transform<>::Init( [=] XGBOOST_DEVICE(size_t, common::Span lower, common::Span upper, diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc index 887725295bf9..cc4b5a1620f8 100644 --- a/src/tree/updater_approx.cc +++ b/src/tree/updater_approx.cc @@ -238,10 +238,9 @@ class GloablApproxBuilder { split_conditions_.clear(); split_ind_.clear(); - Driver driver(static_cast(param_.grow_policy)); + Driver driver(param_); auto &tree = *p_tree; driver.Push({this->InitRoot(p_fmat, gpair, hess, p_tree)}); - bst_node_t num_leaves{1}; auto expand_set = driver.Pop(); int depth = 0; bool is_loss_guide = static_cast(param_.grow_policy) == @@ -267,16 +266,14 @@ class GloablApproxBuilder { bool is_applied = false; // candidates that can be applied. for (auto const &candidate : expand_set) { - if (!candidate.IsValid(param_, num_leaves)) { - continue; - } evaluator_.ApplyTreeSplit(candidate, p_tree); applied[candidate.nid] = candidate; applied_vec.push_back(candidate); CHECK_EQ(applied[candidate.nid].nid, candidate.nid); - num_leaves++; + // num_leaves++; int left_child_nidx = tree[candidate.nid].LeftChild(); - if (CPUExpandEntry::ChildIsValid(param_, p_tree->GetDepth(left_child_nidx), num_leaves)) { +// if (CPUExpandEntry::ChildIsValid(param_, p_tree->GetDepth(left_child_nidx), num_leaves)) { + if (driver.IsChildValid(candidate)) { valid_candidates.emplace_back(candidate); } else { if (param_.grow_policy == TrainParam::kLossGuide) { diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 20db181ef187..ae209cdaf205 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -62,7 +62,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam); #endif // !defined(GTEST_TEST) /** - * \struct DeviceHistogram + * \struct DeviceHistogramStorage * * \summary Data storage for node histograms on device. Automatically expands. * @@ -72,20 +72,27 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam); * \author Rory * \date 28/07/2018 */ -template -class DeviceHistogram { +template +class DeviceHistogramStorage { private: /*! \brief Map nidx to starting index of its histogram. */ std::map nidx_map_; + // Large buffer of zeroed memory, caches histograms dh::device_vector data_; + // If we run out of storage allocate one histogram at a time + // in overflow. Not cached, overwritten when a new histogram + // is requested + dh::device_vector overflow_; + std::map overflow_nidx_map_; int n_bins_; int device_id_; static constexpr size_t kNumItemsInGradientSum = sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); - static_assert(kNumItemsInGradientSum == 2, - "Number of items in gradient type should be 2."); + static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2."); public: + // Start with about 16mb + DeviceHistogramStorage() { data_.reserve(1 << 22); } void Init(int device_id, int n_bins) { this->n_bins_ = n_bins; this->device_id_ = device_id; @@ -93,52 +100,47 @@ class DeviceHistogram { void Reset() { auto d_data = data_.data().get(); - dh::LaunchN(data_.size(), - [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); + dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); nidx_map_.clear(); + overflow_nidx_map_.clear(); } bool HistogramExists(int nidx) const { - return nidx_map_.find(nidx) != nidx_map_.cend(); - } - int Bins() const { - return n_bins_; - } - size_t HistogramSize() const { - return n_bins_ * kNumItemsInGradientSum; + return nidx_map_.find(nidx) != nidx_map_.cend() || + overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); } + int Bins() const { return n_bins_; } + size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; } + dh::device_vector& Data() { return data_; } - dh::device_vector& Data() { - return data_; - } - - void AllocateHistogram(int nidx) { - if (HistogramExists(nidx)) return; + void AllocateHistograms(const std::vector& new_nidxs) { + for (int nidx : new_nidxs) { + CHECK(!HistogramExists(nidx)); + } // Number of items currently used in data const size_t used_size = nidx_map_.size() * HistogramSize(); - const size_t new_used_size = used_size + HistogramSize(); - if (data_.size() >= kStopGrowingSize) { - // Recycle histogram memory - if (new_used_size <= data_.size()) { - // no need to remove old node, just insert the new one. - nidx_map_[nidx] = used_size; - // memset histogram size in bytes - } else { - std::pair old_entry = *nidx_map_.begin(); - nidx_map_.erase(old_entry.first); - nidx_map_[nidx] = old_entry.second; + const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size(); + if (used_size >= kStopGrowingSize) { + // Use overflow + // Delete previous entries + overflow_nidx_map_.clear(); + overflow_.resize(HistogramSize() * new_nidxs.size()); + // Zero memory + auto d_data = overflow_.data().get(); + dh::LaunchN(overflow_.size(), + [=] __device__(size_t idx) { d_data[idx] = 0.0; }); + // Append new histograms + for (int nidx : new_nidxs) { + overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize(); } - // Zero recycled memory - auto d_data = data_.data().get() + nidx_map_[nidx]; - dh::LaunchN(n_bins_ * 2, - [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); } else { - // Append new node histogram - nidx_map_[nidx] = used_size; - // Check there is enough memory for another histogram node - if (data_.size() < new_used_size + HistogramSize()) { - size_t new_required_memory = - std::max(data_.size() * 2, HistogramSize()); - data_.resize(new_required_memory); + CHECK_GE(data_.size(), used_size); + // Expand if necessary + if (data_.size() < new_used_size) { + data_.resize(std::max(data_.size() * 2, new_used_size)); + } + // Append new histograms + for (int nidx : new_nidxs) { + nidx_map_[nidx] = nidx_map_.size() * HistogramSize(); } } @@ -152,9 +154,16 @@ class DeviceHistogram { */ common::Span GetNodeHistogram(int nidx) { CHECK(this->HistogramExists(nidx)); - auto ptr = data_.data().get() + nidx_map_.at(nidx); - return common::Span( - reinterpret_cast(ptr), n_bins_); + + if (nidx_map_.find(nidx) != nidx_map_.cend()) { + // Fetch from normal cache + auto ptr = data_.data().get() + nidx_map_.at(nidx); + return common::Span(reinterpret_cast(ptr), n_bins_); + } else { + // Fetch from overflow + auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx); + return common::Span(reinterpret_cast(ptr), n_bins_); + } } }; @@ -171,7 +180,7 @@ struct GPUHistMakerDevice { BatchParam batch_param; std::unique_ptr row_partitioner; - DeviceHistogram hist{}; + DeviceHistogramStorage hist{}; dh::caching_device_vector d_gpair; // storage for gpair; common::Span gpair; @@ -195,6 +204,7 @@ struct GPUHistMakerDevice { std::unique_ptr feature_groups; + GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page, common::Span _feature_types, bst_uint _n_rows, TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features, @@ -213,7 +223,7 @@ struct GPUHistMakerDevice { // Copy assigning an empty vector causes an exception in MSVC debug builds monotone_constraints = param.monotone_constraints; } - node_sum_gradients.resize(param.MaxNodes()); + node_sum_gradients.resize(256); // Init histogram hist.Init(ctx_->gpu_id, page->Cuts().TotalBins()); @@ -322,7 +332,6 @@ struct GPUHistMakerDevice { } void BuildHist(int nidx) { - hist.AllocateHistogram(nidx); auto d_node_hist = hist.GetNodeHistogram(nidx); auto d_ridx = row_partitioner->GetRows(nidx); BuildGradientHistogram(page->GetDeviceAccessor(ctx_->gpu_id), @@ -330,8 +339,12 @@ struct GPUHistMakerDevice { d_ridx, d_node_hist, histogram_rounding); } - void SubtractionTrick(int nidx_parent, int nidx_histogram, - int nidx_subtraction) { + // Attempt to do subtraction trick + // return true if succeeded + bool SubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) { + if (!hist.HistogramExists(nidx_histogram) || !hist.HistogramExists(nidx_parent)) { + return false; + } auto d_node_hist_parent = hist.GetNodeHistogram(nidx_parent); auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram); auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction); @@ -340,12 +353,7 @@ struct GPUHistMakerDevice { d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx]; }); - } - - bool CanDoSubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) { - // Make sure histograms are already allocated - hist.AllocateHistogram(nidx_subtraction); - return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent); + return true; } void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) { @@ -505,13 +513,15 @@ struct GPUHistMakerDevice { row_partitioner.reset(); } - void AllReduceHist(int nidx, dh::AllReducer* reducer) { + // num histograms is the number of contiguous histograms in memory to reduce over + void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) { monitor.Start("AllReduce"); auto d_node_hist = hist.GetNodeHistogram(nidx).data(); - reducer->AllReduceSum( - reinterpret_cast(d_node_hist), - reinterpret_cast(d_node_hist), - page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); + reducer->AllReduceSum(reinterpret_cast(d_node_hist), + reinterpret_cast(d_node_hist), + page->Cuts().TotalBins() * + (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)) * + num_histograms); monitor.Stop("AllReduce"); } @@ -519,33 +529,50 @@ struct GPUHistMakerDevice { /** * \brief Build GPU local histograms for the left and right child of some parent node */ - void BuildHistLeftRight(const GPUExpandEntry &candidate, int nidx_left, - int nidx_right, dh::AllReducer* reducer) { - auto build_hist_nidx = nidx_left; - auto subtraction_trick_nidx = nidx_right; - - // Decide whether to build the left histogram or right histogram - // Use sum of Hessian as a heuristic to select node with fewest training instances - bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess(); - if (fewer_right) { - std::swap(build_hist_nidx, subtraction_trick_nidx); + void BuildHistLeftRight(std::vector const& candidates, dh::AllReducer* reducer, + const RegTree& tree) { + if (candidates.empty()) return; + // Some nodes we will manually compute histograms + // others we will do by subtraction + std::vector hist_nidx; + std::vector subtraction_nidx; + for (auto& e : candidates) { + // Decide whether to build the left histogram or right histogram + // Use sum of Hessian as a heuristic to select node with fewest training instances + bool fewer_right = e.split.right_sum.GetHess() < e.split.left_sum.GetHess(); + if (fewer_right) { + hist_nidx.emplace_back(tree[e.nid].RightChild()); + subtraction_nidx.emplace_back(tree[e.nid].LeftChild()); + } else { + hist_nidx.emplace_back(tree[e.nid].LeftChild()); + subtraction_nidx.emplace_back(tree[e.nid].RightChild()); + } + } + std::vector all_new = hist_nidx; + all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end()); + // Allocate the histograms + // Guaranteed contiguous memory + hist.AllocateHistograms(all_new); + + for (auto nidx : hist_nidx) { + this->BuildHist(nidx); } - this->BuildHist(build_hist_nidx); - this->AllReduceHist(build_hist_nidx, reducer); + // Reduce all in one go + // This gives much better latency in a distributed setting + // when processing a large batch + this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size()); - // Check whether we can use the subtraction trick to calculate the other - bool do_subtraction_trick = this->CanDoSubtractionTrick( - candidate.nid, build_hist_nidx, subtraction_trick_nidx); + for (int i = 0; i < subtraction_nidx.size(); i++) { + auto build_hist_nidx = hist_nidx.at(i); + auto subtraction_trick_nidx = subtraction_nidx.at(i); + auto parent_nidx = candidates.at(i).nid; - if (do_subtraction_trick) { - // Calculate other histogram using subtraction trick - this->SubtractionTrick(candidate.nid, build_hist_nidx, - subtraction_trick_nidx); - } else { - // Calculate other histogram manually - this->BuildHist(subtraction_trick_nidx); - this->AllReduceHist(subtraction_trick_nidx, reducer); + if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) { + // Calculate other histogram manually + this->BuildHist(subtraction_trick_nidx); + this->AllReduceHist(subtraction_trick_nidx, reducer, 1); + } } } @@ -587,12 +614,17 @@ struct GPUHistMakerDevice { } evaluator_.ApplyTreeSplit(candidate, p_tree); - node_sum_gradients[tree[candidate.nid].LeftChild()] = candidate.split.left_sum; - node_sum_gradients[tree[candidate.nid].RightChild()] = candidate.split.right_sum; + const auto& parent = tree[candidate.nid]; + std::size_t max_nidx = std::max(parent.LeftChild(), parent.RightChild()); + // Grow as needed + if (node_sum_gradients.size() <= max_nidx) { + node_sum_gradients.resize(max_nidx * 2 + 1); + } + node_sum_gradients[parent.LeftChild()] = candidate.split.left_sum; + node_sum_gradients[parent.RightChild()] = candidate.split.right_sum; - interaction_constraints.Split(candidate.nid, tree[candidate.nid].SplitIndex(), - tree[candidate.nid].LeftChild(), - tree[candidate.nid].RightChild()); + interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(), + parent.RightChild()); } GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) { @@ -605,8 +637,9 @@ struct GPUHistMakerDevice { GradientPairPrecise{}, thrust::plus{}); rabit::Allreduce(reinterpret_cast(&root_sum), 2); + hist.AllocateHistograms({kRootNIdx}); this->BuildHist(kRootNIdx); - this->AllReduceHist(kRootNIdx, reducer); + this->AllReduceHist(kRootNIdx, reducer, 1); // Remember root stats node_sum_gradients[kRootNIdx] = root_sum; @@ -624,7 +657,8 @@ struct GPUHistMakerDevice { RegTree* p_tree, dh::AllReducer* reducer, HostDeviceVector* p_out_position) { auto& tree = *p_tree; - Driver driver(static_cast(param.grow_policy)); + // Process maximum 32 nodes at a time + Driver driver(param, 32); monitor.Start("Reset"); this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_); @@ -634,48 +668,44 @@ struct GPUHistMakerDevice { driver.Push({ this->InitRoot(p_tree, reducer) }); monitor.Stop("InitRoot"); - auto num_leaves = 1; - // The set of leaves that can be expanded asynchronously auto expand_set = driver.Pop(); while (!expand_set.empty()) { - auto new_candidates = - pinned.GetSpan(expand_set.size() * 2, GPUExpandEntry()); - - for (auto i = 0ull; i < expand_set.size(); i++) { - auto candidate = expand_set.at(i); - if (!candidate.IsValid(param, num_leaves)) { - continue; - } + for (auto& candidate : expand_set) { this->ApplySplit(candidate, p_tree); + } + // Get the candidates we are allowed to expand further + // e.g. We do not bother further processing nodes whose children are beyond max depth + std::vector filtered_expand_set; + std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set), + [&](const auto& e) { return driver.IsChildValid(e); }); + + + auto new_candidates = + pinned.GetSpan(filtered_expand_set.size() * 2, GPUExpandEntry()); + + for (const auto& e : filtered_expand_set) { + monitor.Start("UpdatePosition"); + // Update position is only run when child is valid, instead of right after apply + // split (as in approx tree method). Hense we have the finalise position call + // in GPU Hist. + this->UpdatePosition(e, p_tree); + monitor.Stop("UpdatePosition"); + } - num_leaves++; + monitor.Start("BuildHist"); + this->BuildHistLeftRight(filtered_expand_set, reducer, tree); + monitor.Stop("BuildHist"); + for (auto i = 0ull; i < filtered_expand_set.size(); i++) { + auto candidate = filtered_expand_set.at(i); int left_child_nidx = tree[candidate.nid].LeftChild(); int right_child_nidx = tree[candidate.nid].RightChild(); - // Only create child entries if needed_ - if (GPUExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx), - num_leaves)) { - monitor.Start("UpdatePosition"); - // Update position is only run when child is valid, instead of right after apply - // split (as in approx tree method). Hense we have the finalise position call - // in GPU Hist. - this->UpdatePosition(candidate, p_tree); - monitor.Stop("UpdatePosition"); - - monitor.Start("BuildHist"); - this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer); - monitor.Stop("BuildHist"); - - monitor.Start("EvaluateSplits"); - this->EvaluateLeftRightSplits(candidate, left_child_nidx, right_child_nidx, *p_tree, - new_candidates.subspan(i * 2, 2)); - monitor.Stop("EvaluateSplits"); - } else { - // Set default - new_candidates[i * 2] = GPUExpandEntry(); - new_candidates[i * 2 + 1] = GPUExpandEntry(); - } + + monitor.Start("EvaluateSplits"); + this->EvaluateLeftRightSplits(candidate, left_child_nidx, right_child_nidx, *p_tree, + new_candidates.subspan(i * 2, 2)); + monitor.Stop("EvaluateSplits"); } dh::DefaultStream().Sync(); driver.Push(new_candidates.begin(), new_candidates.end()); diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 9101863c6f18..cddc3a40bf70 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -155,6 +155,7 @@ void QuantileHistMaker::Builder::InitRoot( void QuantileHistMaker::Builder::AddSplitsToTree( const std::vector& expand, + Driver* driver, RegTree *p_tree, int *num_leaves, std::vector* nodes_for_apply_split, @@ -164,11 +165,14 @@ void QuantileHistMaker::Builder::AddSplitsToTree( const bool is_loss_guided = static_cast(param_.grow_policy) != TrainParam::kDepthWise; std::vector complete_node_ids; + std::cout << "expand.size():" << expand.size() << std::endl; for (auto const& entry : expand) { - if (entry.IsValid(param_, *num_leaves)) { + if (driver->IsChildValid(entry)) { + + // if (entry.IsValid(param_, *num_leaves)) { nodes_for_apply_split->push_back(entry); evaluator_->ApplyTreeSplit(entry, p_tree); - (*num_leaves)++; + // (*num_leaves)++; complete_node_ids.push_back((*p_tree)[entry.nid].LeftChild()); complete_node_ids.push_back((*p_tree)[entry.nid].RightChild()); *is_left_small = entry.split.left_sum.GetHess() <= entry.split.right_sum.GetHess(); @@ -179,6 +183,8 @@ void QuantileHistMaker::Builder::AddSplitsToTree( smalest_nodes_mask[(*p_tree)[entry.nid].RightChild()] = true; smalest_nodes_mask[ (*p_tree)[entry.nid].LeftChild()] = false; } + } else { + std::cout << "Not valid!!! entry.nid:" << entry.nid << std::endl; } } child_node_ids_ = complete_node_ids; @@ -232,11 +238,14 @@ void QuantileHistMaker::Builder::ExpandTree( RegTree* p_tree, const std::vector& gpair_h, HostDeviceVector *p_out_position) { + std::cout << "ExpandTree 1" << std::endl; monitor_->Start("ExpandTree"); int num_leaves = 0; split_conditions_.clear(); split_ind_.clear(); - Driver driver(static_cast(param_.grow_policy)); + Driver driver(param_); + std::cout << "ExpandTree 2" << std::endl; + // Driver driver(static_cast(param_.grow_policy)); std::vector expand; size_t page_id{0}; std::vector& row_indices = *row_set_collection_.Data(); @@ -252,21 +261,28 @@ void QuantileHistMaker::Builder::ExpandTree( TrainParam::kDepthWise ? false : true; InitRoot(gmat, p_fmat, p_tree, gpair_h, &num_leaves, &expand); + std::cout << "ExpandTree 3" << std::endl; driver.Push(expand[0]); child_node_ids_.clear(); child_node_ids_.emplace_back(0); int32_t depth = 0; while (!driver.IsEmpty()) { std::unordered_map smalest_nodes_mask; + std::cout << "ExpandTree before POP:" << depth << std::endl; expand = driver.Pop(); - depth = expand[0].depth + 1; + std::cout << "ExpandTree after POP:" << depth << std::endl; + if (expand.size()) { + depth = expand[0].depth + 1; + } + std::cout << "ExpandTree depth:" << depth << std::endl; std::vector nodes_for_apply_split; std::vector nodes_to_evaluate; nodes_for_explicit_hist_build_.clear(); nodes_for_subtraction_trick_.clear(); bool is_left_small = false; - AddSplitsToTree(expand, p_tree, &num_leaves, &nodes_for_apply_split, + AddSplitsToTree(expand, &driver, p_tree, &num_leaves, &nodes_for_apply_split, &smalest_nodes_mask, depth, &is_left_small); + std::cout << "AddSplitsToTree finished" << std::endl; if (nodes_for_apply_split.size() != 0) { monitor_->Start("ApplySplit"); @@ -289,9 +305,11 @@ void QuantileHistMaker::Builder::ExpandTree( true); ++page_id; } + std::cout << "UpdatePositionDispatched finished" << std::endl; monitor_->Stop("ApplySplit"); SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree); + std::cout << "SplitSiblings finished" << std::endl; if (param_.max_depth == 0 || depth < param_.max_depth) { size_t i = 0; monitor_->Start("BuildHist"); @@ -309,6 +327,7 @@ void QuantileHistMaker::Builder::ExpandTree( std::copy(merged_thread_ids_set[nid].begin(), merged_thread_ids_set[nid].end(), merged_thread_ids[nid].begin()); } + std::cout << "merged_thread_ids_set finished" << std::endl; for (auto const &gidx : p_fmat->GetBatches(HistBatch(param_))) { CommonRowPartitioner &partitioner = this->partitioner_.at(i); @@ -319,25 +338,33 @@ void QuantileHistMaker::Builder::ExpandTree( &(partitioner.GetNodeAssignments()), &merged_thread_ids); ++i; } + std::cout << "BuildHist finished" << std::endl; + monitor_->Stop("BuildHist"); monitor_->Start("EvaluateSplits"); auto ft = p_fmat->Info().feature_types.ConstHostSpan(); evaluator_->EvaluateSplits(this->histogram_builder_->Histogram(), feature_values_, ft, *p_tree, &nodes_to_evaluate); monitor_->Stop("EvaluateSplits"); + std::cout << "EvaluateSplits finished" << std::endl; } + std::cout << "nodes_for_apply_split.size():" << nodes_for_apply_split.size() << std::endl; for (size_t i = 0; i < nodes_for_apply_split.size(); ++i) { CPUExpandEntry left_node = nodes_to_evaluate.at(i * 2 + 0); CPUExpandEntry right_node = nodes_to_evaluate.at(i * 2 + 1); driver.Push(left_node); driver.Push(right_node); } + std::cout << "DRIVERPOP finished" << std::endl; } } auto &h_out_position = p_out_position->HostVector(); + std::cout << "LeafPartition started" << std::endl; this->LeafPartition(*p_tree, &h_out_position); + std::cout << "LeafPartition finished" << std::endl; monitor_->Stop(__func__); + std::cout << "ExpandTree finished" << std::endl; } void QuantileHistMaker::Builder::UpdateTree(HostDeviceVector *gpair, DMatrix *p_fmat, diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 9e283f183e33..684aaa2b0e6a 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -174,11 +174,12 @@ class QuantileHistMaker: public TreeUpdater { RegTree *p_tree); void AddSplitsToTree(const std::vector& expand, + Driver* driver, RegTree *p_tree, int *num_leaves, std::vector* nodes_for_apply_split, - std::unordered_map* smalest_nodes_mask_ptr, size_t depth - , bool * is_left_small); + std::unordered_map* smalest_nodes_mask_ptr, size_t depth, + bool * is_left_small); template void ExpandTree(const GHistIndexMatrix& gmat, diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc index 59e49fd8823c..f846142d665e 100644 --- a/tests/cpp/common/test_column_matrix.cc +++ b/tests/cpp/common/test_column_matrix.cc @@ -106,4 +106,4 @@ TEST(HistIndexCreationWithExternalMemory, Test) { TestGHistIndexMatrixCreation(40); } } // namespace common -} // namespace xgboost +} // namespace xgboost \ No newline at end of file diff --git a/tests/cpp/data/test_proxy_dmatrix.cu b/tests/cpp/data/test_proxy_dmatrix.cu index d9f315a8f144..a599ada6da50 100644 --- a/tests/cpp/data/test_proxy_dmatrix.cu +++ b/tests/cpp/data/test_proxy_dmatrix.cu @@ -19,7 +19,7 @@ TEST(ProxyDMatrix, DeviceData) { .GenerateColumnarArrayInterface(&label_storage); DMatrixProxy proxy; - proxy.SetData(data.c_str()); + proxy.SetCUDAArray(data.c_str()); proxy.SetInfo("label", labels.c_str()); ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr)); @@ -34,7 +34,7 @@ TEST(ProxyDMatrix, DeviceData) { data = RandomDataGenerator(kRows, kCols, 0) .Device(0) .GenerateColumnarArrayInterface(&columnar_storage); - proxy.SetData(data.c_str()); + proxy.SetCUDAArray(data.c_str()); ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr)); ASSERT_EQ(dmlc::get>(proxy.Adapter())->NumRows(), kRows); diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc index f9fe7d38660d..00201769bc03 100644 --- a/tests/cpp/gbm/test_gbtree.cc +++ b/tests/cpp/gbm/test_gbtree.cc @@ -1,16 +1,17 @@ /*! * Copyright 2019-2022 XGBoost contributors */ -#include #include +#include #include +#include "../../../src/data/adapter.h" +#include "../../../src/data/proxy_dmatrix.h" +#include "../../../src/gbm/gbtree.h" +#include "../helpers.h" #include "xgboost/base.h" #include "xgboost/host_device_vector.h" #include "xgboost/learner.h" -#include "../helpers.h" -#include "../../../src/gbm/gbtree.h" -#include "../../../src/data/adapter.h" #include "xgboost/predictor.h" namespace xgboost { @@ -246,53 +247,78 @@ TEST(Dart, JsonIO) { ASSERT_NE(get(model["model"]["weight_drop"]).size(), 0ul); } -TEST(Dart, Prediction) { - size_t constexpr kRows = 16, kCols = 10; +namespace { +class Dart : public testing::TestWithParam { + public: + void Run(std::string predictor) { + size_t constexpr kRows = 16, kCols = 10; - HostDeviceVector data; - auto array_str = RandomDataGenerator(kRows, kCols, 0).GenerateArrayInterface(&data); - auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols); + HostDeviceVector data; + auto rng = RandomDataGenerator(kRows, kCols, 0); + if (predictor == "gpu_predictor") { + rng.Device(0); + } + auto array_str = rng.GenerateArrayInterface(&data); + auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols); - std::vector labels (kRows); - for (size_t i = 0; i < kRows; ++i) { - labels[i] = i % 2; - } - p_mat->SetInfo("label", labels.data(), DataType::kFloat32, kRows); + std::vector labels(kRows); + for (size_t i = 0; i < kRows; ++i) { + labels[i] = i % 2; + } + p_mat->SetInfo("label", labels.data(), DataType::kFloat32, kRows); - auto learner = std::unique_ptr(Learner::Create({p_mat})); - learner->SetParam("booster", "dart"); - learner->SetParam("rate_drop", "0.5"); - learner->Configure(); + auto learner = std::unique_ptr(Learner::Create({p_mat})); + learner->SetParam("booster", "dart"); + learner->SetParam("rate_drop", "0.5"); + learner->Configure(); - for (size_t i = 0; i < 16; ++i) { - learner->UpdateOneIter(i, p_mat); - } + for (size_t i = 0; i < 16; ++i) { + learner->UpdateOneIter(i, p_mat); + } + + learner->SetParam("predictor", predictor); - HostDeviceVector predts_training; - learner->Predict(p_mat, false, &predts_training, 0, 0, true); - - HostDeviceVector* inplace_predts; - auto adapter = std::shared_ptr(new data::ArrayAdapter{StringView{array_str}}); - learner->InplacePredict(adapter, nullptr, PredictionType::kValue, - std::numeric_limits::quiet_NaN(), - &inplace_predts, 0, 0); - CHECK(inplace_predts); - - HostDeviceVector predts_inference; - learner->Predict(p_mat, false, &predts_inference, 0, 0, false); - - auto const& h_predts_training = predts_training.ConstHostVector(); - auto const& h_predts_inference = predts_inference.ConstHostVector(); - auto const& h_inplace_predts = inplace_predts->HostVector(); - ASSERT_EQ(h_predts_training.size(), h_predts_inference.size()); - ASSERT_EQ(h_inplace_predts.size(), h_predts_inference.size()); - for (size_t i = 0; i < predts_inference.Size(); ++i) { - // Inference doesn't drop tree. - ASSERT_GT(std::abs(h_predts_training[i] - h_predts_inference[i]), kRtEps * 10); - // Inplace prediction is inference. - ASSERT_LT(h_inplace_predts[i] - h_predts_inference[i], kRtEps / 10); + HostDeviceVector predts_training; + learner->Predict(p_mat, false, &predts_training, 0, 0, true); + + HostDeviceVector* inplace_predts; + std::shared_ptr x{new data::DMatrixProxy{}}; + if (predictor == "gpu_predictor") { + x->SetCUDAArray(array_str.c_str()); + } else { + x->SetArrayData(array_str.c_str()); + } + learner->InplacePredict(x, PredictionType::kValue, std::numeric_limits::quiet_NaN(), + &inplace_predts, 0, 0); + CHECK(inplace_predts); + + HostDeviceVector predts_inference; + learner->Predict(p_mat, false, &predts_inference, 0, 0, false); + + auto const& h_predts_training = predts_training.ConstHostVector(); + auto const& h_predts_inference = predts_inference.ConstHostVector(); + auto const& h_inplace_predts = inplace_predts->HostVector(); + ASSERT_EQ(h_predts_training.size(), h_predts_inference.size()); + ASSERT_EQ(h_inplace_predts.size(), h_predts_inference.size()); + for (size_t i = 0; i < predts_inference.Size(); ++i) { + // Inference doesn't drop tree. + ASSERT_GT(std::abs(h_predts_training[i] - h_predts_inference[i]), kRtEps * 10); + // Inplace prediction is inference. + ASSERT_LT(h_inplace_predts[i] - h_predts_inference[i], kRtEps / 10); + } } -} +}; +} // anonymous namespace + +TEST_P(Dart, Prediction) { this->Run(GetParam()); } + +#if defined(XGBOOST_USE_CUDA) +INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, + testing::Values("auto", "cpu_predictor", "gpu_predictor")); +#else +INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("auto", "cpu_predictor")); +#endif // defined(XGBOOST_USE_CUDA) + std::pair TestModelSlice(std::string booster) { size_t constexpr kRows = 1000, kCols = 100, kForest = 2, kClasses = 3; @@ -485,19 +511,20 @@ TEST(GBTree, PredictRange) { // inplace predict HostDeviceVector raw_storage; auto raw = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateArrayInterface(&raw_storage); - std::shared_ptr x{new data::ArrayAdapter{StringView{raw}}}; + std::shared_ptr x{new data::DMatrixProxy{}}; + x->SetArrayData(raw.data()); HostDeviceVector* out_predt; - learner->InplacePredict(x, nullptr, PredictionType::kValue, - std::numeric_limits::quiet_NaN(), &out_predt, 0, 2); + learner->InplacePredict(x, PredictionType::kValue, std::numeric_limits::quiet_NaN(), + &out_predt, 0, 2); auto h_out_predt = out_predt->HostVector(); - learner->InplacePredict(x, nullptr, PredictionType::kValue, - std::numeric_limits::quiet_NaN(), &out_predt, 0, 0); + learner->InplacePredict(x, PredictionType::kValue, std::numeric_limits::quiet_NaN(), + &out_predt, 0, 0); auto h_out_predt_full = out_predt->HostVector(); ASSERT_TRUE(std::equal(h_out_predt.begin(), h_out_predt.end(), h_out_predt_full.begin())); - ASSERT_THROW(learner->InplacePredict(x, nullptr, PredictionType::kValue, + ASSERT_THROW(learner->InplacePredict(x, PredictionType::kValue, std::numeric_limits::quiet_NaN(), &out_predt, 0, 3), dmlc::Error); } diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index f43747abdd9e..5b03f31d8d7a 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -5,11 +5,12 @@ #include #include +#include "../../../src/data/adapter.h" +#include "../../../src/data/proxy_dmatrix.h" +#include "../../../src/gbm/gbtree.h" +#include "../../../src/gbm/gbtree_model.h" #include "../helpers.h" #include "test_predictor.h" -#include "../../../src/gbm/gbtree_model.h" -#include "../../../src/gbm/gbtree.h" -#include "../../../src/data/adapter.h" namespace xgboost { TEST(CpuPredictor, Basic) { @@ -172,8 +173,11 @@ TEST(CpuPredictor, InplacePredict) { HostDeviceVector data; gen.GenerateDense(&data); ASSERT_EQ(data.Size(), kRows * kCols); - std::shared_ptr x{ - new data::DenseAdapter(data.HostPointer(), kRows, kCols)}; + std::shared_ptr x{new data::DMatrixProxy{}}; + auto array_interface = GetArrayInterface(&data, kRows, kCols); + std::string arr_str; + Json::Dump(array_interface, &arr_str); + x->SetArrayData(arr_str.data()); TestInplacePrediction(x, "cpu_predictor", kRows, kCols, -1); } @@ -182,9 +186,15 @@ TEST(CpuPredictor, InplacePredict) { HostDeviceVector rptrs; HostDeviceVector columns; gen.GenerateCSR(&data, &rptrs, &columns); - std::shared_ptr x{new data::CSRAdapter( - rptrs.HostPointer(), columns.HostPointer(), data.HostPointer(), kRows, - data.Size(), kCols)}; + auto data_interface = GetArrayInterface(&data, kRows * kCols, 1); + auto rptr_interface = GetArrayInterface(&rptrs, kRows + 1, 1); + auto col_interface = GetArrayInterface(&columns, kRows * kCols, 1); + std::string data_str, rptr_str, col_str; + Json::Dump(data_interface, &data_str); + Json::Dump(rptr_interface, &rptr_str); + Json::Dump(col_interface, &col_str); + std::shared_ptr x{new data::DMatrixProxy}; + x->SetCSRData(rptr_str.data(), col_str.data(), data_str.data(), kCols, true); TestInplacePrediction(x, "cpu_predictor", kRows, kCols, -1); } } diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 3113bc62b018..0dbbc8d4588e 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -1,17 +1,19 @@ /*! * Copyright 2017-2020 XGBoost contributors */ -#include #include +#include #include -#include -#include #include +#include +#include + #include -#include "../helpers.h" -#include "../../../src/gbm/gbtree_model.h" #include "../../../src/data/device_adapter.cuh" +#include "../../../src/data/proxy_dmatrix.h" +#include "../../../src/gbm/gbtree_model.h" +#include "../helpers.h" #include "test_predictor.h" namespace xgboost { @@ -135,8 +137,9 @@ TEST(GPUPredictor, InplacePredictCupy) { gen.Device(0); HostDeviceVector data; std::string interface_str = gen.GenerateArrayInterface(&data); - auto x = std::make_shared(interface_str); - TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 0); + std::shared_ptr p_fmat{new data::DMatrixProxy}; + dynamic_cast(p_fmat.get())->SetCUDAArray(interface_str.c_str()); + TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0); } TEST(GPUPredictor, InplacePredictCuDF) { @@ -145,8 +148,9 @@ TEST(GPUPredictor, InplacePredictCuDF) { gen.Device(0); std::vector> storage(kCols); auto interface_str = gen.GenerateColumnarArrayInterface(&storage); - auto x = std::make_shared(interface_str); - TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 0); + std::shared_ptr p_fmat{new data::DMatrixProxy}; + dynamic_cast(p_fmat.get())->SetCUDAArray(interface_str.c_str()); + TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0); } TEST(GPUPredictor, MGPU_InplacePredict) { // NOLINT @@ -160,10 +164,10 @@ TEST(GPUPredictor, MGPU_InplacePredict) { // NOLINT gen.Device(1); HostDeviceVector data; std::string interface_str = gen.GenerateArrayInterface(&data); - auto x = std::make_shared(interface_str); - TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 1); - EXPECT_THROW(TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 0), - dmlc::Error); + std::shared_ptr p_fmat{new data::DMatrixProxy}; + dynamic_cast(p_fmat.get())->SetCUDAArray(interface_str.c_str()); + TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 1); + EXPECT_THROW(TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0), dmlc::Error); } TEST(GpuPredictor, LesserFeatures) { diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc index e1d8b096a6eb..832d2cf4ceb2 100644 --- a/tests/cpp/predictor/test_predictor.cc +++ b/tests/cpp/predictor/test_predictor.cc @@ -2,19 +2,20 @@ * Copyright 2020-2021 by Contributors */ +#include "test_predictor.h" + #include -#include #include -#include #include +#include +#include -#include "test_predictor.h" - -#include "../helpers.h" -#include "../../../src/data/adapter.h" -#include "../../../src/common/io.h" -#include "../../../src/common/categorical.h" #include "../../../src/common/bitfield.h" +#include "../../../src/common/categorical.h" +#include "../../../src/common/io.h" +#include "../../../src/data/adapter.h" +#include "../../../src/data/proxy_dmatrix.h" +#include "../helpers.h" namespace xgboost { TEST(Predictor, PredictionCache) { @@ -83,9 +84,8 @@ void TestTrainingPrediction(size_t rows, size_t bins, train("gpu_predictor", &predictions_1); } -void TestInplacePrediction(dmlc::any x, std::string predictor, - bst_row_t rows, bst_feature_t cols, - int32_t device) { +void TestInplacePrediction(std::shared_ptr x, std::string predictor, bst_row_t rows, + bst_feature_t cols, int32_t device) { size_t constexpr kClasses { 4 }; auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(device); std::shared_ptr m = gen.GenerateDMatrix(true, false, kClasses); @@ -105,24 +105,21 @@ void TestInplacePrediction(dmlc::any x, std::string predictor, } HostDeviceVector *p_out_predictions_0{nullptr}; - learner->InplacePredict(x, nullptr, PredictionType::kMargin, - std::numeric_limits::quiet_NaN(), + learner->InplacePredict(x, PredictionType::kMargin, std::numeric_limits::quiet_NaN(), &p_out_predictions_0, 0, 2); CHECK(p_out_predictions_0); HostDeviceVector predict_0 (p_out_predictions_0->Size()); predict_0.Copy(*p_out_predictions_0); HostDeviceVector *p_out_predictions_1{nullptr}; - learner->InplacePredict(x, nullptr, PredictionType::kMargin, - std::numeric_limits::quiet_NaN(), + learner->InplacePredict(x, PredictionType::kMargin, std::numeric_limits::quiet_NaN(), &p_out_predictions_1, 2, 4); CHECK(p_out_predictions_1); HostDeviceVector predict_1 (p_out_predictions_1->Size()); predict_1.Copy(*p_out_predictions_1); HostDeviceVector* p_out_predictions{nullptr}; - learner->InplacePredict(x, nullptr, PredictionType::kMargin, - std::numeric_limits::quiet_NaN(), + learner->InplacePredict(x, PredictionType::kMargin, std::numeric_limits::quiet_NaN(), &p_out_predictions, 0, 4); auto& h_pred = p_out_predictions->HostVector(); @@ -378,25 +375,28 @@ void TestSparsePrediction(float sparsity, std::string predictor) { learner->SetParam("predictor", predictor); learner->Predict(Xy, false, &sparse_predt, 0, 0); - std::vector with_nan(kRows * kCols, std::numeric_limits::quiet_NaN()); - for (auto const& page : Xy->GetBatches()) { + HostDeviceVector with_nan(kRows * kCols, std::numeric_limits::quiet_NaN()); + auto& h_with_nan = with_nan.HostVector(); + for (auto const &page : Xy->GetBatches()) { auto batch = page.GetView(); for (size_t i = 0; i < batch.Size(); ++i) { auto row = batch[i]; for (auto e : row) { - with_nan[i * kCols + e.index] = e.fvalue; + h_with_nan[i * kCols + e.index] = e.fvalue; } } } learner->SetParam("predictor", "cpu_predictor"); // Xcode_12.4 doesn't compile with `std::make_shared`. - auto dense = std::shared_ptr( - new data::DenseAdapter(with_nan.data(), kRows, kCols)); + auto dense = std::shared_ptr(new data::DMatrixProxy{}); + auto array_interface = GetArrayInterface(&with_nan, kRows, kCols); + std::string arr_str; + Json::Dump(array_interface, &arr_str); + dynamic_cast(dense.get())->SetArrayData(arr_str.data()); HostDeviceVector *p_dense_predt; - learner->InplacePredict(dmlc::any(dense), nullptr, PredictionType::kValue, - std::numeric_limits::quiet_NaN(), &p_dense_predt, - 0, 0); + learner->InplacePredict(dense, PredictionType::kValue, std::numeric_limits::quiet_NaN(), + &p_dense_predt, 0, 0); auto const& dense_predt = *p_dense_predt; if (predictor == "cpu_predictor") { diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h index 9c5d99afef65..1ff96096c533 100644 --- a/tests/cpp/predictor/test_predictor.h +++ b/tests/cpp/predictor/test_predictor.h @@ -61,9 +61,8 @@ void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method, std::shared_ptr p_full, std::shared_ptr p_hist); -void TestInplacePrediction(dmlc::any x, std::string predictor, - bst_row_t rows, bst_feature_t cols, - int32_t device = -1); +void TestInplacePrediction(std::shared_ptr x, std::string predictor, bst_row_t rows, + bst_feature_t cols, int32_t device = -1); void TestPredictionWithLesserFeatures(std::string preditor_name); diff --git a/tests/cpp/tree/gpu_hist/test_driver.cu b/tests/cpp/tree/gpu_hist/test_driver.cu index d35f3510f628..8e7164e37bec 100644 --- a/tests/cpp/tree/gpu_hist/test_driver.cu +++ b/tests/cpp/tree/gpu_hist/test_driver.cu @@ -6,41 +6,58 @@ namespace xgboost { namespace tree { TEST(GpuHist, DriverDepthWise) { - Driver driver(TrainParam::kDepthWise); + TrainParam p; + p.InitAllowUnknown(Args{}); + p.grow_policy = TrainParam::kDepthWise; + Driver driver(p, 2); EXPECT_TRUE(driver.Pop().empty()); DeviceSplitCandidate split; split.loss_chg = 1.0f; - GPUExpandEntry root(0, 0, split, .0f, .0f, .0f); + split.left_sum = {0.0f, 1.0f}; + split.right_sum = {0.0f, 1.0f}; + GPUExpandEntry root(0, 0, split, 2.0f, 1.0f, 1.0f); driver.Push({root}); EXPECT_EQ(driver.Pop().front().nid, 0); - driver.Push({GPUExpandEntry{1, 1, split, .0f, .0f, .0f}}); - driver.Push({GPUExpandEntry{2, 1, split, .0f, .0f, .0f}}); - driver.Push({GPUExpandEntry{3, 2, split, .0f, .0f, .0f}}); - // Should return entries from level 1 + driver.Push({GPUExpandEntry{1, 1, split, 2.0f, 1.0f, 1.0f}}); + driver.Push({GPUExpandEntry{2, 1, split, 2.0f, 1.0f, 1.0f}}); + driver.Push({GPUExpandEntry{3, 1, split, 2.0f, 1.0f, 1.0f}}); + driver.Push({GPUExpandEntry{4, 2, split, 2.0f, 1.0f, 1.0f}}); + // Should return 2 entries from level 1 + // as we limited the driver to pop maximum 2 nodes auto res = driver.Pop(); EXPECT_EQ(res.size(), 2); for (auto &e : res) { EXPECT_EQ(e.depth, 1); } + + // Should now return 1 entry from level 1 + res = driver.Pop(); + EXPECT_EQ(res.size(), 1); + EXPECT_EQ(res.at(0).depth, 1); + res = driver.Pop(); - EXPECT_EQ(res[0].depth, 2); + EXPECT_EQ(res.at(0).depth, 2); EXPECT_TRUE(driver.Pop().empty()); } TEST(GpuHist, DriverLossGuided) { DeviceSplitCandidate high_gain; + high_gain.left_sum = {0.0f, 1.0f}; + high_gain.right_sum = {0.0f, 1.0f}; high_gain.loss_chg = 5.0f; - DeviceSplitCandidate low_gain; + DeviceSplitCandidate low_gain = high_gain; low_gain.loss_chg = 1.0f; - Driver driver(TrainParam::kLossGuide); + TrainParam p; + p.grow_policy=TrainParam::kLossGuide; + Driver driver(p); EXPECT_TRUE(driver.Pop().empty()); - GPUExpandEntry root(0, 0, high_gain, .0f, .0f, .0f); + GPUExpandEntry root(0, 0, high_gain, 2.0f, 1.0f, 1.0f ); driver.Push({root}); EXPECT_EQ(driver.Pop().front().nid, 0); // Select high gain first - driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}}); - driver.Push({GPUExpandEntry{2, 2, high_gain, .0f, .0f, .0f}}); + driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}}); + driver.Push({GPUExpandEntry{2, 2, high_gain, 2.0f, 1.0f, 1.0f}}); auto res = driver.Pop(); EXPECT_EQ(res.size(), 1); EXPECT_EQ(res[0].nid, 2); @@ -49,8 +66,8 @@ TEST(GpuHist, DriverLossGuided) { EXPECT_EQ(res[0].nid, 1); // If equal gain, use nid - driver.Push({GPUExpandEntry{2, 1, low_gain, .0f, .0f, .0f}}); - driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}}); + driver.Push({GPUExpandEntry{2, 1, low_gain, 2.0f, 1.0f, 1.0f}}); + driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}}); res = driver.Pop(); EXPECT_EQ(res[0].nid, 1); res = driver.Pop(); diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index 3b543a48d7cc..75d97b681a61 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -95,7 +95,6 @@ TEST(Histogram, GPUDeterministic) { std::vector shm_sizes{48 * 1024, 64 * 1024, 160 * 1024}; for (bool is_dense : is_dense_array) { for (int shm_size : shm_sizes) { - TestDeterministicHistogram(is_dense, shm_size); TestDeterministicHistogram(is_dense, shm_size); } } diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index b3c08736c996..e6069cdfdd4d 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -27,31 +27,40 @@ TEST(GpuHist, DeviceHistogram) { // Ensures that node allocates correctly after reaching `kStopGrowingSize`. dh::safe_cuda(cudaSetDevice(0)); constexpr size_t kNBins = 128; - constexpr size_t kNNodes = 4; + constexpr int kNNodes = 4; constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; - DeviceHistogram histogram; + DeviceHistogramStorage histogram; histogram.Init(0, kNBins); - for (size_t i = 0; i < kNNodes; ++i) { - histogram.AllocateHistogram(i); + for (int i = 0; i < kNNodes; ++i) { + histogram.AllocateHistograms({i}); } histogram.Reset(); ASSERT_EQ(histogram.Data().size(), kStopGrowing); // Use allocated memory but do not erase nidx_map. - for (size_t i = 0; i < kNNodes; ++i) { - histogram.AllocateHistogram(i); + for (int i = 0; i < kNNodes; ++i) { + histogram.AllocateHistograms({i}); } - for (size_t i = 0; i < kNNodes; ++i) { + for (int i = 0; i < kNNodes; ++i) { ASSERT_TRUE(histogram.HistogramExists(i)); } - // Erase existing nidx_map. - for (size_t i = kNNodes; i < kNNodes * 2; ++i) { - histogram.AllocateHistogram(i); - } - for (size_t i = 0; i < kNNodes; ++i) { - ASSERT_FALSE(histogram.HistogramExists(i)); + // Add two new nodes + histogram.AllocateHistograms({kNNodes}); + histogram.AllocateHistograms({kNNodes + 1}); + + // Old cached nodes should still exist + for (int i = 0; i < kNNodes; ++i) { + ASSERT_TRUE(histogram.HistogramExists(i)); } + + // Should be deleted + ASSERT_FALSE(histogram.HistogramExists(kNNodes)); + // Most recent node should exist + ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1)); + + // Add same node again - should fail + EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes + 1});); } std::vector GetHostHistGpair() { @@ -96,9 +105,9 @@ void TestBuildHist(bool use_shared_memory_histograms) { thrust::host_vector h_gidx_buffer (page->gidx_buffer.HostVector()); maker.row_partitioner.reset(new RowPartitioner(0, kNRows)); - maker.hist.AllocateHistogram(0); + maker.hist.AllocateHistograms({0}); maker.gpair = gpair.DeviceSpan(); - maker.histogram_rounding = CreateRoundingFactor(maker.gpair);; + maker.histogram_rounding = CreateRoundingFactor(maker.gpair); BuildGradientHistogram( page->GetDeviceAccessor(0), maker.feature_groups->DeviceAccessor(0), @@ -106,7 +115,7 @@ void TestBuildHist(bool use_shared_memory_histograms) { maker.hist.GetNodeHistogram(0), maker.histogram_rounding, !use_shared_memory_histograms); - DeviceHistogram& d_hist = maker.hist; + DeviceHistogramStorage& d_hist = maker.hist; auto node_histogram = d_hist.GetNodeHistogram(0); // d_hist.data stored in float, not gradient pair @@ -129,12 +138,10 @@ void TestBuildHist(bool use_shared_memory_histograms) { TEST(GpuHist, BuildHistGlobalMem) { TestBuildHist(false); - TestBuildHist(false); } TEST(GpuHist, BuildHistSharedMem) { TestBuildHist(true); - TestBuildHist(true); } HistogramCutsWrapper GetHostCutMatrix () { @@ -198,7 +205,7 @@ TEST(GpuHist, EvaluateRootSplit) { // Initialize GPUHistMakerDevice::hist maker.hist.Init(0, (max_bins - 1) * kNCols); - maker.hist.AllocateHistogram(0); + maker.hist.AllocateHistograms({0}); // Each row of hist_gpair represents gpairs for one feature. // Each entry represents a bin. std::vector hist_gpair = GetHostHistGpair(); diff --git a/tests/cpp/tree/test_tree_policy.cc b/tests/cpp/tree/test_tree_policy.cc index 15f4cd31bc99..1387e94a8e91 100644 --- a/tests/cpp/tree/test_tree_policy.cc +++ b/tests/cpp/tree/test_tree_policy.cc @@ -141,9 +141,9 @@ TEST_F(TestGrowPolicy, Approx) { TEST_F(TestGrowPolicy, Hist) { this->TestTreeGrowPolicy("hist", "depthwise"); - this->TestTreeGrowPolicy("hist", "lossguide"); + // this->TestTreeGrowPolicy("hist", "lossguide"); - this->TestCombination("hist"); + // this->TestCombination("hist"); } #if defined(XGBOOST_USE_CUDA) diff --git a/tests/distributed/distributed_gpu.py b/tests/distributed/distributed_gpu.py index a2ab6d398018..d10d2aed4884 100644 --- a/tests/distributed/distributed_gpu.py +++ b/tests/distributed/distributed_gpu.py @@ -8,46 +8,44 @@ def run_test(name, params_fun): """Runs a distributed GPU test.""" # Always call this before using distributed module - xgb.rabit.init() - rank = xgb.rabit.get_rank() - world = xgb.rabit.get_world_size() - - # Load file, file will be automatically sharded in distributed mode. - dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train') - dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test') - - params, n_rounds = params_fun(rank) - - # Specify validations set to watch performance - watchlist = [(dtest, 'eval'), (dtrain, 'train')] - - # Run training, all the features in training API is available. - # Currently, this script only support calling train once for fault recovery purpose. - bst = xgb.train(params, dtrain, n_rounds, watchlist, early_stopping_rounds=2) - - # Have each worker save its model - model_name = "test.model.%s.%d" % (name, rank) - bst.dump_model(model_name, with_stats=True) - xgb.rabit.allreduce(np.ones((1, 1)), xgb.rabit.Op.MAX) # sync - xgb.rabit.tracker_print("Finished training\n") - - if (rank == 0): - for i in range(0, world): - model_name_root = "test.model.%s.%d" % (name, i) - for j in range(0, world): - if i == j: - continue - with open(model_name_root, 'r') as model_root: - contents_root = model_root.read() - model_name_rank = "test.model.%s.%d" % (name, j) - with open(model_name_rank, 'r') as model_rank: - contents_rank = model_rank.read() - if contents_root != contents_rank: - raise Exception( - ('Worker models diverged: test.model.%s.%d ' - 'differs from test.model.%s.%d') % (name, i, name, j)) - - xgb.rabit.finalize() + with xgb.rabit.RabitContext(): + rank = xgb.rabit.get_rank() + world = xgb.rabit.get_world_size() + + # Load file, file will be automatically sharded in distributed mode. + dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train') + dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test') + + params, n_rounds = params_fun(rank) + + # Specify validations set to watch performance + watchlist = [(dtest, 'eval'), (dtrain, 'train')] + + # Run training, all the features in training API is available. + # Currently, this script only support calling train once for fault recovery purpose. + bst = xgb.train(params, dtrain, n_rounds, watchlist, early_stopping_rounds=2) + + # Have each worker save its model + model_name = "test.model.%s.%d" % (name, rank) + bst.dump_model(model_name, with_stats=True) + xgb.rabit.allreduce(np.ones((1, 1)), xgb.rabit.Op.MAX) # sync + xgb.rabit.tracker_print("Finished training\n") + + if (rank == 0): + for i in range(0, world): + model_name_root = "test.model.%s.%d" % (name, i) + for j in range(0, world): + if i == j: + continue + with open(model_name_root, 'r') as model_root: + contents_root = model_root.read() + model_name_rank = "test.model.%s.%d" % (name, j) + with open(model_name_rank, 'r') as model_rank: + contents_rank = model_rank.read() + if contents_root != contents_rank: + raise Exception( + ('Worker models diverged: test.model.%s.%d ' + 'differs from test.model.%s.%d') % (name, i, name, j)) base_params = { diff --git a/tests/distributed/test_basic.py b/tests/distributed/test_basic.py index f7c1ffee3efc..db2916b39a3c 100644 --- a/tests/distributed/test_basic.py +++ b/tests/distributed/test_basic.py @@ -2,28 +2,23 @@ import xgboost as xgb # Always call this before using distributed module -xgb.rabit.init() +with xgb.rabit.RabitContext(): + # Load file, file will be automatically sharded in distributed mode. + dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train') + dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test') -# Load file, file will be automatically sharded in distributed mode. -dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train') -dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test') + # Specify parameters via map, definition are same as c++ version + param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} -# Specify parameters via map, definition are same as c++ version -param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} + # Specify validations set to watch performance + watchlist = [(dtest, 'eval'), (dtrain, 'train')] + num_round = 20 -# Specify validations set to watch performance -watchlist = [(dtest, 'eval'), (dtrain, 'train')] -num_round = 20 + # Run training, all the features in training API is available. + # Currently, this script only support calling train once for fault recovery purpose. + bst = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=2) -# Run training, all the features in training API is available. -# Currently, this script only support calling train once for fault recovery purpose. -bst = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=2) - -# Save the model, only ask process 0 to save the model. -if xgb.rabit.get_rank() == 0: - bst.save_model("test.model") - xgb.rabit.tracker_print("Finished training\n") - -# Notify the tracker all training has been successful -# This is only needed in distributed training. -xgb.rabit.finalize() + # Save the model, only ask process 0 to save the model. + if xgb.rabit.get_rank() == 0: + bst.save_model("test.model") + xgb.rabit.tracker_print("Finished training\n") diff --git a/tests/distributed/test_federated.py b/tests/distributed/test_federated.py index 5b5b167fcd32..a3cdbc1e2912 100644 --- a/tests/distributed/test_federated.py +++ b/tests/distributed/test_federated.py @@ -27,31 +27,26 @@ def run_worker(port: int, world_size: int, rank: int) -> None: f'federated_client_key={CLIENT_KEY}', f'federated_client_cert={CLIENT_CERT}' ] - xgb.rabit.init([e.encode() for e in rabit_env]) - - # Load file, file will not be sharded in federated mode. - dtrain = xgb.DMatrix('agaricus.txt.train-%02d' % rank) - dtest = xgb.DMatrix('agaricus.txt.test-%02d' % rank) - - # Specify parameters via map, definition are same as c++ version - param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} - - # Specify validations set to watch performance - watchlist = [(dtest, 'eval'), (dtrain, 'train')] - num_round = 20 - - # Run training, all the features in training API is available. - # Currently, this script only support calling train once for fault recovery purpose. - bst = xgb.train(param, dtrain, num_round, evals=watchlist, early_stopping_rounds=2) - - # Save the model, only ask process 0 to save the model. - if xgb.rabit.get_rank() == 0: - bst.save_model("test.model.json") - xgb.rabit.tracker_print("Finished training\n") - - # Notify the tracker all training has been successful - # This is only needed in distributed training. - xgb.rabit.finalize() + with xgb.rabit.RabitContext([e.encode() for e in rabit_env]): + # Load file, file will not be sharded in federated mode. + dtrain = xgb.DMatrix('agaricus.txt.train-%02d' % rank) + dtest = xgb.DMatrix('agaricus.txt.test-%02d' % rank) + + # Specify parameters via map, definition are same as c++ version + param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} + + # Specify validations set to watch performance + watchlist = [(dtest, 'eval'), (dtrain, 'train')] + num_round = 20 + + # Run training, all the features in training API is available. + bst = xgb.train(param, dtrain, num_round, evals=watchlist, + early_stopping_rounds=2) + + # Save the model, only ask process 0 to save the model. + if xgb.rabit.get_rank() == 0: + bst.save_model("test.model.json") + xgb.rabit.tracker_print("Finished training\n") def run_test() -> None: diff --git a/tests/distributed/test_issue3402.py b/tests/distributed/test_issue3402.py index e3b87931bf67..7a40d3420ebb 100644 --- a/tests/distributed/test_issue3402.py +++ b/tests/distributed/test_issue3402.py @@ -2,78 +2,73 @@ import xgboost as xgb import numpy as np -xgb.rabit.init() +with xgb.rabit.RabitContext(): + X = [ + [15.00,28.90,29.00,3143.70,0.00,0.10,69.90,90.00,13726.07,0.00,2299.70,0.00,0.05, + 4327.03,0.00,24.00,0.18,3.00,0.41,3.77,0.00,0.00,4.00,0.00,150.92,0.00,2.00,0.00, + 0.01,138.00,1.00,0.02,69.90,0.00,0.83,5.00,0.01,0.12,47.30,0.00,296.00,0.16,0.00, + 0.00,27.70,7.00,7.25,4406.16,1.00,0.54,245.28,3.00,0.06,306.50,5143.00,29.00,23.74, + 548.00,2.00,68.00,70.90,25.45,0.39,0.00,0.01,497.11,0.00,42.00,83.00,4.00,0.00,1.00, + 0.00,104.35,94.12,0.03,79.23,237.69,1.00,0.04,0.01,0.02,2.00,108.81,7.00,12.00,0.46, + 31.00,0.00,0.15,74.59,0.00,19.50,0.00,0.75,0.06,0.08,118.00,35.90,0.01,0.07,1.00, + 0.03,81.18,13.33,0.00,0.00,0.00,0.00,0.00,0.41,0.00,0.15,57.00,0.00,22.00,449.68, + 0.00,0.00,2.00,195.26,51.58,306.50,0.10,1.00,0.00,258.00,21.00,0.43,3.00,16.00,0.00, + 0.00,0.00,0.00,1.00,74.51,4.00,0.02,35.90,30.00,8.69,0.00,0.36,5.00,2.00,3.00,0.26, + 9.50,8.00,11.00,11918.15,0.00,258.00,13.00,9.04,0.14,604.65,0.92,74.59,0.00,0.00, + 72.76,1.00,0.22,64.00,2.00,0.00,0.00,0.02,0.00,305.50,27.70,0.02,0.00,177.00,14.00, + 0.00,0.05,90.00,0.03,0.00,1.00,0.43,4.00,0.05,0.09,431.00,0.00,2.00,0.00,0.00,1.00, + 0.25,0.17,0.00,0.00,21.00,94.12,0.17,0.00,0.00,0.00,548.00,0.00,68.00,0.00,0.00,9.50, + 25.45,1390.31,7.00,0.00,2.00,310.70,0.00,0.01,0.01,0.03,81.40,1.00,0.02,0.00,9.00, + 6.00,0.00,175.76,36.00,0.00,20.75,2.00,0.00,0.00,0.00,0.22,74.16,0.10,56.81,0.00, + 2197.03,0.00,197.66,0.00,55.00,20.00,367.18,22.00,0.00,0.01,1510.26,0.24,0.00,0.01, + 0.00,11.00,278.10,61.70,278.10,0.00,0.08,0.57,1.00,0.65,255.60,0.00,0.86,0.25,70.95, + 2299.70,0.23,0.05,92.70,1.00,38.00,0.00,0.00,56.81,21.85,0.00,23.74,0.00,2.00,0.03, + 2.00,0.00,347.58,30.00,243.55,109.00,0.00,296.00,6.00,6.00,0.00,0.00,109.00,2299.70, + 0.00,0.01,0.08,1.00,4745.09,4.00,0.18,0.00,0.17,0.02,0.00,1.00,147.13,71.07,2115.16, + 0.00,0.26,0.00,43.00,604.90,49.44,4327.03,0.68,0.75,0.10,86.36,52.98,0.20,0.00,22.50, + 305.50,0.00,1.00,0.00,7.00,0.78,0.00,296.00,22.50,0.00,5.00,2979.54,1.00,14.00,51.00, + 0.42,0.11,0.00,1.00,0.00,0.00,70.90,37.84,0.02,548.40,0.00,46.35,5.00,1.66,0.29,0.00, + 0.02,2255.69,160.53,790.64,6775.15,0.68,19.50,2299.70,79.87,6.00,0.00,60.00,0.27, + 233.77,10.00,0.00,0.00,23.00,82.27,1.00,0.00,1.00,0.42,1.00,0.01,0.40,0.41,9.50,2299.70, + 46.30,0.00,0.00,2299.70,3.00,0.00,0.00,83.00,1.00], + [48.00,80.89,69.90,11570.00,26.00,0.40,468.00,0.00,5739.46,0.00,1480.00,90.89,0.00, + 14042.09,3600.08,120.00,0.09,31.00,0.25,2.36,0.00,7.00,22.00,0.00,257.59,0.00,6.00, + 260.00,0.05,313.00,1.00,0.07,468.00,0.00,0.67,11.00,0.02,0.32,0.00,0.00,1387.61,0.34, + 0.00,0.00,158.04,6.00,13.98,12380.05,0.00,0.16,122.74,3.00,0.18,291.33,7517.79,124.00, + 45.08,900.00,1.00,0.00,577.25,79.75,0.39,0.00,0.00,244.62,0.00,57.00,178.00,19.00, + 0.00,1.00,386.10,103.51,480.00,0.06,129.41,334.31,1.00,0.06,0.00,0.06,3.00,125.55, + 0.00,76.00,0.14,30.00,0.00,0.03,411.29,791.33,55.00,0.12,3.80,0.07,0.01,188.00,221.11, + 0.01,0.15,1.00,0.18,144.32,15.00,0.00,0.05,0.00,3.00,0.00,0.20,0.00,0.14,62.00,0.06, + 55.00,239.35,0.00,0.00,2.00,534.20,747.50,400.57,0.40,0.00,0.00,219.98,30.00,0.25, + 1.00,70.00,0.02,0.04,0.00,0.00,7.00,747.50,8.67,0.06,271.01,28.00,5.63,75.39,0.46, + 11.00,3.00,19.00,0.38,131.74,23.00,39.00,30249.41,0.00,202.68,2.00,64.94,0.03,2787.68, + 0.54,35.00,0.02,106.03,25.00,1.00,0.10,45.00,2.00,0.00,0.00,0.00,0.00,449.27,172.38, + 0.05,0.00,550.00,130.00,2006.55,0.07,0.00,0.03,0.00,5.00,0.21,22.00,0.05,0.01,1011.40, + 0.00,4.00,3600.08,0.00,1.00,1.00,1.00,0.00,3.00,9.00,270.00,0.12,0.03,0.00,0.00,820.00, + 1827.50,0.00,100.33,0.00,131.74,53.16,9557.97,7.00,0.00,11.00,180.81,0.00,0.01,0.04, + 0.02,1480.00,0.92,0.05,0.00,15.00,6.00,0.00,161.42,28.00,169.00,35.60,4.00,0.12,0.00, + 0.00,0.27,230.56,0.42,171.90,0.00,28407.51,1.00,883.10,0.00,261.00,9.00,1031.67,38.00, + 0.00,0.04,1607.68,0.32,791.33,0.04,1403.00,2.00,2260.50,88.08,2260.50,0.00,0.12,0.75, + 3.00,0.00,1231.68,0.07,0.60,0.24,0.00,0.00,0.15,0.14,753.50,1.00,95.00,7.00,0.26, + 77.63,38.45,0.00,42.65,0.00,14.00,0.07,6.00,0.00,1911.59,43.00,386.77,1324.80,0.00, + 518.00,10.00,10.00,0.11,0.00,1324.80,0.00,0.00,0.02,0.16,1.00,10492.12,5.00,0.94, + 5.00,0.08,0.10,1.00,0.92,3731.49,105.81,6931.39,0.00,0.43,0.00,118.00,5323.71,81.66, + 14042.09,0.08,0.20,0.40,96.64,0.00,0.08,4.00,1028.82,353.00,0.00,2.00,32.00,43.00, + 5.16,75.39,900.00,232.10,3.00,5.00,6049.88,1.00,126.00,46.00,0.59,0.15,0.00,8.00, + 7.00,0.00,577.25,0.00,0.07,2415.10,0.00,83.72,9.00,1.76,0.20,0.00,0.17,3278.65,155.26, + 4415.50,22731.62,1.00,55.00,0.00,499.94,22.00,0.58,67.00,0.21,341.72,16.00,0.00,965.07, + 17.00,138.41,0.00,0.00,1.00,0.14,1.00,0.02,0.35,1.69,369.00,1300.00,25.00,0.00,0.01, + 0.00,0.00,0.00,0.00,52.00,8.00]] + X = np.array(X) + y = [1, 0] -X = [ - [15.00,28.90,29.00,3143.70,0.00,0.10,69.90,90.00,13726.07,0.00,2299.70,0.00,0.05, - 4327.03,0.00,24.00,0.18,3.00,0.41,3.77,0.00,0.00,4.00,0.00,150.92,0.00,2.00,0.00, - 0.01,138.00,1.00,0.02,69.90,0.00,0.83,5.00,0.01,0.12,47.30,0.00,296.00,0.16,0.00, - 0.00,27.70,7.00,7.25,4406.16,1.00,0.54,245.28,3.00,0.06,306.50,5143.00,29.00,23.74, - 548.00,2.00,68.00,70.90,25.45,0.39,0.00,0.01,497.11,0.00,42.00,83.00,4.00,0.00,1.00, - 0.00,104.35,94.12,0.03,79.23,237.69,1.00,0.04,0.01,0.02,2.00,108.81,7.00,12.00,0.46, - 31.00,0.00,0.15,74.59,0.00,19.50,0.00,0.75,0.06,0.08,118.00,35.90,0.01,0.07,1.00, - 0.03,81.18,13.33,0.00,0.00,0.00,0.00,0.00,0.41,0.00,0.15,57.00,0.00,22.00,449.68, - 0.00,0.00,2.00,195.26,51.58,306.50,0.10,1.00,0.00,258.00,21.00,0.43,3.00,16.00,0.00, - 0.00,0.00,0.00,1.00,74.51,4.00,0.02,35.90,30.00,8.69,0.00,0.36,5.00,2.00,3.00,0.26, - 9.50,8.00,11.00,11918.15,0.00,258.00,13.00,9.04,0.14,604.65,0.92,74.59,0.00,0.00, - 72.76,1.00,0.22,64.00,2.00,0.00,0.00,0.02,0.00,305.50,27.70,0.02,0.00,177.00,14.00, - 0.00,0.05,90.00,0.03,0.00,1.00,0.43,4.00,0.05,0.09,431.00,0.00,2.00,0.00,0.00,1.00, - 0.25,0.17,0.00,0.00,21.00,94.12,0.17,0.00,0.00,0.00,548.00,0.00,68.00,0.00,0.00,9.50, - 25.45,1390.31,7.00,0.00,2.00,310.70,0.00,0.01,0.01,0.03,81.40,1.00,0.02,0.00,9.00, - 6.00,0.00,175.76,36.00,0.00,20.75,2.00,0.00,0.00,0.00,0.22,74.16,0.10,56.81,0.00, - 2197.03,0.00,197.66,0.00,55.00,20.00,367.18,22.00,0.00,0.01,1510.26,0.24,0.00,0.01, - 0.00,11.00,278.10,61.70,278.10,0.00,0.08,0.57,1.00,0.65,255.60,0.00,0.86,0.25,70.95, - 2299.70,0.23,0.05,92.70,1.00,38.00,0.00,0.00,56.81,21.85,0.00,23.74,0.00,2.00,0.03, - 2.00,0.00,347.58,30.00,243.55,109.00,0.00,296.00,6.00,6.00,0.00,0.00,109.00,2299.70, - 0.00,0.01,0.08,1.00,4745.09,4.00,0.18,0.00,0.17,0.02,0.00,1.00,147.13,71.07,2115.16, - 0.00,0.26,0.00,43.00,604.90,49.44,4327.03,0.68,0.75,0.10,86.36,52.98,0.20,0.00,22.50, - 305.50,0.00,1.00,0.00,7.00,0.78,0.00,296.00,22.50,0.00,5.00,2979.54,1.00,14.00,51.00, - 0.42,0.11,0.00,1.00,0.00,0.00,70.90,37.84,0.02,548.40,0.00,46.35,5.00,1.66,0.29,0.00, - 0.02,2255.69,160.53,790.64,6775.15,0.68,19.50,2299.70,79.87,6.00,0.00,60.00,0.27, - 233.77,10.00,0.00,0.00,23.00,82.27,1.00,0.00,1.00,0.42,1.00,0.01,0.40,0.41,9.50,2299.70, - 46.30,0.00,0.00,2299.70,3.00,0.00,0.00,83.00,1.00], - [48.00,80.89,69.90,11570.00,26.00,0.40,468.00,0.00,5739.46,0.00,1480.00,90.89,0.00, - 14042.09,3600.08,120.00,0.09,31.00,0.25,2.36,0.00,7.00,22.00,0.00,257.59,0.00,6.00, - 260.00,0.05,313.00,1.00,0.07,468.00,0.00,0.67,11.00,0.02,0.32,0.00,0.00,1387.61,0.34, - 0.00,0.00,158.04,6.00,13.98,12380.05,0.00,0.16,122.74,3.00,0.18,291.33,7517.79,124.00, - 45.08,900.00,1.00,0.00,577.25,79.75,0.39,0.00,0.00,244.62,0.00,57.00,178.00,19.00, - 0.00,1.00,386.10,103.51,480.00,0.06,129.41,334.31,1.00,0.06,0.00,0.06,3.00,125.55, - 0.00,76.00,0.14,30.00,0.00,0.03,411.29,791.33,55.00,0.12,3.80,0.07,0.01,188.00,221.11, - 0.01,0.15,1.00,0.18,144.32,15.00,0.00,0.05,0.00,3.00,0.00,0.20,0.00,0.14,62.00,0.06, - 55.00,239.35,0.00,0.00,2.00,534.20,747.50,400.57,0.40,0.00,0.00,219.98,30.00,0.25, - 1.00,70.00,0.02,0.04,0.00,0.00,7.00,747.50,8.67,0.06,271.01,28.00,5.63,75.39,0.46, - 11.00,3.00,19.00,0.38,131.74,23.00,39.00,30249.41,0.00,202.68,2.00,64.94,0.03,2787.68, - 0.54,35.00,0.02,106.03,25.00,1.00,0.10,45.00,2.00,0.00,0.00,0.00,0.00,449.27,172.38, - 0.05,0.00,550.00,130.00,2006.55,0.07,0.00,0.03,0.00,5.00,0.21,22.00,0.05,0.01,1011.40, - 0.00,4.00,3600.08,0.00,1.00,1.00,1.00,0.00,3.00,9.00,270.00,0.12,0.03,0.00,0.00,820.00, - 1827.50,0.00,100.33,0.00,131.74,53.16,9557.97,7.00,0.00,11.00,180.81,0.00,0.01,0.04, - 0.02,1480.00,0.92,0.05,0.00,15.00,6.00,0.00,161.42,28.00,169.00,35.60,4.00,0.12,0.00, - 0.00,0.27,230.56,0.42,171.90,0.00,28407.51,1.00,883.10,0.00,261.00,9.00,1031.67,38.00, - 0.00,0.04,1607.68,0.32,791.33,0.04,1403.00,2.00,2260.50,88.08,2260.50,0.00,0.12,0.75, - 3.00,0.00,1231.68,0.07,0.60,0.24,0.00,0.00,0.15,0.14,753.50,1.00,95.00,7.00,0.26, - 77.63,38.45,0.00,42.65,0.00,14.00,0.07,6.00,0.00,1911.59,43.00,386.77,1324.80,0.00, - 518.00,10.00,10.00,0.11,0.00,1324.80,0.00,0.00,0.02,0.16,1.00,10492.12,5.00,0.94, - 5.00,0.08,0.10,1.00,0.92,3731.49,105.81,6931.39,0.00,0.43,0.00,118.00,5323.71,81.66, - 14042.09,0.08,0.20,0.40,96.64,0.00,0.08,4.00,1028.82,353.00,0.00,2.00,32.00,43.00, - 5.16,75.39,900.00,232.10,3.00,5.00,6049.88,1.00,126.00,46.00,0.59,0.15,0.00,8.00, - 7.00,0.00,577.25,0.00,0.07,2415.10,0.00,83.72,9.00,1.76,0.20,0.00,0.17,3278.65,155.26, - 4415.50,22731.62,1.00,55.00,0.00,499.94,22.00,0.58,67.00,0.21,341.72,16.00,0.00,965.07, - 17.00,138.41,0.00,0.00,1.00,0.14,1.00,0.02,0.35,1.69,369.00,1300.00,25.00,0.00,0.01, - 0.00,0.00,0.00,0.00,52.00,8.00]] -X = np.array(X) -y = [1, 0] + dtrain = xgb.DMatrix(X, label=y) -dtrain = xgb.DMatrix(X, label=y) + param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic' } + watchlist = [(dtrain,'train')] + num_round = 2 + bst = xgb.train(param, dtrain, num_round, watchlist) -param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic' } -watchlist = [(dtrain,'train')] -num_round = 2 -bst = xgb.train(param, dtrain, num_round, watchlist) - -if xgb.rabit.get_rank() == 0: - bst.save_model("test_issue3402.model") - xgb.rabit.tracker_print("Finished training\n") - -# Notify the tracker all training has been successful -# This is only needed in distributed training. -xgb.rabit.finalize() + if xgb.rabit.get_rank() == 0: + bst.save_model("test_issue3402.model") + xgb.rabit.tracker_print("Finished training\n") diff --git a/tests/python/test_tracker.py b/tests/python/test_tracker.py index 2e113898f4de..885221aae4ae 100644 --- a/tests/python/test_tracker.py +++ b/tests/python/test_tracker.py @@ -16,10 +16,9 @@ def test_rabit_tracker(): rabit_env = [] for k, v in worker_env.items(): rabit_env.append(f"{k}={v}".encode()) - xgb.rabit.init(rabit_env) - ret = xgb.rabit.broadcast('test1234', 0) - assert str(ret) == 'test1234' - xgb.rabit.finalize() + with xgb.rabit.RabitContext(rabit_env): + ret = xgb.rabit.broadcast('test1234', 0) + assert str(ret) == 'test1234' def run_rabit_ops(client, n_workers): diff --git a/tests/python/testing.py b/tests/python/testing.py index 8633e4caa52d..29947f227f86 100644 --- a/tests/python/testing.py +++ b/tests/python/testing.py @@ -7,7 +7,6 @@ from contextlib import contextmanager from io import StringIO from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED -from xgboost.compat import DASK_INSTALLED import pytest import gc import xgboost as xgb @@ -44,8 +43,14 @@ def no_sklearn(): def no_dask(): - return {'condition': not DASK_INSTALLED, - 'reason': 'Dask is not installed'} + try: + import pkg_resources + + pkg_resources.get_distribution("dask") + DASK_INSTALLED = True + except pkg_resources.DistributionNotFound: + DASK_INSTALLED = False + return {"condition": not DASK_INSTALLED, "reason": "Dask is not installed"} def no_pandas():