Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/dmlc/xgboost into optimiz…
Browse files Browse the repository at this point in the history
…ation_part_applysplit
  • Loading branch information
ShvetsKS committed May 22, 2022
2 parents efb4f50 + f6babc8 commit 8cd8cd5
Show file tree
Hide file tree
Showing 63 changed files with 1,406 additions and 1,082 deletions.
5 changes: 1 addition & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,7 @@ endif
# If any of the dask tests failed, contributor won't see the other error.
mypy:
cd python-package; \
mypy ./xgboost/dask.py && \
mypy ./xgboost/rabit.py && \
mypy ./xgboost/tracker.py && \
mypy ./xgboost/sklearn.py && \
mypy . && \
mypy ../demo/guide-python/external_memory.py && \
mypy ../demo/guide-python/categorical.py && \
mypy ../demo/guide-python/cat_in_the_dat.py && \
Expand Down
6 changes: 6 additions & 0 deletions doc/tutorials/saving_model.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ a filename with ``.json`` or ``.ubj`` as file extension, the latter is the exten
xgb.save(bst, 'model_file_name.json')
.. note::

Only load models from JSON files that were produced by XGBoost. Attempting to load
JSON files that were produced by an external source may lead to undefined behaviors
and crashes.

While for memory snapshot, UBJSON is the default starting with xgboost 1.6.

***************************************************************
Expand Down
7 changes: 3 additions & 4 deletions include/xgboost/gbm.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,15 +111,14 @@ class GradientBooster : public Model, public Configurable {
/*!
* \brief Inplace prediction.
*
* \param x A type erased data adapter.
* \param p_fmat A proxy DMatrix that contains the data and related
* meta info.
* \param missing Missing value in the data.
* \param [in,out] out_preds The output preds.
* \param layer_begin (Optional) Beginning of boosted tree layer used for prediction.
* \param layer_end (Optional) End of booster layer. 0 means do not limit trees.
*/
virtual void InplacePredict(dmlc::any const &, std::shared_ptr<DMatrix>, float,
PredictionCacheEntry*,
uint32_t,
virtual void InplacePredict(std::shared_ptr<DMatrix>, float, PredictionCacheEntry*, uint32_t,
uint32_t) const {
LOG(FATAL) << "Inplace predict is not supported by current booster.";
}
Expand Down
13 changes: 4 additions & 9 deletions include/xgboost/learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,21 +139,16 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
/*!
* \brief Inplace prediction.
*
* \param x A type erased data adapter.
* \param p_m An optional Proxy DMatrix object storing meta info like
* base margin. Can be nullptr.
* \param p_fmat A proxy DMatrix that contains the data and related meta info.
* \param type Prediction type.
* \param missing Missing value in the data.
* \param [in,out] out_preds Pointer to output prediction vector.
* \param layer_begin Beginning of boosted tree layer used for prediction.
* \param layer_end End of booster layer. 0 means do not limit trees.
*/
virtual void InplacePredict(dmlc::any const &x,
std::shared_ptr<DMatrix> p_m,
PredictionType type,
float missing,
HostDeviceVector<bst_float> **out_preds,
uint32_t layer_begin, uint32_t layer_end) = 0;
virtual void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
HostDeviceVector<bst_float>** out_preds, uint32_t layer_begin,
uint32_t layer_end) = 0;

/*!
* \brief Calculate feature score. See doc in C API for outputs.
Expand Down
12 changes: 6 additions & 6 deletions include/xgboost/predictor.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ class Predictor {

/**
* \brief Inplace prediction.
* \param x Type erased data adapter.
*
* \param p_fmat A proxy DMatrix that contains the data and related
* meta info.
* \param model The model to predict from.
* \param missing Missing value in the data.
* \param [in,out] out_preds The output preds.
Expand All @@ -154,11 +156,9 @@ class Predictor {
*
* \return True if the data can be handled by current predictor, false otherwise.
*/
virtual bool InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
const gbm::GBTreeModel &model, float missing,
PredictionCacheEntry *out_preds,
uint32_t tree_begin = 0,
uint32_t tree_end = 0) const = 0;
virtual bool InplacePredict(std::shared_ptr<DMatrix> p_fmat, const gbm::GBTreeModel& model,
float missing, PredictionCacheEntry* out_preds,
uint32_t tree_begin = 0, uint32_t tree_end = 0) const = 0;
/**
* \brief online prediction function, predict score for one instance at a time
* NOTE: use the batch prediction interface if possible, batch prediction is
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,14 @@ class GpuPreXGBoost extends PreXGBoostProvider {
* @param estimator [[XGBoostClassifier]] or [[XGBoostRegressor]]
* @param dataset the training data
* @param params all user defined and defaulted params
* @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
* Boolean if building DMatrix in rabit context
* @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
* RDD[() => Watches] will be used as the training input
* Option[ RDD[_] ] is the optional cached RDD
*/
override def buildDatasetToRDD(estimator: Estimator[_],
dataset: Dataset[_],
params: Map[String, Any]):
XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {
GpuPreXGBoost.buildDatasetToRDD(estimator, dataset, params)
}

Expand Down Expand Up @@ -123,16 +122,15 @@ object GpuPreXGBoost extends PreXGBoostProvider {
* @param estimator supports XGBoostClassifier and XGBoostRegressor
* @param dataset the training data
* @param params all user defined and defaulted params
* @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
* Boolean if building DMatrix in rabit context
* @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
* RDD[() => Watches] will be used as the training input to build DMatrix
* Option[ RDD[_] ] is the optional cached RDD
*/
override def buildDatasetToRDD(
estimator: Estimator[_],
dataset: Dataset[_],
params: Map[String, Any]):
XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {

val (Seq(labelName, weightName, marginName), feturesCols, groupName, evalSets) =
estimator match {
Expand Down Expand Up @@ -170,7 +168,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
xgbExecParams: XGBoostExecutionParams =>
val dataMap = prepareInputData(trainingData, evalDataMap, xgbExecParams.numWorkers,
xgbExecParams.cacheTrainingSet)
(true, buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
(buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,16 +101,15 @@ object PreXGBoost extends PreXGBoostProvider {
* @param estimator supports XGBoostClassifier and XGBoostRegressor
* @param dataset the training data
* @param params all user defined and defaulted params
* @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
* Boolean if building DMatrix in rabit context
* @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
* RDD[() => Watches] will be used as the training input
* Option[RDD[_]\] is the optional cached RDD
*/
override def buildDatasetToRDD(
estimator: Estimator[_],
dataset: Dataset[_],
params: Map[String, Any]): XGBoostExecutionParams =>
(Boolean, RDD[() => Watches], Option[RDD[_]]) = {
(RDD[() => Watches], Option[RDD[_]]) = {

if (optionProvider.isDefined && optionProvider.get.providerEnabled(Some(dataset))) {
return optionProvider.get.buildDatasetToRDD(estimator, dataset, params)
Expand Down Expand Up @@ -172,12 +171,12 @@ object PreXGBoost extends PreXGBoostProvider {
val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
} else None
(false, trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
(trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
case Right(trainingData) =>
val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
} else None
(false, trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
(trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
}

}
Expand Down Expand Up @@ -324,20 +323,20 @@ object PreXGBoost extends PreXGBoostProvider {
trainingSet: RDD[XGBLabeledPoint],
evalRDDMap: Map[String, RDD[XGBLabeledPoint]] = Map(),
hasGroup: Boolean = false):
XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]) = {

xgbExecParams: XGBoostExecutionParams =>
composeInputData(trainingSet, hasGroup, xgbExecParams.numWorkers) match {
case Left(trainingData) =>
val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
} else None
(false, trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
(trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
case Right(trainingData) =>
val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
} else None
(false, trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
(trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,15 @@ private[scala] trait PreXGBoostProvider {
* @param estimator supports XGBoostClassifier and XGBoostRegressor
* @param dataset the training data
* @param params all user defined and defaulted params
* @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
* Boolean if building DMatrix in rabit context
* @return [[XGBoostExecutionParams]] => (RDD[[() => Watches]], Option[ RDD[_] ])
* RDD[() => Watches] will be used as the training input to build DMatrix
* Option[ RDD[_] ] is the optional cached RDD
*/
def buildDatasetToRDD(
estimator: Estimator[_],
dataset: Dataset[_],
params: Map[String, Any]):
XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]])
XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]])

/**
* Transform Dataset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,6 @@ object XGBoost extends Serializable {
}

private def buildDistributedBooster(
buildDMatrixInRabit: Boolean,
buildWatches: () => Watches,
xgbExecutionParam: XGBoostExecutionParams,
rabitEnv: java.util.Map[String, String],
Expand All @@ -295,11 +294,6 @@ object XGBoost extends Serializable {
prevBooster: Booster): Iterator[(Booster, Map[String, Array[Float]])] = {

var watches: Watches = null
if (!buildDMatrixInRabit) {
// for CPU pipeline, we need to build DMatrix out of rabit context
watches = buildWatchesAndCheck(buildWatches)
}

val taskId = TaskContext.getPartitionId().toString
val attempt = TaskContext.get().attemptNumber.toString
rabitEnv.put("DMLC_TASK_ID", taskId)
Expand All @@ -310,10 +304,7 @@ object XGBoost extends Serializable {
try {
Rabit.init(rabitEnv)

if (buildDMatrixInRabit) {
// for GPU pipeline, we need to move dmatrix building into rabit context
watches = buildWatchesAndCheck(buildWatches)
}
watches = buildWatchesAndCheck(buildWatches)

val numEarlyStoppingRounds = xgbExecutionParam.earlyStoppingParams.numEarlyStoppingRounds
val metrics = Array.tabulate(watches.size)(_ => Array.ofDim[Float](numRounds))
Expand Down Expand Up @@ -377,7 +368,7 @@ object XGBoost extends Serializable {
@throws(classOf[XGBoostError])
private[spark] def trainDistributed(
sc: SparkContext,
buildTrainingData: XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]),
buildTrainingData: XGBoostExecutionParams => (RDD[() => Watches], Option[RDD[_]]),
params: Map[String, Any]):
(Booster, Map[String, Array[Float]]) = {

Expand All @@ -396,7 +387,7 @@ object XGBoost extends Serializable {
}.orNull

// Get the training data RDD and the cachedRDD
val (buildDMatrixInRabit, trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams)
val (trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams)

try {
// Train for every ${savingRound} rounds and save the partially completed booster
Expand All @@ -413,9 +404,8 @@ object XGBoost extends Serializable {
optionWatches = Some(iter.next())
}

optionWatches.map { buildWatches => buildDistributedBooster(buildDMatrixInRabit,
buildWatches, xgbExecParams, rabitEnv, xgbExecParams.obj,
xgbExecParams.eval, prevBooster)}
optionWatches.map { buildWatches => buildDistributedBooster(buildWatches,
xgbExecParams, rabitEnv, xgbExecParams.obj, xgbExecParams.eval, prevBooster)}
.getOrElse(throw new RuntimeException("No Watches to train"))

}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,23 @@ class XGBoostClassifier (
}

override protected def train(dataset: Dataset[_]): XGBoostClassificationModel = {
val _numClasses = getNumClasses(dataset)
if (isDefined(numClass) && $(numClass) != _numClasses) {
throw new Exception("The number of classes in dataset doesn't match " +
"\'num_class\' in xgboost params.")
}

if (_numClasses == 2) {
if (!isDefined(objective)) {
// If user doesn't set objective, force it to binary:logistic
setObjective("binary:logistic")
}
} else if (_numClasses > 2) {
if (!isDefined(objective)) {
// If user doesn't set objective, force it to multi:softprob
setObjective("multi:softprob")
}
}

if (!isDefined(evalMetric) || $(evalMetric).isEmpty) {
set(evalMetric, setupDefaultEvalMetric())
Expand All @@ -178,12 +195,6 @@ class XGBoostClassifier (
set(objectiveType, "classification")
}

val _numClasses = getNumClasses(dataset)
if (isDefined(numClass) && $(numClass) != _numClasses) {
throw new Exception("The number of classes in dataset doesn't match " +
"\'num_class\' in xgboost params.")
}

// Packing with all params plus params user defined
val derivedXGBParamMap = xgboostParams ++ MLlib2XGBoostParams
val buildTrainingData = PreXGBoost.buildDatasetToRDD(this, dataset, derivedXGBParamMap)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ class XGBoostRegressor (

override protected def train(dataset: Dataset[_]): XGBoostRegressionModel = {

if (!isDefined(objective)) {
// If user doesn't set objective, force it to reg:squarederror
setObjective("reg:squarederror")
}

if (!isDefined(evalMetric) || $(evalMetric).isEmpty) {
set(evalMetric, setupDefaultEvalMetric())
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2014 by Contributors
Copyright (c) 2014-2022 by Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -105,7 +105,7 @@ private[spark] trait LearningTaskParams extends Params {

final def getMaximizeEvaluationMetrics: Boolean = $(maximizeEvaluationMetrics)

setDefault(objective -> "reg:squarederror", baseScore -> 0.5, trainTestRatio -> 1.0,
setDefault(baseScore -> 0.5, trainTestRatio -> 1.0,
numEarlyStoppingRounds -> 0, cacheTrainingSet -> false)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ class FeatureSizeValidatingSuite extends FunSuite with PerTest {
(id, lp.label, lp.features)
}.toDF("id", "label", "features")
val xgb = new XGBoostClassifier(paramMap)
intercept[Exception] {
xgb.fit(repartitioned)
}
xgb.fit(repartitioned)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest {
val testDM = new DMatrix(Classification.test.iterator)
val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
"custom_eval" -> new EvalError, "custom_obj" -> new CustomObj(1),
"num_round" -> "10", "num_workers" -> numWorkers)
"num_round" -> "10", "num_workers" -> numWorkers, "objective" -> "binary:logistic")

val xgbc = new XGBoostClassifier(paramMap)
val xgbcPath = new File(tempDir.toFile, "xgbc").getPath
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,34 @@ class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuit
assert(!transformedDf.columns.contains("probability"))
}

test("objective will be set if not specifying it") {
val training = buildDataFrame(Classification.train)
val paramMap = Map("eta" -> "1", "max_depth" -> "6",
"num_round" -> 5, "num_workers" -> numWorkers, "tree_method" -> treeMethod)
val xgb = new XGBoostClassifier(paramMap)
assert(!xgb.isDefined(xgb.objective))
xgb.fit(training)
assert(xgb.getObjective == "binary:logistic")

val trainingDF = buildDataFrame(MultiClassification.train)
val paramMap1 = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
"num_class" -> "6", "num_round" -> 5, "num_workers" -> numWorkers,
"tree_method" -> treeMethod)
val xgb1 = new XGBoostClassifier(paramMap1)
assert(!xgb1.isDefined(xgb1.objective))
xgb1.fit(trainingDF)
assert(xgb1.getObjective == "multi:softprob")

// shouldn't change user's objective setting
val paramMap2 = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
"num_class" -> "6", "num_round" -> 5, "num_workers" -> numWorkers,
"tree_method" -> treeMethod, "objective" -> "multi:softmax")
val xgb2 = new XGBoostClassifier(paramMap2)
assert(xgb2.getObjective == "multi:softmax")
xgb2.fit(trainingDF)
assert(xgb2.getObjective == "multi:softmax")
}

test("use base margin") {
val training1 = buildDataFrame(Classification.train)
val training2 = training1.withColumn("margin", functions.rand())
Expand Down

0 comments on commit 8cd8cd5

Please sign in to comment.