Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize dart inplace predict perf. #6804

Merged
merged 2 commits into from
Mar 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
67 changes: 45 additions & 22 deletions src/gbm/gbtree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,20 @@ void GPUDartPredictInc(common::Span<float> out_predts,
}
#endif

void GPUDartInplacePredictInc(common::Span<float> out_predts,
common::Span<float> predts, float tree_w,
size_t n_rows, float base_score,
bst_group_t n_groups,
bst_group_t group)
#if defined(XGBOOST_USE_CUDA)
; // NOLINT
#else
{
common::AssertGPUSupport();
}
#endif


class Dart : public GBTree {
public:
explicit Dart(LearnerModelParam const* booster_config) :
Expand Down Expand Up @@ -728,20 +742,22 @@ class Dart : public GBTree {
gpu_predictor_.get()
#endif // defined(XGBOOST_USE_CUDA)
};
Predictor const * predictor {nullptr};

MetaInfo info;
StringView msg{"Unsupported data type for inplace predict."};
int32_t device = GenericParameter::kCpuId;
PredictionCacheEntry predts;
// Inplace predict is not used for training, so no need to drop tree.
for (size_t i = tree_begin; i < tree_end; ++i) {
PredictionCacheEntry predts;
if (tparam_.predictor == PredictorType::kAuto) {
// Try both predictor implementations
bool success = false;
for (auto const &p : predictors) {
if (p && p->InplacePredict(x, nullptr, model_, missing, &predts, i,
i + 1)) {
success = true;
predictor = p;
#if defined(XGBOOST_USE_CUDA)
device = predts.predictions.DeviceIdx();
#endif // defined(XGBOOST_USE_CUDA)
Expand All @@ -750,46 +766,53 @@ class Dart : public GBTree {
}
CHECK(success) << msg;
} else {
// No base margin for each tree
bool success = this->GetPredictor()->InplacePredict(
x, nullptr, model_, missing, &predts, i, i + 1);
// No base margin from meta info for each tree
predictor = this->GetPredictor().get();
bool success = predictor->InplacePredict(x, nullptr, model_, missing,
&predts, i, i + 1);
device = predts.predictions.DeviceIdx();
CHECK(success) << msg;
}

auto w = this->weight_drop_.at(i);
auto &h_predts = predts.predictions.HostVector();
auto &h_out_predts = out_preds->predictions.HostVector();
size_t n_groups = model_.learner_model_param->num_output_group;
auto n_rows = predts.predictions.Size() / n_groups;

if (i == tree_begin) {
auto n_rows =
h_predts.size() / model_.learner_model_param->num_output_group;
// base margin is added here.
if (p_m) {
p_m->Info().num_row_ = n_rows;
cpu_predictor_->InitOutPredictions(p_m->Info(),
&out_preds->predictions, model_);
predictor->InitOutPredictions(p_m->Info(), &out_preds->predictions,
model_);
} else {
info.num_row_ = n_rows;
cpu_predictor_->InitOutPredictions(info, &out_preds->predictions,
model_);
predictor->InitOutPredictions(info, &out_preds->predictions, model_);
}
}

// Multiple the tree weight
CHECK_EQ(h_predts.size(), h_out_predts.size());
CHECK_EQ(predts.predictions.Size(), out_preds->predictions.Size());
auto group = model_.tree_info.at(i);

if (device == GenericParameter::kCpuId) {
auto &h_predts = predts.predictions.HostVector();
auto &h_out_predts = out_preds->predictions.HostVector();
#pragma omp parallel for
for (omp_ulong i = 0; i < h_out_predts.size(); ++i) {
// Need to remove the base margin from indiviual tree.
h_out_predts[i] +=
(h_predts[i] - model_.learner_model_param->base_score) * w;
for (omp_ulong ridx = 0; ridx < n_rows; ++ridx) {
const size_t offset = ridx * n_groups + group;
// Need to remove the base margin from indiviual tree.
h_out_predts[offset] +=
(h_predts[offset] - model_.learner_model_param->base_score) * w;
}
} else {
out_preds->predictions.SetDevice(device);
predts.predictions.SetDevice(device);
GPUDartInplacePredictInc(out_preds->predictions.DeviceSpan(),
predts.predictions.DeviceSpan(), w, n_rows,
model_.learner_model_param->base_score,
n_groups, group);
}
}

if (device != GenericParameter::kCpuId) {
out_preds->predictions.SetDevice(device);
out_preds->predictions.DeviceSpan();
}
}

void PredictInstance(const SparsePage::Inst &inst,
Expand Down
10 changes: 10 additions & 0 deletions src/gbm/gbtree.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,15 @@ void GPUDartPredictInc(common::Span<float> out_predts,
out_predts[offset] += (predts[offset] * tree_w);
});
}

void GPUDartInplacePredictInc(common::Span<float> out_predts,
common::Span<float> predts, float tree_w,
size_t n_rows, float base_score,
bst_group_t n_groups, bst_group_t group) {
dh::LaunchN(dh::CurrentDevice(), n_rows, [=] XGBOOST_DEVICE(size_t ridx) {
const size_t offset = ridx * n_groups + group;
out_predts[offset] += (predts[offset] - base_score) * tree_w;
});
}
} // namespace gbm
} // namespace xgboost
37 changes: 27 additions & 10 deletions tests/python-gpu/test_gpu_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,27 +332,44 @@ def test_predict_categorical_split(self, df):
rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5)

def test_predict_dart(self):
@pytest.mark.parametrize("n_classes", [2, 3])
def test_predict_dart(self, n_classes):
from sklearn.datasets import make_classification
import cupy as cp
rng = cp.random.RandomState(1994)
n_samples = 1000
X = rng.randn(n_samples, 10)
y = rng.randn(n_samples)
X_, y_ = make_classification(
n_samples=n_samples, n_informative=5, n_classes=n_classes
)
X, y = cp.array(X_), cp.array(y_)

Xy = xgb.DMatrix(X, y)
booster = xgb.train(
{
if n_classes == 2:
params = {
"tree_method": "gpu_hist",
"booster": "dart",
"rate_drop": 0.5,
},
Xy,
num_boost_round=32
)
"objective": "binary:logistic"
}
else:
params = {
"tree_method": "gpu_hist",
"booster": "dart",
"rate_drop": 0.5,
"objective": "multi:softprob",
"num_class": n_classes
}

booster = xgb.train(params, Xy, num_boost_round=32)
# predictor=auto
inplace = booster.inplace_predict(X)
copied = booster.predict(Xy)
cpu_inplace = booster.inplace_predict(X_)
booster.set_param({"predictor": "cpu_predictor"})
cpu_copied = booster.predict(Xy)

copied = cp.array(copied)
cp.testing.assert_allclose(cpu_inplace, copied, atol=1e-6)
cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6)
cp.testing.assert_allclose(inplace, copied, atol=1e-6)

booster.set_param({"predictor": "gpu_predictor"})
Expand Down