diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h index 7c7af71ae186..aa9b4f5817bb 100644 --- a/plugin/sycl/common/hist_util.h +++ b/plugin/sycl/common/hist_util.h @@ -46,6 +46,93 @@ ::sycl::event SubtractionHist(::sycl::queue qu, const GHistRow& src2, size_t size, ::sycl::event event_priv); +/*! + * \brief Histograms of gradient statistics for multiple nodes + */ +template +class HistCollection { + public: + using GHistRowT = GHistRow; + + // Access histogram for i-th node + GHistRowT& operator[](bst_uint nid) { + return *(data_.at(nid)); + } + + const GHistRowT& operator[](bst_uint nid) const { + return *(data_.at(nid)); + } + + // Initialize histogram collection + void Init(::sycl::queue qu, uint32_t nbins) { + qu_ = qu; + if (nbins_ != nbins) { + nbins_ = nbins; + data_.clear(); + } + } + + // Create an empty histogram for i-th node + ::sycl::event AddHistRow(bst_uint nid) { + ::sycl::event event; + if (data_.count(nid) == 0) { + data_[nid] = + std::make_shared(&qu_, nbins_, + xgboost::detail::GradientPairInternal(0, 0), + &event); + } else { + data_[nid]->Resize(&qu_, nbins_, + xgboost::detail::GradientPairInternal(0, 0), + &event); + } + return event; + } + + private: + /*! \brief Number of all bins over all features */ + uint32_t nbins_ = 0; + + std::unordered_map> data_; + + ::sycl::queue qu_; +}; + +/*! + * \brief Stores temporary histograms to compute them in parallel + */ +template +class ParallelGHistBuilder { + public: + using GHistRowT = GHistRow; + + void Init(::sycl::queue qu, size_t nbins) { + qu_ = qu; + if (nbins != nbins_) { + hist_buffer_.Init(qu_, nbins); + nbins_ = nbins; + } + } + + void Reset(size_t nblocks) { + hist_device_buffer_.Resize(&qu_, nblocks * nbins_ * 2); + } + + GHistRowT& GetDeviceBuffer() { + return hist_device_buffer_; + } + + protected: + /*! \brief Number of bins in each histogram */ + size_t nbins_ = 0; + /*! \brief Buffers for histograms for all nodes processed */ + HistCollection hist_buffer_; + + /*! \brief Buffer for additional histograms for Parallel processing */ + GHistRowT hist_device_buffer_; + + ::sycl::queue qu_; +}; + /*! * \brief Builder for histograms of gradient statistics */ diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h index f420ef470b71..8f4bb2516f05 100644 --- a/plugin/sycl/data.h +++ b/plugin/sycl/data.h @@ -80,6 +80,12 @@ class USMVector { qu->fill(data_.get(), v, size_).wait(); } + USMVector(::sycl::queue* qu, size_t size, T v, + ::sycl::event* event) : size_(size), capacity_(size) { + data_ = allocate_memory_(qu, size_); + *event = qu->fill(data_.get(), v, size_, *event); + } + USMVector(::sycl::queue* qu, const std::vector &vec) { size_ = vec.size(); capacity_ = size_; diff --git a/plugin/sycl/tree/hist_row_adder.h b/plugin/sycl/tree/hist_row_adder.h new file mode 100644 index 000000000000..968bcca737dc --- /dev/null +++ b/plugin/sycl/tree/hist_row_adder.h @@ -0,0 +1,46 @@ +/*! + * Copyright 2017-2024 by Contributors + * \file hist_row_adder.h + */ +#ifndef PLUGIN_SYCL_TREE_HIST_ROW_ADDER_H_ +#define PLUGIN_SYCL_TREE_HIST_ROW_ADDER_H_ + +#include +#include + +namespace xgboost { +namespace sycl { +namespace tree { + +template +class HistRowsAdder { + public: + virtual void AddHistRows(HistUpdater* builder, + std::vector* sync_ids, RegTree *p_tree) = 0; + virtual ~HistRowsAdder() = default; +}; + +template +class BatchHistRowsAdder: public HistRowsAdder { + public: + void AddHistRows(HistUpdater* builder, + std::vector* sync_ids, RegTree *p_tree) override { + builder->builder_monitor_.Start("AddHistRows"); + + for (auto const& entry : builder->nodes_for_explicit_hist_build_) { + int nid = entry.nid; + auto event = builder->hist_.AddHistRow(nid); + } + for (auto const& node : builder->nodes_for_subtraction_trick_) { + auto event = builder->hist_.AddHistRow(node.nid); + } + + builder->builder_monitor_.Stop("AddHistRows"); + } +}; + +} // namespace tree +} // namespace sycl +} // namespace xgboost + +#endif // PLUGIN_SYCL_TREE_HIST_ROW_ADDER_H_ diff --git a/plugin/sycl/tree/hist_synchronizer.h b/plugin/sycl/tree/hist_synchronizer.h new file mode 100644 index 000000000000..2275a51dba37 --- /dev/null +++ b/plugin/sycl/tree/hist_synchronizer.h @@ -0,0 +1,68 @@ +/*! + * Copyright 2017-2024 by Contributors + * \file hist_synchronizer.h + */ +#ifndef PLUGIN_SYCL_TREE_HIST_SYNCHRONIZER_H_ +#define PLUGIN_SYCL_TREE_HIST_SYNCHRONIZER_H_ + +#include + +#include "../common/hist_util.h" +#include "expand_entry.h" + +namespace xgboost { +namespace sycl { +namespace tree { + +template +class HistUpdater; + +template +class HistSynchronizer { + public: + virtual void SyncHistograms(HistUpdater* builder, + const std::vector& sync_ids, + RegTree *p_tree) = 0; + virtual ~HistSynchronizer() = default; +}; + +template +class BatchHistSynchronizer: public HistSynchronizer { + public: + void SyncHistograms(HistUpdater* builder, + const std::vector& sync_ids, + RegTree *p_tree) override { + builder->builder_monitor_.Start("SyncHistograms"); + const size_t nbins = builder->hist_builder_.GetNumBins(); + + hist_sync_events_.resize(builder->nodes_for_explicit_hist_build_.size()); + for (int i = 0; i < builder->nodes_for_explicit_hist_build_.size(); i++) { + const auto entry = builder->nodes_for_explicit_hist_build_[i]; + auto& this_hist = builder->hist_[entry.nid]; + + if (!(*p_tree)[entry.nid].IsRoot()) { + const size_t parent_id = (*p_tree)[entry.nid].Parent(); + auto& parent_hist = builder->hist_[parent_id]; + auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)]; + hist_sync_events_[i] = common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist, + this_hist, nbins, ::sycl::event()); + } + } + builder->qu_.wait_and_throw(); + + builder->builder_monitor_.Stop("SyncHistograms"); + } + + std::vector<::sycl::event> GetEvents() const { + return hist_sync_events_; + } + + private: + std::vector<::sycl::event> hist_sync_events_; +}; + +} // namespace tree +} // namespace sycl +} // namespace xgboost + +#endif // PLUGIN_SYCL_TREE_HIST_SYNCHRONIZER_H_ diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc index a96cf3f7603c..03be41994eb3 100644 --- a/plugin/sycl/tree/hist_updater.cc +++ b/plugin/sycl/tree/hist_updater.cc @@ -7,10 +7,69 @@ #include +#include "../common/hist_util.h" + namespace xgboost { namespace sycl { namespace tree { +template +void HistUpdater::SetHistSynchronizer( + HistSynchronizer *sync) { + hist_synchronizer_.reset(sync); +} + +template +void HistUpdater::SetHistRowsAdder( + HistRowsAdder *adder) { + hist_rows_adder_.reset(adder); +} + +template +void HistUpdater::BuildHistogramsLossGuide( + ExpandEntry entry, + const common::GHistIndexMatrix &gmat, + RegTree *p_tree, + const USMVector &gpair_device) { + nodes_for_explicit_hist_build_.clear(); + nodes_for_subtraction_trick_.clear(); + nodes_for_explicit_hist_build_.push_back(entry); + + if (!(*p_tree)[entry.nid].IsRoot()) { + auto sibling_id = entry.GetSiblingId(p_tree); + nodes_for_subtraction_trick_.emplace_back(sibling_id, p_tree->GetDepth(sibling_id)); + } + + std::vector sync_ids; + hist_rows_adder_->AddHistRows(this, &sync_ids, p_tree); + qu_.wait_and_throw(); + BuildLocalHistograms(gmat, p_tree, gpair_device); + hist_synchronizer_->SyncHistograms(this, sync_ids, p_tree); +} + +template +void HistUpdater::BuildLocalHistograms( + const common::GHistIndexMatrix &gmat, + RegTree *p_tree, + const USMVector &gpair_device) { + builder_monitor_.Start("BuildLocalHistograms"); + const size_t n_nodes = nodes_for_explicit_hist_build_.size(); + ::sycl::event event; + + for (size_t i = 0; i < n_nodes; i++) { + const int32_t nid = nodes_for_explicit_hist_build_[i].nid; + + if (row_set_collection_[nid].Size() > 0) { + event = BuildHist(gpair_device, row_set_collection_[nid], gmat, &(hist_[nid]), + &(hist_buffer_.GetDeviceBuffer()), event); + } else { + common::InitHist(qu_, &(hist_[nid]), hist_[nid].Size(), &event); + } + } + qu_.wait_and_throw(); + builder_monitor_.Stop("BuildLocalHistograms"); +} + template void HistUpdater::InitSampling( const USMVector &gpair, @@ -70,6 +129,21 @@ void HistUpdater::InitData( // initialize the row set { row_set_collection_.Clear(); + + // initialize histogram collection + uint32_t nbins = gmat.cut.Ptrs().back(); + hist_.Init(qu_, nbins); + + hist_buffer_.Init(qu_, nbins); + size_t buffer_size = kBufferSize; + if (buffer_size > info.num_row_ / kMinBlockSize + 1) { + buffer_size = info.num_row_ / kMinBlockSize + 1; + } + hist_buffer_.Reset(buffer_size); + + // initialize histogram builder + hist_builder_ = common::GHistBuilder(qu_, nbins); + USMVector* row_indices = &(row_set_collection_.Data()); row_indices->Resize(&qu_, info.num_row_); size_t* p_row_indices = row_indices->Data(); @@ -122,6 +196,25 @@ void HistUpdater::InitData( } } row_set_collection_.Init(); + + { + /* determine layout of data */ + const size_t nrow = info.num_row_; + const size_t ncol = info.num_col_; + const size_t nnz = info.num_nonzero_; + // number of discrete bins for feature 0 + const uint32_t nbins_f0 = gmat.cut.Ptrs()[1] - gmat.cut.Ptrs()[0]; + if (nrow * ncol == nnz) { + // dense data with zero-based indexing + data_layout_ = kDenseDataZeroBased; + } else if (nbins_f0 == 0 && nrow * (ncol - 1) == nnz) { + // dense data with one-based indexing + data_layout_ = kDenseDataOneBased; + } else { + // sparse data + data_layout_ = kSparseData; + } + } } template class HistUpdater; diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h index fb81218b6edb..d60eb70655f6 100644 --- a/plugin/sycl/tree/hist_updater.h +++ b/plugin/sycl/tree/hist_updater.h @@ -12,10 +12,13 @@ #pragma GCC diagnostic pop #include +#include #include #include "../common/partition_builder.h" #include "split_evaluator.h" +#include "hist_synchronizer.h" +#include "hist_row_adder.h" #include "../data.h" @@ -26,6 +29,10 @@ namespace tree { template class HistUpdater { public: + template + using GHistRowT = common::GHistRow; + using GradientPairT = xgboost::detail::GradientPairInternal; + explicit HistUpdater(::sycl::queue qu, const xgboost::tree::TrainParam& param, std::unique_ptr pruner, @@ -43,7 +50,13 @@ class HistUpdater { sub_group_size_ = sub_group_sizes.back(); } + void SetHistSynchronizer(HistSynchronizer* sync); + void SetHistRowsAdder(HistRowsAdder* adder); + protected: + friend class BatchHistSynchronizer; + friend class BatchHistRowsAdder; + void InitSampling(const USMVector &gpair, USMVector* row_indices); @@ -54,6 +67,27 @@ class HistUpdater { const DMatrix& fmat, const RegTree& tree); + inline ::sycl::event BuildHist( + const USMVector& gpair_device, + const common::RowSetCollection::Elem row_indices, + const common::GHistIndexMatrix& gmat, + GHistRowT* hist, + GHistRowT* hist_buffer, + ::sycl::event event_priv) { + return hist_builder_.BuildHist(gpair_device, row_indices, gmat, hist, + data_layout_ != kSparseData, hist_buffer, event_priv); + } + + void BuildLocalHistograms(const common::GHistIndexMatrix &gmat, + RegTree *p_tree, + const USMVector &gpair); + + void BuildHistogramsLossGuide( + ExpandEntry entry, + const common::GHistIndexMatrix &gmat, + RegTree *p_tree, + const USMVector &gpair); + // --data fields-- size_t sub_group_size_; @@ -69,11 +103,30 @@ class HistUpdater { const RegTree* p_last_tree_; DMatrix const* const p_last_fmat_; + enum DataLayout { kDenseDataZeroBased, kDenseDataOneBased, kSparseData }; + DataLayout data_layout_; + + constexpr static size_t kBufferSize = 2048; + constexpr static size_t kMinBlockSize = 128; + common::GHistBuilder hist_builder_; + common::ParallelGHistBuilder hist_buffer_; + /*! \brief culmulative histogram of gradients. */ + common::HistCollection hist_; + xgboost::common::Monitor builder_monitor_; xgboost::common::Monitor kernel_monitor_; uint64_t seed_ = 0; + // key is the node id which should be calculated by Subtraction Trick, value is the node which + // provides the evidence for substracts + std::vector nodes_for_subtraction_trick_; + // list of nodes whose histograms would be built explicitly. + std::vector nodes_for_explicit_hist_build_; + + std::unique_ptr> hist_synchronizer_; + std::unique_ptr> hist_rows_adder_; + ::sycl::queue qu_; }; diff --git a/tests/cpp/plugin/test_sycl_hist_updater.cc b/tests/cpp/plugin/test_sycl_hist_updater.cc index 4f66c27b9f81..4bf1ab30eb53 100644 --- a/tests/cpp/plugin/test_sycl_hist_updater.cc +++ b/tests/cpp/plugin/test_sycl_hist_updater.cc @@ -28,16 +28,40 @@ class TestHistUpdater : public HistUpdater { HistUpdater::InitSampling(gpair, row_indices); } - const auto* TestInitData(Context const * ctx, + auto* TestInitData(Context const * ctx, const common::GHistIndexMatrix& gmat, const USMVector &gpair, const DMatrix& fmat, const RegTree& tree) { HistUpdater::InitData(ctx, gmat, gpair, fmat, tree); - return &(HistUpdater::row_set_collection_.Data()); + return &(HistUpdater::row_set_collection_); + } + + const auto* TestBuildHistogramsLossGuide(ExpandEntry entry, + const common::GHistIndexMatrix &gmat, + RegTree *p_tree, + const USMVector &gpair) { + HistUpdater::BuildHistogramsLossGuide(entry, gmat, p_tree, gpair); + return &(HistUpdater::hist_); } }; +void GenerateRandomGPairs(::sycl::queue* qu, GradientPair* gpair_ptr, size_t num_rows, bool has_neg_hess) { + qu->submit([&](::sycl::handler& cgh) { + cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)), + [=](::sycl::item<1> pid) { + uint64_t i = pid.get_linear_id(); + + constexpr uint32_t seed = 777; + oneapi::dpl::minstd_rand engine(seed, i); + GradientPair::ValueT smallest_hess_val = has_neg_hess ? -1. : 0.; + oneapi::dpl::uniform_real_distribution distr(smallest_hess_val, 1.); + gpair_ptr[i] = {distr(engine), distr(engine)}; + }); + }); + qu->wait(); +} + template void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) { const size_t num_rows = 1u << 12; @@ -60,18 +84,7 @@ void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) { USMVector row_indices_0(&qu, num_rows); USMVector row_indices_1(&qu, num_rows); USMVector gpair(&qu, num_rows); - auto* gpair_ptr = gpair.Data(); - qu.submit([&](::sycl::handler& cgh) { - cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)), - [=](::sycl::item<1> pid) { - uint64_t i = pid.get_linear_id(); - - constexpr uint32_t seed = 777; - oneapi::dpl::minstd_rand engine(seed, i); - oneapi::dpl::uniform_real_distribution distr(-1., 1.); - gpair_ptr[i] = {distr(engine), distr(engine)}; - }); - }).wait(); + GenerateRandomGPairs(&qu, gpair.Data(), num_rows, true); updater.TestInitSampling(gpair, &row_indices_0); @@ -125,19 +138,7 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne TestHistUpdater updater(qu, param, std::move(pruner), int_constraints, p_fmat.get()); USMVector gpair(&qu, num_rows); - auto* gpair_ptr = gpair.Data(); - qu.submit([&](::sycl::handler& cgh) { - cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)), - [=](::sycl::item<1> pid) { - uint64_t i = pid.get_linear_id(); - - constexpr uint32_t seed = 777; - oneapi::dpl::minstd_rand engine(seed, i); - GradientPair::ValueT smallest_hess_val = has_neg_hess ? -1. : 0.; - oneapi::dpl::uniform_real_distribution distr(smallest_hess_val, 1.); - gpair_ptr[i] = {distr(engine), distr(engine)}; - }); - }).wait(); + GenerateRandomGPairs(&qu, gpair.Data(), num_rows, has_neg_hess); DeviceMatrix dmat; dmat.Init(qu, p_fmat.get()); @@ -145,10 +146,11 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne gmat.Init(qu, &ctx, dmat, n_bins); RegTree tree; - const auto* row_indices = updater.TestInitData(&ctx, gmat, gpair, *p_fmat, tree); + auto* row_set_collection = updater.TestInitData(&ctx, gmat, gpair, *p_fmat, tree); + auto& row_indices = row_set_collection->Data(); - std::vector row_indices_host(row_indices->Size()); - qu.memcpy(row_indices_host.data(), row_indices->DataConst(), row_indices->Size()*sizeof(size_t)).wait(); + std::vector row_indices_host(row_indices.Size()); + qu.memcpy(row_indices_host.data(), row_indices.DataConst(), row_indices.Size()*sizeof(size_t)).wait(); if (!has_neg_hess) { for (size_t i = 0; i < num_rows; ++i) { @@ -171,6 +173,70 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne } } +template +void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& param, float sparsity) { + const size_t num_rows = 1u << 8; + const size_t num_columns = 1; + const size_t n_bins = 32; + + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + + DeviceManager device_manager; + auto qu = device_manager.GetQueue(ctx.Device()); + ObjInfo task{ObjInfo::kRegression}; + + auto p_fmat = RandomDataGenerator{num_rows, num_columns, sparsity}.GenerateDMatrix(); + + FeatureInteractionConstraintHost int_constraints; + std::unique_ptr pruner{TreeUpdater::Create("prune", &ctx, &task)}; + + TestHistUpdater updater(qu, param, std::move(pruner), int_constraints, p_fmat.get()); + updater.SetHistSynchronizer(new BatchHistSynchronizer()); + updater.SetHistRowsAdder(new BatchHistRowsAdder()); + + USMVector gpair(&qu, num_rows); + auto* gpair_ptr = gpair.Data(); + GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false); + + DeviceMatrix dmat; + dmat.Init(qu, p_fmat.get()); + common::GHistIndexMatrix gmat; + gmat.Init(qu, &ctx, dmat, n_bins); + + RegTree tree; + tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0); + tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0); + tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0); + + ExpandEntry node0(0, tree.GetDepth(0)); + ExpandEntry node1(1, tree.GetDepth(1)); + ExpandEntry node2(2, tree.GetDepth(2)); + + auto* row_set_collection = updater.TestInitData(&ctx, gmat, gpair, *p_fmat, tree); + row_set_collection->AddSplit(0, 1, 2, 42, num_rows - 42); + + updater.TestBuildHistogramsLossGuide(node0, gmat, &tree, gpair); + const auto* hist = updater.TestBuildHistogramsLossGuide(node1, gmat, &tree, gpair); + + ASSERT_EQ((*hist)[0].Size(), n_bins); + ASSERT_EQ((*hist)[1].Size(), n_bins); + ASSERT_EQ((*hist)[2].Size(), n_bins); + + std::vector> hist0_host(n_bins); + std::vector> hist1_host(n_bins); + std::vector> hist2_host(n_bins); + qu.memcpy(hist0_host.data(), (*hist)[0].DataConst(), sizeof(xgboost::detail::GradientPairInternal) * n_bins); + qu.memcpy(hist1_host.data(), (*hist)[1].DataConst(), sizeof(xgboost::detail::GradientPairInternal) * n_bins); + qu.memcpy(hist2_host.data(), (*hist)[2].DataConst(), sizeof(xgboost::detail::GradientPairInternal) * n_bins); + qu.wait(); + + for (size_t idx_bin = 0; idx_bin < n_bins; ++idx_bin) { + EXPECT_NEAR(hist0_host[idx_bin].GetGrad(), hist1_host[idx_bin].GetGrad() + hist2_host[idx_bin].GetGrad(), 1e-6); + EXPECT_NEAR(hist0_host[idx_bin].GetHess(), hist1_host[idx_bin].GetHess() + hist2_host[idx_bin].GetHess(), 1e-6); + } +} + TEST(SyclHistUpdater, Sampling) { xgboost::tree::TrainParam param; param.UpdateAllowUnknown(Args{{"subsample", "0.7"}}); @@ -190,4 +256,14 @@ TEST(SyclHistUpdater, InitData) { TestHistUpdaterInitData(param, false); } +TEST(SyclHistUpdater, BuildHistogramsLossGuide) { + xgboost::tree::TrainParam param; + param.UpdateAllowUnknown(Args{{"max_depth", "3"}}); + + TestHistUpdaterBuildHistogramsLossGuide(param, 0.0); + TestHistUpdaterBuildHistogramsLossGuide(param, 0.5); + TestHistUpdaterBuildHistogramsLossGuide(param, 0.0); + TestHistUpdaterBuildHistogramsLossGuide(param, 0.5); +} + } // namespace xgboost::sycl::tree