From c9b2a435a44ded5d83d5ad4aabf5b30534e1c6e2 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Wed, 22 May 2024 00:22:12 -0700 Subject: [PATCH] optimise hist building for sycl --- plugin/sycl/common/hist_util.cc | 86 ++++++++++++++++++++------------ plugin/sycl/tree/hist_updater.cc | 5 +- plugin/sycl/tree/hist_updater.h | 1 - 3 files changed, 54 insertions(+), 38 deletions(-) diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc index fd813a92cec9..2f2417f3a29a 100644 --- a/plugin/sycl/common/hist_util.cc +++ b/plugin/sycl/common/hist_util.cc @@ -64,6 +64,30 @@ template ::sycl::event SubtractionHist(::sycl::queue qu, const GHistRow& src2, size_t size, ::sycl::event event_priv); +inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max_nblocks) { + struct _ { + size_t block_size, nblocks; + }; + + const size_t min_block_size = 32; + const size_t max_compute_units = + qu.get_device().get_info<::sycl::info::device::max_compute_units>(); + + size_t nblocks = max_compute_units; + + size_t block_size = size / nblocks + !!(size % nblocks); + if (block_size > (1u << 12)) { + nblocks = max_nblocks; + block_size = size / nblocks + !!(size % nblocks); + } + if (block_size < min_block_size) { + block_size = min_block_size; + nblocks = size / block_size + !!(size % block_size); + } + + return _{block_size, nblocks}; +} + // Kernel with buffer using template ::sycl::event BuildHistKernel(::sycl::queue qu, @@ -73,27 +97,26 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, GHistRow* hist, GHistRow* hist_buffer, ::sycl::event event_priv) { + using GradientPairT = xgboost::detail::GradientPairInternal; const size_t size = row_indices.Size(); const size_t* rid = row_indices.begin; const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride; - const GradientPair::ValueT* pgh = - reinterpret_cast(gpair_device.DataConst()); + const auto* pgh = gpair_device.DataConst(); const BinIdxType* gradient_index = gmat.index.data(); const uint32_t* offsets = gmat.index.Offset(); - FPType* hist_data = reinterpret_cast(hist->Data()); const size_t nbins = gmat.nbins; const size_t max_work_group_size = qu.get_device().get_info<::sycl::info::device::max_work_group_size>(); const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size; - const size_t max_nblocks = hist_buffer->Size() / (nbins * 2); - const size_t min_block_size = 128; - size_t nblocks = std::min(max_nblocks, size / min_block_size + !!(size % min_block_size)); - const size_t block_size = size / nblocks + !!(size % nblocks); - FPType* hist_buffer_data = reinterpret_cast(hist_buffer->Data()); + // Captured structured bindings are a C++20 extension + const auto block_params = GetBlocksParameters(qu, size, hist_buffer->Size() / (nbins * 2)); + const size_t block_size = block_params.block_size; + const size_t nblocks = block_params.nblocks; - auto event_fill = qu.fill(hist_buffer_data, FPType(0), nblocks * nbins * 2, event_priv); + GradientPairT* hist_buffer_data = hist_buffer->Data(); + auto event_fill = qu.fill(hist_buffer_data, GradientPairT(0, 0), nblocks * nbins * 2, event_priv); auto event_main = qu.submit([&](::sycl::handler& cgh) { cgh.depends_on(event_fill); cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size), @@ -102,13 +125,14 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, size_t block = pid.get_global_id(0); size_t feat = pid.get_global_id(1); - FPType* hist_local = hist_buffer_data + block * nbins * 2; + GradientPairT* hist_local = hist_buffer_data + block * nbins; for (size_t idx = 0; idx < block_size; ++idx) { size_t i = block * block_size + idx; if (i < size) { const size_t icol_start = n_columns * rid[i]; const size_t idx_gh = rid[i]; + const GradientPairT pgh_row = {pgh[idx_gh].GetGrad(), pgh[idx_gh].GetHess()}; pid.barrier(::sycl::access::fence_space::local_space); const BinIdxType* gr_index_local = gradient_index + icol_start; @@ -118,8 +142,7 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, idx_bin += offsets[j]; } if (idx_bin < nbins) { - hist_local[2 * idx_bin] += pgh[2 * idx_gh]; - hist_local[2 * idx_bin+1] += pgh[2 * idx_gh+1]; + hist_local[idx_bin] += pgh_row; } } } @@ -127,21 +150,19 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, }); }); + GradientPairT* hist_data = hist->Data(); auto event_save = qu.submit([&](::sycl::handler& cgh) { cgh.depends_on(event_main); cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) { size_t idx_bin = pid.get_id(0); - FPType gsum = 0.0f; - FPType hsum = 0.0f; + GradientPairT gpair = {0, 0}; for (size_t j = 0; j < nblocks; ++j) { - gsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin]; - hsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin + 1]; + gpair += hist_buffer_data[j * nbins + idx_bin]; } - hist_data[2 * idx_bin] = gsum; - hist_data[2 * idx_bin + 1] = hsum; + hist_data[idx_bin] = gpair; }); }); return event_save; @@ -165,24 +186,27 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, FPType* hist_data = reinterpret_cast(hist->Data()); const size_t nbins = gmat.nbins; - const size_t max_work_group_size = - qu.get_device().get_info<::sycl::info::device::max_work_group_size>(); - const size_t feat_local = n_columns < max_work_group_size ? n_columns : max_work_group_size; + constexpr size_t work_group_size = 32; + const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0); auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv); auto event_main = qu.submit([&](::sycl::handler& cgh) { cgh.depends_on(event_fill); - cgh.parallel_for<>(::sycl::range<2>(size, feat_local), - [=](::sycl::item<2> pid) { - size_t i = pid.get_id(0); - size_t feat = pid.get_id(1); + cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(size, n_work_groups * work_group_size), + ::sycl::range<2>(1, work_group_size)), + [=](::sycl::nd_item<2> pid) { + const int i = pid.get_global_id(0); + auto group = pid.get_group(); const size_t icol_start = n_columns * rid[i]; const size_t idx_gh = rid[i]; - + const FPType pgh_row[2] = {pgh[2 * idx_gh], pgh[2 * idx_gh + 1]}; const BinIdxType* gr_index_local = gradient_index + icol_start; - for (size_t j = feat; j < n_columns; j += feat_local) { + const size_t group_id = group.get_group_id()[1]; + const size_t local_id = group.get_local_id()[1]; + const size_t j = group_id * work_group_size + local_id; + if (j < n_columns) { uint32_t idx_bin = static_cast(gr_index_local[j]); if constexpr (isDense) { idx_bin += offsets[j]; @@ -190,8 +214,8 @@ ::sycl::event BuildHistKernel(::sycl::queue qu, if (idx_bin < nbins) { AtomicRef gsum(hist_data[2 * idx_bin]); AtomicRef hsum(hist_data[2 * idx_bin + 1]); - gsum.fetch_add(pgh[2 * idx_gh]); - hsum.fetch_add(pgh[2 * idx_gh + 1]); + gsum += pgh_row[0]; + hsum += pgh_row[1]; } } }); @@ -214,10 +238,6 @@ ::sycl::event BuildHistDispatchKernel( const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride; const size_t nbins = gmat.nbins; - // max cycle size, while atomics are still effective - const size_t max_cycle_size_atomics = nbins; - const size_t cycle_size = size; - // TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns); diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc index 76ecdeab8ac3..daf043e4e055 100644 --- a/plugin/sycl/tree/hist_updater.cc +++ b/plugin/sycl/tree/hist_updater.cc @@ -136,10 +136,7 @@ void HistUpdater::InitData( hist_buffer_.Init(qu_, nbins); size_t buffer_size = kBufferSize; - if (buffer_size > info.num_row_ / kMinBlockSize + 1) { - buffer_size = info.num_row_ / kMinBlockSize + 1; - } - hist_buffer_.Reset(buffer_size); + hist_buffer_.Reset(kBufferSize); // initialize histogram builder hist_builder_ = common::GHistBuilder(qu_, nbins); diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h index 544a7c26698a..538f2fe5f707 100644 --- a/plugin/sycl/tree/hist_updater.h +++ b/plugin/sycl/tree/hist_updater.h @@ -130,7 +130,6 @@ class HistUpdater { DataLayout data_layout_; constexpr static size_t kBufferSize = 2048; - constexpr static size_t kMinBlockSize = 128; common::GHistBuilder hist_builder_; common::ParallelGHistBuilder hist_buffer_; /*! \brief culmulative histogram of gradients. */