Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SYCL. Add functional for partition building #10080

Merged
merged 1 commit into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 164 additions & 3 deletions plugin/sycl/common/partition_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,97 @@
#pragma GCC diagnostic pop

#include "../data.h"
#include "row_set.h"
#include "../data/gradient_index.h"
#include "../tree/expand_entry.h"

#include <CL/sycl.hpp>

namespace xgboost {
namespace sycl {
namespace common {

// split row indexes (rid_span) to 2 parts (both stored in rid_buf) depending
// on comparison of indexes values (idx_span) and split point (split_cond)
// Handle dense columns
template <bool default_left, typename BinIdxType>
inline ::sycl::event PartitionDenseKernel(
::sycl::queue* qu,
const GHistIndexMatrix& gmat,
const RowSetCollection::Elem& rid_span,
const size_t fid,
const int32_t split_cond,
xgboost::common::Span<size_t>* rid_buf,
size_t* parts_size,
::sycl::event event) {
const size_t row_stride = gmat.row_stride;
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
const size_t* rid = rid_span.begin;
const size_t range_size = rid_span.Size();
const size_t offset = gmat.cut.Ptrs()[fid];

size_t* p_rid_buf = rid_buf->data();

return qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event);
cgh.parallel_for<>(::sycl::range<1>(range_size), [=](::sycl::item<1> nid) {
const size_t id = rid[nid.get_id(0)];
const int32_t value = static_cast<int32_t>(gradient_index[id * row_stride + fid] + offset);
const bool is_left = value <= split_cond;
if (is_left) {
AtomicRef<size_t> n_left(parts_size[0]);
p_rid_buf[n_left.fetch_add(1)] = id;
} else {
AtomicRef<size_t> n_right(parts_size[1]);
p_rid_buf[range_size - n_right.fetch_add(1) - 1] = id;
}
});
});
}

// split row indexes (rid_span) to 2 parts (both stored in rid_buf) depending
// on comparison of indexes values (idx_span) and split point (split_cond)
// Handle sparce columns
template <bool default_left, typename BinIdxType>
inline ::sycl::event PartitionSparseKernel(::sycl::queue* qu,
const GHistIndexMatrix& gmat,
const RowSetCollection::Elem& rid_span,
const size_t fid,
const int32_t split_cond,
xgboost::common::Span<size_t>* rid_buf,
size_t* parts_size,
::sycl::event event) {
const size_t row_stride = gmat.row_stride;
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
const size_t* rid = rid_span.begin;
const size_t range_size = rid_span.Size();
const uint32_t* cut_ptrs = gmat.cut_device.Ptrs().DataConst();

size_t* p_rid_buf = rid_buf->data();
return qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event);
cgh.parallel_for<>(::sycl::range<1>(range_size), [=](::sycl::item<1> nid) {
const size_t id = rid[nid.get_id(0)];

const BinIdxType* gr_index_local = gradient_index + row_stride * id;
const int32_t fid_local = std::lower_bound(gr_index_local,
gr_index_local + row_stride,
cut_ptrs[fid]) - gr_index_local;
const bool is_left = (fid_local >= row_stride ||
gr_index_local[fid_local] >= cut_ptrs[fid + 1]) ?
default_left :
gr_index_local[fid_local] <= split_cond;
if (is_left) {
AtomicRef<size_t> n_left(parts_size[0]);
p_rid_buf[n_left.fetch_add(1)] = id;
} else {
AtomicRef<size_t> n_right(parts_size[1]);
p_rid_buf[range_size - n_right.fetch_add(1) - 1] = id;
}
});
});
}

// The builder is required for samples partition to left and rights children for set of nodes
class PartitionBuilder {
public:
Expand All @@ -53,7 +137,6 @@ class PartitionBuilder {
return result_rows_[2 * nid];
}


size_t GetNRightElems(int nid) const {
return result_rows_[2 * nid + 1];
}
Expand All @@ -72,19 +155,97 @@ class PartitionBuilder {
return { data_.Data() + nodes_offsets_[nid], nodes_offsets_[nid + 1] - nodes_offsets_[nid] };
}

template <typename BinIdxType>
::sycl::event Partition(const int32_t split_cond,
const GHistIndexMatrix& gmat,
const RowSetCollection::Elem& rid_span,
const xgboost::RegTree::Node& node,
xgboost::common::Span<size_t>* rid_buf,
size_t* parts_size,
::sycl::event event) {
const bst_uint fid = node.SplitIndex();
const bool default_left = node.DefaultLeft();

if (gmat.IsDense()) {
if (default_left) {
return PartitionDenseKernel<true, BinIdxType>(qu_, gmat, rid_span, fid,
split_cond, rid_buf, parts_size, event);
} else {
return PartitionDenseKernel<false, BinIdxType>(qu_, gmat, rid_span, fid,
split_cond, rid_buf, parts_size, event);
}
} else {
if (default_left) {
return PartitionSparseKernel<true, BinIdxType>(qu_, gmat, rid_span, fid,
split_cond, rid_buf, parts_size, event);
} else {
return PartitionSparseKernel<false, BinIdxType>(qu_, gmat, rid_span, fid,
split_cond, rid_buf, parts_size, event);
}
}
}

// Entry point for Partition
void Partition(const GHistIndexMatrix& gmat,
const std::vector<tree::ExpandEntry> nodes,
const RowSetCollection& row_set_collection,
const std::vector<int32_t>& split_conditions,
RegTree* p_tree,
::sycl::event* general_event) {
nodes_events_.resize(n_nodes_);

parts_size_.ResizeAndFill(qu_, 2 * n_nodes_, 0, general_event);

for (size_t node_in_set = 0; node_in_set < n_nodes_; node_in_set++) {
const int32_t nid = nodes[node_in_set].nid;
::sycl::event& node_event = nodes_events_[node_in_set];
const auto& rid_span = row_set_collection[nid];
if (rid_span.Size() > 0) {
const RegTree::Node& node = (*p_tree)[nid];
xgboost::common::Span<size_t> rid_buf = GetData(node_in_set);
size_t* part_size = parts_size_.Data() + 2 * node_in_set;
int32_t split_condition = split_conditions[node_in_set];
switch (gmat.index.GetBinTypeSize()) {
case common::BinTypeSize::kUint8BinsTypeSize:
node_event = Partition<uint8_t>(split_condition, gmat, rid_span, node,
&rid_buf, part_size, *general_event);
break;
case common::BinTypeSize::kUint16BinsTypeSize:
node_event = Partition<uint16_t>(split_condition, gmat, rid_span, node,
&rid_buf, part_size, *general_event);
break;
case common::BinTypeSize::kUint32BinsTypeSize:
node_event = Partition<uint32_t>(split_condition, gmat, rid_span, node,
&rid_buf, part_size, *general_event);
break;
default:
CHECK(false); // no default behavior
}
} else {
node_event = ::sycl::event();
}
}

*general_event = qu_->memcpy(result_rows_.data(),
parts_size_.DataConst(),
sizeof(size_t) * 2 * n_nodes_,
nodes_events_);
}

void MergeToArray(size_t nid,
size_t* data_result,
::sycl::event event) {
::sycl::event* event) {
size_t n_nodes_total = GetNLeftElems(nid) + GetNRightElems(nid);
if (n_nodes_total > 0) {
const size_t* data = data_.Data() + nodes_offsets_[nid];
qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, event);
qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, *event);
}
}

protected:
std::vector<size_t> nodes_offsets_;
std::vector<size_t> result_rows_;
std::vector<::sycl::event> nodes_events_;
size_t n_nodes_;

USMVector<size_t, MemoryType::on_device> parts_size_;
Expand Down
49 changes: 34 additions & 15 deletions plugin/sycl/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,20 +171,20 @@ class USMVector {
}
}

::sycl::event ResizeAndFill(::sycl::queue* qu, size_t size_new, int v) {
void ResizeAndFill(::sycl::queue* qu, size_t size_new, int v, ::sycl::event* event) {
if (size_new <= size_) {
size_ = size_new;
return qu->memset(data_.get(), v, size_new * sizeof(T));
*event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
} else if (size_new <= capacity_) {
size_ = size_new;
return qu->memset(data_.get(), v, size_new * sizeof(T));
*event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
} else {
size_t size_old = size_;
auto data_old = data_;
size_ = size_new;
capacity_ = size_new;
data_ = allocate_memory_(qu, size_);
return qu->memset(data_.get(), v, size_new * sizeof(T));
*event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
}
}

Expand All @@ -211,11 +211,16 @@ class USMVector {
struct DeviceMatrix {
DMatrix* p_mat; // Pointer to the original matrix on the host
::sycl::queue qu_;
USMVector<size_t> row_ptr;
USMVector<size_t, MemoryType::on_device> row_ptr;
USMVector<Entry, MemoryType::on_device> data;
size_t total_offset;

DeviceMatrix(::sycl::queue qu, DMatrix* dmat) : p_mat(dmat), qu_(qu) {
DeviceMatrix() = default;

void Init(::sycl::queue qu, DMatrix* dmat) {
qu_ = qu;
p_mat = dmat;

size_t num_row = 0;
size_t num_nonzero = 0;
for (auto &batch : dmat->GetBatches<SparsePage>()) {
Expand All @@ -226,27 +231,41 @@ struct DeviceMatrix {
}

row_ptr.Resize(&qu_, num_row + 1);
size_t* rows = row_ptr.Data();
data.Resize(&qu_, num_nonzero);

size_t data_offset = 0;
::sycl::event event;
for (auto &batch : dmat->GetBatches<SparsePage>()) {
const auto& data_vec = batch.data.HostVector();
const auto& offset_vec = batch.offset.HostVector();
size_t batch_size = batch.Size();
if (batch_size > 0) {
std::copy(offset_vec.data(), offset_vec.data() + batch_size,
row_ptr.Data() + batch.base_rowid);
if (batch.base_rowid > 0) {
for (size_t i = 0; i < batch_size; i++)
row_ptr[i + batch.base_rowid] += batch.base_rowid;
const auto base_rowid = batch.base_rowid;
event = qu.memcpy(row_ptr.Data() + base_rowid, offset_vec.data(),
sizeof(size_t) * batch_size, event);
if (base_rowid > 0) {
qu.submit([&](::sycl::handler& cgh) {
cgh.depends_on(event);
cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::id<1> pid) {
int row_id = pid[0];
rows[row_id] += base_rowid;
});
});
}
qu.memcpy(data.Data() + data_offset,
data_vec.data(),
offset_vec[batch_size] * sizeof(Entry)).wait();
event = qu.memcpy(data.Data() + data_offset, data_vec.data(),
sizeof(Entry) * offset_vec[batch_size], event);
data_offset += offset_vec[batch_size];
qu.wait();
}
}
row_ptr[num_row] = data_offset;
qu.submit([&](::sycl::handler& cgh) {
cgh.depends_on(event);
cgh.single_task<>([=] {
rows[num_row] = data_offset;
});
});
qu.wait();
total_offset = data_offset;
}

Expand Down
3 changes: 2 additions & 1 deletion plugin/sycl/predictor/predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,8 @@ class Predictor : public xgboost::Predictor {
uint32_t tree_end = 0) const override {
::sycl::queue qu = device_manager.GetQueue(ctx_->Device());
// TODO(razdoburdin): remove temporary workaround after cache fix
sycl::DeviceMatrix device_matrix(qu, dmat);
sycl::DeviceMatrix device_matrix;
device_matrix.Init(qu, dmat);

auto* out_preds = &predts->predictions;
if (tree_end == 0) {
Expand Down
50 changes: 50 additions & 0 deletions plugin/sycl/tree/expand_entry.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*!
* Copyright 2017-2024 by Contributors
* \file expand_entry.h
*/
#ifndef PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
#define PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
#include "../../src/tree/constraints.h"
#pragma GCC diagnostic pop
#include "../../src/tree/hist/expand_entry.h"

namespace xgboost {
namespace sycl {
namespace tree {
/* tree growing policies */
struct ExpandEntry : public xgboost::tree::ExpandEntryImpl<ExpandEntry> {
static constexpr bst_node_t kRootNid = 0;

xgboost::tree::SplitEntry split;

ExpandEntry(int nid, int depth) : ExpandEntryImpl{nid, depth} {}

inline bst_node_t GetSiblingId(const xgboost::RegTree* p_tree) const {
CHECK_EQ((*p_tree)[nid].IsRoot(), false);
const size_t parent_id = (*p_tree)[nid].Parent();
return GetSiblingId(p_tree, parent_id);
}

inline bst_node_t GetSiblingId(const xgboost::RegTree* p_tree, size_t parent_id) const {
return p_tree->IsLeftChild(nid) ? p_tree->RightChild(parent_id)
: p_tree->LeftChild(parent_id);
}

bool IsValidImpl(xgboost::tree::TrainParam const &param, int32_t num_leaves) const {
if (split.loss_chg <= kRtEps) return false;
if (split.loss_chg < param.min_split_loss) return false;
if (param.max_depth > 0 && depth == param.max_depth) return false;
if (param.max_leaves > 0 && num_leaves == param.max_leaves) return false;

return true;
}
};

} // namespace tree
} // namespace sycl
} // namespace xgboost

#endif // PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
3 changes: 2 additions & 1 deletion tests/cpp/plugin/test_sycl_gradient_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ TEST(SyclGradientIndex, Init) {

auto p_fmat = RandomDataGenerator{n_rows, n_columns, 0.3}.GenerateDMatrix();

sycl::DeviceMatrix dmat(qu, p_fmat.get());
sycl::DeviceMatrix dmat;
dmat.Init(qu, p_fmat.get());

int max_bins = 256;
common::GHistIndexMatrix gmat_sycl;
Expand Down
Loading
Loading