Skip to content

Commit

Permalink
Optimized ApplySplit and UpdatePredictCache functions on CPU
Browse files Browse the repository at this point in the history
  • Loading branch information
SmirnovEgorRu committed Feb 1, 2020
1 parent fe8d72b commit d0f8c3f
Show file tree
Hide file tree
Showing 7 changed files with 532 additions and 236 deletions.
1 change: 1 addition & 0 deletions src/common/column_matrix.h
Expand Up @@ -37,6 +37,7 @@ class Column {
size_t Size() const { return len_; }
uint32_t GetGlobalBinIdx(size_t idx) const { return index_base_ + index_[idx]; }
uint32_t GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
const uint32_t* GetFeatureBinIdxPtr() const { return index_; }
// column.GetFeatureBinIdx(idx) + column.GetBaseIdx(idx) ==
// column.GetGlobalBinIdx(idx)
uint32_t GetBaseIdx() const { return index_base_; }
Expand Down
101 changes: 84 additions & 17 deletions src/common/hist_util.cc
Expand Up @@ -707,40 +707,107 @@ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2,
}
}

template<typename FPType, bool do_prefetch>
void BuildHistDenseKernel(const size_t* rid, const float* pgh, const uint32_t* index,
FPType* hist_data, size_t ibegin, size_t iend, size_t n_features,
size_t prefetch_offset, size_t prefetch_step) {
for (size_t i = ibegin; i < iend; ++i) {
const size_t icol_start = rid[i] * n_features;
const size_t idx_gh = 2*rid[i];

if (do_prefetch) {
const size_t icol_start_prefetch = rid[i+prefetch_offset] * n_features;

PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
j += prefetch_step) {
PREFETCH_READ_T0(index + j);
}
}

for (size_t j = icol_start; j < icol_start + n_features; ++j) {
const uint32_t idx_bin = 2*index[j];

hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistSparseKernel(const size_t* rid, const float* pgh, const uint32_t* index,
FPType* hist_data, const size_t* row_ptr, size_t ibegin, size_t iend,
size_t prefetch_offset, size_t prefetch_step) {
for (size_t i = ibegin; i < iend; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];
const size_t idx_gh = 2*rid[i];

if (do_prefetch) {
const size_t icol_start_prftch = row_ptr[rid[i+prefetch_offset]];
const size_t icol_end_prefect = row_ptr[rid[i+prefetch_offset]+1];

PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
for (size_t j = icol_start_prftch; j < icol_end_prefect; j+=prefetch_step) {
PREFETCH_READ_T0(index + j);
}
}

for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = 2*index[j];
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistKernel(const size_t* rid, const float* pgh, const uint32_t* index,
FPType* hist_data, const size_t* row_ptr, size_t ibegin, size_t iend,
size_t prefetch_offset, size_t prefetch_step, bool isDense) {
if (isDense) {
const size_t n_features = row_ptr[rid[0]+1] - row_ptr[rid[0]];
BuildHistDenseKernel<FPType, do_prefetch>(rid, pgh, index, hist_data,
ibegin, iend, n_features, prefetch_offset, prefetch_step);
} else {
BuildHistSparseKernel<FPType, do_prefetch>(rid, pgh, index, hist_data, row_ptr,
ibegin, iend, prefetch_offset, prefetch_step);
}
}

void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
GHistRow hist,
bool isDense) {
const size_t* rid = row_indices.begin;
const size_t nrows = row_indices.Size();
const uint32_t* index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
const float* pgh = reinterpret_cast<const float*>(gpair.data());

double* hist_data = reinterpret_cast<double*>(hist.data());
using FPType = decltype(tree::GradStats::sum_grad);
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());

const size_t cache_line_size = 64;
const size_t prefetch_offset = 10;
size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid);
no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size;
const size_t prefetch_step = cache_line_size / sizeof(*index);

for (size_t i = 0; i < nrows; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];

if (i < nrows - no_prefetch_size) {
PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
}

for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = 2*index[j];
const size_t idx_gh = 2*rid[i];
// if need to work with all rows from bin-matrix (e.g. root node)
const bool contiguousBlock = (rid[row_indices.Size()-1] - rid[0]) == (row_indices.Size() - 1);

hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistKernel<FPType, false>(rid, pgh, index, hist_data, row_ptr,
0, nrows, prefetch_offset, prefetch_step, isDense);
} else {
BuildHistKernel<FPType, true>(rid, pgh, index, hist_data, row_ptr,
0, nrows - no_prefetch_size, prefetch_offset, prefetch_step, isDense);
// no prefetching to avoid loading extra memory
BuildHistKernel<FPType, false>(rid, pgh, index, hist_data, row_ptr,
nrows - no_prefetch_size, nrows, prefetch_offset, prefetch_step, isDense);
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/common/hist_util.h
Expand Up @@ -622,7 +622,8 @@ class GHistBuilder {
void BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist);
GHistRow hist,
bool isDense);
// same, with feature grouping
void BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
Expand Down
143 changes: 128 additions & 15 deletions src/common/row_set.h
Expand Up @@ -10,6 +10,7 @@
#include <xgboost/data.h>
#include <algorithm>
#include <vector>
#include <utility>

namespace xgboost {
namespace common {
Expand Down Expand Up @@ -57,6 +58,13 @@ class RowSetCollection {
<< "access element that is not in the set";
return e;
}

/*! \brief return corresponding element set given the node_id */
inline Elem& operator[](unsigned node_id) {
Elem& e = elem_of_each_node_[node_id];
return e;
}

// clear up things
inline void Clear() {
elem_of_each_node_.clear();
Expand All @@ -83,25 +91,18 @@ class RowSetCollection {
}
// split rowset into two
inline void AddSplit(unsigned node_id,
const std::vector<Split>& row_split_tloc,
unsigned left_node_id,
unsigned right_node_id) {
unsigned right_node_id,
size_t n_left,
size_t n_right) {
const Elem e = elem_of_each_node_[node_id];
const auto nthread = static_cast<bst_omp_uint>(row_split_tloc.size());
CHECK(e.begin != nullptr);
size_t* all_begin = dmlc::BeginPtr(row_indices_);
size_t* begin = all_begin + (e.begin - all_begin);

size_t* it = begin;
for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
std::copy(row_split_tloc[tid].left.begin(), row_split_tloc[tid].left.end(), it);
it += row_split_tloc[tid].left.size();
}
size_t* split_pt = it;
for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
std::copy(row_split_tloc[tid].right.begin(), row_split_tloc[tid].right.end(), it);
it += row_split_tloc[tid].right.size();
}
CHECK_EQ(n_left + n_right, e.Size());
CHECK_LE(begin + n_left, e.end);
CHECK_EQ(begin + n_left + n_right, e.end);

if (left_node_id >= elem_of_each_node_.size()) {
elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1));
Expand All @@ -110,8 +111,8 @@ class RowSetCollection {
elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1));
}

elem_of_each_node_[left_node_id] = Elem(begin, split_pt, left_node_id);
elem_of_each_node_[right_node_id] = Elem(split_pt, e.end, right_node_id);
elem_of_each_node_[left_node_id] = Elem(begin, begin + n_left, left_node_id);
elem_of_each_node_[right_node_id] = Elem(begin + n_left, e.end, right_node_id);
elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1);
}

Expand All @@ -123,6 +124,118 @@ class RowSetCollection {
std::vector<Elem> elem_of_each_node_;
};


template<size_t BlockSize>
class PartitionBuilder {
public:
template<typename Func>
void Init(const size_t n_tasks, size_t n_nodes, Func func) {
node_sizes_.resize(n_nodes);
nodes_.resize(n_nodes+1);

nodes_[0] = 0;
for (size_t i = 1; i < n_nodes+1; ++i) {
nodes_[i] = nodes_[i-1] + func(i-1);
}

if (n_tasks > max_n_tasks_) {
blocks_.resize(n_tasks);
max_n_tasks_ = n_tasks;
}
}

size_t* GetLeftBuffer(int nid, size_t begin, size_t end) {
const size_t task_idx = GetTaskIdx(nid, begin);
CHECK_LE(task_idx, blocks_.size());
return blocks_[task_idx].left();
}

size_t* GetRightBuffer(int nid, size_t begin, size_t end) {
const size_t task_idx = GetTaskIdx(nid, begin);
CHECK_LE(task_idx, blocks_.size());
return blocks_[task_idx].right();
}

void SetNLeftElems(int nid, size_t begin, size_t end, size_t n_left) {
size_t task_idx = GetTaskIdx(nid, begin);
CHECK_LE(task_idx, blocks_.size());
blocks_[task_idx].n_left = n_left;
}

void SetNRightElems(int nid, size_t begin, size_t end, size_t n_right) {
size_t task_idx = GetTaskIdx(nid, begin);
CHECK_LE(task_idx, blocks_.size());
blocks_[task_idx].n_right = n_right;
}


size_t GetNLeftElems(int nid) const {
return node_sizes_[nid].first;
}

size_t GetNRightElems(int nid) const {
return node_sizes_[nid].second;
}

void CalculateRowOffsets() {
for (size_t i = 0; i < nodes_.size()-1; ++i) {
size_t n_left = 0;
for (size_t j = nodes_[i]; j < nodes_[i+1]; ++j) {
blocks_[j].n_offset_left = n_left;
n_left += blocks_[j].n_left;
}
size_t n_right = 0;
for (size_t j = nodes_[i]; j < nodes_[i+1]; ++j) {
blocks_[j].n_offset_right = n_left + n_right;
n_right += blocks_[j].n_right;
}
node_sizes_[i] = {n_left, n_right};
}
}

void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
size_t task_idx = GetTaskIdx(nid, begin);

size_t* left_result = rows_indexes + blocks_[task_idx].n_offset_left;
size_t* right_result = rows_indexes + blocks_[task_idx].n_offset_right;

const size_t* left = blocks_[task_idx].left();
const size_t* right = blocks_[task_idx].right();

std::copy_n(left, blocks_[task_idx].n_left, left_result);
std::copy_n(right, blocks_[task_idx].n_right, right_result);
}

protected:
size_t GetTaskIdx(int nid, size_t begin) {
return nodes_[nid] + begin/BlockSize;
}

struct BlockInfo{
size_t n_left;
size_t n_right;

size_t n_offset_left;
size_t n_offset_right;

size_t* left() {
return &left_data_[0];
}

size_t* right() {
return &right_data_[0];
}
private:
alignas(128) size_t left_data_[BlockSize];
alignas(128) size_t right_data_[BlockSize];
};
std::vector<std::pair<size_t, size_t>> node_sizes_;
std::vector<size_t> nodes_;
std::vector<BlockInfo> blocks_;
size_t max_n_tasks_ = 0;
};


} // namespace common
} // namespace xgboost

Expand Down

0 comments on commit d0f8c3f

Please sign in to comment.