Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized ApplySplit and UpdatePredictCache functions on CPU #5244

Merged
merged 9 commits into from Feb 29, 2020
5 changes: 3 additions & 2 deletions src/common/column_matrix.h
Expand Up @@ -37,6 +37,7 @@ class Column {
size_t Size() const { return len_; }
uint32_t GetGlobalBinIdx(size_t idx) const { return index_base_ + index_[idx]; }
uint32_t GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
common::Span<const uint32_t> GetFeatureBinIdxPtr() const { return { index_, len_ }; }
// column.GetFeatureBinIdx(idx) + column.GetBaseIdx(idx) ==
// column.GetGlobalBinIdx(idx)
uint32_t GetBaseIdx() const { return index_base_; }
Expand Down Expand Up @@ -186,8 +187,8 @@ class ColumnMatrix {

std::vector<size_t> feature_counts_;
std::vector<ColumnType> type_;
SimpleArray<uint32_t> index_; // index_: may store smaller integers; needs padding
SimpleArray<size_t> row_ind_;
std::vector<uint32_t> index_; // index_: may store smaller integers; needs padding
std::vector<size_t> row_ind_;
std::vector<ColumnBoundary> boundary_;

// index_base_[fid]: least bin id for feature fid
Expand Down
147 changes: 124 additions & 23 deletions src/common/hist_util.cc
Expand Up @@ -672,7 +672,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
}

/*!
* \brief fill a histogram by zeroes
* \brief fill a histogram by zeros in range [begin, end)
*/
void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
memset(hist.data() + begin, '\0', (end-begin)*sizeof(tree::GradStats));
Expand Down Expand Up @@ -719,43 +719,144 @@ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2,
}
}


void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
const size_t* rid = row_indices.begin;
const size_t nrows = row_indices.Size();
const uint32_t* index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
struct Prefetch {
public:
static constexpr size_t kCacheLineSize = 64;
static constexpr size_t kPrefetchOffset = 10;
static constexpr size_t kPrefetchStep =
kCacheLineSize / sizeof(decltype(GHistIndexMatrix::index)::value_type);

private:
static constexpr size_t kNoPrefetchSize =
kPrefetchOffset + kCacheLineSize /
sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type);

public:
static size_t NoPrefetchSize(size_t rows) {
return std::min(rows, kNoPrefetchSize);
}
};

constexpr size_t Prefetch::kNoPrefetchSize;

template<typename FPType, bool do_prefetch>
void BuildHistDenseKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const size_t n_features,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const float* pgh = reinterpret_cast<const float*>(gpair.data());
const uint32_t* gradient_index = gmat.index.data();
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());

const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array

for (size_t i = 0; i < size; ++i) {
const size_t icol_start = rid[i] * n_features;
const size_t idx_gh = two * rid[i];

if (do_prefetch) {
const size_t icol_start_prefetch = rid[i + Prefetch::kPrefetchOffset] * n_features;

double* hist_data = reinterpret_cast<double*>(hist.data());
PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
j += Prefetch::kPrefetchStep) {
PREFETCH_READ_T0(gradient_index + j);
}
}

for (size_t j = icol_start; j < icol_start + n_features; ++j) {
const uint32_t idx_bin = two * gradient_index[j];

hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const float* pgh = reinterpret_cast<const float*>(gpair.data());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const uint32_t* gradient_index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());

const size_t cache_line_size = 64;
const size_t prefetch_offset = 10;
size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid);
no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size;
const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array

for (size_t i = 0; i < nrows; ++i) {
for (size_t i = 0; i < size; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];
const size_t idx_gh = two * rid[i];

if (do_prefetch) {
const size_t icol_start_prftch = row_ptr[rid[i+Prefetch::kPrefetchOffset]];
const size_t icol_end_prefect = row_ptr[rid[i+Prefetch::kPrefetchOffset]+1];

if (i < nrows - no_prefetch_size) {
PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prftch; j < icol_end_prefect; j+=Prefetch::kPrefetchStep) {
PREFETCH_READ_T0(gradient_index + j);
}
}

for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = 2*index[j];
const size_t idx_gh = 2*rid[i];

hist_data[idx_bin] += pgh[idx_gh];
const uint32_t idx_bin = two * gradient_index[j];
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat, const bool isDense, GHistRow hist) {
if (row_indices.Size() && isDense) {
const size_t* row_ptr = gmat.row_ptr.data();
const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
BuildHistDenseKernel<FPType, do_prefetch>(gpair, row_indices, gmat, n_features, hist);
} else {
BuildHistSparseKernel<FPType, do_prefetch>(gpair, row_indices, gmat, hist);
}
}

void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist,
bool isDense) {
using FPType = decltype(tree::GradStats::sum_grad);
const size_t nrows = row_indices.Size();
const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);

// if need to work with all rows from bin-matrix (e.g. root node)
const bool contiguousBlock = (row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);

if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistKernel<FPType, false>(gpair, row_indices, gmat, isDense, hist);
} else {
const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size);
const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end);

BuildHistKernel<FPType, true>(gpair, span1, gmat, isDense, hist);
// no prefetching to avoid loading extra memory
BuildHistKernel<FPType, false>(gpair, span2, gmat, isDense, hist);
}
}

void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexBlockMatrix& gmatb,
Expand Down
90 changes: 10 additions & 80 deletions src/common/hist_util.h
@@ -1,5 +1,5 @@
/*!
* Copyright 2017 by Contributors
* Copyright 2017-2020 by Contributors
* \file hist_util.h
* \brief Utility for fast histogram aggregation
* \author Philip Cho, Tianqi Chen
Expand All @@ -25,75 +25,6 @@

namespace xgboost {
namespace common {

/*
* \brief A thin wrapper around dynamically allocated C-style array.
* Make sure to call resize() before use.
*/
template<typename T>
struct SimpleArray {
SmirnovEgorRu marked this conversation as resolved.
Show resolved Hide resolved
~SimpleArray() {
std::free(ptr_);
ptr_ = nullptr;
}

void resize(size_t n) {
T* ptr = static_cast<T*>(std::malloc(n * sizeof(T)));
CHECK(ptr) << "Failed to allocate memory";
if (ptr_) {
std::memcpy(ptr, ptr_, n_ * sizeof(T));
std::free(ptr_);
}
ptr_ = ptr;
n_ = n;
}

T& operator[](size_t idx) {
return ptr_[idx];
}

T& operator[](size_t idx) const {
return ptr_[idx];
}

size_t size() const {
return n_;
}

T back() const {
return ptr_[n_-1];
}

T* data() {
return ptr_;
}

const T* data() const {
return ptr_;
}


T* begin() {
return ptr_;
}

const T* begin() const {
return ptr_;
}

T* end() {
return ptr_ + n_;
}

const T* end() const {
return ptr_ + n_;
}

private:
T* ptr_ = nullptr;
size_t n_ = 0;
};

/*!
* \brief A single row in global histogram index.
* Directly represent the global index in the histogram entry.
Expand Down Expand Up @@ -161,7 +92,7 @@ class HistogramCuts {
return idx;
}

BinIdx SearchBin(Entry const& e) {
BinIdx SearchBin(Entry const& e) const {
return SearchBin(e.fvalue, e.index);
}
};
Expand Down Expand Up @@ -261,8 +192,9 @@ size_t DeviceSketch(int device,

/*!
* \brief preprocessed global index matrix, in CSR format
* Transform floating values to integer index in histogram
* This is a global histogram index.
*
* Transform floating values to integer index in histogram This is a global histogram
* index for CPU histogram. On GPU ellpack page is used.
*/
struct GHistIndexMatrix {
/*! \brief row pointer to rows by element position */
Expand Down Expand Up @@ -606,17 +538,15 @@ class ParallelGHistBuilder {
*/
class GHistBuilder {
public:
// initialize builder
inline void Init(size_t nthread, uint32_t nbins) {
nthread_ = nthread;
nbins_ = nbins;
}
GHistBuilder() : nthread_{0}, nbins_{0} {}
GHistBuilder(size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}

// construct a histogram via histogram aggregation
void BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist);
GHistRow hist,
bool isDense);
// same, with feature grouping
void BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
Expand All @@ -625,7 +555,7 @@ class GHistBuilder {
// construct a histogram via subtraction trick
void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);

uint32_t GetNumBins() {
uint32_t GetNumBins() const {
return nbins_;
}

Expand Down