Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized ApplySplit and UpdatePredictCache functions on CPU #5244

Merged
merged 9 commits into from Feb 29, 2020
5 changes: 3 additions & 2 deletions src/common/column_matrix.h
Expand Up @@ -37,6 +37,7 @@ class Column {
size_t Size() const { return len_; }
uint32_t GetGlobalBinIdx(size_t idx) const { return index_base_ + index_[idx]; }
uint32_t GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
const uint32_t* GetFeatureBinIdxPtr() const { return index_; }
// column.GetFeatureBinIdx(idx) + column.GetBaseIdx(idx) ==
// column.GetGlobalBinIdx(idx)
uint32_t GetBaseIdx() const { return index_base_; }
Expand Down Expand Up @@ -186,8 +187,8 @@ class ColumnMatrix {

std::vector<size_t> feature_counts_;
std::vector<ColumnType> type_;
SimpleArray<uint32_t> index_; // index_: may store smaller integers; needs padding
SimpleArray<size_t> row_ind_;
std::vector<uint32_t> index_; // index_: may store smaller integers; needs padding
std::vector<size_t> row_ind_;
std::vector<ColumnBoundary> boundary_;

// index_base_[fid]: least bin id for feature fid
Expand Down
124 changes: 102 additions & 22 deletions src/common/hist_util.cc
Expand Up @@ -707,43 +707,123 @@ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2,
}
}

struct Prefetch {
public:
static constexpr size_t kCacheLineSize = 64;
static constexpr size_t kPrefetchOffset = 10;
static constexpr size_t kPrefetchStep =
kCacheLineSize / sizeof(decltype(GHistIndexMatrix::index)::value_type);

private:
static constexpr size_t kNoPrefetchSize =
kPrefetchOffset + kCacheLineSize /
sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type);

public:
static size_t NoPrefetchSize(size_t rows) {
return std::min(rows, kNoPrefetchSize);
}
};

void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
const size_t* rid = row_indices.begin;
const size_t nrows = row_indices.Size();
const uint32_t* index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
const float* pgh = reinterpret_cast<const float*>(gpair.data());
constexpr size_t Prefetch::kNoPrefetchSize;

double* hist_data = reinterpret_cast<double*>(hist.data());
template<typename FPType, bool do_prefetch>
void BuildHistDenseKernel(const size_t* rid, const float* pgh, const uint32_t* index,
FPType* hist_data, size_t ibegin, size_t iend, size_t n_features) {
for (size_t i = ibegin; i < iend; ++i) {
const size_t icol_start = rid[i] * n_features;
const size_t idx_gh = 2*rid[i];

const size_t cache_line_size = 64;
const size_t prefetch_offset = 10;
size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid);
no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size;
if (do_prefetch) {
const size_t icol_start_prefetch = rid[i + Prefetch::kPrefetchOffset] * n_features;

for (size_t i = 0; i < nrows; ++i) {
PREFETCH_READ_T0(pgh + 2*rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
j += Prefetch::kPrefetchStep) {
PREFETCH_READ_T0(index + j);
}
}

for (size_t j = icol_start; j < icol_start + n_features; ++j) {
const uint32_t idx_bin = 2 * index[j];

hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistSparseKernel(const size_t* rid, const float* pgh, const uint32_t* gradient_index,
FPType* hist_data, const size_t* row_ptr, size_t ibegin, size_t iend) {
for (size_t i = ibegin; i < iend; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];
const size_t idx_gh = 2*rid[i];
SmirnovEgorRu marked this conversation as resolved.
Show resolved Hide resolved

if (do_prefetch) {
const size_t icol_start_prftch = row_ptr[rid[i+Prefetch::kPrefetchOffset]];
const size_t icol_end_prefect = row_ptr[rid[i+Prefetch::kPrefetchOffset]+1];

if (i < nrows - no_prefetch_size) {
PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + 2*rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prftch; j < icol_end_prefect; j+=Prefetch::kPrefetchStep) {
PREFETCH_READ_T0(gradient_index + j);
}
}

for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = 2*index[j];
const size_t idx_gh = 2*rid[i];

hist_data[idx_bin] += pgh[idx_gh];
const uint32_t idx_bin = 2 * gradient_index[j];
SmirnovEgorRu marked this conversation as resolved.
Show resolved Hide resolved
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistKernel(const size_t* rid, const float* pgh, const uint32_t* gradient_index,
FPType* hist_data, const size_t* row_ptr, size_t ibegin, size_t iend,
bool isDense) {
if (isDense) {
const size_t n_features = row_ptr[rid[0]+1] - row_ptr[rid[0]];
BuildHistDenseKernel<FPType, do_prefetch>(rid, pgh, gradient_index, hist_data,
ibegin, iend, n_features);
} else {
BuildHistSparseKernel<FPType, do_prefetch>(rid, pgh, gradient_index, hist_data, row_ptr,
ibegin, iend);
}
}

void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist,
bool isDense) {
const size_t* rid = row_indices.begin;
const size_t nrows = row_indices.Size();
const uint32_t* index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
const float* pgh = reinterpret_cast<const float*>(gpair.data());

using FPType = decltype(tree::GradStats::sum_grad);
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
SmirnovEgorRu marked this conversation as resolved.
Show resolved Hide resolved
size_t const no_prefetch_size = Prefetch::NoPrefetchSize(nrows);

// if need to work with all rows from bin-matrix (e.g. root node)
const bool contiguousBlock = (rid[row_indices.Size()-1] - rid[0]) == (row_indices.Size() - 1);

if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistKernel<FPType, false>(rid, pgh, index, hist_data, row_ptr,
0, nrows, isDense);
} else {
BuildHistKernel<FPType, true>(rid, pgh, index, hist_data, row_ptr,
0, nrows - no_prefetch_size, isDense);
// no prefetching to avoid loading extra memory
BuildHistKernel<FPType, false>(rid, pgh, index, hist_data, row_ptr,
nrows - no_prefetch_size, nrows, isDense);
}
}

void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexBlockMatrix& gmatb,
Expand Down
88 changes: 9 additions & 79 deletions src/common/hist_util.h
@@ -1,5 +1,5 @@
/*!
* Copyright 2017 by Contributors
* Copyright 2017-2020 by Contributors
* \file hist_util.h
* \brief Utility for fast histogram aggregation
* \author Philip Cho, Tianqi Chen
Expand All @@ -25,75 +25,6 @@

namespace xgboost {
namespace common {

/*
* \brief A thin wrapper around dynamically allocated C-style array.
* Make sure to call resize() before use.
*/
template<typename T>
struct SimpleArray {
SmirnovEgorRu marked this conversation as resolved.
Show resolved Hide resolved
~SimpleArray() {
std::free(ptr_);
ptr_ = nullptr;
}

void resize(size_t n) {
T* ptr = static_cast<T*>(std::malloc(n * sizeof(T)));
CHECK(ptr) << "Failed to allocate memory";
if (ptr_) {
std::memcpy(ptr, ptr_, n_ * sizeof(T));
std::free(ptr_);
}
ptr_ = ptr;
n_ = n;
}

T& operator[](size_t idx) {
return ptr_[idx];
}

T& operator[](size_t idx) const {
return ptr_[idx];
}

size_t size() const {
return n_;
}

T back() const {
return ptr_[n_-1];
}

T* data() {
return ptr_;
}

const T* data() const {
return ptr_;
}


T* begin() {
return ptr_;
}

const T* begin() const {
return ptr_;
}

T* end() {
return ptr_ + n_;
}

const T* end() const {
return ptr_ + n_;
}

private:
T* ptr_ = nullptr;
size_t n_ = 0;
};

/*!
* \brief A single row in global histogram index.
* Directly represent the global index in the histogram entry.
Expand Down Expand Up @@ -267,8 +198,9 @@ size_t DeviceSketch(int device,

/*!
* \brief preprocessed global index matrix, in CSR format
* Transform floating values to integer index in histogram
* This is a global histogram index.
*
* Transform floating values to integer index in histogram This is a global histogram
* index for CPU histogram. On GPU ellpack page is used.
*/
struct GHistIndexMatrix {
/*! \brief row pointer to rows by element position */
Expand Down Expand Up @@ -612,17 +544,15 @@ class ParallelGHistBuilder {
*/
class GHistBuilder {
public:
// initialize builder
inline void Init(size_t nthread, uint32_t nbins) {
nthread_ = nthread;
nbins_ = nbins;
}
GHistBuilder() : nthread_{0}, nbins_{0} {}
GHistBuilder(size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}

// construct a histogram via histogram aggregation
void BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist);
GHistRow hist,
bool isDense);
// same, with feature grouping
void BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
Expand All @@ -631,7 +561,7 @@ class GHistBuilder {
// construct a histogram via subtraction trick
void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);

uint32_t GetNumBins() {
uint32_t GetNumBins() const {
return nbins_;
}

Expand Down