Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance optimizations for CPUs [part 2] #4278

Closed
wants to merge 42 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
0ff8ada
Initial performance optimizations for xgboost
SmirnovEgorRu Dec 2, 2018
40c07c7
remove includes
SmirnovEgorRu Dec 2, 2018
c80f4bc
revert float->double
SmirnovEgorRu Dec 4, 2018
32e88bb
fix for CI
SmirnovEgorRu Dec 4, 2018
f6c44a6
fix for CI
SmirnovEgorRu Dec 4, 2018
c36127d
fix for CI
SmirnovEgorRu Dec 5, 2018
dd12944
fix for CI
SmirnovEgorRu Dec 5, 2018
416bf2f
fix for CI
SmirnovEgorRu Dec 5, 2018
c862fa8
fix for CI
SmirnovEgorRu Dec 5, 2018
1d59566
fix for CI
SmirnovEgorRu Dec 5, 2018
b7685df
fix for CI
SmirnovEgorRu Dec 5, 2018
e29229b
fix for CI
SmirnovEgorRu Dec 5, 2018
d59c386
fix for CI
SmirnovEgorRu Dec 5, 2018
4a0c9b3
Check existence of _mm_prefetch and __builtin_prefetch
hcho3 Jan 4, 2019
6c37c3f
Fix lint
hcho3 Jan 4, 2019
f49c7bc
Merge remote-tracking branch 'remotes/xgb_last/master'
SmirnovEgorRu Jan 31, 2019
7dac50c
Merge branch 'master' of https://github.com/dmlc/xgboost
SmirnovEgorRu Mar 9, 2019
c4c2d68
performance optimizations for hist method on CPU
SmirnovEgorRu Mar 19, 2019
5f95c5f
code refactoring
SmirnovEgorRu Mar 19, 2019
47edfa2
merge with master
SmirnovEgorRu Mar 19, 2019
0072003
tune
SmirnovEgorRu Mar 22, 2019
db107f9
optimizations for pre-processing
SmirnovEgorRu Mar 23, 2019
3706e05
quintiles building optimizations
SmirnovEgorRu Mar 24, 2019
87e7f5e
remove extra locks of mutexes
SmirnovEgorRu Mar 24, 2019
e519963
fix for GHistIndexMatrix::Init
SmirnovEgorRu Mar 26, 2019
d63bbec
correct sum for GHistIndexMatrix
SmirnovEgorRu Mar 26, 2019
e92d555
Merge commit '3706e05e4c0dad21bba7e4fc34b603fc9b8fd2b3' into test_fix
SmirnovEgorRu Mar 26, 2019
b54ff62
Merge commit '87e7f5eda86f60b90ae2fe93fd58b8f379d1ccae' into test_fix
SmirnovEgorRu Mar 26, 2019
2e8babb
fix in tests
SmirnovEgorRu Mar 28, 2019
46660c7
added forgot files for split_evalutor
SmirnovEgorRu Mar 28, 2019
5b98ae0
clean code
SmirnovEgorRu Mar 28, 2019
7d04b64
merge with master
SmirnovEgorRu Mar 28, 2019
55a2ed1
oprimizations for pre-processing
SmirnovEgorRu Mar 29, 2019
72709f1
code cleaning
SmirnovEgorRu Mar 29, 2019
3c88125
code cleaning
SmirnovEgorRu Mar 29, 2019
d96f41b
code clean up
SmirnovEgorRu Apr 2, 2019
dda11af
fix for gcc4.8
SmirnovEgorRu Apr 16, 2019
dd6a443
Merge branch 'master' of https://github.com/dmlc/xgboost into optimiz…
SmirnovEgorRu Apr 16, 2019
3f60fcd
Merge remote-tracking branch 'upstream/master' into optimizations
hcho3 May 1, 2019
08e83fb
Remove unneeded whitespace changes
hcho3 May 1, 2019
fa54eeb
removed redeclaration of MemStackAllocator
May 9, 2019
ffa9b29
remove extra init
May 9, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/xgboost/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ class GradientPairInternal {
SetHess(g.GetHess());
}

XGBOOST_DEVICE float GetGrad() const { return grad_; }
XGBOOST_DEVICE float GetHess() const { return hess_; }
XGBOOST_DEVICE inline float GetGrad() const { return grad_; }
XGBOOST_DEVICE inline float GetHess() const { return hess_; }

XGBOOST_DEVICE GradientPairInternal<T> &operator+=(
const GradientPairInternal<T> &rhs) {
Expand Down
5 changes: 5 additions & 0 deletions src/common/column_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
#define XGBOOST_COMMON_COLUMN_MATRIX_H_

#include <dmlc/timer.h>
#include <limits>
#include <vector>
#include "hist_util.h"
Expand Down Expand Up @@ -51,6 +52,10 @@ class Column {
}
const size_t* GetRowData() const { return row_ind_; }

const uint32_t* GetIndex() const {
return index_;
}

private:
ColumnType type_;
const uint32_t* index_;
Expand Down
188 changes: 3 additions & 185 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,18 @@
* Copyright 2017-2019 by Contributors
* \file hist_util.h
*/
#include "./hist_util.h"
#include <dmlc/timer.h>
#include <rabit/rabit.h>
#include <dmlc/omp.h>
#include <numeric>
#include <vector>

#include "./random.h"
#include "./column_matrix.h"
#include "./hist_util.h"
#include "./quantile.h"
#include "./../tree/updater_quantile_hist.h"

#if defined(XGBOOST_MM_PREFETCH_PRESENT)
#include <xmmintrin.h>
#define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0)
#elif defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
#define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast<const char*>(addr), 0, 3)
#else // no SW pre-fetching available; PREFETCH_READ_T0 is no-op
#define PREFETCH_READ_T0(addr) do {} while (0)
#endif // defined(XGBOOST_MM_PREFETCH_PRESENT)

namespace xgboost {
namespace common {

Expand Down Expand Up @@ -230,7 +222,6 @@ uint32_t HistCutMatrix::GetBinIdx(const Entry& e) {
void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
cut.Init(p_fmat, max_num_bins);
const int32_t nthread = omp_get_max_threads();
// const int nthread = 1;
const uint32_t nbins = cut.row_ptr.back();
hit_count.resize(nbins, 0);
hit_count_tloc_.resize(nthread * nbins, 0);
Expand Down Expand Up @@ -460,7 +451,7 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
for (auto fid : group) {
nnz += feature_nnz[fid];
}
double nnz_rate = static_cast<double>(nnz) / nrow;
float nnz_rate = static_cast<float>(nnz) / nrow;
// take apart small sparse group, due it will not gain on speed
if (nnz_rate <= param.sparse_threshold) {
for (auto fid : group) {
Expand Down Expand Up @@ -545,178 +536,5 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
}
}

void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
const size_t nthread = static_cast<size_t>(this->nthread_);
data_.resize(nbins_ * nthread_);

const size_t* rid = row_indices.begin;
const size_t nrows = row_indices.Size();
const uint32_t* index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
const float* pgh = reinterpret_cast<const float*>(gpair.data());

double* hist_data = reinterpret_cast<double*>(hist.data());
double* data = reinterpret_cast<double*>(data_.data());

const size_t block_size = 512;
size_t n_blocks = nrows/block_size;
n_blocks += !!(nrows - n_blocks*block_size);

const size_t nthread_to_process = std::min(nthread, n_blocks);
memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t));

const size_t cache_line_size = 64;
const size_t prefetch_offset = 10;
size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid);
no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size;

#pragma omp parallel for num_threads(nthread_to_process) schedule(guided)
for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) {
dmlc::omp_uint tid = omp_get_thread_num();
double* data_local_hist = ((nthread_to_process == 1) ? hist_data :
reinterpret_cast<double*>(data_.data() + tid * nbins_));

if (!thread_init_[tid]) {
memset(data_local_hist, '\0', 2*nbins_*sizeof(double));
thread_init_[tid] = true;
}

const size_t istart = iblock*block_size;
const size_t iend = (((iblock+1)*block_size > nrows) ? nrows : istart + block_size);
for (size_t i = istart; i < iend; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];

if (i < nrows - no_prefetch_size) {
PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
}

for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = 2*index[j];
const size_t idx_gh = 2*rid[i];

data_local_hist[idx_bin] += pgh[idx_gh];
data_local_hist[idx_bin+1] += pgh[idx_gh+1];
}
}
}

if (nthread_to_process > 1) {
const size_t size = (2*nbins_);
const size_t block_size = 1024;
size_t n_blocks = size/block_size;
n_blocks += !!(size - n_blocks*block_size);

size_t n_worked_bins = 0;
for (size_t i = 0; i < nthread_to_process; ++i) {
if (thread_init_[i]) {
thread_init_[n_worked_bins++] = i;
}
}

#pragma omp parallel for num_threads(std::min(nthread, n_blocks)) schedule(guided)
for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) {
const size_t istart = iblock * block_size;
const size_t iend = (((iblock + 1) * block_size > size) ? size : istart + block_size);

const size_t bin = 2 * thread_init_[0] * nbins_;
memcpy(hist_data + istart, (data + bin + istart), sizeof(double) * (iend - istart));

for (size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) {
const size_t bin = 2 * thread_init_[i_bin_part] * nbins_;
for (size_t i = istart; i < iend; i++) {
hist_data[i] += data[bin + i];
}
}
}
}
}

void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexBlockMatrix& gmatb,
GHistRow hist) {
constexpr int kUnroll = 8; // loop unrolling factor
const size_t nblock = gmatb.GetNumBlock();
const size_t nrows = row_indices.end - row_indices.begin;
const size_t rest = nrows % kUnroll;

#if defined(_OPENMP)
const auto nthread = static_cast<bst_omp_uint>(this->nthread_); // NOLINT
#endif // defined(_OPENMP)
tree::GradStats* p_hist = hist.data();

#pragma omp parallel for num_threads(nthread) schedule(guided)
for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
auto gmat = gmatb[bid];

for (size_t i = 0; i < nrows - rest; i += kUnroll) {
size_t rid[kUnroll];
size_t ibegin[kUnroll];
size_t iend[kUnroll];
GradientPair stat[kUnroll];

for (int k = 0; k < kUnroll; ++k) {
rid[k] = row_indices.begin[i + k];
ibegin[k] = gmat.row_ptr[rid[k]];
iend[k] = gmat.row_ptr[rid[k] + 1];
stat[k] = gpair[rid[k]];
}
for (int k = 0; k < kUnroll; ++k) {
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
const uint32_t bin = gmat.index[j];
p_hist[bin].Add(stat[k]);
}
}
}
for (size_t i = nrows - rest; i < nrows; ++i) {
const size_t rid = row_indices.begin[i];
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
const GradientPair stat = gpair[rid];
for (size_t j = ibegin; j < iend; ++j) {
const uint32_t bin = gmat.index[j];
p_hist[bin].Add(stat);
}
}
}
}

void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
const uint32_t nbins = static_cast<bst_omp_uint>(nbins_);
constexpr int kUnroll = 8; // loop unrolling factor
const uint32_t rest = nbins % kUnroll;

#if defined(_OPENMP)
const auto nthread = static_cast<bst_omp_uint>(this->nthread_); // NOLINT
#endif // defined(_OPENMP)
tree::GradStats* p_self = self.data();
tree::GradStats* p_sibling = sibling.data();
tree::GradStats* p_parent = parent.data();

#pragma omp parallel for num_threads(nthread) schedule(static)
for (bst_omp_uint bin_id = 0;
bin_id < static_cast<bst_omp_uint>(nbins - rest); bin_id += kUnroll) {
tree::GradStats pb[kUnroll];
tree::GradStats sb[kUnroll];
for (int k = 0; k < kUnroll; ++k) {
pb[k] = p_parent[bin_id + k];
}
for (int k = 0; k < kUnroll; ++k) {
sb[k] = p_sibling[bin_id + k];
}
for (int k = 0; k < kUnroll; ++k) {
p_self[bin_id + k].SetSubstract(pb[k], sb[k]);
}
}
for (uint32_t bin_id = nbins - rest; bin_id < nbins; ++bin_id) {
p_self[bin_id].SetSubstract(p_parent[bin_id], p_sibling[bin_id]);
}
}

} // namespace common
} // namespace xgboost
Loading