Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizations of pre-processing for 'hist' tree method #4310

Merged
merged 5 commits into from
Apr 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions src/common/column_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class ColumnMatrix {
// construct column matrix from GHistIndexMatrix
inline void Init(const GHistIndexMatrix& gmat,
double sparse_threshold) {
const auto nfeature = static_cast<bst_uint>(gmat.cut.row_ptr.size() - 1);
const int32_t nfeature = static_cast<int32_t>(gmat.cut.row_ptr.size() - 1);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@SmirnovEgorRu Why the change from bst_uint to int32_t?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Windows requires that integer type for count inside "#pragma omp paralell for" should have signed type.
Before nfeature wasn't used for parallel loop, but now it is used. So it was changed from unsigned to signed

const size_t nrow = gmat.row_ptr.size() - 1;

// identify type of each column
Expand All @@ -86,7 +86,7 @@ class ColumnMatrix {

gmat.GetFeatureCounts(&feature_counts_[0]);
// classify features
for (bst_uint fid = 0; fid < nfeature; ++fid) {
for (int32_t fid = 0; fid < nfeature; ++fid) {
if (static_cast<double>(feature_counts_[fid])
< sparse_threshold * nrow) {
type_[fid] = kSparseColumn;
Expand All @@ -100,7 +100,7 @@ class ColumnMatrix {
boundary_.resize(nfeature);
size_t accum_index_ = 0;
size_t accum_row_ind_ = 0;
for (bst_uint fid = 0; fid < nfeature; ++fid) {
for (int32_t fid = 0; fid < nfeature; ++fid) {
boundary_[fid].index_begin = accum_index_;
boundary_[fid].row_ind_begin = accum_row_ind_;
if (type_[fid] == kDenseColumn) {
Expand All @@ -124,7 +124,9 @@ class ColumnMatrix {
}

// pre-fill index_ for dense columns
for (bst_uint fid = 0; fid < nfeature; ++fid) {

#pragma omp parallel for
for (int32_t fid = 0; fid < nfeature; ++fid) {
if (type_[fid] == kDenseColumn) {
const size_t ibegin = boundary_[fid].index_begin;
uint32_t* begin = &index_[ibegin];
Expand Down Expand Up @@ -184,8 +186,8 @@ class ColumnMatrix {

std::vector<size_t> feature_counts_;
std::vector<ColumnType> type_;
std::vector<uint32_t> index_; // index_: may store smaller integers; needs padding
std::vector<size_t> row_ind_;
SimpleArray<uint32_t> index_; // index_: may store smaller integers; needs padding
SimpleArray<size_t> row_ind_;
std::vector<ColumnBoundary> boundary_;

// index_base_[fid]: least bin id for feature fid
Expand Down
170 changes: 134 additions & 36 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "./column_matrix.h"
#include "./hist_util.h"
#include "./quantile.h"
#include "./../tree/updater_quantile_hist.h"

#if defined(XGBOOST_MM_PREFETCH_PRESENT)
#include <xmmintrin.h>
Expand Down Expand Up @@ -49,7 +50,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
constexpr int kFactor = 8;
std::vector<WXQSketch> sketchs;

const int nthread = omp_get_max_threads();
const size_t nthread = omp_get_max_threads();

unsigned const nstep =
static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
Expand All @@ -67,34 +68,85 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
// Use group index for weights?
bool const use_group_ind = num_groups != 0 && weights.size() != info.num_row_;

for (const auto &batch : p_fmat->GetRowBatches()) {
size_t group_ind = 0;
if (use_group_ind) {
group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid);
if (use_group_ind) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hcho3 We need to change the name of these Init functions.

for (const auto &batch : p_fmat->GetRowBatches()) {
size_t group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid);
#pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind)
{
CHECK_EQ(nthread, omp_get_num_threads());
auto tid = static_cast<unsigned>(omp_get_thread_num());
unsigned begin = std::min(nstep * tid, ncol);
unsigned end = std::min(nstep * (tid + 1), ncol);

// do not iterate if no columns are assigned to the thread
if (begin < end && end <= ncol) {
for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*)
size_t const ridx = batch.base_rowid + i;
SparsePage::Inst const inst = batch[i];
if (group_ptr[group_ind] == ridx &&
// maximum equals to weights.size() - 1
group_ind < num_groups - 1) {
// move to next group
group_ind++;
}
for (auto const& entry : inst) {
if (entry.index >= begin && entry.index < end) {
size_t w_idx = group_ind;
sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx));
}
}
}
}
}
}
#pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind)
{
CHECK_EQ(nthread, omp_get_num_threads());
auto tid = static_cast<unsigned>(omp_get_thread_num());
unsigned begin = std::min(nstep * tid, ncol);
unsigned end = std::min(nstep * (tid + 1), ncol);

// do not iterate if no columns are assigned to the thread
if (begin < end && end <= ncol) {
for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*)
size_t const ridx = batch.base_rowid + i;
SparsePage::Inst const inst = batch[i];
if (use_group_ind &&
group_ptr[group_ind] == ridx &&
// maximum equals to weights.size() - 1
group_ind < num_groups - 1) {
// move to next group
group_ind++;
} else {
for (const auto &batch : p_fmat->GetRowBatches()) {
const size_t size = batch.Size();
const size_t block_size = 512;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the rule for choosing this number (512)? Is this number hardware-dependent?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, but this value influences on size of allocated buffer: it should not be too large from the one point of view and be not too small (useful work of omp-task should be more than omp-overhead) from the second point of view.

By experiments 512 have been chosen as optimal.

const size_t block_size_iter = block_size * nthread;
const size_t n_blocks = size / block_size_iter + !!(size % block_size_iter);

std::vector<std::vector<std::pair<float, float>>> buff(nthread);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add a note like buffer for sketches to increase the cache usage or for some other reasons? It's sometime easy to forget the pain when we are new comers. ;-)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Buffs are not needed for better cache locality.
It requires for collecting local features-values by each thread and after push them to global queue without conflicts

for (size_t tid = 0; tid < nthread; ++tid) {
buff[tid].resize(block_size * ncol);
}

std::vector<size_t> sizes(nthread * ncol, 0);

for (size_t iblock = 0; iblock < n_blocks; ++iblock) {
#pragma omp parallel num_threads(nthread)
{
int tid = omp_get_thread_num();

const size_t ibegin = iblock * block_size_iter + tid * block_size;
const size_t iend = std::min(ibegin + block_size, size);

auto* p_sizes = sizes.data() + ncol * tid;
auto* p_buff = buff[tid].data();

for (size_t i = ibegin; i < iend; ++i) {
size_t const ridx = batch.base_rowid + i;
bst_float w = info.GetWeight(ridx);
SparsePage::Inst const inst = batch[i];

for (auto const& entry : inst) {
const size_t idx = entry.index;
p_buff[idx * block_size + p_sizes[idx]] = { entry.fvalue, w };
p_sizes[idx]++;
}
}
for (auto const& entry : inst) {
if (entry.index >= begin && entry.index < end) {
size_t w_idx = use_group_ind ? group_ind : ridx;
sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx));
#pragma omp barrier
#pragma omp for schedule(static)
for (int32_t icol = 0; icol < static_cast<int32_t>(ncol); ++icol) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The number of columns is usually small, ~3000 is quite an extreme case, ~120 might be the more common case. Is it preferred to use OMP here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OMP must be here :)
Each OMP task will push around 512*nthreads/ncols elements, it is enough to have gain from OMP. Also, OMP threads have already created on 117 line.

for (size_t tid = 0; tid < nthread; ++tid) {
auto* p_sizes = sizes.data() + ncol * tid;
auto* p_buff = buff[tid].data() + icol * block_size;

for (size_t i = 0; i < p_sizes[icol]; ++i) {
sketchs[icol].Push(p_buff[i].first, p_buff[i].second);
}

p_sizes[icol] = 0;
}
}
}
Expand Down Expand Up @@ -177,22 +229,66 @@ uint32_t HistCutMatrix::GetBinIdx(const Entry& e) {

void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
cut.Init(p_fmat, max_num_bins);

const int nthread = omp_get_max_threads();
const int32_t nthread = omp_get_max_threads();
// const int nthread = 1;
const uint32_t nbins = cut.row_ptr.back();
hit_count.resize(nbins, 0);
hit_count_tloc_.resize(nthread * nbins, 0);

row_ptr.push_back(0);

size_t new_size = 1;
for (const auto &batch : p_fmat->GetRowBatches()) {
const size_t rbegin = row_ptr.size() - 1;
for (size_t i = 0; i < batch.Size(); ++i) {
row_ptr.push_back(batch[i].size() + row_ptr.back());
new_size += batch.Size();
}

row_ptr.resize(new_size);
row_ptr[0] = 0;

size_t rbegin = 0;
size_t prev_sum = 0;

for (const auto &batch : p_fmat->GetRowBatches()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to @trivialfis, @CodingCat: we are not really sure if 'hist' method works when external memory is enabled (i.e. GetRowBatches() returns more than one batch). We will need to test 'hist' against external memory in a future pull request.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hcho3 Absolutely agree.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAICT, it doesn't work in distributed setting, even after @trivialfis 's fix in 0.82

MemStackAllocator<size_t, 128> partial_sums(nthread);
size_t* p_part = partial_sums.Get();

size_t block_size = batch.Size() / nthread;

#pragma omp parallel num_threads(nthread)
{
#pragma omp for
for (int32_t tid = 0; tid < nthread; ++tid) {
size_t ibegin = block_size * tid;
size_t iend = (tid == (nthread-1) ? batch.Size() : (block_size * (tid+1)));

size_t sum = 0;
for (size_t i = ibegin; i < iend; ++i) {
sum += batch[i].size();
row_ptr[rbegin + 1 + i] = sum;
}
}

#pragma omp single
{
p_part[0] = prev_sum;
for (int32_t i = 1; i < nthread; ++i) {
p_part[i] = p_part[i - 1] + row_ptr[rbegin + i*block_size];
}
}

#pragma omp for
for (int32_t tid = 0; tid < nthread; ++tid) {
size_t ibegin = block_size * tid;
size_t iend = (tid == (nthread-1) ? batch.Size() : (block_size * (tid+1)));

for (size_t i = ibegin; i < iend; ++i) {
row_ptr[rbegin + 1 + i] += p_part[tid];
}
}
}

index.resize(row_ptr.back());

CHECK_GT(cut.cut.size(), 0U);
CHECK_EQ(cut.row_ptr.back(), cut.cut.size());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@SmirnovEgorRu Why are we removing this line?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now row_ptr is allocated to final length from the start. So if we have a lot of batches - back() for not final batch will return some trash.


auto bsize = static_cast<omp_ulong>(batch.Size());
#pragma omp parallel for num_threads(nthread) schedule(static)
Expand All @@ -203,7 +299,6 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
SparsePage::Inst inst = batch[i];

CHECK_EQ(ibegin + inst.size(), iend);

for (bst_uint j = 0; j < inst.size(); ++j) {
uint32_t idx = cut.GetBinIdx(inst[j]);

Expand All @@ -215,10 +310,13 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {

#pragma omp parallel for num_threads(nthread) schedule(static)
for (bst_omp_uint idx = 0; idx < bst_omp_uint(nbins); ++idx) {
for (int tid = 0; tid < nthread; ++tid) {
for (size_t tid = 0; tid < nthread; ++tid) {
hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
}
}

prev_sum = row_ptr[rbegin + batch.Size()];
rbegin += batch.Size();
}
}

Expand Down
68 changes: 68 additions & 0 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,74 @@
namespace xgboost {
namespace common {

/*
* \brief A thin wrapper around dynamically allocated C-style array.
* Make sure to call resize() before use.
*/
template<typename T>
SmirnovEgorRu marked this conversation as resolved.
Show resolved Hide resolved
struct SimpleArray {
~SimpleArray() {
free(ptr_);
ptr_ = nullptr;
}

void resize(size_t n) {
T* ptr = static_cast<T*>(malloc(n*sizeof(T)));
memcpy(ptr, ptr_, n_ * sizeof(T));
free(ptr_);
ptr_ = ptr;
n_ = n;
}

T& operator[](size_t idx) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the advantage of using SimpleArray over std::vector? We still have one extra indirection each time we index the array.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the advantage of using SimpleArray over std::vector? We still have one extra indirection each time we index the array.

@SmirnovEgorRu I'm wondering about this too, SimpleArray doens't use dynamic table to amortize the allocation/deallocation time, can you add a note that under what size it's more efficient than dynamic table? Or it's for some other reasons?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I replaced std::vector by SimpleArray because std::vector has to slow resize() function, it fill memory by very not performance manner.
Amortization of allocation/deallocation is not needed here, because I need to call resize() only one time.

return ptr_[idx];
}

T& operator[](size_t idx) const {
return ptr_[idx];
}

size_t size() const {
return n_;
}

T back() const {
return ptr_[n_-1];
}

T* data() {
return ptr_;
}

const T* data() const {
return ptr_;
}


T* begin() {
return ptr_;
}

const T* begin() const {
return ptr_;
}

T* end() {
return ptr_ + n_;
}

const T* end() const {
return ptr_ + n_;
}

private:
T* ptr_ = nullptr;
size_t n_ = 0;
};




/*! \brief Cut configuration for all the features. */
struct HistCutMatrix {
/*! \brief Unit pointer to rows by element position */
Expand Down
1 change: 0 additions & 1 deletion src/common/row_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ class RowSetCollection {
}
// clear up things
inline void Clear() {
row_indices_.clear();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we still need this line. Clearing a RowSetCollection should also clear all row indices stores within.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did some measurements with my all optimizations and InitData() function spends ~15% of tree building time.
This block of code spends all of this time:

for (size_t i = 0; i < info.num_row_; ++i) {
  if (gpair[i].GetHess() >= 0.0f) {
    row_indices.push_back(i);

There are 2 reasons: std::vector - not performance friendly object, push_back contains at least one "if" to check current size. Also, this code can't be parallelized and it is sequential.

I have changed this code to use C-style partition:

j = 0;
for (size_t i = 0; i < info.num_row_; ++i) {
  if (gpair[i].GetHess() >= 0.0f) {
    p_row_indices[j++] = i;

But in this case I need to do row_indices_->resize(num_row) each time after row_indices_->clear(). But resize(num_rows) - very expensive operations if current size=0, it will fill data inside by default value (in our case - 0) by not performance manner. So, I deleted clear() and use resize(nrows) only one time.

After my changes InitData() spends less then 1% now.

P.S. std::vector - really not performance friendly :)

elem_of_each_node_.clear();
}
// initialize node id 0->everything
Expand Down
Loading