Skip to content

Commit

Permalink
Fix memory usage of device sketching (#5407)
Browse files Browse the repository at this point in the history
  • Loading branch information
RAMitchell committed Mar 14, 2020
1 parent bb8c8df commit b745b7a
Show file tree
Hide file tree
Showing 13 changed files with 153 additions and 73 deletions.
19 changes: 5 additions & 14 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,24 +168,15 @@ struct BatchParam {
/*! \brief The GPU device to use. */
int gpu_id;
/*! \brief Maximum number of bins per feature for histograms. */
int max_bin { 0 };
/*! \brief Number of rows in a GPU batch, used for finding quantiles on GPU. */
int gpu_batch_nrows;
int max_bin{0};
/*! \brief Page size for external memory mode. */
size_t gpu_page_size;
BatchParam() = default;
BatchParam(int32_t device, int32_t max_bin, int32_t gpu_batch_nrows,
size_t gpu_page_size = 0) :
gpu_id{device},
max_bin{max_bin},
gpu_batch_nrows{gpu_batch_nrows},
gpu_page_size{gpu_page_size}
{}
BatchParam(int32_t device, int32_t max_bin, size_t gpu_page_size = 0)
: gpu_id{device}, max_bin{max_bin}, gpu_page_size{gpu_page_size} {}
inline bool operator!=(const BatchParam& other) const {
return gpu_id != other.gpu_id ||
max_bin != other.max_bin ||
gpu_batch_nrows != other.gpu_batch_nrows ||
gpu_page_size != other.gpu_page_size;
return gpu_id != other.gpu_id || max_bin != other.max_bin ||
gpu_page_size != other.gpu_page_size;
}
};

Expand Down
8 changes: 7 additions & 1 deletion src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,11 @@ public:
{
return stats_.peak_allocated_bytes;
}
void Clear()
{
stats_ = DeviceStats();
}

void Log() {
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
return;
Expand Down Expand Up @@ -475,7 +480,8 @@ struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
template <typename T>
using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
/*! Be careful that the initialization constructor is a no-op, which means calling
* `vec.resize(n, 1)` won't initialize the memory region to 1. */
* `vec.resize(n)` won't initialize the memory region to 0. Instead use
* `vec.resize(n, 0)`*/
template <typename T>
using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
/** \brief Specialisation of thrust device vector using custom allocator. */
Expand Down
66 changes: 41 additions & 25 deletions src/common/hist_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,19 @@ struct EntryCompareOp {
}
};

// Compute number of sample cuts needed on local node to maintain accuracy
// We take more cuts than needed and then reduce them later
size_t RequiredSampleCuts(int max_bins, size_t num_rows) {
constexpr int kFactor = 8;
double eps = 1.0 / (kFactor * max_bins);
size_t dummy_nlevel;
size_t num_cuts;
WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(
num_rows, eps, &dummy_nlevel, &num_cuts);
return std::min(num_cuts, num_rows);
}


// Count the entries in each column and exclusive scan
void GetColumnSizesScan(int device,
dh::caching_device_vector<size_t>* column_sizes_scan,
Expand Down Expand Up @@ -210,7 +223,7 @@ void ProcessBatch(int device, const SparsePage& page, size_t begin, size_t end,
size_t num_columns) {
dh::XGBCachingDeviceAllocator<char> alloc;
const auto& host_data = page.data.ConstHostVector();
dh::device_vector<Entry> sorted_entries(host_data.begin() + begin,
dh::caching_device_vector<Entry> sorted_entries(host_data.begin() + begin,
host_data.begin() + end);
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
sorted_entries.end(), EntryCompareOp());
Expand All @@ -237,11 +250,11 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
size_t num_columns) {
dh::XGBCachingDeviceAllocator<char> alloc;
const auto& host_data = page.data.ConstHostVector();
dh::device_vector<Entry> sorted_entries(host_data.begin() + begin,
dh::caching_device_vector<Entry> sorted_entries(host_data.begin() + begin,
host_data.begin() + end);

// Binary search to assign weights to each element
dh::device_vector<float> temp_weights(sorted_entries.size());
dh::caching_device_vector<float> temp_weights(sorted_entries.size());
auto d_temp_weights = temp_weights.data().get();
page.offset.SetDevice(device);
auto row_ptrs = page.offset.ConstDeviceSpan();
Expand Down Expand Up @@ -288,28 +301,29 @@ void ProcessWeightedBatch(int device, const SparsePage& page,

HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
size_t sketch_batch_num_elements) {
// Configure batch size based on available memory
bool has_weights = dmat->Info().weights_.Size() > 0;
size_t num_cuts = RequiredSampleCuts(max_bins, dmat->Info().num_row_);
if (sketch_batch_num_elements == 0) {
int bytes_per_element = has_weights ? 24 : 16;
size_t bytes_cuts = num_cuts * dmat->Info().num_col_ * sizeof(SketchEntry);
// use up to 80% of available space
sketch_batch_num_elements =
(dh::AvailableMemory(device) - bytes_cuts) * 0.8 / bytes_per_element;
}

HistogramCuts cuts;
DenseCuts dense_cuts(&cuts);
SketchContainer sketch_container(max_bins, dmat->Info().num_col_,
dmat->Info().num_row_);

constexpr int kFactor = 8;
double eps = 1.0 / (kFactor * max_bins);
size_t dummy_nlevel;
size_t num_cuts;
WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(
dmat->Info().num_row_, eps, &dummy_nlevel, &num_cuts);
num_cuts = std::min(num_cuts, dmat->Info().num_row_);
if (sketch_batch_num_elements == 0) {
sketch_batch_num_elements = dmat->Info().num_nonzero_;
}
dmat->Info().weights_.SetDevice(device);
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
size_t batch_nnz = batch.data.Size();
for (auto begin = 0ull; begin < batch_nnz;
begin += sketch_batch_num_elements) {
size_t end = std::min(batch_nnz, size_t(begin + sketch_batch_num_elements));
if (dmat->Info().weights_.Size() > 0) {
if (has_weights) {
ProcessWeightedBatch(
device, batch, dmat->Info().weights_.ConstDeviceSpan(), begin, end,
&sketch_container, num_cuts, dmat->Info().num_col_);
Expand Down Expand Up @@ -369,6 +383,7 @@ void ProcessBatch(AdapterT* adapter, size_t begin, size_t end, float missing,
// Work out how many valid entries we have in each column
dh::caching_device_vector<size_t> column_sizes_scan(adapter->NumColumns() + 1,
0);

auto d_column_sizes_scan = column_sizes_scan.data().get();
IsValidFunctor is_valid(missing);
dh::LaunchN(adapter->DeviceIdx(), end - begin, [=] __device__(size_t idx) {
Expand All @@ -385,7 +400,7 @@ void ProcessBatch(AdapterT* adapter, size_t begin, size_t end, float missing,
size_t num_valid = host_column_sizes_scan.back();

// Copy current subset of valid elements into temporary storage and sort
thrust::device_vector<Entry> sorted_entries(num_valid);
dh::caching_device_vector<Entry> sorted_entries(num_valid);
thrust::copy_if(thrust::cuda::par(alloc), entry_iter + begin,
entry_iter + end, sorted_entries.begin(), is_valid);
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
Expand All @@ -406,6 +421,17 @@ template <typename AdapterT>
HistogramCuts AdapterDeviceSketch(AdapterT* adapter, int num_bins,
float missing,
size_t sketch_batch_num_elements) {
size_t num_cuts = RequiredSampleCuts(num_bins, adapter->NumRows());
if (sketch_batch_num_elements == 0) {
int bytes_per_element = 16;
size_t bytes_cuts = num_cuts * adapter->NumColumns() * sizeof(SketchEntry);
size_t bytes_num_columns = (adapter->NumColumns() + 1) * sizeof(size_t);
// use up to 80% of available space
sketch_batch_num_elements = (dh::AvailableMemory(adapter->DeviceIdx()) -
bytes_cuts - bytes_num_columns) *
0.8 / bytes_per_element;
}

CHECK(adapter->NumRows() != data::kAdapterUnknownSize);
CHECK(adapter->NumColumns() != data::kAdapterUnknownSize);

Expand All @@ -421,16 +447,6 @@ HistogramCuts AdapterDeviceSketch(AdapterT* adapter, int num_bins,
SketchContainer sketch_container(num_bins, adapter->NumColumns(),
adapter->NumRows());

constexpr int kFactor = 8;
double eps = 1.0 / (kFactor * num_bins);
size_t dummy_nlevel;
size_t num_cuts;
WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(
adapter->NumRows(), eps, &dummy_nlevel, &num_cuts);
num_cuts = std::min(num_cuts, adapter->NumRows());
if (sketch_batch_num_elements == 0) {
sketch_batch_num_elements = batch.Size();
}
for (auto begin = 0ull; begin < batch.Size();
begin += sketch_batch_num_elements) {
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
Expand Down
7 changes: 4 additions & 3 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,14 +199,15 @@ class DenseCuts : public CutsBuilder {
void Build(DMatrix* p_fmat, uint32_t max_num_bins) override;
};


// sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
size_t sketch_batch_num_elements = 10000000);
size_t sketch_batch_num_elements = 0);

// sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
template <typename AdapterT>
HistogramCuts AdapterDeviceSketch(AdapterT* adapter, int num_bins,
float missing,
size_t sketch_batch_num_elements = 10000000);
size_t sketch_batch_num_elements = 0);

/*!
* \brief preprocessed global index matrix, in CSR format
Expand Down
3 changes: 1 addition & 2 deletions src/data/ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,7 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
monitor_.StartCuda("Quantiles");
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
row_stride = GetRowStride(dmat);
cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
param.gpu_batch_nrows);
cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.StopCuda("Quantiles");

monitor_.StartCuda("InitCompressedData");
Expand Down
3 changes: 1 addition & 2 deletions src/data/ellpack_page_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ EllpackPageSource::EllpackPageSource(DMatrix* dmat,

monitor_.StartCuda("Quantiles");
size_t row_stride = GetRowStride(dmat);
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
param.gpu_batch_nrows);
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.StopCuda("Quantiles");

monitor_.StartCuda("WriteEllpackPages");
Expand Down
8 changes: 0 additions & 8 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,13 @@ struct GPUHistMakerTrainParam
: public XGBoostParameter<GPUHistMakerTrainParam> {
bool single_precision_histogram;
bool deterministic_histogram;
// number of rows in a single GPU batch
int gpu_batch_nrows;
bool debug_synchronize;
// declare parameters
DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe(
"Use single precision to build histograms.");
DMLC_DECLARE_FIELD(deterministic_histogram).set_default(true).describe(
"Pre-round the gradient for obtaining deterministic gradient histogram.");
DMLC_DECLARE_FIELD(gpu_batch_nrows)
.set_lower_bound(-1)
.set_default(0)
.describe("Number of rows in a GPU batch, used for finding quantiles on GPU; "
"-1 to use all rows assignted to a GPU, and 0 to auto-deduce");
DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
"Check if all distributed tree are identical after tree construction.");
}
Expand Down Expand Up @@ -1018,7 +1011,6 @@ class GPUHistMakerSpecialised {
BatchParam batch_param{
device_,
param_.max_bin,
hist_maker_param_.gpu_batch_nrows,
generic_param_->gpu_page_size
};
auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
Expand Down
Loading

0 comments on commit b745b7a

Please sign in to comment.