Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix memory usage of device sketching #5407

Merged
merged 3 commits into from
Mar 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 5 additions & 14 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,24 +168,15 @@ struct BatchParam {
/*! \brief The GPU device to use. */
int gpu_id;
/*! \brief Maximum number of bins per feature for histograms. */
int max_bin { 0 };
/*! \brief Number of rows in a GPU batch, used for finding quantiles on GPU. */
int gpu_batch_nrows;
int max_bin{0};
/*! \brief Page size for external memory mode. */
size_t gpu_page_size;
BatchParam() = default;
BatchParam(int32_t device, int32_t max_bin, int32_t gpu_batch_nrows,
size_t gpu_page_size = 0) :
gpu_id{device},
max_bin{max_bin},
gpu_batch_nrows{gpu_batch_nrows},
gpu_page_size{gpu_page_size}
{}
BatchParam(int32_t device, int32_t max_bin, size_t gpu_page_size = 0)
: gpu_id{device}, max_bin{max_bin}, gpu_page_size{gpu_page_size} {}
inline bool operator!=(const BatchParam& other) const {
return gpu_id != other.gpu_id ||
max_bin != other.max_bin ||
gpu_batch_nrows != other.gpu_batch_nrows ||
gpu_page_size != other.gpu_page_size;
return gpu_id != other.gpu_id || max_bin != other.max_bin ||
gpu_page_size != other.gpu_page_size;
}
};

Expand Down
8 changes: 7 additions & 1 deletion src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,11 @@ public:
{
return stats_.peak_allocated_bytes;
}
void Clear()
{
stats_ = DeviceStats();
}

void Log() {
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
return;
Expand Down Expand Up @@ -475,7 +480,8 @@ struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
template <typename T>
using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
/*! Be careful that the initialization constructor is a no-op, which means calling
* `vec.resize(n, 1)` won't initialize the memory region to 1. */
* `vec.resize(n)` won't initialize the memory region to 0. Instead use
* `vec.resize(n, 0)`*/
template <typename T>
using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
/** \brief Specialisation of thrust device vector using custom allocator. */
Expand Down
66 changes: 41 additions & 25 deletions src/common/hist_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,19 @@ struct EntryCompareOp {
}
};

// Compute number of sample cuts needed on local node to maintain accuracy
// We take more cuts than needed and then reduce them later
size_t RequiredSampleCuts(int max_bins, size_t num_rows) {
constexpr int kFactor = 8;
double eps = 1.0 / (kFactor * max_bins);
size_t dummy_nlevel;
size_t num_cuts;
WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(
num_rows, eps, &dummy_nlevel, &num_cuts);
return std::min(num_cuts, num_rows);
}


// Count the entries in each column and exclusive scan
void GetColumnSizesScan(int device,
dh::caching_device_vector<size_t>* column_sizes_scan,
Expand Down Expand Up @@ -210,7 +223,7 @@ void ProcessBatch(int device, const SparsePage& page, size_t begin, size_t end,
size_t num_columns) {
dh::XGBCachingDeviceAllocator<char> alloc;
const auto& host_data = page.data.ConstHostVector();
dh::device_vector<Entry> sorted_entries(host_data.begin() + begin,
dh::caching_device_vector<Entry> sorted_entries(host_data.begin() + begin,
host_data.begin() + end);
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
sorted_entries.end(), EntryCompareOp());
Expand All @@ -237,11 +250,11 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
size_t num_columns) {
dh::XGBCachingDeviceAllocator<char> alloc;
const auto& host_data = page.data.ConstHostVector();
dh::device_vector<Entry> sorted_entries(host_data.begin() + begin,
dh::caching_device_vector<Entry> sorted_entries(host_data.begin() + begin,
host_data.begin() + end);

// Binary search to assign weights to each element
dh::device_vector<float> temp_weights(sorted_entries.size());
dh::caching_device_vector<float> temp_weights(sorted_entries.size());
auto d_temp_weights = temp_weights.data().get();
page.offset.SetDevice(device);
auto row_ptrs = page.offset.ConstDeviceSpan();
Expand Down Expand Up @@ -288,28 +301,29 @@ void ProcessWeightedBatch(int device, const SparsePage& page,

HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
size_t sketch_batch_num_elements) {
// Configure batch size based on available memory
bool has_weights = dmat->Info().weights_.Size() > 0;
size_t num_cuts = RequiredSampleCuts(max_bins, dmat->Info().num_row_);
if (sketch_batch_num_elements == 0) {
int bytes_per_element = has_weights ? 24 : 16;
size_t bytes_cuts = num_cuts * dmat->Info().num_col_ * sizeof(SketchEntry);
// use up to 80% of available space
sketch_batch_num_elements =
(dh::AvailableMemory(device) - bytes_cuts) * 0.8 / bytes_per_element;
}

HistogramCuts cuts;
DenseCuts dense_cuts(&cuts);
SketchContainer sketch_container(max_bins, dmat->Info().num_col_,
dmat->Info().num_row_);

constexpr int kFactor = 8;
double eps = 1.0 / (kFactor * max_bins);
size_t dummy_nlevel;
size_t num_cuts;
WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(
dmat->Info().num_row_, eps, &dummy_nlevel, &num_cuts);
num_cuts = std::min(num_cuts, dmat->Info().num_row_);
if (sketch_batch_num_elements == 0) {
sketch_batch_num_elements = dmat->Info().num_nonzero_;
}
dmat->Info().weights_.SetDevice(device);
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
size_t batch_nnz = batch.data.Size();
for (auto begin = 0ull; begin < batch_nnz;
begin += sketch_batch_num_elements) {
size_t end = std::min(batch_nnz, size_t(begin + sketch_batch_num_elements));
if (dmat->Info().weights_.Size() > 0) {
if (has_weights) {
ProcessWeightedBatch(
device, batch, dmat->Info().weights_.ConstDeviceSpan(), begin, end,
&sketch_container, num_cuts, dmat->Info().num_col_);
Expand Down Expand Up @@ -369,6 +383,7 @@ void ProcessBatch(AdapterT* adapter, size_t begin, size_t end, float missing,
// Work out how many valid entries we have in each column
dh::caching_device_vector<size_t> column_sizes_scan(adapter->NumColumns() + 1,
0);

auto d_column_sizes_scan = column_sizes_scan.data().get();
IsValidFunctor is_valid(missing);
dh::LaunchN(adapter->DeviceIdx(), end - begin, [=] __device__(size_t idx) {
Expand All @@ -385,7 +400,7 @@ void ProcessBatch(AdapterT* adapter, size_t begin, size_t end, float missing,
size_t num_valid = host_column_sizes_scan.back();

// Copy current subset of valid elements into temporary storage and sort
thrust::device_vector<Entry> sorted_entries(num_valid);
dh::caching_device_vector<Entry> sorted_entries(num_valid);
thrust::copy_if(thrust::cuda::par(alloc), entry_iter + begin,
entry_iter + end, sorted_entries.begin(), is_valid);
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
Expand All @@ -406,6 +421,17 @@ template <typename AdapterT>
HistogramCuts AdapterDeviceSketch(AdapterT* adapter, int num_bins,
float missing,
size_t sketch_batch_num_elements) {
size_t num_cuts = RequiredSampleCuts(num_bins, adapter->NumRows());
if (sketch_batch_num_elements == 0) {
int bytes_per_element = 16;
size_t bytes_cuts = num_cuts * adapter->NumColumns() * sizeof(SketchEntry);
size_t bytes_num_columns = (adapter->NumColumns() + 1) * sizeof(size_t);
// use up to 80% of available space
sketch_batch_num_elements = (dh::AvailableMemory(adapter->DeviceIdx()) -
bytes_cuts - bytes_num_columns) *
0.8 / bytes_per_element;
}

CHECK(adapter->NumRows() != data::kAdapterUnknownSize);
CHECK(adapter->NumColumns() != data::kAdapterUnknownSize);

Expand All @@ -421,16 +447,6 @@ HistogramCuts AdapterDeviceSketch(AdapterT* adapter, int num_bins,
SketchContainer sketch_container(num_bins, adapter->NumColumns(),
adapter->NumRows());

constexpr int kFactor = 8;
double eps = 1.0 / (kFactor * num_bins);
size_t dummy_nlevel;
size_t num_cuts;
WQuantileSketch<bst_float, bst_float>::LimitSizeLevel(
adapter->NumRows(), eps, &dummy_nlevel, &num_cuts);
num_cuts = std::min(num_cuts, adapter->NumRows());
if (sketch_batch_num_elements == 0) {
sketch_batch_num_elements = batch.Size();
}
for (auto begin = 0ull; begin < batch.Size();
begin += sketch_batch_num_elements) {
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
Expand Down
7 changes: 4 additions & 3 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,14 +199,15 @@ class DenseCuts : public CutsBuilder {
void Build(DMatrix* p_fmat, uint32_t max_num_bins) override;
};


// sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
size_t sketch_batch_num_elements = 10000000);
size_t sketch_batch_num_elements = 0);

// sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
template <typename AdapterT>
HistogramCuts AdapterDeviceSketch(AdapterT* adapter, int num_bins,
float missing,
size_t sketch_batch_num_elements = 10000000);
size_t sketch_batch_num_elements = 0);

/*!
* \brief preprocessed global index matrix, in CSR format
Expand Down
3 changes: 1 addition & 2 deletions src/data/ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,7 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
monitor_.StartCuda("Quantiles");
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
row_stride = GetRowStride(dmat);
cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
param.gpu_batch_nrows);
cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.StopCuda("Quantiles");

monitor_.StartCuda("InitCompressedData");
Expand Down
3 changes: 1 addition & 2 deletions src/data/ellpack_page_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ EllpackPageSource::EllpackPageSource(DMatrix* dmat,

monitor_.StartCuda("Quantiles");
size_t row_stride = GetRowStride(dmat);
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
param.gpu_batch_nrows);
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.StopCuda("Quantiles");

monitor_.StartCuda("WriteEllpackPages");
Expand Down
8 changes: 0 additions & 8 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,13 @@ struct GPUHistMakerTrainParam
: public XGBoostParameter<GPUHistMakerTrainParam> {
bool single_precision_histogram;
bool deterministic_histogram;
// number of rows in a single GPU batch
int gpu_batch_nrows;
bool debug_synchronize;
// declare parameters
DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe(
"Use single precision to build histograms.");
DMLC_DECLARE_FIELD(deterministic_histogram).set_default(true).describe(
"Pre-round the gradient for obtaining deterministic gradient histogram.");
DMLC_DECLARE_FIELD(gpu_batch_nrows)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there existing use-cases that use this? it looks like the (breaking) new behavior is to auto-deduce and i'm wondering if there are configs that use -1 to pull everything in one shot as opposed to looping (with perhaps better latencies).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current implementation will use up to 80% of available memory so the 'do everything in one batch' approach would only be slightly better in the case where >80% memory is used. Current autodetect behaviour is able to use more available memory than the old implementation and would have faster latencies.

The gpu_batch_nrows parameter was never documented so we have no commitment to support it, I don't think we use it anywhere apart from maybe testing.

.set_lower_bound(-1)
.set_default(0)
.describe("Number of rows in a GPU batch, used for finding quantiles on GPU; "
"-1 to use all rows assignted to a GPU, and 0 to auto-deduce");
DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
"Check if all distributed tree are identical after tree construction.");
}
Expand Down Expand Up @@ -1018,7 +1011,6 @@ class GPUHistMakerSpecialised {
BatchParam batch_param{
device_,
param_.max_bin,
hist_maker_param_.gpu_batch_nrows,
generic_param_->gpu_page_size
};
auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
Expand Down
Loading