Skip to content

Commit

Permalink
Use std::uint64_t for row index. (#10120)
Browse files Browse the repository at this point in the history
- Use std::uint64_t instead of size_t to avoid implementation-defined type.
- Rename to bst_idx_t, to account for other types of indexing.
- Small cleanup to the base header.
  • Loading branch information
trivialfis committed Mar 15, 2024
1 parent 56b1868 commit 53fc175
Show file tree
Hide file tree
Showing 57 changed files with 228 additions and 238 deletions.
48 changes: 21 additions & 27 deletions include/xgboost/base.h
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
/**
* Copyright 2015-2023 by XGBoost Contributors
* Copyright 2015-2024, XGBoost Contributors
* \file base.h
* \brief Defines configuration macros and basic types for xgboost.
*/
#ifndef XGBOOST_BASE_H_
#define XGBOOST_BASE_H_

#include <dmlc/base.h>
#include <dmlc/omp.h>
#include <dmlc/omp.h> // for omp_uint, omp_ulong

#include <cmath>
#include <cstdint>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
#include <cstdint> // for int32_t, uint64_t, int16_t
#include <ostream> // for ostream
#include <string> // for string
#include <utility> // for pair
#include <vector> // for vector

/*!
* \brief string flag for R library, to leave hooks when needed.
Expand Down Expand Up @@ -86,34 +84,31 @@

#endif // !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined()

/*! \brief namespace of xgboost*/
namespace xgboost {

/*! \brief unsigned integer type used for feature index. */
using bst_uint = uint32_t; // NOLINT
using bst_uint = std::uint32_t; // NOLINT
/*! \brief unsigned long integers */
using bst_ulong = uint64_t; // NOLINT
using bst_ulong = std::uint64_t; // NOLINT
/*! \brief float type, used for storing statistics */
using bst_float = float; // NOLINT
/*! \brief Categorical value type. */
using bst_cat_t = int32_t; // NOLINT
using bst_cat_t = std::int32_t; // NOLINT
/*! \brief Type for data column (feature) index. */
using bst_feature_t = uint32_t; // NOLINT
/*! \brief Type for histogram bin index. */
using bst_bin_t = int32_t; // NOLINT
/*! \brief Type for data row index.
*
* Be careful `std::size_t' is implementation-defined. Meaning that the binary
* representation of DMatrix might not be portable across platform. Booster model should
* be portable as parameters are floating points.
using bst_feature_t = std::uint32_t; // NOLINT
/**
* @brief Type for histogram bin index. We sometimes use -1 to indicate invalid bin.
*/
using bst_row_t = std::size_t; // NOLINT
using bst_bin_t = std::int32_t; // NOLINT
/**
* @brief Type for data row index (sample).
*/
using bst_idx_t = std::uint64_t; // NOLINT
/*! \brief Type for tree node index. */
using bst_node_t = std::int32_t; // NOLINT
/*! \brief Type for ranking group index. */
using bst_group_t = std::uint32_t; // NOLINT
/**
* \brief Type for indexing into output targets.
* @brief Type for indexing into output targets.
*/
using bst_target_t = std::uint32_t; // NOLINT
/**
Expand Down Expand Up @@ -306,8 +301,7 @@ class GradientPairInt64 {
XGBOOST_DEVICE bool operator==(const GradientPairInt64 &rhs) const {
return grad_ == rhs.grad_ && hess_ == rhs.hess_;
}
friend std::ostream &operator<<(std::ostream &os,
const GradientPairInt64 &g) {
friend std::ostream &operator<<(std::ostream &os, const GradientPairInt64 &g) {
os << g.GetQuantisedGrad() << "/" << g.GetQuantisedHess();
return os;
}
Expand All @@ -323,7 +317,7 @@ using omp_ulong = dmlc::omp_ulong; // NOLINT
/*! \brief define unsigned int for openmp loop */
using bst_omp_uint = dmlc::omp_uint; // NOLINT
/*! \brief Type used for representing version number in binary form.*/
using XGBoostVersionT = int32_t;
using XGBoostVersionT = std::int32_t;
} // namespace xgboost

#endif // XGBOOST_BASE_H_
4 changes: 2 additions & 2 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ struct BatchParam {
struct HostSparsePageView {
using Inst = common::Span<Entry const>;

common::Span<bst_row_t const> offset;
common::Span<bst_idx_t const> offset;
common::Span<Entry const> data;

Inst operator[](size_t i) const {
Expand All @@ -333,7 +333,7 @@ struct HostSparsePageView {
class SparsePage {
public:
// Offset for each row.
HostDeviceVector<bst_row_t> offset;
HostDeviceVector<bst_idx_t> offset;
/*! \brief the data of the segments */
HostDeviceVector<Entry> data;

Expand Down
10 changes: 8 additions & 2 deletions include/xgboost/json.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ class Value {
virtual Json& operator[](int ind);

virtual bool operator==(Value const& rhs) const = 0;
#if !defined(__APPLE__)
virtual Value& operator=(Value const& rhs) = delete;
#endif // !defined(__APPLE__)

std::string TypeStr() const;

Expand Down Expand Up @@ -105,6 +103,7 @@ class JsonString : public Value {
std::string& GetString() & { return str_; }

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) {
return value->Type() == ValueKind::kString;
Expand Down Expand Up @@ -134,6 +133,7 @@ class JsonArray : public Value {
std::vector<Json>& GetArray() & { return vec_; }

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) {
return value->Type() == ValueKind::kArray;
Expand All @@ -158,6 +158,7 @@ class JsonTypedArray : public Value {
JsonTypedArray(JsonTypedArray&& that) noexcept : Value{kind}, vec_{std::move(that.vec_)} {}

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

void Set(size_t i, T v) { vec_[i] = v; }
size_t Size() const { return vec_.size(); }
Expand Down Expand Up @@ -216,6 +217,7 @@ class JsonObject : public Value {
Map& GetObject() & { return object_; }

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) { return value->Type() == ValueKind::kObject; }
~JsonObject() override = default;
Expand Down Expand Up @@ -249,6 +251,7 @@ class JsonNumber : public Value {
Float& GetNumber() & { return number_; }

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) {
return value->Type() == ValueKind::kNumber;
Expand Down Expand Up @@ -287,6 +290,7 @@ class JsonInteger : public Value {
: Value{ValueKind::kInteger}, integer_{that.integer_} {}

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

Int const& GetInteger() && { return integer_; }
Int const& GetInteger() const & { return integer_; }
Expand All @@ -307,6 +311,7 @@ class JsonNull : public Value {
void Save(JsonWriter* writer) const override;

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) {
return value->Type() == ValueKind::kNull;
Expand Down Expand Up @@ -336,6 +341,7 @@ class JsonBoolean : public Value {
bool& GetBoolean() & { return boolean_; }

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) {
return value->Type() == ValueKind::kBoolean;
Expand Down
2 changes: 1 addition & 1 deletion plugin/sycl/data/gradient_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
uint32_t* offsets) {
if (nbins == 0) return;
const xgboost::Entry *data_ptr = dmat.data.DataConst();
const bst_row_t *offset_vec = dmat.row_ptr.DataConst();
const bst_idx_t *offset_vec = dmat.row_ptr.DataConst();
const size_t num_rows = dmat.row_ptr.Size() - 1;
const bst_float* cut_values = cut_device.Values().DataConst();
const uint32_t* cut_ptrs = cut_device.Ptrs().DataConst();
Expand Down
2 changes: 2 additions & 0 deletions src/collective/device_communicator_adapter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
*/
#pragma once

#include <numeric> // for accumulate

#include "communicator.h"
#include "device_communicator.cuh"

Expand Down
6 changes: 3 additions & 3 deletions src/common/column_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class SparseColumnIter : public Column<BinIdxT> {

public:
SparseColumnIter(common::Span<const BinIdxT> index, bst_bin_t least_bin_idx,
common::Span<const size_t> row_ind, bst_row_t first_row_idx)
common::Span<const size_t> row_ind, bst_idx_t first_row_idx)
: Base{index, least_bin_idx}, row_ind_(row_ind) {
// first_row_id is the first row in the leaf partition
const size_t* row_data = RowIndices();
Expand Down Expand Up @@ -301,7 +301,7 @@ class ColumnMatrix {
}

template <typename BinIdxType>
auto SparseColumn(bst_feature_t fidx, bst_row_t first_row_idx) const {
auto SparseColumn(bst_feature_t fidx, bst_idx_t first_row_idx) const {
const size_t feature_offset = feature_offsets_[fidx]; // to get right place for certain feature
const size_t column_size = feature_offsets_[fidx + 1] - feature_offset;
common::Span<const BinIdxType> bin_index = {
Expand All @@ -325,7 +325,7 @@ class ColumnMatrix {
// all columns are dense column and has no missing value
// FIXME(jiamingy): We don't need a column matrix if there's no missing value.
template <typename RowBinIdxT>
void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
void SetIndexNoMissing(bst_idx_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
const size_t n_features, int32_t n_threads) {
missing_.GrowTo(feature_offsets_[n_features], false);

Expand Down
3 changes: 0 additions & 3 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,16 @@
#include <thrust/unique.h>

#include <algorithm>
#include <chrono>
#include <cstddef> // for size_t
#include <cub/cub.cuh>
#include <cub/util_allocator.cuh>
#include <numeric>
#include <sstream>
#include <string>
#include <tuple>
#include <vector>

#include "../collective/communicator-inl.h"
#include "common.h"
#include "xgboost/global_config.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/logging.h"
#include "xgboost/span.h"
Expand Down
10 changes: 5 additions & 5 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
HistogramCuts out;
auto const &info = m->Info();
auto n_threads = ctx->Threads();
std::vector<bst_row_t> reduced(info.num_col_, 0);
std::vector<bst_idx_t> reduced(info.num_col_, 0);
for (auto const &page : m->GetBatches<SparsePage>()) {
auto const &entries_per_column =
CalcColumnSize(data::SparsePageAdapterBatch{page.GetView()}, info.num_col_, n_threads,
Expand Down Expand Up @@ -209,10 +209,10 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
CHECK(offsets);
}

auto get_row_ptr = [&](bst_row_t ridx) {
auto get_row_ptr = [&](bst_idx_t ridx) {
return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };

const size_t n_features =
get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]);
Expand Down Expand Up @@ -275,10 +275,10 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair,
auto const &row_ptr = gmat.row_ptr.data();
auto base_rowid = gmat.base_rowid;
const uint32_t *offsets = gmat.index.Offset();
auto get_row_ptr = [&](bst_row_t ridx) {
auto get_row_ptr = [&](bst_idx_t ridx) {
return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };

const size_t n_features = gmat.cut.Ptrs().size() - 1;
const size_t n_columns = n_features;
Expand Down
12 changes: 5 additions & 7 deletions src/common/hist_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
#include <xgboost/logging.h>

#include <cstddef> // for size_t
#include <memory>
#include <mutex>
#include <utility>
#include <vector>

Expand All @@ -39,15 +37,15 @@ size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows) {
return std::min(num_cuts, num_rows);
}

size_t RequiredSampleCuts(bst_row_t num_rows, bst_feature_t num_columns,
size_t RequiredSampleCuts(bst_idx_t num_rows, bst_feature_t num_columns,
size_t max_bins, size_t nnz) {
auto per_column = RequiredSampleCutsPerColumn(max_bins, num_rows);
auto if_dense = num_columns * per_column;
auto result = std::min(nnz, if_dense);
return result;
}

size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
size_t num_bins, bool with_weights) {
size_t peak = 0;
// 0. Allocate cut pointer in quantile container by increasing: n_columns + 1
Expand Down Expand Up @@ -85,7 +83,7 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
return peak;
}

size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_row_t num_rows,
size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_idx_t num_rows,
bst_feature_t columns, size_t nnz, int device, size_t num_cuts,
bool has_weight) {
auto constexpr kIntMax = static_cast<std::size_t>(std::numeric_limits<std::int32_t>::max());
Expand Down Expand Up @@ -123,7 +121,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
}

void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_idx_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
Expand Down Expand Up @@ -210,7 +208,7 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
sorted_entries = dh::device_vector<Entry>(h_data.begin() + begin, h_data.begin() + end);
}

bst_row_t base_rowid = page.base_rowid;
bst_idx_t base_rowid = page.base_rowid;

dh::device_vector<float> entry_weight;
auto cuctx = ctx->CUDACtx();
Expand Down
6 changes: 3 additions & 3 deletions src/common/hist_util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ inline size_t constexpr BytesPerElement(bool has_weight) {
* directly if it's not 0.
*/
size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
bst_row_t num_rows, bst_feature_t columns,
bst_idx_t num_rows, bst_feature_t columns,
size_t nnz, int device,
size_t num_cuts, bool has_weight);

Expand All @@ -209,7 +209,7 @@ size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows);
*
* \return The estimated bytes
*/
size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
size_t num_bins, bool with_weights);

// Count the valid entries in each column and copy them out.
Expand Down Expand Up @@ -240,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
void SortByWeight(dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries);

void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_idx_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan);
Expand Down
2 changes: 1 addition & 1 deletion src/common/host_device_vector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ template class HostDeviceVector<uint8_t>;
template class HostDeviceVector<int8_t>;
template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t
template class HostDeviceVector<bst_idx_t>;
template class HostDeviceVector<uint32_t>; // bst_feature_t

#if defined(__APPLE__) || defined(__EMSCRIPTEN__)
Expand Down
2 changes: 1 addition & 1 deletion src/common/host_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ template class HostDeviceVector<uint8_t>;
template class HostDeviceVector<int8_t>;
template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t
template class HostDeviceVector<bst_idx_t>;
template class HostDeviceVector<uint32_t>; // bst_feature_t
template class HostDeviceVector<RegTree::Node>;
template class HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>;
Expand Down
Loading

0 comments on commit 53fc175

Please sign in to comment.