Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use std::uint64_t for row index. #10120

Merged
merged 5 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 21 additions & 27 deletions include/xgboost/base.h
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
/**
* Copyright 2015-2023 by XGBoost Contributors
* Copyright 2015-2024, XGBoost Contributors
* \file base.h
* \brief Defines configuration macros and basic types for xgboost.
*/
#ifndef XGBOOST_BASE_H_
#define XGBOOST_BASE_H_

#include <dmlc/base.h>
#include <dmlc/omp.h>
#include <dmlc/omp.h> // for omp_uint, omp_ulong

#include <cmath>
#include <cstdint>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
#include <cstdint> // for int32_t, uint64_t, int16_t
#include <ostream> // for ostream
#include <string> // for string
#include <utility> // for pair
#include <vector> // for vector

/*!
* \brief string flag for R library, to leave hooks when needed.
Expand Down Expand Up @@ -86,34 +84,31 @@

#endif // !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined()

/*! \brief namespace of xgboost*/
namespace xgboost {

/*! \brief unsigned integer type used for feature index. */
using bst_uint = uint32_t; // NOLINT
using bst_uint = std::uint32_t; // NOLINT
/*! \brief unsigned long integers */
using bst_ulong = uint64_t; // NOLINT
using bst_ulong = std::uint64_t; // NOLINT
/*! \brief float type, used for storing statistics */
using bst_float = float; // NOLINT
/*! \brief Categorical value type. */
using bst_cat_t = int32_t; // NOLINT
using bst_cat_t = std::int32_t; // NOLINT
/*! \brief Type for data column (feature) index. */
using bst_feature_t = uint32_t; // NOLINT
/*! \brief Type for histogram bin index. */
using bst_bin_t = int32_t; // NOLINT
/*! \brief Type for data row index.
*
* Be careful `std::size_t' is implementation-defined. Meaning that the binary
* representation of DMatrix might not be portable across platform. Booster model should
* be portable as parameters are floating points.
using bst_feature_t = std::uint32_t; // NOLINT
/**
* @brief Type for histogram bin index. We sometimes use -1 to indicate invalid bin.
*/
using bst_row_t = std::size_t; // NOLINT
using bst_bin_t = std::int32_t; // NOLINT
/**
* @brief Type for data row index (sample).
*/
using bst_idx_t = std::uint64_t; // NOLINT
/*! \brief Type for tree node index. */
using bst_node_t = std::int32_t; // NOLINT
/*! \brief Type for ranking group index. */
using bst_group_t = std::uint32_t; // NOLINT
/**
* \brief Type for indexing into output targets.
* @brief Type for indexing into output targets.
*/
using bst_target_t = std::uint32_t; // NOLINT
/**
Expand Down Expand Up @@ -306,8 +301,7 @@ class GradientPairInt64 {
XGBOOST_DEVICE bool operator==(const GradientPairInt64 &rhs) const {
return grad_ == rhs.grad_ && hess_ == rhs.hess_;
}
friend std::ostream &operator<<(std::ostream &os,
const GradientPairInt64 &g) {
friend std::ostream &operator<<(std::ostream &os, const GradientPairInt64 &g) {
os << g.GetQuantisedGrad() << "/" << g.GetQuantisedHess();
return os;
}
Expand All @@ -323,7 +317,7 @@ using omp_ulong = dmlc::omp_ulong; // NOLINT
/*! \brief define unsigned int for openmp loop */
using bst_omp_uint = dmlc::omp_uint; // NOLINT
/*! \brief Type used for representing version number in binary form.*/
using XGBoostVersionT = int32_t;
using XGBoostVersionT = std::int32_t;
} // namespace xgboost

#endif // XGBOOST_BASE_H_
4 changes: 2 additions & 2 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ struct BatchParam {
struct HostSparsePageView {
using Inst = common::Span<Entry const>;

common::Span<bst_row_t const> offset;
common::Span<bst_idx_t const> offset;
common::Span<Entry const> data;

Inst operator[](size_t i) const {
Expand All @@ -333,7 +333,7 @@ struct HostSparsePageView {
class SparsePage {
public:
// Offset for each row.
HostDeviceVector<bst_row_t> offset;
HostDeviceVector<bst_idx_t> offset;
/*! \brief the data of the segments */
HostDeviceVector<Entry> data;

Expand Down
10 changes: 8 additions & 2 deletions include/xgboost/json.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ class Value {
virtual Json& operator[](int ind);

virtual bool operator==(Value const& rhs) const = 0;
#if !defined(__APPLE__)
virtual Value& operator=(Value const& rhs) = delete;
#endif // !defined(__APPLE__)

std::string TypeStr() const;

Expand Down Expand Up @@ -105,6 +103,7 @@ class JsonString : public Value {
std::string& GetString() & { return str_; }

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) {
return value->Type() == ValueKind::kString;
Expand Down Expand Up @@ -134,6 +133,7 @@ class JsonArray : public Value {
std::vector<Json>& GetArray() & { return vec_; }

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) {
return value->Type() == ValueKind::kArray;
Expand All @@ -158,6 +158,7 @@ class JsonTypedArray : public Value {
JsonTypedArray(JsonTypedArray&& that) noexcept : Value{kind}, vec_{std::move(that.vec_)} {}

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

void Set(size_t i, T v) { vec_[i] = v; }
size_t Size() const { return vec_.size(); }
Expand Down Expand Up @@ -216,6 +217,7 @@ class JsonObject : public Value {
Map& GetObject() & { return object_; }

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) { return value->Type() == ValueKind::kObject; }
~JsonObject() override = default;
Expand Down Expand Up @@ -249,6 +251,7 @@ class JsonNumber : public Value {
Float& GetNumber() & { return number_; }

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) {
return value->Type() == ValueKind::kNumber;
Expand Down Expand Up @@ -287,6 +290,7 @@ class JsonInteger : public Value {
: Value{ValueKind::kInteger}, integer_{that.integer_} {}

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

Int const& GetInteger() && { return integer_; }
Int const& GetInteger() const & { return integer_; }
Expand All @@ -307,6 +311,7 @@ class JsonNull : public Value {
void Save(JsonWriter* writer) const override;

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) {
return value->Type() == ValueKind::kNull;
Expand Down Expand Up @@ -336,6 +341,7 @@ class JsonBoolean : public Value {
bool& GetBoolean() & { return boolean_; }

bool operator==(Value const& rhs) const override;
Value& operator=(Value const& rhs) override = delete;

static bool IsClassOf(Value const* value) {
return value->Type() == ValueKind::kBoolean;
Expand Down
2 changes: 1 addition & 1 deletion plugin/sycl/data/gradient_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
uint32_t* offsets) {
if (nbins == 0) return;
const xgboost::Entry *data_ptr = dmat.data.DataConst();
const bst_row_t *offset_vec = dmat.row_ptr.DataConst();
const bst_idx_t *offset_vec = dmat.row_ptr.DataConst();
const size_t num_rows = dmat.row_ptr.Size() - 1;
const bst_float* cut_values = cut_device.Values().DataConst();
const uint32_t* cut_ptrs = cut_device.Ptrs().DataConst();
Expand Down
2 changes: 2 additions & 0 deletions src/collective/device_communicator_adapter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
*/
#pragma once

#include <numeric> // for accumulate

#include "communicator.h"
#include "device_communicator.cuh"

Expand Down
6 changes: 3 additions & 3 deletions src/common/column_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class SparseColumnIter : public Column<BinIdxT> {

public:
SparseColumnIter(common::Span<const BinIdxT> index, bst_bin_t least_bin_idx,
common::Span<const size_t> row_ind, bst_row_t first_row_idx)
common::Span<const size_t> row_ind, bst_idx_t first_row_idx)
: Base{index, least_bin_idx}, row_ind_(row_ind) {
// first_row_id is the first row in the leaf partition
const size_t* row_data = RowIndices();
Expand Down Expand Up @@ -301,7 +301,7 @@ class ColumnMatrix {
}

template <typename BinIdxType>
auto SparseColumn(bst_feature_t fidx, bst_row_t first_row_idx) const {
auto SparseColumn(bst_feature_t fidx, bst_idx_t first_row_idx) const {
const size_t feature_offset = feature_offsets_[fidx]; // to get right place for certain feature
const size_t column_size = feature_offsets_[fidx + 1] - feature_offset;
common::Span<const BinIdxType> bin_index = {
Expand All @@ -325,7 +325,7 @@ class ColumnMatrix {
// all columns are dense column and has no missing value
// FIXME(jiamingy): We don't need a column matrix if there's no missing value.
template <typename RowBinIdxT>
void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
void SetIndexNoMissing(bst_idx_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
const size_t n_features, int32_t n_threads) {
missing_.GrowTo(feature_offsets_[n_features], false);

Expand Down
3 changes: 0 additions & 3 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,16 @@
#include <thrust/unique.h>

#include <algorithm>
#include <chrono>
#include <cstddef> // for size_t
#include <cub/cub.cuh>
#include <cub/util_allocator.cuh>
#include <numeric>
#include <sstream>
#include <string>
#include <tuple>
#include <vector>

#include "../collective/communicator-inl.h"
#include "common.h"
#include "xgboost/global_config.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/logging.h"
#include "xgboost/span.h"
Expand Down
10 changes: 5 additions & 5 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
HistogramCuts out;
auto const &info = m->Info();
auto n_threads = ctx->Threads();
std::vector<bst_row_t> reduced(info.num_col_, 0);
std::vector<bst_idx_t> reduced(info.num_col_, 0);
for (auto const &page : m->GetBatches<SparsePage>()) {
auto const &entries_per_column =
CalcColumnSize(data::SparsePageAdapterBatch{page.GetView()}, info.num_col_, n_threads,
Expand Down Expand Up @@ -209,10 +209,10 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
CHECK(offsets);
}

auto get_row_ptr = [&](bst_row_t ridx) {
auto get_row_ptr = [&](bst_idx_t ridx) {
return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };

const size_t n_features =
get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]);
Expand Down Expand Up @@ -275,10 +275,10 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair,
auto const &row_ptr = gmat.row_ptr.data();
auto base_rowid = gmat.base_rowid;
const uint32_t *offsets = gmat.index.Offset();
auto get_row_ptr = [&](bst_row_t ridx) {
auto get_row_ptr = [&](bst_idx_t ridx) {
return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };

const size_t n_features = gmat.cut.Ptrs().size() - 1;
const size_t n_columns = n_features;
Expand Down
12 changes: 5 additions & 7 deletions src/common/hist_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
#include <xgboost/logging.h>

#include <cstddef> // for size_t
#include <memory>
#include <mutex>
#include <utility>
#include <vector>

Expand All @@ -39,15 +37,15 @@ size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows) {
return std::min(num_cuts, num_rows);
}

size_t RequiredSampleCuts(bst_row_t num_rows, bst_feature_t num_columns,
size_t RequiredSampleCuts(bst_idx_t num_rows, bst_feature_t num_columns,
size_t max_bins, size_t nnz) {
auto per_column = RequiredSampleCutsPerColumn(max_bins, num_rows);
auto if_dense = num_columns * per_column;
auto result = std::min(nnz, if_dense);
return result;
}

size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
size_t num_bins, bool with_weights) {
size_t peak = 0;
// 0. Allocate cut pointer in quantile container by increasing: n_columns + 1
Expand Down Expand Up @@ -85,7 +83,7 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
return peak;
}

size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_row_t num_rows,
size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_idx_t num_rows,
bst_feature_t columns, size_t nnz, int device, size_t num_cuts,
bool has_weight) {
auto constexpr kIntMax = static_cast<std::size_t>(std::numeric_limits<std::int32_t>::max());
Expand Down Expand Up @@ -123,7 +121,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
}

void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_idx_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
Expand Down Expand Up @@ -210,7 +208,7 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
sorted_entries = dh::device_vector<Entry>(h_data.begin() + begin, h_data.begin() + end);
}

bst_row_t base_rowid = page.base_rowid;
bst_idx_t base_rowid = page.base_rowid;

dh::device_vector<float> entry_weight;
auto cuctx = ctx->CUDACtx();
Expand Down
6 changes: 3 additions & 3 deletions src/common/hist_util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ inline size_t constexpr BytesPerElement(bool has_weight) {
* directly if it's not 0.
*/
size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
bst_row_t num_rows, bst_feature_t columns,
bst_idx_t num_rows, bst_feature_t columns,
size_t nnz, int device,
size_t num_cuts, bool has_weight);

Expand All @@ -209,7 +209,7 @@ size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows);
*
* \return The estimated bytes
*/
size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
size_t num_bins, bool with_weights);

// Count the valid entries in each column and copy them out.
Expand Down Expand Up @@ -240,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
void SortByWeight(dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries);

void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_idx_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan);
Expand Down
2 changes: 1 addition & 1 deletion src/common/host_device_vector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ template class HostDeviceVector<uint8_t>;
template class HostDeviceVector<int8_t>;
template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t
template class HostDeviceVector<bst_idx_t>;
template class HostDeviceVector<uint32_t>; // bst_feature_t

#if defined(__APPLE__) || defined(__EMSCRIPTEN__)
Expand Down
2 changes: 1 addition & 1 deletion src/common/host_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ template class HostDeviceVector<uint8_t>;
template class HostDeviceVector<int8_t>;
template class HostDeviceVector<FeatureType>;
template class HostDeviceVector<Entry>;
template class HostDeviceVector<uint64_t>; // bst_row_t
template class HostDeviceVector<bst_idx_t>;
template class HostDeviceVector<uint32_t>; // bst_feature_t
template class HostDeviceVector<RegTree::Node>;
template class HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>;
Expand Down
Loading
Loading