Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUDA iterator to tensor view. #10074

Merged
merged 1 commit into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions include/xgboost/linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,9 @@ class TensorView {
using ShapeT = std::size_t[kDim];
using StrideT = ShapeT;

using element_type = T; // NOLINT
using value_type = std::remove_cv_t<T>; // NOLINT

private:
StrideT stride_{1};
ShapeT shape_{0};
Expand All @@ -314,7 +317,7 @@ class TensorView {
}

template <size_t old_dim, size_t new_dim, int32_t D, typename I>
LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
LINALG_HD size_t MakeSliceDim(std::size_t new_shape[D], std::size_t new_stride[D],
detail::RangeTag<I> &&range) const {
static_assert(new_dim < D);
static_assert(old_dim < kDim);
Expand Down Expand Up @@ -528,9 +531,10 @@ class TensorView {
LINALG_HD auto Stride(size_t i) const { return stride_[i]; }

/**
* \brief Number of items in the tensor.
* @brief Number of items in the tensor.
*/
[[nodiscard]] LINALG_HD std::size_t Size() const { return size_; }
[[nodiscard]] bool Empty() const { return Size() == 0; }
/**
* \brief Whether this is a contiguous array, both C and F contiguous returns true.
*/
Expand Down Expand Up @@ -865,7 +869,9 @@ class Tensor {
auto HostView() { return this->View(DeviceOrd::CPU()); }
auto HostView() const { return this->View(DeviceOrd::CPU()); }

[[nodiscard]] size_t Size() const { return data_.Size(); }
[[nodiscard]] std::size_t Size() const { return data_.Size(); }
[[nodiscard]] bool Empty() const { return Size() == 0; }

auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }
auto Shape(size_t i) const { return shape_[i]; }

Expand Down
4 changes: 2 additions & 2 deletions include/xgboost/span.h
Original file line number Diff line number Diff line change
Expand Up @@ -701,10 +701,10 @@ class IterSpan {
return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
}
[[nodiscard]] XGBOOST_DEVICE constexpr iterator begin() const noexcept { // NOLINT
return {this, 0};
return it_;
}
[[nodiscard]] XGBOOST_DEVICE constexpr iterator end() const noexcept { // NOLINT
return {this, size()};
return it_ + size();
}
};
} // namespace common
Expand Down
42 changes: 32 additions & 10 deletions src/common/linalg_op.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,14 @@
#include "xgboost/context.h" // for Context
#include "xgboost/linalg.h" // for TensorView

namespace xgboost {
namespace linalg {
namespace xgboost::linalg {
namespace cuda_impl {
// Use template specialization to dispatch, Windows + CUDA 11.8 doesn't support extended
// lambda inside constexpr if
template <typename T, std::int32_t D>
struct ElementWiseImpl {
template <typename Fn>
void operator()(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s) {
void operator()(TensorView<T, D> t, Fn&& fn, cudaStream_t s) {
static_assert(D > 1);
dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) mutable {
std::apply(fn, linalg::UnravelIndex(i, t.Shape()));
Expand All @@ -32,36 +31,59 @@ struct ElementWiseImpl {
template <typename T>
struct ElementWiseImpl<T, 1> {
template <typename Fn>
void operator()(linalg::TensorView<T, 1> t, Fn&& fn, cudaStream_t s) {
void operator()(TensorView<T, 1> t, Fn&& fn, cudaStream_t s) {
dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) { fn(i); });
}
};

template <typename T, std::int32_t D, typename Fn>
void ElementWiseKernel(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
void ElementWiseKernel(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
cuda_impl::ElementWiseImpl<T, D>{}(t, fn, s);
}
} // namespace cuda_impl

template <typename T, int32_t D, typename Fn>
void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
void ElementWiseTransformDevice(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
if (t.Contiguous()) {
auto ptr = t.Values().data();
dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
} else {
dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable {
T& v = detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
T& v = detail::Apply(t, UnravelIndex(i, t.Shape()));
v = fn(i, v);
});
}
}

template <typename T, int32_t D, typename Fn>
void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
ctx->IsCUDA() ? cuda_impl::ElementWiseKernel(t, fn)
: ElementWiseKernelHost(t, ctx->Threads(), fn);
}
} // namespace linalg
} // namespace xgboost

namespace detail {
template <typename T, std::int32_t kDim>
struct IterOp {
TensorView<T, kDim> v;
XGBOOST_DEVICE T& operator()(std::size_t i) {
return detail::Apply(v, UnravelIndex(i, v.Shape()));
}
};
} // namespace detail

// naming: thrust begin
// returns a thrust iterator for a tensor view.
template <typename T, std::int32_t kDim>
auto tcbegin(TensorView<T, kDim> v) { // NOLINT
return dh::MakeTransformIterator<T>(
thrust::make_counting_iterator(0ul),
detail::IterOp<std::add_const_t<std::remove_const_t<T>>, kDim>{v});
}

template <typename T, std::int32_t kDim>
auto tcend(TensorView<T, kDim> v) { // NOLINT
return tcbegin(v) + v.Size();
}
} // namespace xgboost::linalg
#endif // XGBOOST_COMMON_LINALG_OP_CUH_
24 changes: 23 additions & 1 deletion tests/cpp/common/test_linalg.cu
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
/**
* Copyright 2021-2023 by XGBoost Contributors
* Copyright 2021-2024, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <thrust/equal.h> // for equal
#include <thrust/sequence.h> // for sequence

#include "../../../src/common/cuda_context.cuh"
#include "../../../src/common/linalg_op.cuh"
#include "../helpers.h"
#include "xgboost/context.h"
Expand Down Expand Up @@ -85,4 +88,23 @@ void TestSlice() {
TEST(Linalg, GPUElementWise) { TestElementWiseKernel(); }

TEST(Linalg, GPUTensorView) { TestSlice(); }

TEST(Linalg, GPUIter) {
auto ctx = MakeCUDACtx(1);
auto cuctx = ctx.CUDACtx();

dh::device_vector<double> data(2 * 3 * 4);
thrust::sequence(cuctx->CTP(), data.begin(), data.end(), 1.0);

auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
static_assert(!std::is_const_v<decltype(t)::element_type>);
static_assert(!std::is_const_v<decltype(t)::value_type>);

auto n = std::distance(linalg::tcbegin(t), linalg::tcend(t));
ASSERT_EQ(n, t.Size());
ASSERT_FALSE(t.Empty());

bool eq = thrust::equal(cuctx->CTP(), data.cbegin(), data.cend(), linalg::tcbegin(t));
ASSERT_TRUE(eq);
}
} // namespace xgboost::linalg
Loading