From 4cc00eb12e10e37c2f9ae0c6d50e0ea3c4b9394a Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 29 Sep 2022 15:54:39 +0200 Subject: [PATCH 1/3] Add a function to compute the intersection of N-dimensional alpaka::Vec --- .../AlpakaInterface/interface/vec.h | 42 +++++++++++++++++++ .../AlpakaInterface/test/BuildFile.xml | 6 +++ .../AlpakaInterface/test/alpaka/testVec.cc | 25 +++++++++++ 3 files changed, 73 insertions(+) create mode 100644 HeterogeneousCore/AlpakaInterface/interface/vec.h create mode 100644 HeterogeneousCore/AlpakaInterface/test/BuildFile.xml create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testVec.cc diff --git a/HeterogeneousCore/AlpakaInterface/interface/vec.h b/HeterogeneousCore/AlpakaInterface/interface/vec.h new file mode 100644 index 0000000000000..4126eecf78cf2 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/vec.h @@ -0,0 +1,42 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_vec_h +#define HeterogeneousCore_AlpakaInterface_interface_vec_h + +#include + +#include + +namespace alpaka { + + //! \return The element-wise minimum of one or more vectors. + ALPAKA_NO_HOST_ACC_WARNING + template , Vecs> && ...)>> + ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec const& p, Vecs const&... qs) -> Vec { + Vec r; + if constexpr (TDim::value > 0) { + for (typename TDim::value_type i = 0; i < TDim::value; ++i) + r[i] = std::min({p[i], qs[i]...}); + } + return r; + } + + //! \return The element-wise maximum of one or more vectors. + ALPAKA_NO_HOST_ACC_WARNING + template , Vecs> && ...)>> + ALPAKA_FN_HOST_ACC constexpr auto elementwise_max(Vec const& p, Vecs const&... qs) -> Vec { + Vec r; + if constexpr (TDim::value > 0) { + for (typename TDim::value_type i = 0; i < TDim::value; ++i) + r[i] = std::max({p[i], qs[i]...}); + } + return r; + } + +} // namespace alpaka + +#endif // HeterogeneousCore_AlpakaInterface_interface_vec_h diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml new file mode 100644 index 0000000000000..c8d214ff53436 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testVec.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testVec.cc new file mode 100644 index 0000000000000..0cae50ba0e5e5 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testVec.cc @@ -0,0 +1,25 @@ +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/vec.h" + +static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestVec) "]"; + +TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestVec), s_tag) { + SECTION("Vec1D") { + alpaka_common::Vec1D s1 = 100; + alpaka_common::Vec1D s2 = 42; + alpaka_common::Vec1D si = 42; + REQUIRE(alpaka::elementwise_min(s1, s2) == si); + } + + SECTION("Vec3D") { + alpaka_common::Vec3D v1{100, 10, 10}; + alpaka_common::Vec3D v2{42, 42, 1}; + alpaka_common::Vec3D vi{42, 10, 1}; + REQUIRE(alpaka::elementwise_min(v1, v2) == vi); + } +} From 305982010011f920c1352cb72970ccb6622acbff Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 26 Sep 2022 20:56:49 +0200 Subject: [PATCH 2/3] Add helper functions for alpaka kernels Add helper functions for: - performing an integer division with rounding up, or rounding a value to the next multiple of an integer; - creating an efficient work division, assigning the number of work items either to the number of GPU threads or CPU elements; - perform a scalar or N-dimensional strided loop within a kernel, based on the grid size and number of elements. --- .../AlpakaInterface/interface/workdivision.h | 266 ++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 HeterogeneousCore/AlpakaInterface/interface/workdivision.h diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h new file mode 100644 index 0000000000000..dad09ed1774e6 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -0,0 +1,266 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h +#define HeterogeneousCore_AlpakaInterface_interface_workdivision_h + +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "HeterogeneousCore/AlpakaInterface/interface/vec.h" + +namespace cms::alpakatools { + + using namespace alpaka_common; + + // If the first argument is not a multiple of the second argument, round it up to the next multiple + inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; } + + // Return the integer division of the first argument by the second argument, rounded up to the next integer + inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; } + + // Create an accelerator-dependent work division for 1-dimensional kernels + template and alpaka::Dim::value == 1>> + inline WorkDiv make_workdiv(Idx blocks, Idx elements) { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - the number of threads per block is "elements"; + // - the number of elements per thread is always 1. + return WorkDiv(blocks, elements, Idx{1}); + } else +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#if ALPAKA_ACC_GPU_HIP_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - the number of threads per block is "elements"; + // - the number of elements per thread is always 1. + return WorkDiv(blocks, elements, Idx{1}); + } else +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + { + // On CPU backends, run serially with a single thread per block: + // - the number of threads per block is always 1; + // - the number of elements per thread is "elements". + return WorkDiv(blocks, Idx{1}, elements); + } + } + + // Create the accelerator-dependent workdiv for N-dimensional kernels + template >> + inline WorkDiv> make_workdiv(const Vec>& blocks, + const Vec>& elements) { + using Dim = alpaka::Dim; +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - the number of threads per block is "elements"; + // - the number of elements per thread is always 1. + return WorkDiv(blocks, elements, Vec::ones()); + } else +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - the number of threads per block is "elements"; + // - the number of elements per thread is always 1. + return WorkDiv(blocks, elements, Vec::ones()); + } else +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + { + // On CPU backends, run serially with a single thread per block: + // - the number of threads per block is always 1; + // - the number of elements per thread is "elements". + return WorkDiv(blocks, Vec::ones(), elements); + } + } + + template and alpaka::Dim::value == 1>> + class elements_with_stride { + public: + ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)[0u]}, + first_{alpaka::getIdx(acc)[0u] * elements_}, + stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[0u]}, + first_{alpaka::getIdx(acc)[0u] * elements_}, + stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, + extent_{extent} {} + + class iterator { + friend class elements_with_stride; + + ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first) + : elements_{elements}, + stride_{stride}, + extent_{extent}, + first_{std::min(first, extent)}, + index_{first_}, + last_{std::min(first + elements, extent)} {} + + public: + ALPAKA_FN_ACC inline Idx operator*() const { return index_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline iterator& operator++() { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < last_) + return *this; + + // increment the thread index with the grid stride + first_ += stride_ * elements_; + index_ = first_; + last_ = std::min(first_ + elements_, extent_); + if (index_ < extent_) + return *this; + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + index_ = extent_; + last_ = extent_; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { + return (index_ == other.index_) and (first_ == other.first_); + } + + ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Idx elements_; + Idx stride_; + Idx extent_; + // modified by the pre/post-increment operator + Idx first_; + Idx index_; + Idx last_; + }; + + ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); } + + ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } + + private: + const Idx elements_; + const Idx first_; + const Idx stride_; + const Idx extent_; + }; + + template and (alpaka::Dim::value > 0)>> + class elements_with_stride_nd { + public: + using Dim = alpaka::Dim; + using Vec = alpaka::Vec; + + ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc) + : elements_{alpaka::getWorkDiv(acc)}, + first_{alpaka::getIdx(acc) * elements_}, + stride_{alpaka::getWorkDiv(acc) * elements_}, + extent_{stride_} {} + + ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent) + : elements_{alpaka::getWorkDiv(acc)}, + first_{alpaka::getIdx(acc) * elements_}, + stride_{alpaka::getWorkDiv(acc) * elements_}, + extent_{extent} {} + + class iterator { + friend class elements_with_stride_nd; + constexpr static const auto last_dimension = Dim::value - 1; + + ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first) + : elements_{elements}, + stride_{stride}, + extent_{extent}, + first_{alpaka::elementwise_min(first, extent)}, + index_{first_}, + last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {} + + public: + ALPAKA_FN_ACC inline Vec operator*() const { return index_; } + + // pre-increment the iterator + ALPAKA_FN_ACC inline iterator& operator++() { + // increment the index along the elements processed by the current thread + ++index_[last_dimension]; + if (index_[last_dimension] < last_) + return *this; + + // increment the thread index along with the last dimension with the grid stride + first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension]; + index_[last_dimension] = first_[last_dimension]; + last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]); + if (index_[last_dimension] < extent_[last_dimension]) + return *this; + + // increment the thread index along the outer dimensions with the grid stride + if constexpr (last_dimension > 0) + for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) { + first_[dimension] += stride_[dimension]; + index_[dimension] = first_[dimension]; + if (index_[dimension] < extent_[dimension]) + return *this; + } + + // the iterator has reached or passed the end of the extent, clamp it to the extent + first_ = extent_; + index_ = extent_; + last_ = extent_[last_dimension]; + return *this; + } + + // post-increment the iterator + ALPAKA_FN_ACC inline iterator operator++(int) { + iterator old = *this; + ++(*this); + return old; + } + + ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { + return (index_ == other.index_) and (first_ == other.first_); + } + + ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } + + private: + // non-const to support iterator copy and assignment + Vec elements_; + Vec stride_; + Vec extent_; + // modified by the pre/post-increment operator + Vec first_; + Vec index_; + Idx last_; + }; + + ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); } + + ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } + + private: + const Vec elements_; + const Vec first_; + const Vec stride_; + const Vec extent_; + }; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h From 355d9d3fbfa7ff2139223a96463196e5de92f007 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 27 Sep 2022 20:17:50 +0200 Subject: [PATCH 3/3] Update the alpaka tests to use the new helper functions --- .../AlpakaTest/plugins/alpaka/TestAlgo.dev.cc | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc index e39a49c9e8dbc..cd054102fe71d 100644 --- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc +++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc @@ -8,37 +8,45 @@ #include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h" #include "HeterogeneousCore/AlpakaInterface/interface/config.h" #include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" #include "TestAlgo.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { + using namespace cms::alpakatools; + class TestAlgoKernel { public: - template >> + template >> ALPAKA_FN_ACC void operator()(TAcc const& acc, portabletest::TestDeviceCollection::View view, int32_t size) const { - // this example accepts an arbitrary number of blocks and threads, and always uses 1 element per thread + // global index of the thread within the grid const int32_t thread = alpaka::getIdx(acc)[0u]; - const int32_t stride = alpaka::getWorkDiv(acc)[0u]; const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}}; + // set this only once in the whole kernel grid if (thread == 0) { view.r() = 1.; } - for (auto i = thread; i < size; i += stride) { + + // make a strided loop over the kernel grid, covering up to "size" elements + for (int32_t i : elements_with_stride(acc, size)) { view[i] = {0., 0., 0., i, matrix * i}; } } }; void TestAlgo::fill(Queue& queue, portabletest::TestDeviceCollection& collection) const { - auto const& deviceProperties = alpaka::getAccDevProps(alpaka::getDev(queue)); - uint32_t maxThreadsPerBlock = deviceProperties.m_blockThreadExtentMax[0]; + // use 64 items per group (this value is arbitrary, but it's a reasonable starting point) + uint32_t items = 64; + + // use as many groups as needed to cover the whole problem + uint32_t groups = divide_up_by(collection->metadata().size(), items); - uint32_t threadsPerBlock = maxThreadsPerBlock; - uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock; - uint32_t elementsPerThread = 1; - auto workDiv = WorkDiv1D{blocksPerGrid, threadsPerBlock, elementsPerThread}; + // map items to + // - threads with a single element per thread on a GPU backend + // - elements within a single thread on a CPU backend + auto workDiv = make_workdiv(groups, items); alpaka::exec(queue, workDiv, TestAlgoKernel{}, collection.view(), collection->metadata().size()); }