From 4cc00eb12e10e37c2f9ae0c6d50e0ea3c4b9394a Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 29 Sep 2022 15:54:39 +0200
Subject: [PATCH 1/3] Add a function to compute the intersection of
 N-dimensional alpaka::Vec

---
 .../AlpakaInterface/interface/vec.h           | 42 +++++++++++++++++++
 .../AlpakaInterface/test/BuildFile.xml        |  6 +++
 .../AlpakaInterface/test/alpaka/testVec.cc    | 25 +++++++++++
 3 files changed, 73 insertions(+)
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/vec.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testVec.cc
diff --git a/HeterogeneousCore/AlpakaInterface/interface/vec.h b/HeterogeneousCore/AlpakaInterface/interface/vec.h
new file mode 100644
index 0000000000000..4126eecf78cf2
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/vec.h
@@ -0,0 +1,42 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_vec_h
+#define HeterogeneousCore_AlpakaInterface_interface_vec_h
+
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+namespace alpaka {
+
+  //! \return The element-wise minimum of one or more vectors.
+  ALPAKA_NO_HOST_ACC_WARNING
+  template <typename TDim,
+            typename TVal,
+            typename... Vecs,
+            typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
+  ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal> {
+    Vec<TDim, TVal> r;
+    if constexpr (TDim::value > 0) {
+      for (typename TDim::value_type i = 0; i < TDim::value; ++i)
+        r[i] = std::min({p[i], qs[i]...});
+    }
+    return r;
+  }
+
+  //! \return The element-wise maximum of one or more vectors.
+  ALPAKA_NO_HOST_ACC_WARNING
+  template <typename TDim,
+            typename TVal,
+            typename... Vecs,
+            typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
+  ALPAKA_FN_HOST_ACC constexpr auto elementwise_max(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal> {
+    Vec<TDim, TVal> r;
+    if constexpr (TDim::value > 0) {
+      for (typename TDim::value_type i = 0; i < TDim::value; ++i)
+        r[i] = std::max({p[i], qs[i]...});
+    }
+    return r;
+  }
+
+}  // namespace alpaka
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_vec_h
diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
new file mode 100644
index 0000000000000..c8d214ff53436
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
@@ -0,0 +1,6 @@
+<bin name="alpakaTestVec" file="alpaka/testVec.cc">
+  <use name="alpaka"/>
+  <use name="catch2"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testVec.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testVec.cc
new file mode 100644
index 0000000000000..0cae50ba0e5e5
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testVec.cc
@@ -0,0 +1,25 @@
+#include <alpaka/alpaka.hpp>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/vec.h"
+
+static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestVec) "]";
+
+TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestVec), s_tag) {
+  SECTION("Vec1D") {
+    alpaka_common::Vec1D s1 = 100;
+    alpaka_common::Vec1D s2 = 42;
+    alpaka_common::Vec1D si = 42;
+    REQUIRE(alpaka::elementwise_min(s1, s2) == si);
+  }
+
+  SECTION("Vec3D") {
+    alpaka_common::Vec3D v1{100, 10, 10};
+    alpaka_common::Vec3D v2{42, 42, 1};
+    alpaka_common::Vec3D vi{42, 10, 1};
+    REQUIRE(alpaka::elementwise_min(v1, v2) == vi);
+  }
+}

From 305982010011f920c1352cb72970ccb6622acbff Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 26 Sep 2022 20:56:49 +0200
Subject: [PATCH 2/3] Add helper functions for alpaka kernels

Add helper functions for:
  - performing an integer division with rounding up, or rounding a
    value to the next multiple of an integer;
  - creating an efficient work division, assigning the number of
    work items either to the number of GPU threads or CPU elements;
  - perform a scalar or N-dimensional strided loop within a kernel,
    based on the grid size and number of elements.
---
 .../AlpakaInterface/interface/workdivision.h  | 266 ++++++++++++++++++
 1 file changed, 266 insertions(+)
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/workdivision.h

diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
new file mode 100644
index 0000000000000..dad09ed1774e6
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -0,0 +1,266 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
+#define HeterogeneousCore_AlpakaInterface_interface_workdivision_h
+
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/vec.h"
+
+namespace cms::alpakatools {
+
+  using namespace alpaka_common;
+
+  // If the first argument is not a multiple of the second argument, round it up to the next multiple
+  inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }
+
+  // Return the integer division of the first argument by the second argument, rounded up to the next integer
+  inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
+
+  // Create an accelerator-dependent work division for 1-dimensional kernels
+  template <typename TAcc,
+            typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim1D, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - the number of threads per block is "elements";
+      //   - the number of elements per thread is always 1.
+      return WorkDiv<Dim1D>(blocks, elements, Idx{1});
+    } else
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+#if ALPAKA_ACC_GPU_HIP_ENABLED
+        if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim1D, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - the number of threads per block is "elements";
+      //   - the number of elements per thread is always 1.
+      return WorkDiv<Dim1D>(blocks, elements, Idx{1});
+    } else
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+    {
+      // On CPU backends, run serially with a single thread per block:
+      //   - the number of threads per block is always 1;
+      //   - the number of elements per thread is "elements".
+      return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
+    }
+  }
+
+  // Create the accelerator-dependent workdiv for N-dimensional kernels
+  template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
+  inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
+                                                 const Vec<alpaka::Dim<TAcc>>& elements) {
+    using Dim = alpaka::Dim<TAcc>;
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - the number of threads per block is "elements";
+      //   - the number of elements per thread is always 1.
+      return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
+    } else
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+        if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim, Idx>>) {
+      // On GPU backends, each thread is looking at a single element:
+      //   - the number of threads per block is "elements";
+      //   - the number of elements per thread is always 1.
+      return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
+    } else
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+    {
+      // On CPU backends, run serially with a single thread per block:
+      //   - the number of threads per block is always 1;
+      //   - the number of elements per thread is "elements".
+      return WorkDiv<Dim>(blocks, Vec<Dim>::ones(), elements);
+    }
+  }
+
+  template <typename TAcc,
+            typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  class elements_with_stride {
+  public:
+    ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          extent_{stride_} {}
+
+    ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
+          extent_{extent} {}
+
+    class iterator {
+      friend class elements_with_stride;
+
+      ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first)
+          : elements_{elements},
+            stride_{stride},
+            extent_{extent},
+            first_{std::min(first, extent)},
+            index_{first_},
+            last_{std::min(first + elements, extent)} {}
+
+    public:
+      ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline iterator& operator++() {
+        // increment the index along the elements processed by the current thread
+        ++index_;
+        if (index_ < last_)
+          return *this;
+
+        // increment the thread index with the grid stride
+        first_ += stride_ * elements_;
+        index_ = first_;
+        last_ = std::min(first_ + elements_, extent_);
+        if (index_ < extent_)
+          return *this;
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        first_ = extent_;
+        index_ = extent_;
+        last_ = extent_;
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
+        return (index_ == other.index_) and (first_ == other.first_);
+      }
+
+      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+
+    private:
+      // non-const to support iterator copy and assignment
+      Idx elements_;
+      Idx stride_;
+      Idx extent_;
+      // modified by the pre/post-increment operator
+      Idx first_;
+      Idx index_;
+      Idx last_;
+    };
+
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
+
+    ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
+
+  private:
+    const Idx elements_;
+    const Idx first_;
+    const Idx stride_;
+    const Idx extent_;
+  };
+
+  template <typename TAcc,
+            typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+  class elements_with_stride_nd {
+  public:
+    using Dim = alpaka::Dim<TAcc>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+
+    ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
+          extent_{stride_} {}
+
+    ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
+          first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
+          stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
+          extent_{extent} {}
+
+    class iterator {
+      friend class elements_with_stride_nd;
+      constexpr static const auto last_dimension = Dim::value - 1;
+
+      ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first)
+          : elements_{elements},
+            stride_{stride},
+            extent_{extent},
+            first_{alpaka::elementwise_min(first, extent)},
+            index_{first_},
+            last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {}
+
+    public:
+      ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
+
+      // pre-increment the iterator
+      ALPAKA_FN_ACC inline iterator& operator++() {
+        // increment the index along the elements processed by the current thread
+        ++index_[last_dimension];
+        if (index_[last_dimension] < last_)
+          return *this;
+
+        // increment the thread index along with the last dimension with the grid stride
+        first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension];
+        index_[last_dimension] = first_[last_dimension];
+        last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]);
+        if (index_[last_dimension] < extent_[last_dimension])
+          return *this;
+
+        // increment the thread index along the outer dimensions with the grid stride
+        if constexpr (last_dimension > 0)
+          for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) {
+            first_[dimension] += stride_[dimension];
+            index_[dimension] = first_[dimension];
+            if (index_[dimension] < extent_[dimension])
+              return *this;
+          }
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        first_ = extent_;
+        index_ = extent_;
+        last_ = extent_[last_dimension];
+        return *this;
+      }
+
+      // post-increment the iterator
+      ALPAKA_FN_ACC inline iterator operator++(int) {
+        iterator old = *this;
+        ++(*this);
+        return old;
+      }
+
+      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
+        return (index_ == other.index_) and (first_ == other.first_);
+      }
+
+      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+
+    private:
+      // non-const to support iterator copy and assignment
+      Vec elements_;
+      Vec stride_;
+      Vec extent_;
+      // modified by the pre/post-increment operator
+      Vec first_;
+      Vec index_;
+      Idx last_;
+    };
+
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
+
+    ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
+
+  private:
+    const Vec elements_;
+    const Vec first_;
+    const Vec stride_;
+    const Vec extent_;
+  };
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_workdivision_h

From 355d9d3fbfa7ff2139223a96463196e5de92f007 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 27 Sep 2022 20:17:50 +0200
Subject: [PATCH 3/3] Update the alpaka tests to use the new helper functions

---
 .../AlpakaTest/plugins/alpaka/TestAlgo.dev.cc | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
index e39a49c9e8dbc..cd054102fe71d 100644
--- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
+++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
@@ -8,37 +8,45 @@
 #include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 
 #include "TestAlgo.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
+  using namespace cms::alpakatools;
+
   class TestAlgoKernel {
   public:
-    template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
+    template <typename TAcc, typename = std::enable_if_t<is_accelerator_v<TAcc>>>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, portabletest::TestDeviceCollection::View view, int32_t size) const {
-      // this example accepts an arbitrary number of blocks and threads, and always uses 1 element per thread
+      // global index of the thread within the grid
       const int32_t thread = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
-      const int32_t stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
 
+      // set this only once in the whole kernel grid
       if (thread == 0) {
         view.r() = 1.;
       }
-      for (auto i = thread; i < size; i += stride) {
+
+      // make a strided loop over the kernel grid, covering up to "size" elements
+      for (int32_t i : elements_with_stride(acc, size)) {
         view[i] = {0., 0., 0., i, matrix * i};
       }
     }
   };
 
   void TestAlgo::fill(Queue& queue, portabletest::TestDeviceCollection& collection) const {
-    auto const& deviceProperties = alpaka::getAccDevProps<Acc1D>(alpaka::getDev(queue));
-    uint32_t maxThreadsPerBlock = deviceProperties.m_blockThreadExtentMax[0];
+    // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
+    uint32_t items = 64;
+
+    // use as many groups as needed to cover the whole problem
+    uint32_t groups = divide_up_by(collection->metadata().size(), items);
 
-    uint32_t threadsPerBlock = maxThreadsPerBlock;
-    uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;
-    uint32_t elementsPerThread = 1;
-    auto workDiv = WorkDiv1D{blocksPerGrid, threadsPerBlock, elementsPerThread};
+    // map items to
+    //   - threads with a single element per thread on a GPU backend
+    //   - elements within a single thread on a CPU backend
+    auto workDiv = make_workdiv<Acc1D>(groups, items);
 
     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view(), collection->metadata().size());
   }