Cuda: Allocate using the correct device (kokkos#6392)

* Cuda: Allocate using the correct device * Avoid warning about uninitialized variable * exec_space_provided -> stream_sync_only * Fix up ASYNC support * Only check for errors in synchronization if allocation was successful * Rename arguments and make constructor private --------- Co-authored-by: Daniel Arndt <arndtd@ornl.com>
dalg24 · Jan 3, 2024 · 4078a0d · 4078a0d
1 parent 02b46c0
commit 4078a0d
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 70 deletions.
diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -135,11 +135,23 @@ void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) {
 
 namespace Kokkos {
 
-CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
-
-CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
-
-CudaHostPinnedSpace::CudaHostPinnedSpace() {}
+CudaSpace::CudaSpace()
+    : m_device(Kokkos::Cuda().cuda_device()),
+      m_stream(Kokkos::Cuda().cuda_stream()) {}
+CudaSpace::CudaSpace(int device_id, cudaStream_t stream)
+    : m_device(device_id), m_stream(stream) {}
+
+CudaUVMSpace::CudaUVMSpace()
+    : m_device(Kokkos::Cuda().cuda_device()),
+      m_stream(Kokkos::Cuda().cuda_stream()) {}
+CudaUVMSpace::CudaUVMSpace(int device_id, cudaStream_t stream)
+    : m_device(device_id), m_stream(stream) {}
+
+CudaHostPinnedSpace::CudaHostPinnedSpace()
+    : m_device(Kokkos::Cuda().cuda_device()),
+      m_stream(Kokkos::Cuda().cuda_stream()) {}
+CudaHostPinnedSpace::CudaHostPinnedSpace(int device_id, cudaStream_t stream)
+    : m_device(device_id), m_stream(stream) {}
 
 size_t memory_threshold_g = 40000;  // 40 kB
 
@@ -161,52 +173,38 @@ void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
 }
 
 namespace {
-void *impl_allocate_common(const Cuda &exec_space, const char *arg_label,
-                           const size_t arg_alloc_size,
+void *impl_allocate_common(const int device_id,
+                           [[maybe_unused]] const cudaStream_t stream,
+                           const char *arg_label, const size_t arg_alloc_size,
                            const size_t arg_logical_size,
                            const Kokkos::Tools::SpaceHandle arg_handle,
-                           bool exec_space_provided) {
+                           [[maybe_unused]] bool stream_sync_only) {
   void *ptr = nullptr;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id));
 
+  cudaError_t error_code = cudaSuccess;
 #ifndef CUDART_VERSION
 #error CUDART_VERSION undefined!
 #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
-  cudaError_t error_code;
   if (arg_alloc_size >= memory_threshold_g) {
-    if (exec_space_provided) {
-      error_code =
-          exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper(
-              &ptr, arg_alloc_size);
-      exec_space.fence("Kokkos::Cuda: backend fence after async malloc");
-    } else {
-      error_code = Impl::CudaInternal::singleton().cuda_malloc_async_wrapper(
-          &ptr, arg_alloc_size);
-      Impl::cuda_device_synchronize(
-          "Kokkos::Cuda: backend fence after async malloc");
+    error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
+
+    if (error_code == cudaSuccess) {
+      if (stream_sync_only) {
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      } else {
+        Impl::cuda_device_synchronize(
+            "Kokkos::Cuda: backend fence after async malloc");
+      }
     }
-  } else {
-    error_code =
-        (exec_space_provided
-             ? exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
-                   &ptr, arg_alloc_size)
-             : Impl::CudaInternal::singleton().cuda_malloc_wrapper(
-                   &ptr, arg_alloc_size));
-  }
-#else
-  cudaError_t error_code;
-  if (exec_space_provided) {
-    error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
-        &ptr, arg_alloc_size);
-  } else {
-    error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper(
-        &ptr, arg_alloc_size);
-  }
+  } else
 #endif
+  { error_code = cudaMalloc(&ptr, arg_alloc_size); }
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
     // This is the only way to clear the last error, which
     // we should do here since we're turning it into an
     // exception here
-    exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper();
+    cudaGetLastError();
     throw Experimental::CudaRawMemoryAllocationFailure(
         arg_alloc_size, error_code,
         Experimental::RawMemoryAllocationFailure::AllocationMechanism::
@@ -226,16 +224,17 @@ void *CudaSpace::impl_allocate(
     const char *arg_label, const size_t arg_alloc_size,
     const size_t arg_logical_size,
     const Kokkos::Tools::SpaceHandle arg_handle) const {
-  return impl_allocate_common(Kokkos::Cuda{}, arg_label, arg_alloc_size,
+  return impl_allocate_common(m_device, m_stream, arg_label, arg_alloc_size,
                               arg_logical_size, arg_handle, false);
 }
 
 void *CudaSpace::impl_allocate(
     const Cuda &exec_space, const char *arg_label, const size_t arg_alloc_size,
     const size_t arg_logical_size,
     const Kokkos::Tools::SpaceHandle arg_handle) const {
-  return impl_allocate_common(exec_space, arg_label, arg_alloc_size,
-                              arg_logical_size, arg_handle, true);
+  return impl_allocate_common(
+      exec_space.cuda_device(), exec_space.cuda_stream(), arg_label,
+      arg_alloc_size, arg_logical_size, arg_handle, true);
 }
 
 void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const {
@@ -256,28 +255,27 @@ void *CudaUVMSpace::impl_allocate(
   if (arg_alloc_size > 0) {
     Kokkos::Impl::num_uvm_allocations++;
 
-    auto error_code =
-        Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper(
-            &ptr, arg_alloc_size, cudaMemAttachGlobal);
-
-#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
-    if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          (Impl::CudaInternal::singleton().cuda_mem_advise_wrapper(
-              ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
-              cudaCpuDeviceId)));
-#endif
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+    cudaError_t error_code =
+        cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
 
     if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
       // This is the only way to clear the last error, which
       // we should do here since we're turning it into an
       // exception here
-      Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
+      cudaGetLastError();
       throw Experimental::CudaRawMemoryAllocationFailure(
           arg_alloc_size, error_code,
           Experimental::RawMemoryAllocationFailure::AllocationMechanism::
               CudaMallocManaged);
     }
+
+#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
+    if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
+                        cudaCpuDeviceId));
+#endif
   }
   Cuda::impl_static_fence(
       "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation");
@@ -302,13 +300,14 @@ void *CudaHostPinnedSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
-  auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper(
-      &ptr, arg_alloc_size, cudaHostAllocDefault);
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+  cudaError_t error_code =
+      cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault);
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
     // This is the only way to clear the last error, which
     // we should do here since we're turning it into an
     // exception here
-    Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
+    cudaGetLastError();
     throw Experimental::CudaRawMemoryAllocationFailure(
         arg_alloc_size, error_code,
         Experimental::RawMemoryAllocationFailure::AllocationMechanism::

diff --git a/core/src/Cuda/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp
@@ -68,6 +68,11 @@ class CudaSpace {
   /*--------------------------------*/
 
   CudaSpace();
+
+ private:
+  CudaSpace(int device_id, cudaStream_t stream);
+
+ public:
   CudaSpace(CudaSpace&& rhs)      = default;
   CudaSpace(const CudaSpace& rhs) = default;
   CudaSpace& operator=(CudaSpace&& rhs) = default;
@@ -89,6 +94,10 @@ class CudaSpace {
                   const size_t arg_alloc_size,
                   const size_t arg_logical_size = 0) const;
 
+  static CudaSpace impl_create(int device_id, cudaStream_t stream) {
+    return CudaSpace(device_id, stream);
+  }
+
  private:
   void* impl_allocate(const Cuda& exec_space, const char* arg_label,
                       const size_t arg_alloc_size,
@@ -110,7 +119,8 @@ class CudaSpace {
   static constexpr const char* name() { return m_name; }
 
  private:
-  int m_device;  ///< Which Cuda device
+  int m_device;
+  cudaStream_t m_stream;
 
   static constexpr const char* m_name = "Cuda";
   friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
@@ -147,6 +157,11 @@ class CudaUVMSpace {
   /*--------------------------------*/
 
   CudaUVMSpace();
+
+ private:
+  CudaUVMSpace(int device_id, cudaStream_t stream);
+
+ public:
   CudaUVMSpace(CudaUVMSpace&& rhs)      = default;
   CudaUVMSpace(const CudaUVMSpace& rhs) = default;
   CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default;
@@ -185,8 +200,13 @@ class CudaUVMSpace {
 #endif
   /*--------------------------------*/
 
+  static CudaUVMSpace impl_create(int device_id, cudaStream_t stream) {
+    return CudaUVMSpace(device_id, stream);
+  }
+
  private:
-  int m_device;  ///< Which Cuda device
+  int m_device;
+  cudaStream_t m_stream;
 
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
   static bool kokkos_impl_cuda_pin_uvm_to_host_v;
@@ -219,6 +239,11 @@ class CudaHostPinnedSpace {
   /*--------------------------------*/
 
   CudaHostPinnedSpace();
+
+ private:
+  CudaHostPinnedSpace(int device_id, cudaStream_t stream);
+
+ public:
   CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs)      = default;
   CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default;
   CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default;
@@ -236,6 +261,10 @@ class CudaHostPinnedSpace {
                   const size_t arg_alloc_size,
                   const size_t arg_logical_size = 0) const;
 
+  static CudaHostPinnedSpace impl_create(int device_id, cudaStream_t stream) {
+    return CudaHostPinnedSpace(device_id, stream);
+  }
+
  private:
   void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
                       const size_t arg_logical_size = 0,
@@ -252,6 +281,9 @@ class CudaHostPinnedSpace {
   static constexpr const char* name() { return m_name; }
 
  private:
+  int m_device;
+  cudaStream_t m_stream;
+
   static constexpr const char* m_name = "CudaHostPinned";
 
   /*--------------------------------*/

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -26,10 +26,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <Cuda/Kokkos_Cuda_Error.hpp>
-#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
-#include <Cuda/Kokkos_Cuda_Instance.hpp>
-#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
+//#include <Cuda/Kokkos_Cuda_Error.hpp>
+//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
+//#include <Cuda/Kokkos_Cuda_Instance.hpp>
+//#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Tools.hpp>
 #include <impl/Kokkos_CheckedIntegerOps.hpp>
@@ -454,8 +454,9 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain);
-    Record *const r = Record::allocate(
-        Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", alloc_size);
+    Record *const r =
+        Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream),
+                         "Kokkos::InternalScratchFlags", alloc_size);
 
     Record::increment(r);
 
@@ -480,8 +481,9 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const {
 
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain);
-    Record *const r = Record::allocate(
-        Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", alloc_size);
+    Record *const r =
+        Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream),
+                         "Kokkos::InternalScratchSpace", alloc_size);
 
     Record::increment(r);
 
@@ -505,7 +507,7 @@ Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const {
     std::size_t alloc_size =
         multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain);
     Record *const r =
-        Record::allocate(Kokkos::CudaHostPinnedSpace(),
+        Record::allocate(CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream),
                          "Kokkos::InternalScratchUnified", alloc_size);
 
     Record::increment(r);
@@ -526,9 +528,9 @@ Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const {
     if (m_scratchFunctor)
       Record::decrement(Record::get_record(m_scratchFunctor));
 
-    Record *const r =
-        Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor",
-                         m_scratchFunctorSize);
+    Record *const r = Record::allocate(
+        CudaSpace::impl_create(m_cudaDev, m_stream),
+        "Kokkos::InternalScratchFunctor", m_scratchFunctorSize);
 
     Record::increment(r);
 

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -22,7 +22,7 @@
 #include <atomic>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <cuda_runtime_api.h>
-
+#include "Kokkos_CudaSpace.hpp"
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // These functions fulfill the purpose of allowing to work around