Add option to enable CUDA unified memory architectures

This is in support of Grace Hopper making, CudaSpace host accessible. I also added an emulation mode to run on other CUDA architectures, by making the cudaMalloc wrapper call cudaMallocManaged. Kokkos_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY is the option A new macro KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY will be defined if both Grace and Hopper are enabled.
crtrott · Mar 4, 2024 · f24357e · f24357e
1 parent 7899413
commit f24357e
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 5 deletions.
diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in
@@ -37,6 +37,8 @@
 #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA  // deprecated
 #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
 #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
+#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
+#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY
 #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
 #cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH
@@ -63,6 +65,7 @@
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
 #cmakedefine KOKKOS_ARCH_ARMV81
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
+#cmakedefine KOKKOS_ARCH_ARMV9_GRACE
 #cmakedefine KOKKOS_ARCH_A64FX
 #cmakedefine KOKKOS_ARCH_AVX
 #cmakedefine KOKKOS_ARCH_AVX2

diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake
@@ -48,6 +48,9 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda
 # resolved but we keep the option around a bit longer to be safe.
 KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
 KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA")
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_EMULATE_UNIFIED_MEMORY OFF "Whether to emulate unified memory architectures for CUDA")
+
 KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4    ON "Whether code deprecated in major release 4 is available" )
 KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
 KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for HIP")

diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -184,6 +184,24 @@ void *impl_allocate_common(const int device_id,
   cudaError_t error_code = cudaSuccess;
 #ifndef CUDART_VERSION
 #error CUDART_VERSION undefined!
+#elif defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
+  // This is inteded to simulate Grace-Hopper like behavior
+  error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+  // This is intended for Grace-Hopper (and future unified memory architectures)
+  // The idea is to use host allocator and then adivce to keep it in HBM on
+  // device, but that requires CUDA 12.2
+  static_assert(CUDART_VERSION >= 12020);
+  ptr = malloc(arg_alloc_size);
+  // TODO: what error do we want to throw here if it fails, i.e. ptr == nullptr?
+  // One would thing cudaMemLocation{device_id, cudaMemLocationTypeDeivce} would
+  // work but it doesn't. I.e. the order of members doesn't seem to be defined.
+  cudaMemLocation loc;
+  loc.id   = device_id;
+  loc.type = cudaMemLocationTypeDevice;
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2(
+      ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc));
 #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
   if (arg_alloc_size >= memory_threshold_g) {
     error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
@@ -197,8 +215,10 @@ void *impl_allocate_common(const int device_id,
       }
     }
   } else
-#endif
+#else
   { error_code = cudaMalloc(&ptr, arg_alloc_size); }
+#endif
+
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
     // This is the only way to clear the last error, which
     // we should do here since we're turning it into an
@@ -344,6 +364,11 @@ void CudaSpace::impl_deallocate(
   try {
 #ifndef CUDART_VERSION
 #error CUDART_VERSION undefined!
+#elif defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+    free(arg_alloc_ptr);
 #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
     if (arg_alloc_size >= memory_threshold_g) {
       Impl::cuda_device_synchronize(
@@ -356,9 +381,6 @@ void CudaSpace::impl_deallocate(
       KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
       KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
     }
-#else
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
 #endif
   } catch (...) {
   }
@@ -463,8 +485,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
 
 #include <impl/Kokkos_SharedAlloc_timpl.hpp>
 
+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
 KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
     Kokkos::CudaSpace);
+#else
+KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace);
+#endif
 KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
     Kokkos::CudaUVMSpace);
 KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(

diff --git a/core/src/Cuda/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp
@@ -88,6 +88,19 @@ class CudaSpace {
   void* allocate(const char* arg_label, const size_t arg_alloc_size,
                  const size_t arg_logical_size = 0) const;
 
+#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
+#endif
+
   /**\brief  Deallocate untracked memory in the cuda space */
   void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
   void deallocate(const char* arg_label, void* const arg_alloc_ptr,
@@ -337,7 +350,11 @@ static_assert(
 template <>
 struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> {
   enum : bool { assignable = false };
-  enum : bool { accessible = false };
+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+  enum : bool{accessible = false};
+#else
+  enum : bool { accessible = true };
+#endif
   enum : bool { deepcopy = true };
 };
 
@@ -558,8 +575,12 @@ struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
 KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
     Kokkos::CudaSpace);
+#else
+KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaSpace);
+#endif
 KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace);
 KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace);
 

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -325,7 +325,13 @@ class CudaInternal {
   template <bool setCudaDevice = true>
   cudaError_t cuda_malloc_wrapper(void** devPtr, size_t size) const {
     if constexpr (setCudaDevice) set_cuda_device();
+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
     return cudaMalloc(devPtr, size);
+#else
+    auto ptr = cudaMallocManaged(devPtr, size, cudaMemAttachGlobal);
+    cudaDeviceSynchronize();
+    return ptr;
+#endif
   }
 
   template <bool setCudaDevice = true>

diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp
@@ -533,6 +533,15 @@ static constexpr bool kokkos_omp_on_host() { return false; }
 #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
 #endif
 
+#if defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
+#define KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
+#endif
+
+// TODO: enable the following when we are sure it is the right thing to do
+//#if defined(KOKKOS_ARCH_ARMV9_GRACE) && defined(KOKKOS_ARCH_HOPPER90)
+//#define KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
+//#endif
+
 #define KOKKOS_INVALID_INDEX (~std::size_t(0))
 
 #define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX