Skip to content

Commit

Permalink
Add option to enable CUDA unified memory architectures
Browse files Browse the repository at this point in the history
This is in support of Grace Hopper making, CudaSpace host accessible.
I also added an emulation mode to run on other CUDA architectures,
by making the cudaMalloc wrapper call cudaMallocManaged.

Kokkos_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY is the option

A new macro KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY will be defined
if both Grace and Hopper are enabled.
  • Loading branch information
crtrott committed Mar 4, 2024
1 parent 7899413 commit f24357e
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 5 deletions.
3 changes: 3 additions & 0 deletions cmake/KokkosCore_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA // deprecated
#cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY
#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
#cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
#cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH
Expand All @@ -63,6 +65,7 @@
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
#cmakedefine KOKKOS_ARCH_ARMV81
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
#cmakedefine KOKKOS_ARCH_ARMV9_GRACE
#cmakedefine KOKKOS_ARCH_A64FX
#cmakedefine KOKKOS_ARCH_AVX
#cmakedefine KOKKOS_ARCH_AVX2
Expand Down
3 changes: 3 additions & 0 deletions cmake/kokkos_enable_options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda
# resolved but we keep the option around a bit longer to be safe.
KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA")
KOKKOS_ENABLE_OPTION(IMPL_CUDA_EMULATE_UNIFIED_MEMORY OFF "Whether to emulate unified memory architectures for CUDA")

KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" )
KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP")
Expand Down
34 changes: 30 additions & 4 deletions core/src/Cuda/Kokkos_CudaSpace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,24 @@ void *impl_allocate_common(const int device_id,
cudaError_t error_code = cudaSuccess;
#ifndef CUDART_VERSION
#error CUDART_VERSION undefined!
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
// This is inteded to simulate Grace-Hopper like behavior
error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
// This is intended for Grace-Hopper (and future unified memory architectures)
// The idea is to use host allocator and then adivce to keep it in HBM on
// device, but that requires CUDA 12.2
static_assert(CUDART_VERSION >= 12020);
ptr = malloc(arg_alloc_size);
// TODO: what error do we want to throw here if it fails, i.e. ptr == nullptr?
// One would thing cudaMemLocation{device_id, cudaMemLocationTypeDeivce} would
// work but it doesn't. I.e. the order of members doesn't seem to be defined.
cudaMemLocation loc;
loc.id = device_id;
loc.type = cudaMemLocationTypeDevice;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2(
ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc));
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
if (arg_alloc_size >= memory_threshold_g) {
error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
Expand All @@ -197,8 +215,10 @@ void *impl_allocate_common(const int device_id,
}
}
} else
#endif
#else
{ error_code = cudaMalloc(&ptr, arg_alloc_size); }
#endif

if (error_code != cudaSuccess) { // TODO tag as unlikely branch
// This is the only way to clear the last error, which
// we should do here since we're turning it into an
Expand Down Expand Up @@ -344,6 +364,11 @@ void CudaSpace::impl_deallocate(
try {
#ifndef CUDART_VERSION
#error CUDART_VERSION undefined!
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
free(arg_alloc_ptr);
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
if (arg_alloc_size >= memory_threshold_g) {
Impl::cuda_device_synchronize(
Expand All @@ -356,9 +381,6 @@ void CudaSpace::impl_deallocate(
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
}
#else
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
#endif
} catch (...) {
}
Expand Down Expand Up @@ -463,8 +485,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,

#include <impl/Kokkos_SharedAlloc_timpl.hpp>

#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Kokkos::CudaSpace);
#else
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace);
#endif
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Kokkos::CudaUVMSpace);
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Expand Down
23 changes: 22 additions & 1 deletion core/src/Cuda/Kokkos_CudaSpace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,19 @@ class CudaSpace {
void* allocate(const char* arg_label, const size_t arg_alloc_size,
const size_t arg_logical_size = 0) const;

#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
template <typename ExecutionSpace>
void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
return allocate(arg_alloc_size);
}
template <typename ExecutionSpace>
void* allocate(const ExecutionSpace&, const char* arg_label,
const size_t arg_alloc_size,
const size_t arg_logical_size = 0) const {
return allocate(arg_label, arg_alloc_size, arg_logical_size);
}
#endif

/**\brief Deallocate untracked memory in the cuda space */
void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
void deallocate(const char* arg_label, void* const arg_alloc_ptr,
Expand Down Expand Up @@ -337,7 +350,11 @@ static_assert(
template <>
struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> {
enum : bool { assignable = false };
enum : bool { accessible = false };
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
enum : bool{accessible = false};
#else
enum : bool { accessible = true };
#endif
enum : bool { deepcopy = true };
};

Expand Down Expand Up @@ -558,8 +575,12 @@ struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
Kokkos::CudaSpace);
#else
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaSpace);
#endif
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace);
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace);

Expand Down
6 changes: 6 additions & 0 deletions core/src/Cuda/Kokkos_Cuda_Instance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,13 @@ class CudaInternal {
template <bool setCudaDevice = true>
cudaError_t cuda_malloc_wrapper(void** devPtr, size_t size) const {
if constexpr (setCudaDevice) set_cuda_device();
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
return cudaMalloc(devPtr, size);
#else
auto ptr = cudaMallocManaged(devPtr, size, cudaMemAttachGlobal);
cudaDeviceSynchronize();
return ptr;
#endif
}

template <bool setCudaDevice = true>
Expand Down
9 changes: 9 additions & 0 deletions core/src/Kokkos_Macros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,15 @@ static constexpr bool kokkos_omp_on_host() { return false; }
#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
#endif

#if defined(KOKKOS_ENABLE_IMPL_CUDA_EMULATE_UNIFIED_MEMORY)
#define KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
#endif

// TODO: enable the following when we are sure it is the right thing to do
//#if defined(KOKKOS_ARCH_ARMV9_GRACE) && defined(KOKKOS_ARCH_HOPPER90)
//#define KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
//#endif

#define KOKKOS_INVALID_INDEX (~std::size_t(0))

#define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX
Expand Down

0 comments on commit f24357e

Please sign in to comment.