Skip to content

Commit

Permalink
Cuda: Allocate using the correct device (kokkos#6392)
Browse files Browse the repository at this point in the history
* Cuda: Allocate using the correct device

* Avoid warning about uninitialized variable

* exec_space_provided -> stream_sync_only

* Fix up ASYNC support

* Only check for errors in synchronization if allocation was successful

* Rename arguments and make constructor private

---------

Co-authored-by: Daniel Arndt <arndtd@ornl.com>
  • Loading branch information
masterleinad and Daniel Arndt committed Jan 3, 2024
1 parent 02b46c0 commit 4078a0d
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 70 deletions.
109 changes: 54 additions & 55 deletions core/src/Cuda/Kokkos_CudaSpace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,23 @@ void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) {

namespace Kokkos {

CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {}

CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}

CudaHostPinnedSpace::CudaHostPinnedSpace() {}
CudaSpace::CudaSpace()
: m_device(Kokkos::Cuda().cuda_device()),
m_stream(Kokkos::Cuda().cuda_stream()) {}
CudaSpace::CudaSpace(int device_id, cudaStream_t stream)
: m_device(device_id), m_stream(stream) {}

CudaUVMSpace::CudaUVMSpace()
: m_device(Kokkos::Cuda().cuda_device()),
m_stream(Kokkos::Cuda().cuda_stream()) {}
CudaUVMSpace::CudaUVMSpace(int device_id, cudaStream_t stream)
: m_device(device_id), m_stream(stream) {}

CudaHostPinnedSpace::CudaHostPinnedSpace()
: m_device(Kokkos::Cuda().cuda_device()),
m_stream(Kokkos::Cuda().cuda_stream()) {}
CudaHostPinnedSpace::CudaHostPinnedSpace(int device_id, cudaStream_t stream)
: m_device(device_id), m_stream(stream) {}

size_t memory_threshold_g = 40000; // 40 kB

Expand All @@ -161,52 +173,38 @@ void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
}

namespace {
void *impl_allocate_common(const Cuda &exec_space, const char *arg_label,
const size_t arg_alloc_size,
void *impl_allocate_common(const int device_id,
[[maybe_unused]] const cudaStream_t stream,
const char *arg_label, const size_t arg_alloc_size,
const size_t arg_logical_size,
const Kokkos::Tools::SpaceHandle arg_handle,
bool exec_space_provided) {
[[maybe_unused]] bool stream_sync_only) {
void *ptr = nullptr;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id));

cudaError_t error_code = cudaSuccess;
#ifndef CUDART_VERSION
#error CUDART_VERSION undefined!
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
cudaError_t error_code;
if (arg_alloc_size >= memory_threshold_g) {
if (exec_space_provided) {
error_code =
exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper(
&ptr, arg_alloc_size);
exec_space.fence("Kokkos::Cuda: backend fence after async malloc");
} else {
error_code = Impl::CudaInternal::singleton().cuda_malloc_async_wrapper(
&ptr, arg_alloc_size);
Impl::cuda_device_synchronize(
"Kokkos::Cuda: backend fence after async malloc");
error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);

if (error_code == cudaSuccess) {
if (stream_sync_only) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
} else {
Impl::cuda_device_synchronize(
"Kokkos::Cuda: backend fence after async malloc");
}
}
} else {
error_code =
(exec_space_provided
? exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
&ptr, arg_alloc_size)
: Impl::CudaInternal::singleton().cuda_malloc_wrapper(
&ptr, arg_alloc_size));
}
#else
cudaError_t error_code;
if (exec_space_provided) {
error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper(
&ptr, arg_alloc_size);
} else {
error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper(
&ptr, arg_alloc_size);
}
} else
#endif
{ error_code = cudaMalloc(&ptr, arg_alloc_size); }
if (error_code != cudaSuccess) { // TODO tag as unlikely branch
// This is the only way to clear the last error, which
// we should do here since we're turning it into an
// exception here
exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper();
cudaGetLastError();
throw Experimental::CudaRawMemoryAllocationFailure(
arg_alloc_size, error_code,
Experimental::RawMemoryAllocationFailure::AllocationMechanism::
Expand All @@ -226,16 +224,17 @@ void *CudaSpace::impl_allocate(
const char *arg_label, const size_t arg_alloc_size,
const size_t arg_logical_size,
const Kokkos::Tools::SpaceHandle arg_handle) const {
return impl_allocate_common(Kokkos::Cuda{}, arg_label, arg_alloc_size,
return impl_allocate_common(m_device, m_stream, arg_label, arg_alloc_size,
arg_logical_size, arg_handle, false);
}

void *CudaSpace::impl_allocate(
const Cuda &exec_space, const char *arg_label, const size_t arg_alloc_size,
const size_t arg_logical_size,
const Kokkos::Tools::SpaceHandle arg_handle) const {
return impl_allocate_common(exec_space, arg_label, arg_alloc_size,
arg_logical_size, arg_handle, true);
return impl_allocate_common(
exec_space.cuda_device(), exec_space.cuda_stream(), arg_label,
arg_alloc_size, arg_logical_size, arg_handle, true);
}

void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const {
Expand All @@ -256,28 +255,27 @@ void *CudaUVMSpace::impl_allocate(
if (arg_alloc_size > 0) {
Kokkos::Impl::num_uvm_allocations++;

auto error_code =
Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper(
&ptr, arg_alloc_size, cudaMemAttachGlobal);

#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
KOKKOS_IMPL_CUDA_SAFE_CALL(
(Impl::CudaInternal::singleton().cuda_mem_advise_wrapper(
ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
cudaCpuDeviceId)));
#endif
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
cudaError_t error_code =
cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);

if (error_code != cudaSuccess) { // TODO tag as unlikely branch
// This is the only way to clear the last error, which
// we should do here since we're turning it into an
// exception here
Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
cudaGetLastError();
throw Experimental::CudaRawMemoryAllocationFailure(
arg_alloc_size, error_code,
Experimental::RawMemoryAllocationFailure::AllocationMechanism::
CudaMallocManaged);
}

#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host())
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation,
cudaCpuDeviceId));
#endif
}
Cuda::impl_static_fence(
"Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation");
Expand All @@ -302,13 +300,14 @@ void *CudaHostPinnedSpace::impl_allocate(
const Kokkos::Tools::SpaceHandle arg_handle) const {
void *ptr = nullptr;

auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper(
&ptr, arg_alloc_size, cudaHostAllocDefault);
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
cudaError_t error_code =
cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault);
if (error_code != cudaSuccess) { // TODO tag as unlikely branch
// This is the only way to clear the last error, which
// we should do here since we're turning it into an
// exception here
Impl::CudaInternal::singleton().cuda_get_last_error_wrapper();
cudaGetLastError();
throw Experimental::CudaRawMemoryAllocationFailure(
arg_alloc_size, error_code,
Experimental::RawMemoryAllocationFailure::AllocationMechanism::
Expand Down
36 changes: 34 additions & 2 deletions core/src/Cuda/Kokkos_CudaSpace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ class CudaSpace {
/*--------------------------------*/

CudaSpace();

private:
CudaSpace(int device_id, cudaStream_t stream);

public:
CudaSpace(CudaSpace&& rhs) = default;
CudaSpace(const CudaSpace& rhs) = default;
CudaSpace& operator=(CudaSpace&& rhs) = default;
Expand All @@ -89,6 +94,10 @@ class CudaSpace {
const size_t arg_alloc_size,
const size_t arg_logical_size = 0) const;

static CudaSpace impl_create(int device_id, cudaStream_t stream) {
return CudaSpace(device_id, stream);
}

private:
void* impl_allocate(const Cuda& exec_space, const char* arg_label,
const size_t arg_alloc_size,
Expand All @@ -110,7 +119,8 @@ class CudaSpace {
static constexpr const char* name() { return m_name; }

private:
int m_device; ///< Which Cuda device
int m_device;
cudaStream_t m_stream;

static constexpr const char* m_name = "Cuda";
friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
Expand Down Expand Up @@ -147,6 +157,11 @@ class CudaUVMSpace {
/*--------------------------------*/

CudaUVMSpace();

private:
CudaUVMSpace(int device_id, cudaStream_t stream);

public:
CudaUVMSpace(CudaUVMSpace&& rhs) = default;
CudaUVMSpace(const CudaUVMSpace& rhs) = default;
CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default;
Expand Down Expand Up @@ -185,8 +200,13 @@ class CudaUVMSpace {
#endif
/*--------------------------------*/

static CudaUVMSpace impl_create(int device_id, cudaStream_t stream) {
return CudaUVMSpace(device_id, stream);
}

private:
int m_device; ///< Which Cuda device
int m_device;
cudaStream_t m_stream;

#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
static bool kokkos_impl_cuda_pin_uvm_to_host_v;
Expand Down Expand Up @@ -219,6 +239,11 @@ class CudaHostPinnedSpace {
/*--------------------------------*/

CudaHostPinnedSpace();

private:
CudaHostPinnedSpace(int device_id, cudaStream_t stream);

public:
CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default;
CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default;
CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default;
Expand All @@ -236,6 +261,10 @@ class CudaHostPinnedSpace {
const size_t arg_alloc_size,
const size_t arg_logical_size = 0) const;

static CudaHostPinnedSpace impl_create(int device_id, cudaStream_t stream) {
return CudaHostPinnedSpace(device_id, stream);
}

private:
void* impl_allocate(const char* arg_label, const size_t arg_alloc_size,
const size_t arg_logical_size = 0,
Expand All @@ -252,6 +281,9 @@ class CudaHostPinnedSpace {
static constexpr const char* name() { return m_name; }

private:
int m_device;
cudaStream_t m_stream;

static constexpr const char* m_name = "CudaHostPinned";

/*--------------------------------*/
Expand Down
26 changes: 14 additions & 12 deletions core/src/Cuda/Kokkos_Cuda_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@

#include <Kokkos_Core.hpp>

#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
#include <Cuda/Kokkos_Cuda_Instance.hpp>
#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
//#include <Cuda/Kokkos_Cuda_Error.hpp>
//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
//#include <Cuda/Kokkos_Cuda_Instance.hpp>
//#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Tools.hpp>
#include <impl/Kokkos_CheckedIntegerOps.hpp>
Expand Down Expand Up @@ -454,8 +454,9 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const {

std::size_t alloc_size =
multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain);
Record *const r = Record::allocate(
Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", alloc_size);
Record *const r =
Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream),
"Kokkos::InternalScratchFlags", alloc_size);

Record::increment(r);

Expand All @@ -480,8 +481,9 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const {

std::size_t alloc_size =
multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain);
Record *const r = Record::allocate(
Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", alloc_size);
Record *const r =
Record::allocate(CudaSpace::impl_create(m_cudaDev, m_stream),
"Kokkos::InternalScratchSpace", alloc_size);

Record::increment(r);

Expand All @@ -505,7 +507,7 @@ Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const {
std::size_t alloc_size =
multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain);
Record *const r =
Record::allocate(Kokkos::CudaHostPinnedSpace(),
Record::allocate(CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream),
"Kokkos::InternalScratchUnified", alloc_size);

Record::increment(r);
Expand All @@ -526,9 +528,9 @@ Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const {
if (m_scratchFunctor)
Record::decrement(Record::get_record(m_scratchFunctor));

Record *const r =
Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor",
m_scratchFunctorSize);
Record *const r = Record::allocate(
CudaSpace::impl_create(m_cudaDev, m_stream),
"Kokkos::InternalScratchFunctor", m_scratchFunctorSize);

Record::increment(r);

Expand Down
2 changes: 1 addition & 1 deletion core/src/Cuda/Kokkos_Cuda_Instance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <atomic>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <cuda_runtime_api.h>

#include "Kokkos_CudaSpace.hpp"
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// These functions fulfill the purpose of allowing to work around
Expand Down

0 comments on commit 4078a0d

Please sign in to comment.