Skip to content

Commit

Permalink
multi-GPU support: Add test for all policies (kokkos#6782)
Browse files Browse the repository at this point in the history
* Cuda multi-GPU support: Test with managed and unmanaged Views

* Move check for cuda_device

* Also test copying between devices

* Refactor using StreamsAndDevices

* Don't use shared_ptr
  • Loading branch information
masterleinad committed Feb 28, 2024
1 parent bb73401 commit 16a5ebe
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 20 deletions.
2 changes: 1 addition & 1 deletion core/unit_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,7 @@ if(Kokkos_ENABLE_CUDA)
KOKKOS_ADD_EXECUTABLE_AND_TEST(
CoreUnitTest_CudaInterOpStreamsMultiGPU
SOURCES
UnitTestMain.cpp
UnitTestMainInit.cpp
cuda/TestCuda_InterOp_StreamsMultiGPU.cpp
)
KOKKOS_ADD_EXECUTABLE_AND_TEST(
Expand Down
160 changes: 141 additions & 19 deletions core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,32 +18,154 @@
#include <Test_InterOp_Streams.hpp>

namespace {
TEST(cuda, multi_gpu) {
Kokkos::initialize();

int n_devices;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&n_devices));
struct StreamsAndDevices {
std::array<cudaStream_t, 2> streams;
std::array<int, 2> devices;

KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(0));
cudaStream_t stream0;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream0));
StreamsAndDevices() {
int n_devices;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&n_devices));

KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(n_devices - 1));
cudaStream_t stream;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream));
devices = {0, n_devices - 1};
for (int i = 0; i < 2; ++i) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i]));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&streams[i]));
}
}
StreamsAndDevices(const StreamsAndDevices &) = delete;
StreamsAndDevices &operator=(const StreamsAndDevices &) = delete;
~StreamsAndDevices() {
for (int i = 0; i < 2; ++i) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i]));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]));
}
}
};

std::array<TEST_EXECSPACE, 2> get_execution_spaces(
const StreamsAndDevices &streams_and_devices) {
TEST_EXECSPACE exec0(streams_and_devices.streams[0]);
TEST_EXECSPACE exec1(streams_and_devices.streams[1]);

// Must return void to use ASSERT_EQ
[&]() {
ASSERT_EQ(exec0.cuda_device(), streams_and_devices.devices[0]);
ASSERT_EQ(exec1.cuda_device(), streams_and_devices.devices[1]);
}();

return {exec0, exec1};
}

// Test Interoperability with Cuda Streams
void test_policies(TEST_EXECSPACE exec0, Kokkos::View<int *, TEST_EXECSPACE> v0,
TEST_EXECSPACE exec, Kokkos::View<int *, TEST_EXECSPACE> v) {
using MemorySpace = typename TEST_EXECSPACE::memory_space;

Kokkos::deep_copy(exec, v, 5);
Kokkos::deep_copy(exec0, v0, 5);

Kokkos::deep_copy(v, v0);

int sum;
int sum0;

Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range_0",
Kokkos::RangePolicy<TEST_EXECSPACE>(exec0, 0, 100),
Test::FunctorRange<MemorySpace>(v0));
Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range",
Kokkos::RangePolicy<TEST_EXECSPACE>(exec, 0, 100),
Test::FunctorRange<MemorySpace>(v));
Kokkos::parallel_reduce(
"Test::cuda::raw_cuda_stream::RangeReduce_0",
Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec0,
0, 100),
Test::FunctorRangeReduce<MemorySpace>(v0), sum0);
Kokkos::parallel_reduce(
"Test::cuda::raw_cuda_stream::RangeReduce",
Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec, 0,
100),
Test::FunctorRangeReduce<MemorySpace>(v), sum);
ASSERT_EQ(600, sum0);
ASSERT_EQ(600, sum);

Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange_0",
Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
exec0, {0, 0}, {10, 10}),
Test::FunctorMDRange<MemorySpace>(v0));
Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange",
Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(
exec, {0, 0}, {10, 10}),
Test::FunctorMDRange<MemorySpace>(v));
Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce_0",
Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>,
Kokkos::LaunchBounds<128, 2>>(
exec0, {0, 0}, {10, 10}),
Test::FunctorMDRangeReduce<MemorySpace>(v0), sum0);
Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce",
Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>,
Kokkos::LaunchBounds<128, 2>>(
exec, {0, 0}, {10, 10}),
Test::FunctorMDRangeReduce<MemorySpace>(v), sum);
ASSERT_EQ(700, sum0);
ASSERT_EQ(700, sum);

Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team_0",
Kokkos::TeamPolicy<TEST_EXECSPACE>(exec0, 10, 10),
Test::FunctorTeam<MemorySpace, TEST_EXECSPACE>(v0));
Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team",
Kokkos::TeamPolicy<TEST_EXECSPACE>(exec, 10, 10),
Test::FunctorTeam<MemorySpace, TEST_EXECSPACE>(v));
Kokkos::parallel_reduce(
"Test::cuda::raw_cuda_stream::Team_0",
Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec0,
10, 10),
Test::FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v0), sum0);
Kokkos::parallel_reduce(
"Test::cuda::raw_cuda_stream::Team",
Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec, 10,
10),
Test::FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v), sum);
ASSERT_EQ(800, sum0);
ASSERT_EQ(800, sum);
}

TEST(cuda_multi_gpu, managed_views) {
StreamsAndDevices streams_and_devices;
{
TEST_EXECSPACE space0(stream0);
ASSERT_EQ(space0.cuda_device(), 0);
TEST_EXECSPACE space(stream);
ASSERT_EQ(space.cuda_device(), n_devices - 1);
std::array<TEST_EXECSPACE, 2> execs =
get_execution_spaces(streams_and_devices);

Kokkos::View<int *, TEST_EXECSPACE> view0(
Kokkos::view_alloc("v0", execs[0]), 100);
Kokkos::View<int *, TEST_EXECSPACE> view(Kokkos::view_alloc("v", execs[1]),
100);

test_policies(execs[0], view0, execs[1], view);
}
Kokkos::finalize();
}

TEST(cuda_multi_gpu, unmanaged_views) {
StreamsAndDevices streams_and_devices;
{
std::array<TEST_EXECSPACE, 2> execs =
get_execution_spaces(streams_and_devices);

KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(0));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(stream0));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[0].cuda_device()));
int *p0;
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaMalloc(reinterpret_cast<void **>(&p0), sizeof(int) * 100));
Kokkos::View<int *, TEST_EXECSPACE> view0(p0, 100);

KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(n_devices - 1));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(stream));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[1].cuda_device()));
int *p;
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaMalloc(reinterpret_cast<void **>(&p), sizeof(int) * 100));
Kokkos::View<int *, TEST_EXECSPACE> view(p, 100);

test_policies(execs[0], view0, execs[1], view);
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p0));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p));
}
}
} // namespace

0 comments on commit 16a5ebe

Please sign in to comment.