Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert CUDA tests to use Kokkos #14628

Merged
merged 7 commits into from
Jan 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 0 additions & 4 deletions include/deal.II/base/array_view.h
Original file line number Diff line number Diff line change
Expand Up @@ -575,10 +575,6 @@ inline typename ArrayView<ElementType, MemorySpaceType>::value_type &
ArrayView<ElementType, MemorySpaceType>::operator[](const std::size_t i) const
{
AssertIndexRange(i, n_elements);
Assert(
(std::is_same<MemorySpaceType, MemorySpace::Host>::value),
ExcMessage(
"Accessing elements is only allowed if the data is stored in CPU memory!"));

return *(starting_element + i);
}
Expand Down
52 changes: 24 additions & 28 deletions include/deal.II/lac/affine_constraints.templates.h
Original file line number Diff line number Diff line change
Expand Up @@ -2256,24 +2256,12 @@ namespace internal
vec.zero_out_ghost_values();
}

#ifdef DEAL_II_WITH_CUDA
template <typename Number>
__global__ void
set_zero_kernel(const size_type * constrained_dofs,
const unsigned int n_constrained_dofs,
Number * dst)
{
const unsigned int index = threadIdx.x + blockDim.x * blockIdx.x;
if (index < n_constrained_dofs)
dst[constrained_dofs[index]] = 0;
}

template <typename number>
void
set_zero_parallel(
const std::vector<size_type> & cm,
LinearAlgebra::distributed::Vector<number, MemorySpace::CUDA> &vec,
size_type shift = 0)
const std::vector<size_type> & cm,
LinearAlgebra::distributed::Vector<number, MemorySpace::Default> &vec,
size_type shift = 0)
{
Assert(shift == 0, ExcNotImplemented());
(void)shift;
Expand All @@ -2285,22 +2273,30 @@ namespace internal
constrained_local_dofs_host.push_back(
vec.get_partitioner()->global_to_local(global_index));

const int n_constraints = constrained_local_dofs_host.size();
size_type *constrained_local_dofs_device;
Utilities::CUDA::malloc(constrained_local_dofs_device, n_constraints);
Utilities::CUDA::copy_to_dev(constrained_local_dofs_host,
constrained_local_dofs_device);

const int n_blocks = 1 + (n_constraints - 1) / CUDAWrappers::block_size;
set_zero_kernel<<<n_blocks, CUDAWrappers::block_size>>>(
constrained_local_dofs_device, n_constraints, vec.get_values());
AssertCudaKernel();

Utilities::CUDA::free(constrained_local_dofs_device);
const int n_constraints = constrained_local_dofs_host.size();
Kokkos::View<size_type *, MemorySpace::Default::kokkos_space>
constrained_local_dofs_device(
Kokkos::view_alloc(Kokkos::WithoutInitializing,
"constrained_local_dofs_device"),
n_constraints);
Kokkos::deep_copy(constrained_local_dofs_device,
Kokkos::View<size_type *, Kokkos::HostSpace>(
constrained_local_dofs_host.data(),
constrained_local_dofs_host.size()));

using ExecutionSpace =
MemorySpace::Default::kokkos_space::execution_space;
ExecutionSpace exec;
auto local_values = vec.get_values();
Kokkos::parallel_for(
masterleinad marked this conversation as resolved.
Show resolved Hide resolved
"set_zero_parallel",
Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n_constraints),
KOKKOS_LAMBDA(int i) {
local_values[constrained_local_dofs_device[i]] = 0;
});

vec.zero_out_ghost_values();
}
#endif

template <class VectorType>
void
Expand Down
1 change: 0 additions & 1 deletion source/lac/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ if(DEAL_II_WITH_CUDA)
set(_separate_src
${_separate_src}
vector_memory_cuda.cc
affine_constraints_cuda.cc
)
endif()

Expand Down
28 changes: 28 additions & 0 deletions source/lac/affine_constraints.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,32 @@ dealii::AffineConstraints<double>::distribute<
# endif
#endif

#ifndef DOXYGEN
namespace internal
{
namespace AffineConstraintsImplementation
{
template void
set_zero_all(
const std::vector<types::global_dof_index> & cm,
LinearAlgebra::distributed::Vector<float, MemorySpace::Default> &vec);

template void
set_zero_all(
const std::vector<types::global_dof_index> & cm,
LinearAlgebra::distributed::Vector<double, MemorySpace::Default> &vec);
} // namespace AffineConstraintsImplementation
} // namespace internal

template void
AffineConstraints<float>::set_zero<
LinearAlgebra::distributed::Vector<float, MemorySpace::Default>>(
LinearAlgebra::distributed::Vector<float, MemorySpace::Default> &) const;

template void
AffineConstraints<double>::set_zero<
LinearAlgebra::distributed::Vector<double, MemorySpace::Default>>(
LinearAlgebra::distributed::Vector<double, MemorySpace::Default> &) const;
#endif

DEAL_II_NAMESPACE_CLOSE
50 changes: 0 additions & 50 deletions source/lac/affine_constraints_cuda.cc

This file was deleted.

138 changes: 62 additions & 76 deletions tests/cuda/cuda_point.cc → tests/base/kokkos_point.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,84 +19,71 @@

#include "../tests.h"

template <int dim, typename Number>
__global__ void
miscellaneous_kernel(Number check[16])
{
Point<dim, Number> p_1;
check[0] = p_1.norm_square();
Point<dim, Number> p_2(Tensor<1, dim, Number>{});
check[1] = p_2.norm_square();
if (dim == 1)
{
Point<dim, Number> p(1.);
check[2] = p.norm_square();
}
if (dim == 2)
{
Point<dim, Number> p(.6, .8);
check[2] = p.norm_square();
}
if (dim == 3)
{
Point<dim, Number> p(.48, .64, .6);
check[2] = p.norm_square();
}

auto p_3 = Point<dim, Number>::unit_vector(0);
check[3] = p_3.norm_square();

auto entry_1 = p_1(0);
check[4] = entry_1;
p_1(0) = Number{1.};
check[5] = p_1.norm_square();
auto p_4 = p_1 + Tensor<1, dim, Number>{};
check[6] = p_4.norm_square();
auto p_5 = p_1 - Tensor<1, dim, Number>{};
check[7] = p_5.norm_square();
auto t_1 = p_1 - p_2;
check[8] = t_1.norm_square();
auto p_6 = -p_3;
check[9] = p_6.norm_square();
auto p_7 = p_4 / 2.;
check[10] = p_7.norm_square();
auto p_8 = p_7 * 5.;
check[11] = p_8.norm_square();

auto s_1 = p_1 * t_1;
check[12] = s_1;
auto s_2 = p_2.square();
check[13] = s_2;
auto s_3 = p_3.distance(p_5);
check[14] = s_3;
auto s_4 = p_4.distance_square(p_1);
check[15] = s_4;
}

template <int dim, typename Number>
void
test_gpu()
{
Number * check;
const unsigned int n_tests = 16;

auto cuda_error = cudaMalloc(&check, n_tests * sizeof(Number));
AssertCuda(cuda_error);

// Miscellaneous
miscellaneous_kernel<dim, Number><<<1, 1>>>(check);
// Check that the kernel was launched correctly
AssertCuda(cudaPeekAtLastError());
// Check that there was no problem during the execution of the kernel
AssertCuda(cudaDeviceSynchronize());

std::vector<Number> check_host(n_tests);

cuda_error = cudaMemcpy(check_host.data(),
check,
n_tests * sizeof(Number),
cudaMemcpyDeviceToHost);
AssertCuda(cuda_error);
Kokkos::View<Number *, MemorySpace::Default::kokkos_space> check("check",
n_tests);

using ExecutionSpace = MemorySpace::Default::kokkos_space::execution_space;
ExecutionSpace exec;
Kokkos::parallel_for(
Kokkos::RangePolicy<ExecutionSpace>(exec, 0, 1), KOKKOS_LAMBDA(int) {
Point<dim, Number> p_1;
check[0] = p_1.norm_square();
Point<dim, Number> p_2(Tensor<1, dim, Number>{});
check[1] = p_2.norm_square();
if (dim == 1)
{
Point<dim, Number> p(1.);
check[2] = p.norm_square();
}
if (dim == 2)
{
Point<dim, Number> p(.6, .8);
check[2] = p.norm_square();
}
if (dim == 3)
{
Point<dim, Number> p(.48, .64, .6);
check[2] = p.norm_square();
}

auto p_3 = Point<dim, Number>::unit_vector(0);
check[3] = p_3.norm_square();

auto entry_1 = p_1(0);
check[4] = entry_1;
p_1(0) = Number{1.};
check[5] = p_1.norm_square();
auto p_4 = p_1 + Tensor<1, dim, Number>{};
check[6] = p_4.norm_square();
auto p_5 = p_1 - Tensor<1, dim, Number>{};
check[7] = p_5.norm_square();
auto t_1 = p_1 - p_2;
check[8] = t_1.norm_square();
auto p_6 = -p_3;
check[9] = p_6.norm_square();
auto p_7 = p_4 / 2.;
check[10] = p_7.norm_square();
auto p_8 = p_7 * 5.;
check[11] = p_8.norm_square();

auto s_1 = p_1 * t_1;
check[12] = s_1;
auto s_2 = p_2.square();
check[13] = s_2;
auto s_3 = p_3.distance(p_5);
check[14] = s_3;
auto s_4 = p_4.distance_square(p_1);
check[15] = s_4;
});

auto check_host =
Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, check);

const double tolerance = 1.e-8;
AssertThrow(std::abs(check_host[0] - 0.) < tolerance, ExcInternalError());
Expand All @@ -116,9 +103,6 @@ test_gpu()
AssertThrow(std::abs(check_host[14] - 0.) < tolerance, ExcInternalError());
AssertThrow(std::abs(check_host[15] - 0.) < tolerance, ExcInternalError());

cuda_error = cudaFree(check);
AssertCuda(cuda_error);

deallog << "OK" << std::endl;
}

Expand All @@ -127,12 +111,14 @@ main()
{
initlog();

init_cuda();
Kokkos::initialize();

test_gpu<1, double>();
test_gpu<2, double>();
test_gpu<3, double>();
test_gpu<1, float>();
test_gpu<2, float>();
test_gpu<3, float>();

Kokkos::finalize();
}
File renamed without changes.