dealii · Rombur · Jan 24, 2023 · Dec 31, 2022 · Dec 31, 2022 · Dec 30, 2022
diff --git a/include/deal.II/base/array_view.h b/include/deal.II/base/array_view.h
@@ -575,10 +575,6 @@ inline typename ArrayView<ElementType, MemorySpaceType>::value_type &
 ArrayView<ElementType, MemorySpaceType>::operator[](const std::size_t i) const
 {
   AssertIndexRange(i, n_elements);
-  Assert(
-    (std::is_same<MemorySpaceType, MemorySpace::Host>::value),
-    ExcMessage(
-      "Accessing elements is only allowed if the data is stored in CPU memory!"));
 
   return *(starting_element + i);
 }

diff --git a/include/deal.II/lac/affine_constraints.templates.h b/include/deal.II/lac/affine_constraints.templates.h
@@ -2256,24 +2256,12 @@ namespace internal
       vec.zero_out_ghost_values();
     }
 
-#ifdef DEAL_II_WITH_CUDA
-    template <typename Number>
-    __global__ void
-    set_zero_kernel(const size_type *  constrained_dofs,
-                    const unsigned int n_constrained_dofs,
-                    Number *           dst)
-    {
-      const unsigned int index = threadIdx.x + blockDim.x * blockIdx.x;
-      if (index < n_constrained_dofs)
-        dst[constrained_dofs[index]] = 0;
-    }
-
     template <typename number>
     void
     set_zero_parallel(
-      const std::vector<size_type> &                                 cm,
-      LinearAlgebra::distributed::Vector<number, MemorySpace::CUDA> &vec,
-      size_type                                                      shift = 0)
+      const std::vector<size_type> &                                    cm,
+      LinearAlgebra::distributed::Vector<number, MemorySpace::Default> &vec,
+      size_type shift = 0)
     {
       Assert(shift == 0, ExcNotImplemented());
       (void)shift;
@@ -2285,22 +2273,30 @@ namespace internal
           constrained_local_dofs_host.push_back(
             vec.get_partitioner()->global_to_local(global_index));
 
-      const int  n_constraints = constrained_local_dofs_host.size();
-      size_type *constrained_local_dofs_device;
-      Utilities::CUDA::malloc(constrained_local_dofs_device, n_constraints);
-      Utilities::CUDA::copy_to_dev(constrained_local_dofs_host,
-                                   constrained_local_dofs_device);
-
-      const int n_blocks = 1 + (n_constraints - 1) / CUDAWrappers::block_size;
-      set_zero_kernel<<<n_blocks, CUDAWrappers::block_size>>>(
-        constrained_local_dofs_device, n_constraints, vec.get_values());
-      AssertCudaKernel();
-
-      Utilities::CUDA::free(constrained_local_dofs_device);
+      const int n_constraints = constrained_local_dofs_host.size();
+      Kokkos::View<size_type *, MemorySpace::Default::kokkos_space>
+        constrained_local_dofs_device(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                             "constrained_local_dofs_device"),
+          n_constraints);
+      Kokkos::deep_copy(constrained_local_dofs_device,
+                        Kokkos::View<size_type *, Kokkos::HostSpace>(
+                          constrained_local_dofs_host.data(),
+                          constrained_local_dofs_host.size()));
+
+      using ExecutionSpace =
+        MemorySpace::Default::kokkos_space::execution_space;
+      ExecutionSpace exec;
+      auto           local_values = vec.get_values();
+      Kokkos::parallel_for(
+        "set_zero_parallel",
+        Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n_constraints),
+        KOKKOS_LAMBDA(int i) {
+          local_values[constrained_local_dofs_device[i]] = 0;
+        });
 
       vec.zero_out_ghost_values();
     }
-#endif
 
     template <class VectorType>
     void

diff --git a/source/lac/CMakeLists.txt b/source/lac/CMakeLists.txt
@@ -66,7 +66,6 @@ if(DEAL_II_WITH_CUDA)
   set(_separate_src
     ${_separate_src}
     vector_memory_cuda.cc
-    affine_constraints_cuda.cc
     )
 endif()
 

diff --git a/source/lac/affine_constraints.cc b/source/lac/affine_constraints.cc
@@ -150,4 +150,32 @@ dealii::AffineConstraints<double>::distribute<
 #  endif
 #endif
 
+#ifndef DOXYGEN
+namespace internal
+{
+  namespace AffineConstraintsImplementation
+  {
+    template void
+    set_zero_all(
+      const std::vector<types::global_dof_index> &                     cm,
+      LinearAlgebra::distributed::Vector<float, MemorySpace::Default> &vec);
+
+    template void
+    set_zero_all(
+      const std::vector<types::global_dof_index> &                      cm,
+      LinearAlgebra::distributed::Vector<double, MemorySpace::Default> &vec);
+  } // namespace AffineConstraintsImplementation
+} // namespace internal
+
+template void
+AffineConstraints<float>::set_zero<
+  LinearAlgebra::distributed::Vector<float, MemorySpace::Default>>(
+  LinearAlgebra::distributed::Vector<float, MemorySpace::Default> &) const;
+
+template void
+AffineConstraints<double>::set_zero<
+  LinearAlgebra::distributed::Vector<double, MemorySpace::Default>>(
+  LinearAlgebra::distributed::Vector<double, MemorySpace::Default> &) const;
+#endif
+
 DEAL_II_NAMESPACE_CLOSE
diff --git a/source/lac/affine_constraints_cuda.cc b/source/lac/affine_constraints_cuda.cc
diff --git a/tests/cuda/cuda_point.cc → tests/base/kokkos_point.cc b/tests/cuda/cuda_point.cc → tests/base/kokkos_point.cc
@@ -19,84 +19,71 @@
 
 #include "../tests.h"
 
-template <int dim, typename Number>
-__global__ void
-miscellaneous_kernel(Number check[16])
-{
-  Point<dim, Number> p_1;
-  check[0] = p_1.norm_square();
-  Point<dim, Number> p_2(Tensor<1, dim, Number>{});
-  check[1] = p_2.norm_square();
-  if (dim == 1)
-    {
-      Point<dim, Number> p(1.);
-      check[2] = p.norm_square();
-    }
-  if (dim == 2)
-    {
-      Point<dim, Number> p(.6, .8);
-      check[2] = p.norm_square();
-    }
-  if (dim == 3)
-    {
-      Point<dim, Number> p(.48, .64, .6);
-      check[2] = p.norm_square();
-    }
-
-  auto p_3 = Point<dim, Number>::unit_vector(0);
-  check[3] = p_3.norm_square();
-
-  auto entry_1 = p_1(0);
-  check[4]     = entry_1;
-  p_1(0)       = Number{1.};
-  check[5]     = p_1.norm_square();
-  auto p_4     = p_1 + Tensor<1, dim, Number>{};
-  check[6]     = p_4.norm_square();
-  auto p_5     = p_1 - Tensor<1, dim, Number>{};
-  check[7]     = p_5.norm_square();
-  auto t_1     = p_1 - p_2;
-  check[8]     = t_1.norm_square();
-  auto p_6     = -p_3;
-  check[9]     = p_6.norm_square();
-  auto p_7     = p_4 / 2.;
-  check[10]    = p_7.norm_square();
-  auto p_8     = p_7 * 5.;
-  check[11]    = p_8.norm_square();
-
-  auto s_1  = p_1 * t_1;
-  check[12] = s_1;
-  auto s_2  = p_2.square();
-  check[13] = s_2;
-  auto s_3  = p_3.distance(p_5);
-  check[14] = s_3;
-  auto s_4  = p_4.distance_square(p_1);
-  check[15] = s_4;
-}
-
 template <int dim, typename Number>
 void
 test_gpu()
 {
-  Number *           check;
   const unsigned int n_tests = 16;
 
-  auto cuda_error = cudaMalloc(&check, n_tests * sizeof(Number));
-  AssertCuda(cuda_error);
-
-  // Miscellaneous
-  miscellaneous_kernel<dim, Number><<<1, 1>>>(check);
-  // Check that the kernel was launched correctly
-  AssertCuda(cudaPeekAtLastError());
-  // Check that there was no problem during the execution of the kernel
-  AssertCuda(cudaDeviceSynchronize());
-
-  std::vector<Number> check_host(n_tests);
-
-  cuda_error = cudaMemcpy(check_host.data(),
-                          check,
-                          n_tests * sizeof(Number),
-                          cudaMemcpyDeviceToHost);
-  AssertCuda(cuda_error);
+  Kokkos::View<Number *, MemorySpace::Default::kokkos_space> check("check",
+                                                                   n_tests);
+
+  using ExecutionSpace = MemorySpace::Default::kokkos_space::execution_space;
+  ExecutionSpace exec;
+  Kokkos::parallel_for(
+    Kokkos::RangePolicy<ExecutionSpace>(exec, 0, 1), KOKKOS_LAMBDA(int) {
+      Point<dim, Number> p_1;
+      check[0] = p_1.norm_square();
+      Point<dim, Number> p_2(Tensor<1, dim, Number>{});
+      check[1] = p_2.norm_square();
+      if (dim == 1)
+        {
+          Point<dim, Number> p(1.);
+          check[2] = p.norm_square();
+        }
+      if (dim == 2)
+        {
+          Point<dim, Number> p(.6, .8);
+          check[2] = p.norm_square();
+        }
+      if (dim == 3)
+        {
+          Point<dim, Number> p(.48, .64, .6);
+          check[2] = p.norm_square();
+        }
+
+      auto p_3 = Point<dim, Number>::unit_vector(0);
+      check[3] = p_3.norm_square();
+
+      auto entry_1 = p_1(0);
+      check[4]     = entry_1;
+      p_1(0)       = Number{1.};
+      check[5]     = p_1.norm_square();
+      auto p_4     = p_1 + Tensor<1, dim, Number>{};
+      check[6]     = p_4.norm_square();
+      auto p_5     = p_1 - Tensor<1, dim, Number>{};
+      check[7]     = p_5.norm_square();
+      auto t_1     = p_1 - p_2;
+      check[8]     = t_1.norm_square();
+      auto p_6     = -p_3;
+      check[9]     = p_6.norm_square();
+      auto p_7     = p_4 / 2.;
+      check[10]    = p_7.norm_square();
+      auto p_8     = p_7 * 5.;
+      check[11]    = p_8.norm_square();
+
+      auto s_1  = p_1 * t_1;
+      check[12] = s_1;
+      auto s_2  = p_2.square();
+      check[13] = s_2;
+      auto s_3  = p_3.distance(p_5);
+      check[14] = s_3;
+      auto s_4  = p_4.distance_square(p_1);
+      check[15] = s_4;
+    });
+
+  auto check_host =
+    Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, check);
 
   const double tolerance = 1.e-8;
   AssertThrow(std::abs(check_host[0] - 0.) < tolerance, ExcInternalError());
@@ -116,9 +103,6 @@ test_gpu()
   AssertThrow(std::abs(check_host[14] - 0.) < tolerance, ExcInternalError());
   AssertThrow(std::abs(check_host[15] - 0.) < tolerance, ExcInternalError());
 
-  cuda_error = cudaFree(check);
-  AssertCuda(cuda_error);
-
   deallog << "OK" << std::endl;
 }
 
@@ -127,12 +111,14 @@ main()
 {
   initlog();
 
-  init_cuda();
+  Kokkos::initialize();
 
   test_gpu<1, double>();
   test_gpu<2, double>();
   test_gpu<3, double>();
   test_gpu<1, float>();
   test_gpu<2, float>();
   test_gpu<3, float>();
+
+  Kokkos::finalize();
 }
diff --git a/tests/cuda/cuda_point.output → tests/base/kokkos_point.output b/tests/cuda/cuda_point.output → tests/base/kokkos_point.output