Merge pull request #14621 from masterleinad/remove_compiler_cuda_aware

dealii · Dec 30, 2022 · e20814c · e20814c
2 parents 25d263f + 3604e2f
commit e20814c
Show file tree

Hide file tree

Showing 20 changed files with 48 additions and 45 deletions.
diff --git a/doc/doxygen/options.dox.in b/doc/doxygen/options.dox.in
@@ -200,7 +200,6 @@ PREDEFINED             = DOXYGEN=1 \
                          DEAL_II_WITH_TASKFLOW=1 \
                          DEAL_II_WITH_COMPLEX_VALUES=1 \
                          DEAL_II_WITH_CUDA=1 \
-                         DEAL_II_COMPILER_CUDA_AWARE=1 \
                          DEAL_II_WITH_GINKGO=1 \
                          DEAL_II_WITH_GMSH=1 \
                          DEAL_II_GMSH_WITH_API=1 \

diff --git a/include/deal.II/base/config.h.in b/include/deal.II/base/config.h.in
@@ -103,14 +103,6 @@
 #cmakedefine DEAL_II_RESTRICT @DEAL_II_RESTRICT@
 #cmakedefine DEAL_II_COMPILER_HAS_DIAGNOSTIC_PRAGMA
 
-/*
- * A variable to tell if the compiler used in the current compilation process
- * understands CUDA code.
- */
-#if defined(DEAL_II_WITH_CUDA) && defined(__CUDACC__)
-#  define DEAL_II_COMPILER_CUDA_AWARE
-#endif
-
 /***********************************************************************
  * CPU features:
  *

diff --git a/include/deal.II/base/cuda.h b/include/deal.II/base/cuda.h
@@ -21,7 +21,7 @@
 #include <deal.II/base/array_view.h>
 #include <deal.II/base/exceptions.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 #  include <cusolverDn.h>
 #  include <cusolverSp.h>
 #  include <cusparse.h>

diff --git a/include/deal.II/base/numbers.h b/include/deal.II/base/numbers.h
@@ -21,7 +21,7 @@
 
 #include <deal.II/base/types.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 #  include <cuComplex.h>
 #endif
 
@@ -809,7 +809,7 @@ namespace internal
     }
   };
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
   template <>
   struct NumberType<cuComplex>
   {

diff --git a/include/deal.II/base/partitioner.templates.h b/include/deal.II/base/partitioner.templates.h
@@ -107,8 +107,7 @@ namespace Utilities
         }
 
       Number *temp_array_ptr = temporary_storage.data();
-#    if defined(DEAL_II_COMPILER_CUDA_AWARE) && \
-      defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
+#    if defined(DEAL_II_WITH_CUDA) && defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
       // When using CUDAs-aware MPI, the set of local indices that are ghosts
       // indices on other processors is expanded in arrays. This is for
       // performance reasons as this can significantly decrease the number of
@@ -121,8 +120,7 @@ namespace Utilities
 
       for (unsigned int i = 0; i < n_import_targets; ++i)
         {
-#    if defined(DEAL_II_COMPILER_CUDA_AWARE) && \
-      defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
+#    if defined(DEAL_II_WITH_CUDA) && defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
           if (std::is_same<MemorySpaceType, MemorySpace::CUDA>::value)
             {
               const auto chunk_size = import_indices_plain_dev[i].second;
@@ -228,7 +226,7 @@ namespace Utilities
                     }
                   else
                     {
-#    ifdef DEAL_II_COMPILER_CUDA_AWARE
+#    ifdef DEAL_II_WITH_CUDA
                       cudaError_t cuda_error =
                         cudaMemcpy(ghost_array.data() + ghost_range.first,
                                    ghost_array.data() + offset,
@@ -378,7 +376,7 @@ namespace Utilities
                         }
                       else
                         {
-#    ifdef DEAL_II_COMPILER_CUDA_AWARE
+#    ifdef DEAL_II_WITH_CUDA
                           cudaError_t cuda_error =
                             cudaMemcpy(ghost_array_ptr + offset,
                                        ghost_array.data() + my_ghosts->first,
@@ -414,8 +412,7 @@ namespace Utilities
             ExcMessage("Index overflow: Maximum message size in MPI is 2GB. "
                        "The number of ghost entries times the size of 'Number' "
                        "exceeds this value. This is not supported."));
-#    if defined(DEAL_II_COMPILER_CUDA_AWARE) && \
-      defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
+#    if defined(DEAL_II_WITH_CUDA) && defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
           if (std::is_same<MemorySpaceType, MemorySpace::CUDA>::value)
             cudaDeviceSynchronize();
 #    endif
@@ -526,7 +523,7 @@ namespace Utilities
                    "import_from_ghosted_array_start as is passed "
                    "to import_from_ghosted_array_finish."));
 
-#      ifdef DEAL_II_COMPILER_CUDA_AWARE
+#      ifdef DEAL_II_WITH_CUDA
           if (std::is_same<MemorySpaceType, MemorySpace::CUDA>::value)
             {
               cudaMemset(ghost_array.data(),
@@ -560,8 +557,7 @@ namespace Utilities
       const unsigned int n_import_targets = import_targets_data.size();
       const unsigned int n_ghost_targets  = ghost_targets_data.size();
 
-#    if (defined(DEAL_II_COMPILER_CUDA_AWARE) && \
-         defined(DEAL_II_MPI_WITH_CUDA_SUPPORT))
+#    if (defined(DEAL_II_WITH_CUDA) && defined(DEAL_II_MPI_WITH_CUDA_SUPPORT))
       // When using CUDAs-aware MPI, the set of local indices that are ghosts
       // indices on other processors is expanded in arrays. This is for
       // performance reasons as this can significantly decrease the number of
@@ -583,8 +579,7 @@ namespace Utilities
           AssertThrowMPI(ierr);
 
           const Number *read_position = temporary_storage.data();
-#    if !(defined(DEAL_II_COMPILER_CUDA_AWARE) && \
-          defined(DEAL_II_MPI_WITH_CUDA_SUPPORT))
+#    if !(defined(DEAL_II_WITH_CUDA) && defined(DEAL_II_MPI_WITH_CUDA_SUPPORT))
           // If the operation is no insertion, add the imported data to the
           // local values. For insert, nothing is done here (but in debug mode
           // we assert that the specified value is either zero or matches with
@@ -730,8 +725,7 @@ namespace Utilities
         {
           Assert(ghost_array.begin() != nullptr, ExcInternalError());
 
-#    if defined(DEAL_II_COMPILER_CUDA_AWARE) && \
-      defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
+#    if defined(DEAL_II_WITH_CUDA) && defined(DEAL_II_MPI_WITH_CUDA_SUPPORT)
           if (std::is_same<MemorySpaceType, MemorySpace::CUDA>::value)
             {
               Assert(std::is_trivial<Number>::value, ExcNotImplemented());

diff --git a/include/deal.II/base/tensor.h b/include/deal.II/base/tensor.h
@@ -1432,8 +1432,12 @@ constexpr DEAL_II_ALWAYS_INLINE
   DEAL_II_HOST_DEVICE const typename Tensor<rank_, dim, Number>::value_type &
   Tensor<rank_, dim, Number>::operator[](const unsigned int i) const
 {
-#  ifndef DEAL_II_COMPILER_CUDA_AWARE
+#  if KOKKOS_VERSION < 30700
+#    ifdef KOKKOS_ACTIVE_MEMORY_SPACE_HOST
   AssertIndexRange(i, dim);
+#    endif
+#  else
+  KOKKOS_IF_ON_HOST((AssertIndexRange(i, dim);))
 #  endif
 
   return values[i];
@@ -1444,9 +1448,16 @@ template <int rank_, int dim, typename Number>
 constexpr inline DEAL_II_ALWAYS_INLINE const Number &
 Tensor<rank_, dim, Number>::operator[](const TableIndices<rank_> &indices) const
 {
-#  ifndef DEAL_II_COMPILER_CUDA_AWARE
+#  if KOKKOS_VERSION < 30700
+#    ifdef KOKKOS_ACTIVE_MEMORY_SPACE_HOST
   Assert(dim != 0,
          ExcMessage("Cannot access an object of type Tensor<rank_,0,Number>"));
+#    endif
+#  else
+  KOKKOS_IF_ON_HOST(
+    (Assert(dim != 0,
+            ExcMessage(
+              "Cannot access an object of type Tensor<rank_,0,Number>"));))
 #  endif
 
   return TensorAccessors::extract<rank_>(*this, indices);
@@ -1458,9 +1469,16 @@ template <int rank_, int dim, typename Number>
 constexpr inline DEAL_II_ALWAYS_INLINE Number &
 Tensor<rank_, dim, Number>::operator[](const TableIndices<rank_> &indices)
 {
-#  ifndef DEAL_II_COMPILER_CUDA_AWARE
+#  if KOKKOS_VERSION < 30700
+#    ifdef KOKKOS_ACTIVE_MEMORY_SPACE_HOST
   Assert(dim != 0,
          ExcMessage("Cannot access an object of type Tensor<rank_,0,Number>"));
+#    endif
+#  else
+  KOKKOS_IF_ON_HOST(
+    (Assert(dim != 0,
+            ExcMessage(
+              "Cannot access an object of type Tensor<rank_,0,Number>"));))
 #  endif
 
   return TensorAccessors::extract<rank_>(*this, indices);

diff --git a/include/deal.II/lac/affine_constraints.templates.h b/include/deal.II/lac/affine_constraints.templates.h
@@ -2256,7 +2256,7 @@ namespace internal
       vec.zero_out_ghost_values();
     }
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
     template <typename Number>
     __global__ void
     set_zero_kernel(const size_type *  constrained_dofs,

diff --git a/include/deal.II/lac/cuda_atomic.h b/include/deal.II/lac/cuda_atomic.h
@@ -18,7 +18,7 @@
 
 #include <deal.II/base/config.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 
 DEAL_II_NAMESPACE_OPEN
 

diff --git a/include/deal.II/lac/cuda_kernels.h b/include/deal.II/lac/cuda_kernels.h
@@ -18,7 +18,7 @@
 
 #include <deal.II/base/config.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 
 
 #  include <deal.II/base/cuda_size.h>

diff --git a/include/deal.II/lac/cuda_kernels.templates.h b/include/deal.II/lac/cuda_kernels.templates.h
@@ -20,7 +20,7 @@
 
 #include <deal.II/lac/cuda_kernels.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 
 DEAL_II_NAMESPACE_OPEN
 

diff --git a/include/deal.II/lac/cuda_precondition.h b/include/deal.II/lac/cuda_precondition.h
@@ -23,7 +23,7 @@
 
 #include <memory>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 
 DEAL_II_NAMESPACE_OPEN
 

diff --git a/include/deal.II/lac/cuda_solver_direct.h b/include/deal.II/lac/cuda_solver_direct.h
@@ -18,7 +18,7 @@
 
 #include <deal.II/base/config.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 #  include <deal.II/base/cuda.h>
 
 #  include <deal.II/lac/cuda_sparse_matrix.h>

diff --git a/include/deal.II/lac/cuda_sparse_matrix.h b/include/deal.II/lac/cuda_sparse_matrix.h
@@ -22,7 +22,7 @@
 
 #include <iomanip>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 #  include <deal.II/base/cuda.h>
 
 #  include <deal.II/lac/cuda_vector.h>

diff --git a/include/deal.II/lac/precondition.h b/include/deal.II/lac/precondition.h
@@ -3386,7 +3386,7 @@ namespace internal
     }
 
 
-#  ifdef DEAL_II_COMPILER_CUDA_AWARE
+#  ifdef DEAL_II_WITH_CUDA
     template <typename Number>
     __global__ void
     set_initial_guess_kernel(const types::global_dof_index offset,
@@ -3425,7 +3425,7 @@ namespace internal
       const Number mean_value = vector.mean_value();
       vector.add(-mean_value);
     }
-#  endif // DEAL_II_COMPILER_CUDA_AWARE
+#  endif // DEAL_II_WITH_CUDA
 
     struct EigenvalueTracker
     {

diff --git a/include/deal.II/matrix_free/cuda_fe_evaluation.h b/include/deal.II/matrix_free/cuda_fe_evaluation.h
@@ -18,7 +18,7 @@
 
 #include <deal.II/base/config.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 
 #  include <deal.II/base/tensor.h>
 #  include <deal.II/base/utilities.h>

diff --git a/include/deal.II/matrix_free/cuda_hanging_nodes_internal.h b/include/deal.II/matrix_free/cuda_hanging_nodes_internal.h
@@ -18,7 +18,7 @@
 
 #include <deal.II/base/config.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 
 #  include <deal.II/base/cuda_size.h>
 

diff --git a/include/deal.II/matrix_free/cuda_matrix_free.h b/include/deal.II/matrix_free/cuda_matrix_free.h
@@ -19,7 +19,7 @@
 
 #include <deal.II/base/config.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 
 #  include <deal.II/base/cuda_size.h>
 #  include <deal.II/base/mpi_stub.h>

diff --git a/include/deal.II/matrix_free/cuda_matrix_free.templates.h b/include/deal.II/matrix_free/cuda_matrix_free.templates.h
@@ -21,7 +21,7 @@
 
 #include <deal.II/matrix_free/cuda_matrix_free.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 
 #  include <deal.II/base/cuda.h>
 #  include <deal.II/base/cuda_size.h>

diff --git a/include/deal.II/matrix_free/cuda_tensor_product_kernels.h b/include/deal.II/matrix_free/cuda_tensor_product_kernels.h
@@ -23,7 +23,7 @@
 
 #include <deal.II/matrix_free/cuda_matrix_free.templates.h>
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 
 DEAL_II_NAMESPACE_OPEN
 

diff --git a/tests/tests.h b/tests/tests.h
@@ -618,7 +618,7 @@ struct MPILogInitAll
 };
 
 
-#ifdef DEAL_II_COMPILER_CUDA_AWARE
+#ifdef DEAL_II_WITH_CUDA
 // By default, all the ranks will try to access the device 0.
 // If we are running with MPI support it is better to address different graphic
 // cards for different processes even if only one node is used. The choice below