grid: Fix wrong results on Volta due to thread divergence

cp2k · Mar 20, 2021 · 814da85 · 814da85
1 parent 79d1c86
commit 814da85
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 11 deletions.
diff --git a/src/grid/Makefile b/src/grid/Makefile
@@ -6,7 +6,7 @@ clean:
 	rm -fv *.o */*.o *.x
 
 CFLAGS := -fopenmp -g -O3 -march=native -Wall -Wextra
-NVFLAGS := -g -O3 -lineinfo -arch sm_50 -Wno-deprecated-gpu-targets -Xcompiler "$(CFLAGS)" -D__GRID_CUDA
+NVFLAGS := -g -O3 -lineinfo -arch sm_70 -Wno-deprecated-gpu-targets -Xcompiler "$(CFLAGS)" -D__GRID_CUDA
 LIBS := -lm -lblas
 
 ALL_HEADERS := $(shell find . -name "*.h")

diff --git a/src/grid/gpu/grid_gpu_collint.h b/src/grid/gpu/grid_gpu_collint.h
@@ -67,9 +67,18 @@ __device__ static void atomicAddDouble(double *address, double val) {
  * \brief Sums first within a warp and then issues a single atomicAdd per warp.
  * \author Ole Schuett
  ******************************************************************************/
-__device__ static inline void coalescedAtomicAdd(double *address, double val) {
-
+__device__ static inline void coalescedAtomicAdd(double *base_address, int idx,
+                                                 double val) {
+
+#if __CUDA_ARCH__ >= 700
+  // The labeled_partition should not be needed because of __syncwarp earlier,
+  // but since it has no noticeable performance impact and we can play it safe.
+  const cg::coalesced_group active =
+      cg::labeled_partition(cg::coalesced_threads(), idx);
+  // assert(active.meta_group_size()==1); // too costly for production
+#else
   const cg::coalesced_group active = cg::coalesced_threads();
+#endif
 
 #if (CUDA_VERSION >= 11000)
   // Reduce from Cuda 11+ library is around 30% faster than the solution below.
@@ -91,9 +100,9 @@ __device__ static inline void coalescedAtomicAdd(double *address, double val) {
   const double sum = sum1 + sum2;
 #endif
 
-  // A single atomic add to avoid shared memory bank conflicts.
+  // Use only a single atomic add to avoid collisions.
   if (active.thread_rank() == 0) {
-    atomicAddDouble(address, sum);
+    atomicAddDouble(&base_address[idx], sum);
   }
 }
 
@@ -204,7 +213,7 @@ __device__ static void gridpoint_to_cxyz(const double dx, const double dy,
           if (cxyz_index < GRID_N_CXYZ_LOCAL_MEM) {
             cxyz_local_mem[cxyz_index] += val;
           } else {
-            coalescedAtomicAdd(&cxyz[cxyz_index], val);
+            coalescedAtomicAdd(cxyz, cxyz_index, val);
           }
           pow_dx *= dx; // pow_dx = pow(dx, lxp)
         }
@@ -411,6 +420,12 @@ ortho_cxyz_to_grid(const kernel_params *params, const smem_task *task,
       const double jremain = kremain - jr * jr;
       const int imin =
           ceil(-1e-8 - sqrt(fmax(0.0, jremain)) * params->dh_inv[0][0]);
+
+#if (!GRID_DO_COLLOCATE)
+      // Sync threads within warp to have a single group in coalescedAtomicAdd.
+      __syncwarp(); // assumes blockDim.x==32
+#endif
+
       for (int i = threadIdx.x + imin; i <= 1 - imin; i += blockDim.x) {
         const int ia = cubecenter[0] + i - params->shift_local[0];
         const int ig =
@@ -510,6 +525,12 @@ general_cxyz_to_grid(const kernel_params *params, const smem_task *task,
       if (jg < bounds_j[0] || bounds_j[1] < jg) {
         continue;
       }
+
+#if (!GRID_DO_COLLOCATE)
+      // Sync threads within warp to have a single group in coalescedAtomicAdd.
+      __syncwarp(); // assumes blockDim.x==32
+#endif
+
       const int i_start = threadIdx.x + index_min[0];
       for (int i = i_start; i <= index_max[0]; i += blockDim.x) {
         const int ig =
@@ -585,7 +606,7 @@ __device__ static void cxyz_to_grid(const kernel_params *params,
 #if (!GRID_DO_COLLOCATE)
   // integrate
   for (int i = 0; i < GRID_N_CXYZ_LOCAL_MEM; i++) {
-    coalescedAtomicAdd(&cxyz[i], cxyz_local_mem[i]);
+    atomicAddDouble(&cxyz[i], cxyz_local_mem[i]);
   }
   __syncthreads(); // because of concurrent writes to cxyz
 #endif

diff --git a/src/grid/gpu/grid_gpu_integrate.cu b/src/grid/gpu/grid_gpu_integrate.cu
@@ -87,11 +87,11 @@ __device__ static void store_forces_and_virial(const kernel_params *params,
           for (int k = 0; k < 3; k++) {
             const double force_a = get_force_a(a, b, k, task->zeta, task->zetb,
                                                task->n1, cab, COMPUTE_TAU);
-            coalescedAtomicAdd(&task->forces_a[k], force_a * pabval);
+            atomicAddDouble(&task->forces_a[k], force_a * pabval);
             const double force_b =
                 get_force_b(a, b, k, task->zeta, task->zetb, task->rab,
                             task->n1, cab, COMPUTE_TAU);
-            coalescedAtomicAdd(&task->forces_b[k], force_b * pabval);
+            atomicAddDouble(&task->forces_b[k], force_b * pabval);
           }
           if (params->virial != NULL) {
             for (int k = 0; k < 3; k++) {
@@ -103,7 +103,7 @@ __device__ static void store_forces_and_virial(const kernel_params *params,
                     get_virial_b(a, b, k, l, task->zeta, task->zetb, task->rab,
                                  task->n1, cab, COMPUTE_TAU);
                 const double virial = pabval * (virial_a + virial_b);
-                coalescedAtomicAdd(&params->virial[k * 3 + l], virial);
+                atomicAddDouble(&params->virial[k * 3 + l], virial);
               }
             }
           }
@@ -275,7 +275,8 @@ void grid_gpu_integrate_one_grid_level(
 
   // Launch !
   const int nblocks = ntasks;
-  const dim3 threads_per_block(4, 8, 8);
+  const dim3 threads_per_block(32, 2, 1);
+  assert(threads_per_block.x == 32); // needed for __syncwarp
 
   if (!compute_tau && !calculate_forces) {
     grid_integrate_density<<<nblocks, threads_per_block, smem_per_block,