Skip to content

Commit

Permalink
grid: Fix wrong results on Volta due to thread divergence
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Mar 20, 2021
1 parent 79d1c86 commit 814da85
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 11 deletions.
2 changes: 1 addition & 1 deletion src/grid/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ clean:
rm -fv *.o */*.o *.x

CFLAGS := -fopenmp -g -O3 -march=native -Wall -Wextra
NVFLAGS := -g -O3 -lineinfo -arch sm_50 -Wno-deprecated-gpu-targets -Xcompiler "$(CFLAGS)" -D__GRID_CUDA
NVFLAGS := -g -O3 -lineinfo -arch sm_70 -Wno-deprecated-gpu-targets -Xcompiler "$(CFLAGS)" -D__GRID_CUDA
LIBS := -lm -lblas

ALL_HEADERS := $(shell find . -name "*.h")
Expand Down
33 changes: 27 additions & 6 deletions src/grid/gpu/grid_gpu_collint.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,18 @@ __device__ static void atomicAddDouble(double *address, double val) {
* \brief Sums first within a warp and then issues a single atomicAdd per warp.
* \author Ole Schuett
******************************************************************************/
__device__ static inline void coalescedAtomicAdd(double *address, double val) {

__device__ static inline void coalescedAtomicAdd(double *base_address, int idx,
double val) {

#if __CUDA_ARCH__ >= 700
// The labeled_partition should not be needed because of __syncwarp earlier,
// but since it has no noticeable performance impact and we can play it safe.
const cg::coalesced_group active =
cg::labeled_partition(cg::coalesced_threads(), idx);
// assert(active.meta_group_size()==1); // too costly for production
#else
const cg::coalesced_group active = cg::coalesced_threads();
#endif

#if (CUDA_VERSION >= 11000)
// Reduce from Cuda 11+ library is around 30% faster than the solution below.
Expand All @@ -91,9 +100,9 @@ __device__ static inline void coalescedAtomicAdd(double *address, double val) {
const double sum = sum1 + sum2;
#endif

// A single atomic add to avoid shared memory bank conflicts.
// Use only a single atomic add to avoid collisions.
if (active.thread_rank() == 0) {
atomicAddDouble(address, sum);
atomicAddDouble(&base_address[idx], sum);
}
}

Expand Down Expand Up @@ -204,7 +213,7 @@ __device__ static void gridpoint_to_cxyz(const double dx, const double dy,
if (cxyz_index < GRID_N_CXYZ_LOCAL_MEM) {
cxyz_local_mem[cxyz_index] += val;
} else {
coalescedAtomicAdd(&cxyz[cxyz_index], val);
coalescedAtomicAdd(cxyz, cxyz_index, val);
}
pow_dx *= dx; // pow_dx = pow(dx, lxp)
}
Expand Down Expand Up @@ -411,6 +420,12 @@ ortho_cxyz_to_grid(const kernel_params *params, const smem_task *task,
const double jremain = kremain - jr * jr;
const int imin =
ceil(-1e-8 - sqrt(fmax(0.0, jremain)) * params->dh_inv[0][0]);

#if (!GRID_DO_COLLOCATE)
// Sync threads within warp to have a single group in coalescedAtomicAdd.
__syncwarp(); // assumes blockDim.x==32
#endif

for (int i = threadIdx.x + imin; i <= 1 - imin; i += blockDim.x) {
const int ia = cubecenter[0] + i - params->shift_local[0];
const int ig =
Expand Down Expand Up @@ -510,6 +525,12 @@ general_cxyz_to_grid(const kernel_params *params, const smem_task *task,
if (jg < bounds_j[0] || bounds_j[1] < jg) {
continue;
}

#if (!GRID_DO_COLLOCATE)
// Sync threads within warp to have a single group in coalescedAtomicAdd.
__syncwarp(); // assumes blockDim.x==32
#endif

const int i_start = threadIdx.x + index_min[0];
for (int i = i_start; i <= index_max[0]; i += blockDim.x) {
const int ig =
Expand Down Expand Up @@ -585,7 +606,7 @@ __device__ static void cxyz_to_grid(const kernel_params *params,
#if (!GRID_DO_COLLOCATE)
// integrate
for (int i = 0; i < GRID_N_CXYZ_LOCAL_MEM; i++) {
coalescedAtomicAdd(&cxyz[i], cxyz_local_mem[i]);
atomicAddDouble(&cxyz[i], cxyz_local_mem[i]);
}
__syncthreads(); // because of concurrent writes to cxyz
#endif
Expand Down
9 changes: 5 additions & 4 deletions src/grid/gpu/grid_gpu_integrate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,11 @@ __device__ static void store_forces_and_virial(const kernel_params *params,
for (int k = 0; k < 3; k++) {
const double force_a = get_force_a(a, b, k, task->zeta, task->zetb,
task->n1, cab, COMPUTE_TAU);
coalescedAtomicAdd(&task->forces_a[k], force_a * pabval);
atomicAddDouble(&task->forces_a[k], force_a * pabval);
const double force_b =
get_force_b(a, b, k, task->zeta, task->zetb, task->rab,
task->n1, cab, COMPUTE_TAU);
coalescedAtomicAdd(&task->forces_b[k], force_b * pabval);
atomicAddDouble(&task->forces_b[k], force_b * pabval);
}
if (params->virial != NULL) {
for (int k = 0; k < 3; k++) {
Expand All @@ -103,7 +103,7 @@ __device__ static void store_forces_and_virial(const kernel_params *params,
get_virial_b(a, b, k, l, task->zeta, task->zetb, task->rab,
task->n1, cab, COMPUTE_TAU);
const double virial = pabval * (virial_a + virial_b);
coalescedAtomicAdd(&params->virial[k * 3 + l], virial);
atomicAddDouble(&params->virial[k * 3 + l], virial);
}
}
}
Expand Down Expand Up @@ -275,7 +275,8 @@ void grid_gpu_integrate_one_grid_level(

// Launch !
const int nblocks = ntasks;
const dim3 threads_per_block(4, 8, 8);
const dim3 threads_per_block(32, 2, 1);
assert(threads_per_block.x == 32); // needed for __syncwarp

if (!compute_tau && !calculate_forces) {
grid_integrate_density<<<nblocks, threads_per_block, smem_per_block,
Expand Down

0 comments on commit 814da85

Please sign in to comment.