Skip to content

Commit

Permalink
grid: Reduce GPU shared memory by processing Cab piecewise
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed May 16, 2023
1 parent 17d3cc5 commit 98d2d7a
Show file tree
Hide file tree
Showing 7 changed files with 19,805 additions and 57 deletions.
10 changes: 5 additions & 5 deletions src/grid/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ Resource usage:
Common:
GLOBAL:328 CONSTANT[3]:18848
Function _Z24collocate_kernel_anyfunc13kernel_params:
REG:134 STACK:0 SHARED:416 LOCAL:0 CONSTANT[0]:600 CONSTANT[2]:312 TEXTURE:0 SURFACE:0 SAMPLER:0
REG:147 STACK:0 SHARED:416 LOCAL:0 CONSTANT[0]:600 CONSTANT[2]:320 TEXTURE:0 SURFACE:0 SAMPLER:0
Function _Z24collocate_kernel_density13kernel_params:
REG:71 STACK:0 SHARED:416 LOCAL:0 CONSTANT[0]:600 CONSTANT[2]:104 TEXTURE:0 SURFACE:0 SAMPLER:0

Expand Down Expand Up @@ -155,13 +155,13 @@ Resource usage:
Common:
GLOBAL:328 CONSTANT[3]:18848
Function _Z25grid_integrate_tau_forces13kernel_params:
REG:132 STACK:64 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0
REG:135 STACK:64 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0
Function _Z29grid_integrate_density_forces13kernel_params:
REG:128 STACK:16 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0
REG:130 STACK:16 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0
Function _Z18grid_integrate_tau13kernel_params:
REG:80 STACK:0 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:216 TEXTURE:0 SURFACE:0 SAMPLER:0
REG:88 STACK:0 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0
Function _Z22grid_integrate_density13kernel_params:
REG:74 STACK:0 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0
REG:78 STACK:0 SHARED:432 LOCAL:0 CONSTANT[0]:616 CONSTANT[2]:208 TEXTURE:0 SURFACE:0 SAMPLER:0

Fatbin ptx code:
================
Expand Down
17 changes: 13 additions & 4 deletions src/grid/gpu/grid_gpu_collint.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ __device__ static void atomicAddDouble(double *address, double val) {
typedef struct {
double *data;
const int n1;
const int length;
const int mask[2];
} cab_store;

/*******************************************************************************
Expand All @@ -97,7 +97,12 @@ typedef struct {
******************************************************************************/
__device__ static inline double cab_get(const cab_store *cab, const orbital a,
const orbital b) {
return cab->data[idx(b) * cab->n1 + idx(a)];
const int i = idx(b) * cab->n1 + idx(a);
if (cab->mask[0] <= i && i < cab->mask[1]) {
return cab->data[i - cab->mask[0]];
} else {
return 0.0;
}
}

/*******************************************************************************
Expand All @@ -106,7 +111,10 @@ __device__ static inline double cab_get(const cab_store *cab, const orbital a,
******************************************************************************/
__device__ static inline void cab_add(cab_store *cab, const orbital a,
const orbital b, const double value) {
atomicAddDouble(&cab->data[idx(b) * cab->n1 + idx(a)], value);
const int i = idx(b) * cab->n1 + idx(a);
if (cab->mask[0] <= i && i < cab->mask[1]) {
atomicAddDouble(&cab->data[i - cab->mask[0]], value);
}
}

/*******************************************************************************
Expand Down Expand Up @@ -477,8 +485,9 @@ __device__ static void cab_to_cxyz(const smem_task *task, const double *alpha,
* \author Ole Schuett
******************************************************************************/
__device__ static void zero_cab(cab_store *cab) {
const int n = cab->mask[1] - cab->mask[0];
if (threadIdx.z == 0 && threadIdx.y == 0) {
for (int i = threadIdx.x; i < cab->length; i += blockDim.x) {
for (int i = threadIdx.x; i < n; i += blockDim.x) {
cab->data[i] = 0.0;
}
}
Expand Down
33 changes: 15 additions & 18 deletions src/grid/gpu/grid_gpu_collocate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -282,13 +282,17 @@ __device__ static void collocate_kernel(const kernel_params *params) {
double *smem_alpha = &shared_memory[params->smem_alpha_offset];
double *smem_cxyz = &shared_memory[params->smem_cxyz_offset];

// For large basis sets the Cab matrix does not fit into shared memory.
// Therefore, we're doing multiple passes while masking different parts.
compute_alpha(&task, smem_alpha);
for (int lb = 0; lb < task.n1 * task.n2; lb += params->smem_cab_length) {
const int ub = lb + params->smem_cab_length;
cab_store cab = {.data = smem_cab, .n1 = task.n1, .mask = {lb, ub}};
zero_cab(&cab);
block_to_cab<IS_FUNC_AB>(params, &task, &cab);
cab_to_cxyz(&task, smem_alpha, &cab, smem_cxyz);
}

cab_store cab = {
.data = smem_cab, .n1 = task.n1, .length = params->smem_cab_length};
zero_cab(&cab);
block_to_cab<IS_FUNC_AB>(params, &task, &cab);
cab_to_cxyz(&task, smem_alpha, &cab, smem_cxyz);
cxyz_to_grid(params, &task, smem_cxyz, params->grid);
}

Expand Down Expand Up @@ -333,22 +337,15 @@ void grid_gpu_collocate_one_grid_level(
init_constant_memory();

// Compute required shared memory.
// TODO: Currently, cab's indicies run over 0...ncoset[lmax],
// however only ncoset(lmin)...ncoset(lmax) are actually needed.
const int cab_len = ncoset(lb_max) * ncoset(la_max);
const size_t available_dynamic_smem = 48 * 1024 - sizeof(smem_task);
const int alpha_len = 3 * (lb_max + 1) * (la_max + 1) * (lp_max + 1);
const int cxyz_len = ncoset(lp_max);
const int leftover_smem_len =
(available_dynamic_smem / sizeof(double)) - alpha_len - cxyz_len;
const int cab_full_len = ncoset(lb_max) * ncoset(la_max);
const int cab_len = imin(cab_full_len, leftover_smem_len);
const size_t smem_per_block =
(cab_len + alpha_len + cxyz_len) * sizeof(double);

if (smem_per_block > 48 * 1024) {
fprintf(stderr, "ERROR: Not enough shared memory in grid_gpu_collocate.\n");
fprintf(stderr, "cab_len: %i, ", cab_len);
fprintf(stderr, "alpha_len: %i, ", alpha_len);
fprintf(stderr, "cxyz_len: %i, ", cxyz_len);
fprintf(stderr, "total smem_per_block: %f kb\n\n", smem_per_block / 1024.0);
abort();
}
(alpha_len + cxyz_len + cab_len) * sizeof(double);

// kernel parameters
kernel_params params;
Expand Down
42 changes: 18 additions & 24 deletions src/grid/gpu/grid_gpu_integrate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -392,20 +392,21 @@ __device__ static void integrate_kernel(const kernel_params *params) {
double *smem_alpha = &shared_memory[params->smem_alpha_offset];
double *smem_cxyz = &shared_memory[params->smem_cxyz_offset];

compute_alpha(&task, smem_alpha);

zero_cxyz(&task, smem_cxyz);
grid_to_cxyz(params, &task, params->grid, smem_cxyz);

cab_store cab = {
.data = smem_cab, .n1 = task.n1, .length = params->smem_cab_length};
zero_cab(&cab);
cab_to_cxyz(&task, smem_alpha, &cab, smem_cxyz);

store_hab<COMPUTE_TAU>(&task, &cab);

if (CALCULATE_FORCES) {
store_forces_and_virial<COMPUTE_TAU>(params, &task, &cab);
// For large basis sets the Cab matrix does not fit into shared memory.
// Therefore, we're doing multiple passes while applying a mask.
compute_alpha(&task, smem_alpha);
for (int lb = 0; lb < task.n1 * task.n2; lb += params->smem_cab_length) {
const int ub = lb + params->smem_cab_length;
cab_store cab = {.data = smem_cab, .n1 = task.n1, .mask = {lb, ub}};
zero_cab(&cab);
cab_to_cxyz(&task, smem_alpha, &cab, smem_cxyz);
store_hab<COMPUTE_TAU>(&task, &cab);
if (CALCULATE_FORCES) {
store_forces_and_virial<COMPUTE_TAU>(params, &task, &cab);
}
}
}

Expand Down Expand Up @@ -472,22 +473,15 @@ void grid_gpu_integrate_one_grid_level(
init_constant_memory();

// Compute required shared memory.
// TODO: Currently, cab's indicies run over 0...ncoset[lmax],
// however only ncoset(lmin)...ncoset(lmax) are actually needed.
const int cab_len = ncoset(lb_max) * ncoset(la_max);
const size_t available_dynamic_smem = 48 * 1024 - sizeof(smem_task);
const int alpha_len = 3 * (lb_max + 1) * (la_max + 1) * (lp_max + 1);
const int cxyz_len = ncoset(lp_max);
const int leftover_smem_len =
(available_dynamic_smem / sizeof(double)) - alpha_len - cxyz_len;
const int cab_full_len = ncoset(lb_max) * ncoset(la_max);
const int cab_len = imin(cab_full_len, leftover_smem_len);
const size_t smem_per_block =
(cab_len + alpha_len + cxyz_len) * sizeof(double);

if (smem_per_block > 48 * 1024) {
fprintf(stderr, "ERROR: Not enough shared memory in grid_gpu_integrate.\n");
fprintf(stderr, "cab_len: %i, ", cab_len);
fprintf(stderr, "alpha_len: %i, ", alpha_len);
fprintf(stderr, "cxyz_len: %i, ", cxyz_len);
fprintf(stderr, "total smem_per_block: %f kb\n\n", smem_per_block / 1024.0);
abort();
}
(alpha_len + cxyz_len + cab_len) * sizeof(double);

// kernel parameters
kernel_params params;
Expand Down
12 changes: 6 additions & 6 deletions src/grid/grid_replay.c
Original file line number Diff line number Diff line change
Expand Up @@ -437,8 +437,8 @@ double grid_replay(const char *filename, const int cycles, const bool collocate,
max_rel_diff = fmax(max_rel_diff, rel_diff);
max_value = fmax(max_value, fabs(test_value));
if (max_rel_diff > 1e-14) {
printf("%i %i ref: %le test: %le diff:%le rel_diff: %le\n", i, j,
ref_value, test_value, diff, rel_diff);
printf("hab[%i, %i] ref: %le test: %le diff:%le rel_diff: %le\n", i,
j, ref_value, test_value, diff, rel_diff);
}
}
}
Expand All @@ -452,8 +452,8 @@ double grid_replay(const char *filename, const int cycles, const bool collocate,
const double rel_diff = diff / fmax(1.0, fabs(ref_value));
max_rel_diff = fmax(max_rel_diff, rel_diff * forces_fudge_factor);
if (max_rel_diff > 1e-14) {
printf("forces %i %i ref: %le test: %le diff:%le rel_diff: %le\n", i,
j, ref_value, test_value, diff, rel_diff);
printf("forces[%i, %i] ref: %le test: %le diff:%le rel_diff: %le\n",
i, j, ref_value, test_value, diff, rel_diff);
}
}
}
Expand All @@ -467,8 +467,8 @@ double grid_replay(const char *filename, const int cycles, const bool collocate,
const double rel_diff = diff / fmax(1.0, fabs(ref_value));
max_rel_diff = fmax(max_rel_diff, rel_diff * virial_fudge_factor);
if (max_rel_diff > 1e-14) {
printf("virial %i %i ref: %le test: %le diff:%le rel_diff: %le\n", i,
j, ref_value, test_value, diff, rel_diff);
printf("virial[ %i, %i] ref: %le test: %le diff:%le rel_diff: %le\n",
i, j, ref_value, test_value, diff, rel_diff);
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/grid/grid_unittest.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ int main(int argc, char *argv[]) {
errors += run_test(argv[1], "ortho_density_l2200.task");
errors += run_test(argv[1], "ortho_density_l3300.task");
errors += run_test(argv[1], "ortho_density_l3333.task");
errors += run_test(argv[1], "ortho_density_l0505.task");
errors += run_test(argv[1], "ortho_non_periodic.task");
errors += run_test(argv[1], "ortho_tau.task");
errors += run_test(argv[1], "general_density.task");
Expand Down

0 comments on commit 98d2d7a

Please sign in to comment.