Skip to content

Commit

Permalink
grid: Add GPU backend to stats
Browse files Browse the repository at this point in the history
  • Loading branch information
oschuett committed Dec 23, 2020
1 parent 9ecfabf commit c2d8b10
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 44 deletions.
72 changes: 42 additions & 30 deletions src/grid/common/grid_library.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,23 @@ void grid_library_set_config(const enum grid_backend backend,
grid_library_config grid_library_get_config() { return config; }

/*******************************************************************************
* \brief Increment specified counter, see grid_library.h for details.
* \brief Adds given increment to counter specified by lp, backend, and kernel.
* \author Ole Schuett
******************************************************************************/
void grid_library_increment_counter(int lp, int kern, int op) {
lp = imin(lp, 19);
per_thread_globals[omp_get_thread_num()]->counters[lp][kern][op]++;
void grid_library_counter_add(const int lp, const enum grid_backend backend,
const enum grid_library_kernel kernel,
const int increment) {
const int back = backend - GRID_BACKEND_REF;
const int idx = back * 4 * 20 + kernel * 20 + imin(lp, 19);
per_thread_globals[omp_get_thread_num()]->counters[idx] += increment;
}

/*******************************************************************************
* \brief Comperator passed to qsort to compare two counters.
* \author Ole Schuett
******************************************************************************/
static int compare_counters(const void *a, const void *b) {
return *(long *)b - *(long *)a;
}

/*******************************************************************************
Expand All @@ -118,21 +129,20 @@ void grid_library_print_stats(void (*mpi_sum_func)(long *, int),
}

// Sum all counters across threads and mpi ranks.
long counters[20][2][2] = {0};
long counters[320][2] = {0};
double total = 0.0;
for (int lp = 0; lp < 20; lp++) {
for (int kern = 0; kern < 2; kern++) {
for (int op = 0; op < 2; op++) {
for (int i = 0; i < omp_get_max_threads(); i++) {
counters[lp][kern][op] +=
per_thread_globals[i]->counters[lp][kern][op];
}
mpi_sum_func(&counters[lp][kern][op], mpi_comm);
total += counters[lp][kern][op];
}
for (int i = 0; i < 320; i++) {
counters[i][1] = i;
for (int j = 0; j < omp_get_max_threads(); j++) {
counters[i][0] += per_thread_globals[j]->counters[i];
}
mpi_sum_func(&counters[i][0], mpi_comm);
total += counters[i][0];
}

// Sort counters.
qsort(counters, 320, 2 * sizeof(long), &compare_counters);

// Print counters.
print_func("\n", output_unit);
print_func(" ----------------------------------------------------------------"
Expand All @@ -150,24 +160,26 @@ void grid_library_print_stats(void (*mpi_sum_func)(long *, int),
print_func(" ----------------------------------------------------------------"
"---------------\n",
output_unit);
print_func(" LP KERNEL OPERATION "
print_func(" LP KERNEL BACKEND "
"COUNT PERCENT\n",
output_unit);

for (int lp = 0; lp < 20; lp++) {
for (int kern = 0; kern < 2; kern++) {
for (int op = 0; op < 2; op++) {
if (counters[lp][kern][op] == 0)
continue; // skip empty counters
const char *op_str = (op == 1) ? "collocate" : "integrate";
const char *kern_str = (kern == 1) ? "ortho" : "general";
char buffer[100];
double percent = 100.0 * counters[lp][kern][op] / total;
snprintf(buffer, sizeof(buffer), " %-5i %-9s %-12s %38li %10.2f%%\n",
lp, kern_str, op_str, counters[lp][kern][op], percent);
print_func(buffer, output_unit);
}
}
const char *kernel_names[] = {"collocate ortho", "integrate ortho",
"collocate general", "integrate general"};
const char *backend_names[] = {"REF", "CPU", "GPU", "HYBRID"};

for (int i = 0; i < 320; i++) {
if (counters[i][0] == 0)
continue; // skip empty counters
const double percent = 100.0 * counters[i][0] / total;
const int idx = counters[i][1];
const int back = idx / 80;
const int kern = (idx % 80) / 20;
const int lp = (idx % 80) % 20;
char buffer[100];
snprintf(buffer, sizeof(buffer), " %-5i %-17s %-6s %34li %10.2f%%\n", lp,
kernel_names[kern], backend_names[back], counters[i][0], percent);
print_func(buffer, output_unit);
}

print_func(" ----------------------------------------------------------------"
Expand Down
22 changes: 16 additions & 6 deletions src/grid/common/grid_library.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,23 +68,33 @@ void grid_library_print_stats(void (*mpi_sum_func)(long *, int), int mpi_comm,
******************************************************************************/
typedef struct {
grid_sphere_cache sphere_cache;
long counters[20][2][2]; // [lp][kernel][operation]
long counters[4 * 4 * 20]; // [backend][kernel][lp]
} grid_library_globals;

/*******************************************************************************
* \brief Various kernels provided by the grid library.
* \author Ole Schuett
******************************************************************************/
enum grid_library_kernel {
GRID_COLLOCATE_ORTHO = 0,
GRID_INTEGRATE_ORTHO = 1,
GRID_COLLOCATE_GENERAL = 2,
GRID_INTEGRATE_GENERAL = 3,
};

/*******************************************************************************
* \brief Returns a pointer to the thread local sphere cache.
* \author Ole Schuett
******************************************************************************/
grid_sphere_cache *grid_library_get_sphere_cache();

/*******************************************************************************
* \brief Increment specified counter.
* \param lp Total angular momentum: la_max + lb_max.
* \param kern Kernel: 0=ortho, 1=general.
* \param op Operation: 0=integrate, 1=collocate.
* \brief Adds given increment to counter specified by lp, backend, and kernel.
* \author Ole Schuett
******************************************************************************/
void grid_library_increment_counter(int collocate, int ortho, int lp);
void grid_library_counter_add(const int lp, const enum grid_backend backend,
const enum grid_library_kernel kern,
const int increment);

#ifdef __cplusplus
}
Expand Down
4 changes: 3 additions & 1 deletion src/grid/gpu/grid_gpu_collocate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ void grid_gpu_collocate_one_grid_level(
const int last_task, const bool orthorhombic, const enum grid_func func,
const int npts_global[3], const int npts_local[3], const int shift_local[3],
const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
const cudaStream_t stream, const double *pab_blocks_dev, double *grid_dev) {
const cudaStream_t stream, const double *pab_blocks_dev, double *grid_dev,
int *lp_diff) {

const int ntasks = last_task - first_task + 1;
if (ntasks == 0) {
Expand All @@ -153,6 +154,7 @@ void grid_gpu_collocate_one_grid_level(

// Compute max angular momentum.
const prepare_ldiffs ldiffs = prepare_get_ldiffs(func);
*lp_diff = ldiffs.la_max_diff + ldiffs.lb_max_diff; // for reporting stats
const int la_max = task_list->lmax + ldiffs.la_max_diff;
const int lb_max = task_list->lmax + ldiffs.lb_max_diff;
const int lp_max = la_max + lb_max;
Expand Down
3 changes: 2 additions & 1 deletion src/grid/gpu/grid_gpu_collocate.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ void grid_gpu_collocate_one_grid_level(
const int last_task, const bool orthorhombic, const enum grid_func func,
const int npts_global[3], const int npts_local[3], const int shift_local[3],
const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
const cudaStream_t stream, const double *pab_blocks_dev, double *grid_dev);
const cudaStream_t stream, const double *pab_blocks_dev, double *grid_dev,
int *lp_diff);

#ifdef __cplusplus
}
Expand Down
3 changes: 2 additions & 1 deletion src/grid/gpu/grid_gpu_integrate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ void grid_gpu_integrate_one_grid_level(
const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
const cudaStream_t stream, const double *pab_blocks_dev,
const double *grid_dev, double *hab_blocks_dev, double *forces_dev,
double *virial_dev) {
double *virial_dev, int *lp_diff) {

const int ntasks = last_task - first_task + 1;
if (ntasks == 0) {
Expand All @@ -222,6 +222,7 @@ void grid_gpu_integrate_one_grid_level(
assert(!calculate_virial || calculate_forces);
const process_ldiffs ldiffs =
process_get_ldiffs(calculate_forces, calculate_virial, compute_tau);
*lp_diff = ldiffs.la_max_diff + ldiffs.lb_max_diff; // for reporting stats
const int la_max = task_list->lmax + ldiffs.la_max_diff;
const int lb_max = task_list->lmax + ldiffs.lb_max_diff;
const int lp_max = la_max + lb_max;
Expand Down
2 changes: 1 addition & 1 deletion src/grid/gpu/grid_gpu_integrate.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ void grid_gpu_integrate_one_grid_level(
const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
const cudaStream_t stream, const double *pab_blocks_dev,
const double *grid_dev, double *hab_blocks_dev, double *forces_dev,
double *virial_dev);
double *virial_dev, int *lp_diff);

#ifdef __cplusplus
}
Expand Down
50 changes: 48 additions & 2 deletions src/grid/gpu/grid_gpu_task_list.cu
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,22 @@ void grid_gpu_create_task_list(
}
}

// collect stats
memset(task_list->stats, 0, 2 * 20 * sizeof(int));
for (int itask = 0; itask < ntasks; itask++) {
const int iatom = iatom_list[itask] - 1;
const int jatom = jatom_list[itask] - 1;
const int ikind = atom_kinds[iatom] - 1;
const int jkind = atom_kinds[jatom] - 1;
const int iset = iset_list[itask] - 1;
const int jset = jset_list[itask] - 1;
const int la_max = basis_sets[ikind]->lmax[iset];
const int lb_max = basis_sets[jkind]->lmax[jset];
const int lp = imin(la_max + lb_max, 19);
const bool has_border_mask = (border_mask_list[itask] != 0);
task_list->stats[has_border_mask][lp]++;
}

// allocate main cuda stream
CHECK(cudaStreamCreate(&task_list->main_stream));

Expand Down Expand Up @@ -265,6 +281,7 @@ void grid_gpu_collocate_task_list(
CHECK(cudaEventCreate(&input_ready_event));
CHECK(cudaEventRecord(input_ready_event, task_list->main_stream));

int lp_diff;
int first_task = 0;
assert(task_list->nlevels == nlevels);
for (int level = 0; level < task_list->nlevels; level++) {
Expand Down Expand Up @@ -292,11 +309,25 @@ void grid_gpu_collocate_task_list(
task_list, first_task, last_task, orthorhombic, func,
npts_global[level], npts_local[level], shift_local[level],
border_width[level], dh[level], dh_inv[level], level_stream,
pab_blocks->device_buffer, task_list->grid_dev[level]);
pab_blocks->device_buffer, task_list->grid_dev[level], &lp_diff);

first_task = last_task + 1;
}

// update counters while we wait for kernels to finish
for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
for (int lp = 0; lp < 20; lp++) {
const int count = task_list->stats[has_border_mask][lp];
if (orthorhombic && !has_border_mask) {
grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,
GRID_COLLOCATE_ORTHO, count);
} else {
grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,
GRID_COLLOCATE_GENERAL, count);
}
}
}

// download result from device to host.
// TODO: Make these mem copies actually async by page locking the grid buffers
for (int level = 0; level < task_list->nlevels; level++) {
Expand Down Expand Up @@ -361,6 +392,7 @@ void grid_gpu_integrate_task_list(
CHECK(cudaEventCreate(&input_ready_event));
CHECK(cudaEventRecord(input_ready_event, task_list->main_stream));

int lp_diff;
int first_task = 0;
assert(task_list->nlevels == nlevels);
for (int level = 0; level < task_list->nlevels; level++) {
Expand Down Expand Up @@ -389,7 +421,7 @@ void grid_gpu_integrate_task_list(
npts_global[level], npts_local[level], shift_local[level],
border_width[level], dh[level], dh_inv[level], level_stream,
pab_blocks_dev, task_list->grid_dev[level], hab_blocks->device_buffer,
forces_dev, virial_dev);
forces_dev, virial_dev, &lp_diff);

// Have main stream wait for level to complete before downloading results.
cudaEvent_t level_done_event;
Expand All @@ -401,6 +433,20 @@ void grid_gpu_integrate_task_list(
first_task = last_task + 1;
}

// update counters while we wait for kernels to finish
for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
for (int lp = 0; lp < 20; lp++) {
const int count = task_list->stats[has_border_mask][lp];
if (orthorhombic && !has_border_mask) {
grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,
GRID_INTEGRATE_ORTHO, count);
} else {
grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,
GRID_INTEGRATE_GENERAL, count);
}
}
}

// download result from device to host using main stream.
CHECK(cudaMemcpyAsync(hab_blocks->host_buffer, hab_blocks->device_buffer,
hab_blocks->size, cudaMemcpyDeviceToHost,
Expand Down
1 change: 1 addition & 0 deletions src/grid/gpu/grid_gpu_task_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ typedef struct {
cudaStream_t *level_streams;
cudaStream_t main_stream;
int lmax;
int stats[2][20]; // [has_border_mask][lp]
// device pointers
int *block_offsets_dev;
double *atom_positions_dev;
Expand Down
6 changes: 4 additions & 2 deletions src/grid/ref/grid_ref_collint.h
Original file line number Diff line number Diff line change
Expand Up @@ -787,16 +787,18 @@ cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp,
const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz,
GRID_CONST_WHEN_INTEGRATE double *grid) {

enum grid_library_kernel k;
if (orthorhombic && border_mask == 0) {
grid_library_increment_counter(lp, 1, GRID_DO_COLLOCATE);
k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_ORTHO : GRID_INTEGRATE_ORTHO;
ortho_cxyz_to_grid(lp, zetp, dh, dh_inv, rp, npts_global, npts_local,
shift_local, radius, cxyz, grid);
} else {
grid_library_increment_counter(lp, 0, GRID_DO_COLLOCATE);
k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_GENERAL : GRID_INTEGRATE_GENERAL;
general_cxyz_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
npts_local, shift_local, border_width, radius, cxyz,
grid);
}
grid_library_counter_add(lp, GRID_BACKEND_REF, k, 1);
}

/*******************************************************************************
Expand Down

0 comments on commit c2d8b10

Please sign in to comment.