grid: Add GPU backend to stats

cp2k · Dec 23, 2020 · c2d8b10 · c2d8b10
1 parent 9ecfabf
commit c2d8b10
Show file tree

Hide file tree

Showing 9 changed files with 119 additions and 44 deletions.
diff --git a/src/grid/common/grid_library.c b/src/grid/common/grid_library.c
@@ -96,12 +96,23 @@ void grid_library_set_config(const enum grid_backend backend,
 grid_library_config grid_library_get_config() { return config; }
 
 /*******************************************************************************
- * \brief Increment specified counter, see grid_library.h for details.
+ * \brief Adds given increment to counter specified by lp, backend, and kernel.
  * \author Ole Schuett
  ******************************************************************************/
-void grid_library_increment_counter(int lp, int kern, int op) {
-  lp = imin(lp, 19);
-  per_thread_globals[omp_get_thread_num()]->counters[lp][kern][op]++;
+void grid_library_counter_add(const int lp, const enum grid_backend backend,
+                              const enum grid_library_kernel kernel,
+                              const int increment) {
+  const int back = backend - GRID_BACKEND_REF;
+  const int idx = back * 4 * 20 + kernel * 20 + imin(lp, 19);
+  per_thread_globals[omp_get_thread_num()]->counters[idx] += increment;
+}
+
+/*******************************************************************************
+ * \brief Comperator passed to qsort to compare two counters.
+ * \author Ole Schuett
+ ******************************************************************************/
+static int compare_counters(const void *a, const void *b) {
+  return *(long *)b - *(long *)a;
 }
 
 /*******************************************************************************
@@ -118,21 +129,20 @@ void grid_library_print_stats(void (*mpi_sum_func)(long *, int),
   }
 
   // Sum all counters across threads and mpi ranks.
-  long counters[20][2][2] = {0};
+  long counters[320][2] = {0};
   double total = 0.0;
-  for (int lp = 0; lp < 20; lp++) {
-    for (int kern = 0; kern < 2; kern++) {
-      for (int op = 0; op < 2; op++) {
-        for (int i = 0; i < omp_get_max_threads(); i++) {
-          counters[lp][kern][op] +=
-              per_thread_globals[i]->counters[lp][kern][op];
-        }
-        mpi_sum_func(&counters[lp][kern][op], mpi_comm);
-        total += counters[lp][kern][op];
-      }
+  for (int i = 0; i < 320; i++) {
+    counters[i][1] = i;
+    for (int j = 0; j < omp_get_max_threads(); j++) {
+      counters[i][0] += per_thread_globals[j]->counters[i];
     }
+    mpi_sum_func(&counters[i][0], mpi_comm);
+    total += counters[i][0];
   }
 
+  // Sort counters.
+  qsort(counters, 320, 2 * sizeof(long), &compare_counters);
+
   // Print counters.
   print_func("\n", output_unit);
   print_func(" ----------------------------------------------------------------"
@@ -150,24 +160,26 @@ void grid_library_print_stats(void (*mpi_sum_func)(long *, int),
   print_func(" ----------------------------------------------------------------"
              "---------------\n",
              output_unit);
-  print_func(" LP    KERNEL    OPERATION                                     "
+  print_func(" LP    KERNEL             BACKEND                              "
              "COUNT     PERCENT\n",
              output_unit);
 
-  for (int lp = 0; lp < 20; lp++) {
-    for (int kern = 0; kern < 2; kern++) {
-      for (int op = 0; op < 2; op++) {
-        if (counters[lp][kern][op] == 0)
-          continue; // skip empty counters
-        const char *op_str = (op == 1) ? "collocate" : "integrate";
-        const char *kern_str = (kern == 1) ? "ortho" : "general";
-        char buffer[100];
-        double percent = 100.0 * counters[lp][kern][op] / total;
-        snprintf(buffer, sizeof(buffer), " %-5i %-9s %-12s %38li %10.2f%%\n",
-                 lp, kern_str, op_str, counters[lp][kern][op], percent);
-        print_func(buffer, output_unit);
-      }
-    }
+  const char *kernel_names[] = {"collocate ortho", "integrate ortho",
+                                "collocate general", "integrate general"};
+  const char *backend_names[] = {"REF", "CPU", "GPU", "HYBRID"};
+
+  for (int i = 0; i < 320; i++) {
+    if (counters[i][0] == 0)
+      continue; // skip empty counters
+    const double percent = 100.0 * counters[i][0] / total;
+    const int idx = counters[i][1];
+    const int back = idx / 80;
+    const int kern = (idx % 80) / 20;
+    const int lp = (idx % 80) % 20;
+    char buffer[100];
+    snprintf(buffer, sizeof(buffer), " %-5i %-17s  %-6s  %34li %10.2f%%\n", lp,
+             kernel_names[kern], backend_names[back], counters[i][0], percent);
+    print_func(buffer, output_unit);
   }
 
   print_func(" ----------------------------------------------------------------"

diff --git a/src/grid/common/grid_library.h b/src/grid/common/grid_library.h
@@ -68,23 +68,33 @@ void grid_library_print_stats(void (*mpi_sum_func)(long *, int), int mpi_comm,
  ******************************************************************************/
 typedef struct {
   grid_sphere_cache sphere_cache;
-  long counters[20][2][2]; // [lp][kernel][operation]
+  long counters[4 * 4 * 20]; // [backend][kernel][lp]
 } grid_library_globals;
 
+/*******************************************************************************
+ * \brief Various kernels provided by the grid library.
+ * \author Ole Schuett
+ ******************************************************************************/
+enum grid_library_kernel {
+  GRID_COLLOCATE_ORTHO = 0,
+  GRID_INTEGRATE_ORTHO = 1,
+  GRID_COLLOCATE_GENERAL = 2,
+  GRID_INTEGRATE_GENERAL = 3,
+};
+
 /*******************************************************************************
  * \brief Returns a pointer to the thread local sphere cache.
  * \author Ole Schuett
  ******************************************************************************/
 grid_sphere_cache *grid_library_get_sphere_cache();
 
 /*******************************************************************************
- * \brief Increment specified counter.
- * \param lp    Total angular momentum: la_max + lb_max.
- * \param kern  Kernel: 0=ortho, 1=general.
- * \param op    Operation: 0=integrate, 1=collocate.
+ * \brief Adds given increment to counter specified by lp, backend, and kernel.
  * \author Ole Schuett
  ******************************************************************************/
-void grid_library_increment_counter(int collocate, int ortho, int lp);
+void grid_library_counter_add(const int lp, const enum grid_backend backend,
+                              const enum grid_library_kernel kern,
+                              const int increment);
 
 #ifdef __cplusplus
 }

diff --git a/src/grid/gpu/grid_gpu_collocate.cu b/src/grid/gpu/grid_gpu_collocate.cu
@@ -142,7 +142,8 @@ void grid_gpu_collocate_one_grid_level(
     const int last_task, const bool orthorhombic, const enum grid_func func,
     const int npts_global[3], const int npts_local[3], const int shift_local[3],
     const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
-    const cudaStream_t stream, const double *pab_blocks_dev, double *grid_dev) {
+    const cudaStream_t stream, const double *pab_blocks_dev, double *grid_dev,
+    int *lp_diff) {
 
   const int ntasks = last_task - first_task + 1;
   if (ntasks == 0) {
@@ -153,6 +154,7 @@ void grid_gpu_collocate_one_grid_level(
 
   // Compute max angular momentum.
   const prepare_ldiffs ldiffs = prepare_get_ldiffs(func);
+  *lp_diff = ldiffs.la_max_diff + ldiffs.lb_max_diff; // for reporting stats
   const int la_max = task_list->lmax + ldiffs.la_max_diff;
   const int lb_max = task_list->lmax + ldiffs.lb_max_diff;
   const int lp_max = la_max + lb_max;

diff --git a/src/grid/gpu/grid_gpu_collocate.h b/src/grid/gpu/grid_gpu_collocate.h
@@ -25,7 +25,8 @@ void grid_gpu_collocate_one_grid_level(
     const int last_task, const bool orthorhombic, const enum grid_func func,
     const int npts_global[3], const int npts_local[3], const int shift_local[3],
     const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
-    const cudaStream_t stream, const double *pab_blocks_dev, double *grid_dev);
+    const cudaStream_t stream, const double *pab_blocks_dev, double *grid_dev,
+    int *lp_diff);
 
 #ifdef __cplusplus
 }

diff --git a/src/grid/gpu/grid_gpu_integrate.cu b/src/grid/gpu/grid_gpu_integrate.cu
@@ -207,7 +207,7 @@ void grid_gpu_integrate_one_grid_level(
     const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
     const cudaStream_t stream, const double *pab_blocks_dev,
     const double *grid_dev, double *hab_blocks_dev, double *forces_dev,
-    double *virial_dev) {
+    double *virial_dev, int *lp_diff) {
 
   const int ntasks = last_task - first_task + 1;
   if (ntasks == 0) {
@@ -222,6 +222,7 @@ void grid_gpu_integrate_one_grid_level(
   assert(!calculate_virial || calculate_forces);
   const process_ldiffs ldiffs =
       process_get_ldiffs(calculate_forces, calculate_virial, compute_tau);
+  *lp_diff = ldiffs.la_max_diff + ldiffs.lb_max_diff; // for reporting stats
   const int la_max = task_list->lmax + ldiffs.la_max_diff;
   const int lb_max = task_list->lmax + ldiffs.lb_max_diff;
   const int lp_max = la_max + lb_max;

diff --git a/src/grid/gpu/grid_gpu_integrate.h b/src/grid/gpu/grid_gpu_integrate.h
@@ -27,7 +27,7 @@ void grid_gpu_integrate_one_grid_level(
     const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
     const cudaStream_t stream, const double *pab_blocks_dev,
     const double *grid_dev, double *hab_blocks_dev, double *forces_dev,
-    double *virial_dev);
+    double *virial_dev, int *lp_diff);
 
 #ifdef __cplusplus
 }

diff --git a/src/grid/gpu/grid_gpu_task_list.cu b/src/grid/gpu/grid_gpu_task_list.cu
@@ -167,6 +167,22 @@ void grid_gpu_create_task_list(
     }
   }
 
+  // collect stats
+  memset(task_list->stats, 0, 2 * 20 * sizeof(int));
+  for (int itask = 0; itask < ntasks; itask++) {
+    const int iatom = iatom_list[itask] - 1;
+    const int jatom = jatom_list[itask] - 1;
+    const int ikind = atom_kinds[iatom] - 1;
+    const int jkind = atom_kinds[jatom] - 1;
+    const int iset = iset_list[itask] - 1;
+    const int jset = jset_list[itask] - 1;
+    const int la_max = basis_sets[ikind]->lmax[iset];
+    const int lb_max = basis_sets[jkind]->lmax[jset];
+    const int lp = imin(la_max + lb_max, 19);
+    const bool has_border_mask = (border_mask_list[itask] != 0);
+    task_list->stats[has_border_mask][lp]++;
+  }
+
   // allocate main cuda stream
   CHECK(cudaStreamCreate(&task_list->main_stream));
 
@@ -265,6 +281,7 @@ void grid_gpu_collocate_task_list(
   CHECK(cudaEventCreate(&input_ready_event));
   CHECK(cudaEventRecord(input_ready_event, task_list->main_stream));
 
+  int lp_diff;
   int first_task = 0;
   assert(task_list->nlevels == nlevels);
   for (int level = 0; level < task_list->nlevels; level++) {
@@ -292,11 +309,25 @@ void grid_gpu_collocate_task_list(
         task_list, first_task, last_task, orthorhombic, func,
         npts_global[level], npts_local[level], shift_local[level],
         border_width[level], dh[level], dh_inv[level], level_stream,
-        pab_blocks->device_buffer, task_list->grid_dev[level]);
+        pab_blocks->device_buffer, task_list->grid_dev[level], &lp_diff);
 
     first_task = last_task + 1;
   }
 
+  // update counters while we wait for kernels to finish
+  for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
+    for (int lp = 0; lp < 20; lp++) {
+      const int count = task_list->stats[has_border_mask][lp];
+      if (orthorhombic && !has_border_mask) {
+        grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,
+                                 GRID_COLLOCATE_ORTHO, count);
+      } else {
+        grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,
+                                 GRID_COLLOCATE_GENERAL, count);
+      }
+    }
+  }
+
   // download result from device to host.
   // TODO: Make these mem copies actually async by page locking the grid buffers
   for (int level = 0; level < task_list->nlevels; level++) {
@@ -361,6 +392,7 @@ void grid_gpu_integrate_task_list(
   CHECK(cudaEventCreate(&input_ready_event));
   CHECK(cudaEventRecord(input_ready_event, task_list->main_stream));
 
+  int lp_diff;
   int first_task = 0;
   assert(task_list->nlevels == nlevels);
   for (int level = 0; level < task_list->nlevels; level++) {
@@ -389,7 +421,7 @@ void grid_gpu_integrate_task_list(
         npts_global[level], npts_local[level], shift_local[level],
         border_width[level], dh[level], dh_inv[level], level_stream,
         pab_blocks_dev, task_list->grid_dev[level], hab_blocks->device_buffer,
-        forces_dev, virial_dev);
+        forces_dev, virial_dev, &lp_diff);
 
     // Have main stream wait for level to complete before downloading results.
     cudaEvent_t level_done_event;
@@ -401,6 +433,20 @@ void grid_gpu_integrate_task_list(
     first_task = last_task + 1;
   }
 
+  // update counters while we wait for kernels to finish
+  for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
+    for (int lp = 0; lp < 20; lp++) {
+      const int count = task_list->stats[has_border_mask][lp];
+      if (orthorhombic && !has_border_mask) {
+        grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,
+                                 GRID_INTEGRATE_ORTHO, count);
+      } else {
+        grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,
+                                 GRID_INTEGRATE_GENERAL, count);
+      }
+    }
+  }
+
   // download result from device to host using main stream.
   CHECK(cudaMemcpyAsync(hab_blocks->host_buffer, hab_blocks->device_buffer,
                         hab_blocks->size, cudaMemcpyDeviceToHost,

diff --git a/src/grid/gpu/grid_gpu_task_list.h b/src/grid/gpu/grid_gpu_task_list.h
@@ -51,6 +51,7 @@ typedef struct {
   cudaStream_t *level_streams;
   cudaStream_t main_stream;
   int lmax;
+  int stats[2][20]; // [has_border_mask][lp]
   // device pointers
   int *block_offsets_dev;
   double *atom_positions_dev;

diff --git a/src/grid/ref/grid_ref_collint.h b/src/grid/ref/grid_ref_collint.h
@@ -787,16 +787,18 @@ cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp,
              const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz,
              GRID_CONST_WHEN_INTEGRATE double *grid) {
 
+  enum grid_library_kernel k;
   if (orthorhombic && border_mask == 0) {
-    grid_library_increment_counter(lp, 1, GRID_DO_COLLOCATE);
+    k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_ORTHO : GRID_INTEGRATE_ORTHO;
     ortho_cxyz_to_grid(lp, zetp, dh, dh_inv, rp, npts_global, npts_local,
                        shift_local, radius, cxyz, grid);
   } else {
-    grid_library_increment_counter(lp, 0, GRID_DO_COLLOCATE);
+    k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_GENERAL : GRID_INTEGRATE_GENERAL;
     general_cxyz_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
                          npts_local, shift_local, border_width, radius, cxyz,
                          grid);
   }
+  grid_library_counter_add(lp, GRID_BACKEND_REF, k, 1);
 }
 
 /*******************************************************************************