refactoring, minor improvements, integrate gpu

- removed dynamic allocation when computing the coefficients for collocate - removed unsused functions in tensor structure - do precomputations for collocate during initialization of the tasks (cube dimensions are not precomputed) - added function computing pab from the coefficients given by integrate - Start moving coefficients calculations on GPU. It is basically the same code than for CPU adapted to GPU computation. - split source file for collocate and integrate on gpu. - use some of ref implementation for forces and stresses without VLAs (cpu side)
cp2k · Dec 17, 2020 · e09b9ef · e09b9ef
1 parent 1785898
commit e09b9ef
Show file tree

Hide file tree

Showing 12 changed files with 1,989 additions and 944 deletions.
diff --git a/src/grid/cpu/coefficients.c b/src/grid/cpu/coefficients.c
@@ -68,7 +68,7 @@ void transform_triangular_to_xyz(const double *const coef_xyz,
 }
 
 // *****************************************************************************
-void grid_prepare_coef_dgemm(
+void grid_prepare_coef_dgemm_bis(
     const int *lmin, const int *lmax, const int lp, const double prefactor,
     const tensor *const alpha, // [3][lb_max+1][la_max+1][lp+1]
     const tensor *const pab,
@@ -138,6 +138,100 @@ void grid_prepare_coef_dgemm(
   grid_free_scratch(coef_xtt);
 }
 
+void grid_prepare_coef_dgemm(
+    const int *lmin, const int *lmax, const int lp, const double prefactor,
+    const tensor *const alpha, // [3][lb_max+1][la_max+1][lp+1]
+    const tensor *const pab,
+    tensor *coef_xyz) //[lp+1][lp+1][lp+1]
+{
+  /* can be done with dgemms as well, since it is a change of basis from (x -
+   * x1) (x - x2) to (x - x12)^alpha */
+
+  assert(alpha != NULL);
+  assert(coef_xyz != NULL);
+  assert(coef_xyz->data != NULL);
+  memset(coef_xyz->data, 0, coef_xyz->alloc_size_ * sizeof(double));
+  // we need a proper fix for that. We can use the tensor structure for this
+
+  for (int lzb = 0; lzb <= lmax[1]; lzb++) {
+      for (int lyb = 0; lyb <= lmax[1] - lzb; lyb++) {
+          const int lxb_min = imax(lmin[1] - lzb - lyb, 0);
+          for (int lxb = lxb_min; lxb <= lmax[1] - lzb - lyb; lxb++) {
+              const int jco = coset(lxb, lyb, lzb);
+              for (int lza = 0; lza <= lmax[0]; lza++) {
+                  for (int lya = 0; lya <= lmax[0] - lza; lya++) {
+                      const int lxa_min = imax(lmin[0] - lza - lya, 0);
+                      for (int lxa = lxa_min; lxa <= lmax[1] - lza - lya; lxa++) {
+                          const int ico = coset(lxa, lya, lza);
+                          const double pab_ = idx2(pab[0], jco, ico);
+                          for (int lzp = 0; lzp <= lza + lzb; lzp++) {
+                              double p = prefactor * pab_ * idx4(alpha[0], 2, lzb, lza, lzp);
+                              for (int lxp = 0; lxp <= lp - lza - lzb; lxp++) {
+                                  double p1 = p * idx4(alpha[0], 0, lxb, lxa, lxp);
+                                  double *restrict dst = &idx3(coef_xyz[0], lxp, lzp, 0);
+                                  const double *restrict const src = &idx4(alpha[0], 1, lyb, lya, 0);
+#pragma GCC ivdep
+                                  for (int lyp = 0; lyp <= lp - lza - lzb - lxp; lyp++) {
+                                      dst[lyp] += p1 * src[lyp]; // collocate
+                                  }
+                              }
+                          }
+                      }
+                  }
+              }
+          }
+      }
+  }
+}
+
+/* opposite of the previous function. Extract pab from coef */
+void grid_compute_pab_dgemm(
+    const int *lmin, const int *lmax, const int lp, const double prefactor,
+    const tensor *const alpha, // [3][lb_max+1][la_max+1][lp+1]
+    const tensor *const coef_xyz,
+    tensor *pab) //[lp+1][lp+1][lp+1]
+{
+  /* can be done with dgemms as well, since it is a change of basis from (x -
+   * x1) (x - x2) to (x - x12)^alpha */
+
+  assert(alpha != NULL);
+  assert(coef_xyz != NULL);
+  assert(coef_xyz->data != NULL);
+  memset(coef_xyz->data, 0, coef_xyz->alloc_size_ * sizeof(double));
+  // we need a proper fix for that. We can use the tensor structure for this
+
+  for (int lzb = 0; lzb <= lmax[1]; lzb++) {
+      for (int lyb = 0; lyb <= lmax[1] - lzb; lyb++) {
+          const int lxb_min = imax(lmin[1] - lzb - lyb, 0);
+          for (int lxb = lxb_min; lxb <= lmax[1] - lzb - lyb; lxb++) {
+              const int jco = coset(lxb, lyb, lzb);
+              for (int lza = 0; lza <= lmax[0]; lza++) {
+                  for (int lya = 0; lya <= lmax[0] - lza; lya++) {
+                      const int lxa_min = imax(lmin[0] - lza - lya, 0);
+                      for (int lxa = lxa_min; lxa <= lmax[1] - lza - lya; lxa++) {
+                          const int ico = coset(lxa, lya, lza);
+                          double pab_ = 0.0;
+                          for (int lyp = 0; lyp <= lya + lyb; lyp++) {
+                              double p = prefactor * idx4(alpha[0], 1, lyb, lya, lyp);
+                              for (int lxp = 0; lxp <= lp - lya - lyb; lxp++) {
+                                  double p1 = p * idx4(alpha[0], 0, lxb, lxa, lxp);
+                                  const double *restrict const src1 = &idx3(coef_xyz[0], lyp, lxp, 0);
+                                  const double *restrict  const src2 = &idx4(alpha[0], 1, lzb, lza, 0);
+#pragma GCC ivdep
+                                  for (int lzp = 0; lzp <= lp - lya - lyb - lxp; lyp++) {
+                                      pab_ += src1[lyp] * p1 * src2[lyp]; // collocate
+                                  }
+                              }
+                          }
+                          idx2(pab[0], jco, ico) += pab_;
+                      }
+                  }
+              }
+          }
+      }
+  }
+}
+
 // *****************************************************************************
 void grid_prepare_alpha_dgemm(const double ra[3], const double rb[3],
                               const double rp[3], const int *lmax,

diff --git a/src/grid/cpu/collocation_integration.h b/src/grid/cpu/collocation_integration.h
@@ -22,6 +22,7 @@ extern "C" {
 #endif
 
 #include "../cpu/tensor_local.h"
+#include "private_header.h"
 #ifdef __GRID_CUDA
 typedef struct pgf_list_gpu_ {
   /* number of devices */
@@ -86,6 +87,8 @@ typedef struct pgf_list_gpu_ {
   int *border_mask_cpu_;
   int *border_mask_gpu_;
 
+  _task *task_list_cpu_;
+  _task *task_list_gpu_;
   int3 border_width;
 
   struct pgf_list_gpu_ *next;

diff --git a/src/grid/cpu/grid_collocate_dgemm.c b/src/grid/cpu/grid_collocate_dgemm.c
@@ -876,7 +876,6 @@ void grid_collocate_pgf_product_cpu_dgemm(
 
   handler->grid.ld_ = grid_local_size[0];
   handler->grid.data = grid_;
-  handler->blocked_grid.blocked_decomposition = false;
 
   setup_global_grid_size(&handler->grid, (const int *const)grid_global_size);
 
@@ -936,12 +935,12 @@ void grid_collocate_pgf_product_cpu_dgemm(
 
   initialize_tensor_4(&(handler->alpha), 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
                       lmax_prep[0] + lmax_prep[1] + 1);
-  alloc_tensor(&(handler->alpha));
+  realloc_tensor(&(handler->alpha));
 
   const int lp = lmax_prep[0] + lmax_prep[1];
 
   initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
-  alloc_tensor(&(handler->coef));
+  realloc_tensor(&(handler->coef));
 
   // initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
   // need them to be stored as
@@ -965,38 +964,28 @@ void grid_collocate_pgf_product_cpu_dgemm(
   free(pab_prep.data);
 }
 
-double compute_coefficients(grid_context *const ctx,
-                            struct collocation_integration_ *handler,
-                            const _task *task, tensor *const pab,
-                            tensor *const work, tensor *const pab_prep,
-                            int *const prev_block_num, int *const prev_iset,
-                            int *const prev_jset, const grid_buffer *pab_blocks,
-                            double *rp) {
+
+void compute_coefficients(grid_context *const ctx,
+                          struct collocation_integration_ *handler,
+                          const _task *task,
+                          const grid_buffer *pab_blocks,
+                          tensor *const pab,
+                          tensor *const work,
+                          tensor *const pab_prep) {
   const int iatom = task->iatom - 1;
   const int jatom = task->jatom - 1;
   const int iset = task->iset - 1;
   const int jset = task->jset - 1;
-  const int ipgf = task->ipgf - 1;
-  const int jpgf = task->jpgf - 1;
   const int ikind = ctx->atom_kinds[iatom] - 1;
   const int jkind = ctx->atom_kinds[jatom] - 1;
   const grid_basis_set *ibasis = ctx->basis_sets[ikind];
   const grid_basis_set *jbasis = ctx->basis_sets[jkind];
-  const int ncoseta = ncoset(ibasis->lmax[iset]);
-  const int ncosetb = ncoset(jbasis->lmax[jset]);
-  /* const int ncoa = ibasis->npgf[iset] * ncoseta;  // size of carthesian
-   * set */
-  /* const int ncob = jbasis->npgf[jset] * ncosetb; */
+
   const int block_num = task->block_num - 1;
 
   // Load subblock from buffer and decontract into Cartesian sublock pab.
   // The previous pab can be reused when only ipgf or jpgf has changed.
-  if (block_num != *prev_block_num || iset != *prev_iset ||
-      jset != *prev_jset) {
-    *prev_block_num = block_num;
-    *prev_iset = iset;
-    *prev_jset = jset;
-
+  if (task->update_block_) {
     // Locate current matrix block within the buffer. This block
     // contains the weights of the gaussian pairs in the spherical
     // harmonic basis, but we do computation in the cartesian
@@ -1010,36 +999,15 @@ double compute_coefficients(grid_context *const ctx,
                                   block, work, pab);
   } // end of block loading
 
-  const double zeta[2] = {ibasis->zet[iset * ibasis->maxpgf + ipgf],
-                          jbasis->zet[jset * jbasis->maxpgf + jpgf]};
-
-  const double *ra = &ctx->atom_positions[3 * iatom];
-  int offset[2] = {ipgf * ncoseta, jpgf * ncosetb};
-
-  int lmax[2] = {ibasis->lmax[iset], jbasis->lmax[jset]};
-  int lmin[2] = {ibasis->lmin[iset], jbasis->lmin[jset]};
-
-  const double zetp = zeta[0] + zeta[1];
-  const double f = zeta[1] / zetp;
-  const double rab2 = task->rab[0] * task->rab[0] +
-                      task->rab[1] * task->rab[1] + task->rab[2] * task->rab[2];
-  const double prefactor =
-      ((iatom == jatom) ? 1.0 : 2.0) * exp(-zeta[0] * f * rab2);
-
-  double rb[3];
-  for (int i = 0; i < 3; i++) {
-    rp[i] = ra[i] + f * task->rab[i];
-    rb[i] = ra[i] + task->rab[i];
-  }
 
   int lmin_prep[2];
   int lmax_prep[2];
 
-  lmin_prep[0] = imax(lmin[0] + handler->lmin_diff[0], 0);
-  lmin_prep[1] = imax(lmin[1] + handler->lmin_diff[1], 0);
+  lmin_prep[0] = imax(task->lmin[0] + handler->lmin_diff[0], 0);
+  lmin_prep[1] = imax(task->lmin[1] + handler->lmin_diff[1], 0);
 
-  lmax_prep[0] = lmax[0] + handler->lmax_diff[0];
-  lmax_prep[1] = lmax[1] + handler->lmax_diff[1];
+  lmax_prep[0] = task->lmax[0] + handler->lmax_diff[0];
+  lmax_prep[1] = task->lmax[1] + handler->lmax_diff[1];
 
   const int n1_prep = ncoset(lmax_prep[0]);
   const int n2_prep = ncoset(lmax_prep[1]);
@@ -1050,7 +1018,12 @@ double compute_coefficients(grid_context *const ctx,
   initialize_tensor_2(pab_prep, n2_prep, n1_prep);
   realloc_tensor(pab_prep);
 
-  grid_prepare_pab_dgemm(handler->func, offset, lmin, lmax, &zeta[0], pab,
+  grid_prepare_pab_dgemm(handler->func,
+                         task->offset,
+                         task->lmin,
+                         task->lmax,
+                         &task->zeta[0],
+                         pab,
                          pab_prep);
 
   //   *** initialise the coefficient matrix, we transform the sum
@@ -1081,18 +1054,23 @@ double compute_coefficients(grid_context *const ctx,
   initialize_tensor_3(&handler->coef, lp + 1, lp + 1, lp + 1);
   realloc_tensor(&handler->coef);
 
-  // trese two functions can be done with dgemm again....
-
-  // initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
-  // need them to be stored as
+  // these two functions can be done with dgemm again....
 
-  grid_prepare_alpha_dgemm(ra, rb, rp, lmax_prep, &handler->alpha);
+  grid_prepare_alpha_dgemm(task->ra,
+                           task->rb,
+                           task->rp,
+                           lmax_prep,
+                           &handler->alpha);
 
   // compute the coefficients after applying the function of interest
   // coef[x][z][y]
-  grid_prepare_coef_dgemm(lmin_prep, lmax_prep, lp, prefactor, &handler->alpha,
-                          pab_prep, &handler->coef);
-  return zetp;
+  grid_prepare_coef_dgemm(lmin_prep,
+                          lmax_prep,
+                          lp,
+                          task->prefactor,
+                          &handler->alpha,
+                          pab_prep,
+                          &handler->coef);
 }
 
 void collocate_one_grid_level_dgemm(grid_context *const ctx,
@@ -1147,8 +1125,6 @@ void collocate_one_grid_level_dgemm(grid_context *const ctx,
       memset(handler->grid.data, 0, sizeof(double) * grid->alloc_size_);
     }
 
-    // Initialize variables to detect when a new subblock has to be fetched.
-    int prev_block_num = -1, prev_iset = -1, prev_jset = -1;
 
 #pragma omp for schedule(static)
     for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
@@ -1170,12 +1146,10 @@ void collocate_one_grid_level_dgemm(grid_context *const ctx,
                           task->border_mask);
       }
 
-      double rp[3];
-      double zetp = compute_coefficients(ctx, handler, task, &pab, &work,
-                                         &pab_prep, &prev_block_num, &prev_iset,
-                                         &prev_jset, pab_blocks, rp);
+      compute_coefficients(ctx, handler, task, pab_blocks, &pab, &work,
+                           &pab_prep);
 
-      grid_collocate(handler, ctx->orthorhombic, zetp, rp, task->radius);
+      grid_collocate(handler, ctx->orthorhombic, task->zetp, task->rp, task->radius);
     }
 
     // Merge thread local grids into shared grid. Could be improved though....

diff --git a/src/grid/cpu/grid_context_cpu.c b/src/grid/cpu/grid_context_cpu.c
@@ -151,6 +151,10 @@ void update_task_lists(const int nlevels, const int ntasks,
     ctx->tasks[i] = ctx->tasks[i - 1] + ctx->tasks_per_level[i - 1];
   }
 
+  int prev_block_num = -1;
+  int prev_iset = -1;
+  int prev_jset = -1;
+
   _task *task = ctx->tasks[0];
   for (int i = 0; i < ntasks; i++) {
     task->level = level_list[i] - 1;
@@ -166,6 +170,57 @@ void update_task_lists(const int nlevels, const int ntasks,
     task->rab[0] = rab_list[i][0];
     task->rab[1] = rab_list[i][1];
     task->rab[2] = rab_list[i][2];
+    const int iatom = task->iatom - 1;
+    const int jatom = task->jatom - 1;
+    const int iset = task->iset - 1;
+    const int jset = task->jset - 1;
+    const int ipgf = task->ipgf - 1;
+    const int jpgf = task->jpgf - 1;
+    const int ikind = ctx->atom_kinds[iatom] - 1;
+    const int jkind = ctx->atom_kinds[jatom] - 1;
+    const grid_basis_set *ibasis = ctx->basis_sets[ikind];
+    const grid_basis_set *jbasis = ctx->basis_sets[jkind];
+    const int ncoseta = ncoset(ibasis->lmax[iset]);
+    const int ncosetb = ncoset(jbasis->lmax[jset]);
+
+    task->zeta[0] = ibasis->zet[iset * ibasis->maxpgf + ipgf];
+    task->zeta[1] = jbasis->zet[jset * jbasis->maxpgf + jpgf];
+
+    const double *ra = &ctx->atom_positions[3 * iatom];
+    const double zetp = task->zeta[0] + task->zeta[1];
+    const double f = task->zeta[1] / zetp;
+    const double rab2 = task->rab[0] * task->rab[0] +
+        task->rab[1] * task->rab[1] + task->rab[2] * task->rab[2];
+
+    task->prefactor =
+        ((iatom == jatom) ? 1.0 : 2.0) * exp(-task->zeta[0] * f * rab2);
+    task->zetp = zetp;
+
+    const int block_num = task->block_num - 1;
+
+    for (int i = 0; i < 3; i++) {
+        task->ra[i] = ra[i];
+        task->rp[i] = ra[i] + f * task->rab[i];
+        task->rb[i] = ra[i] + task->rab[i];
+    }
+
+    task->lmax[0] = ibasis->lmax[iset];
+    task->lmax[1] = jbasis->lmax[jset];
+    task->lmin[0] = ibasis->lmin[iset];
+    task->lmin[1] = jbasis->lmin[jset];
+
+    if ((block_num != prev_block_num) || (iset != prev_iset) ||
+        (jset != prev_jset)) {
+        task->update_block_ = true;
+        prev_block_num = block_num;
+        prev_iset = iset;
+        prev_jset = jset;
+    } else {
+        task->update_block_ = false;
+    }
+
+    task->offset[0] = ipgf * ncoseta;
+    task->offset[1] = jpgf * ncosetb;
     task++;
   }