Skip to content

Commit

Permalink
refactoring, minor improvements, integrate gpu
Browse files Browse the repository at this point in the history
- removed dynamic allocation when computing the coefficients for collocate
- removed unsused functions in tensor structure
- do precomputations for collocate during initialization of the tasks (cube dimensions are not precomputed)
- added function computing pab from the coefficients given by integrate
- Start moving coefficients calculations on GPU. It is basically the same code than for CPU adapted to GPU computation.
- split source file for collocate and integrate on gpu.
- use some of ref implementation for forces and stresses without VLAs (cpu side)
  • Loading branch information
mtaillefumier authored and dev-zero committed Dec 17, 2020
1 parent 1785898 commit e09b9ef
Show file tree
Hide file tree
Showing 12 changed files with 1,989 additions and 944 deletions.
96 changes: 95 additions & 1 deletion src/grid/cpu/coefficients.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void transform_triangular_to_xyz(const double *const coef_xyz,
}

// *****************************************************************************
void grid_prepare_coef_dgemm(
void grid_prepare_coef_dgemm_bis(
const int *lmin, const int *lmax, const int lp, const double prefactor,
const tensor *const alpha, // [3][lb_max+1][la_max+1][lp+1]
const tensor *const pab,
Expand Down Expand Up @@ -138,6 +138,100 @@ void grid_prepare_coef_dgemm(
grid_free_scratch(coef_xtt);
}

void grid_prepare_coef_dgemm(
const int *lmin, const int *lmax, const int lp, const double prefactor,
const tensor *const alpha, // [3][lb_max+1][la_max+1][lp+1]
const tensor *const pab,
tensor *coef_xyz) //[lp+1][lp+1][lp+1]
{
/* can be done with dgemms as well, since it is a change of basis from (x -
* x1) (x - x2) to (x - x12)^alpha */

assert(alpha != NULL);
assert(coef_xyz != NULL);
assert(coef_xyz->data != NULL);
memset(coef_xyz->data, 0, coef_xyz->alloc_size_ * sizeof(double));
// we need a proper fix for that. We can use the tensor structure for this

for (int lzb = 0; lzb <= lmax[1]; lzb++) {
for (int lyb = 0; lyb <= lmax[1] - lzb; lyb++) {
const int lxb_min = imax(lmin[1] - lzb - lyb, 0);
for (int lxb = lxb_min; lxb <= lmax[1] - lzb - lyb; lxb++) {
const int jco = coset(lxb, lyb, lzb);
for (int lza = 0; lza <= lmax[0]; lza++) {
for (int lya = 0; lya <= lmax[0] - lza; lya++) {
const int lxa_min = imax(lmin[0] - lza - lya, 0);
for (int lxa = lxa_min; lxa <= lmax[1] - lza - lya; lxa++) {
const int ico = coset(lxa, lya, lza);
const double pab_ = idx2(pab[0], jco, ico);
for (int lzp = 0; lzp <= lza + lzb; lzp++) {
double p = prefactor * pab_ * idx4(alpha[0], 2, lzb, lza, lzp);
for (int lxp = 0; lxp <= lp - lza - lzb; lxp++) {
double p1 = p * idx4(alpha[0], 0, lxb, lxa, lxp);
double *restrict dst = &idx3(coef_xyz[0], lxp, lzp, 0);
const double *restrict const src = &idx4(alpha[0], 1, lyb, lya, 0);
#pragma GCC ivdep
for (int lyp = 0; lyp <= lp - lza - lzb - lxp; lyp++) {
dst[lyp] += p1 * src[lyp]; // collocate
}
}
}
}
}
}
}
}
}
}

/* opposite of the previous function. Extract pab from coef */
void grid_compute_pab_dgemm(
const int *lmin, const int *lmax, const int lp, const double prefactor,
const tensor *const alpha, // [3][lb_max+1][la_max+1][lp+1]
const tensor *const coef_xyz,
tensor *pab) //[lp+1][lp+1][lp+1]
{
/* can be done with dgemms as well, since it is a change of basis from (x -
* x1) (x - x2) to (x - x12)^alpha */

assert(alpha != NULL);
assert(coef_xyz != NULL);
assert(coef_xyz->data != NULL);
memset(coef_xyz->data, 0, coef_xyz->alloc_size_ * sizeof(double));
// we need a proper fix for that. We can use the tensor structure for this

for (int lzb = 0; lzb <= lmax[1]; lzb++) {
for (int lyb = 0; lyb <= lmax[1] - lzb; lyb++) {
const int lxb_min = imax(lmin[1] - lzb - lyb, 0);
for (int lxb = lxb_min; lxb <= lmax[1] - lzb - lyb; lxb++) {
const int jco = coset(lxb, lyb, lzb);
for (int lza = 0; lza <= lmax[0]; lza++) {
for (int lya = 0; lya <= lmax[0] - lza; lya++) {
const int lxa_min = imax(lmin[0] - lza - lya, 0);
for (int lxa = lxa_min; lxa <= lmax[1] - lza - lya; lxa++) {
const int ico = coset(lxa, lya, lza);
double pab_ = 0.0;
for (int lyp = 0; lyp <= lya + lyb; lyp++) {
double p = prefactor * idx4(alpha[0], 1, lyb, lya, lyp);
for (int lxp = 0; lxp <= lp - lya - lyb; lxp++) {
double p1 = p * idx4(alpha[0], 0, lxb, lxa, lxp);
const double *restrict const src1 = &idx3(coef_xyz[0], lyp, lxp, 0);
const double *restrict const src2 = &idx4(alpha[0], 1, lzb, lza, 0);
#pragma GCC ivdep
for (int lzp = 0; lzp <= lp - lya - lyb - lxp; lyp++) {
pab_ += src1[lyp] * p1 * src2[lyp]; // collocate
}
}
}
idx2(pab[0], jco, ico) += pab_;
}
}
}
}
}
}
}

// *****************************************************************************
void grid_prepare_alpha_dgemm(const double ra[3], const double rb[3],
const double rp[3], const int *lmax,
Expand Down
3 changes: 3 additions & 0 deletions src/grid/cpu/collocation_integration.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ extern "C" {
#endif

#include "../cpu/tensor_local.h"
#include "private_header.h"
#ifdef __GRID_CUDA
typedef struct pgf_list_gpu_ {
/* number of devices */
Expand Down Expand Up @@ -86,6 +87,8 @@ typedef struct pgf_list_gpu_ {
int *border_mask_cpu_;
int *border_mask_gpu_;

_task *task_list_cpu_;
_task *task_list_gpu_;
int3 border_width;

struct pgf_list_gpu_ *next;
Expand Down
102 changes: 38 additions & 64 deletions src/grid/cpu/grid_collocate_dgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,6 @@ void grid_collocate_pgf_product_cpu_dgemm(

handler->grid.ld_ = grid_local_size[0];
handler->grid.data = grid_;
handler->blocked_grid.blocked_decomposition = false;

setup_global_grid_size(&handler->grid, (const int *const)grid_global_size);

Expand Down Expand Up @@ -936,12 +935,12 @@ void grid_collocate_pgf_product_cpu_dgemm(

initialize_tensor_4(&(handler->alpha), 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
lmax_prep[0] + lmax_prep[1] + 1);
alloc_tensor(&(handler->alpha));
realloc_tensor(&(handler->alpha));

const int lp = lmax_prep[0] + lmax_prep[1];

initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
alloc_tensor(&(handler->coef));
realloc_tensor(&(handler->coef));

// initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
// need them to be stored as
Expand All @@ -965,38 +964,28 @@ void grid_collocate_pgf_product_cpu_dgemm(
free(pab_prep.data);
}

double compute_coefficients(grid_context *const ctx,
struct collocation_integration_ *handler,
const _task *task, tensor *const pab,
tensor *const work, tensor *const pab_prep,
int *const prev_block_num, int *const prev_iset,
int *const prev_jset, const grid_buffer *pab_blocks,
double *rp) {

void compute_coefficients(grid_context *const ctx,
struct collocation_integration_ *handler,
const _task *task,
const grid_buffer *pab_blocks,
tensor *const pab,
tensor *const work,
tensor *const pab_prep) {
const int iatom = task->iatom - 1;
const int jatom = task->jatom - 1;
const int iset = task->iset - 1;
const int jset = task->jset - 1;
const int ipgf = task->ipgf - 1;
const int jpgf = task->jpgf - 1;
const int ikind = ctx->atom_kinds[iatom] - 1;
const int jkind = ctx->atom_kinds[jatom] - 1;
const grid_basis_set *ibasis = ctx->basis_sets[ikind];
const grid_basis_set *jbasis = ctx->basis_sets[jkind];
const int ncoseta = ncoset(ibasis->lmax[iset]);
const int ncosetb = ncoset(jbasis->lmax[jset]);
/* const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian
* set */
/* const int ncob = jbasis->npgf[jset] * ncosetb; */

const int block_num = task->block_num - 1;

// Load subblock from buffer and decontract into Cartesian sublock pab.
// The previous pab can be reused when only ipgf or jpgf has changed.
if (block_num != *prev_block_num || iset != *prev_iset ||
jset != *prev_jset) {
*prev_block_num = block_num;
*prev_iset = iset;
*prev_jset = jset;

if (task->update_block_) {
// Locate current matrix block within the buffer. This block
// contains the weights of the gaussian pairs in the spherical
// harmonic basis, but we do computation in the cartesian
Expand All @@ -1010,36 +999,15 @@ double compute_coefficients(grid_context *const ctx,
block, work, pab);
} // end of block loading

const double zeta[2] = {ibasis->zet[iset * ibasis->maxpgf + ipgf],
jbasis->zet[jset * jbasis->maxpgf + jpgf]};

const double *ra = &ctx->atom_positions[3 * iatom];
int offset[2] = {ipgf * ncoseta, jpgf * ncosetb};

int lmax[2] = {ibasis->lmax[iset], jbasis->lmax[jset]};
int lmin[2] = {ibasis->lmin[iset], jbasis->lmin[jset]};

const double zetp = zeta[0] + zeta[1];
const double f = zeta[1] / zetp;
const double rab2 = task->rab[0] * task->rab[0] +
task->rab[1] * task->rab[1] + task->rab[2] * task->rab[2];
const double prefactor =
((iatom == jatom) ? 1.0 : 2.0) * exp(-zeta[0] * f * rab2);

double rb[3];
for (int i = 0; i < 3; i++) {
rp[i] = ra[i] + f * task->rab[i];
rb[i] = ra[i] + task->rab[i];
}

int lmin_prep[2];
int lmax_prep[2];

lmin_prep[0] = imax(lmin[0] + handler->lmin_diff[0], 0);
lmin_prep[1] = imax(lmin[1] + handler->lmin_diff[1], 0);
lmin_prep[0] = imax(task->lmin[0] + handler->lmin_diff[0], 0);
lmin_prep[1] = imax(task->lmin[1] + handler->lmin_diff[1], 0);

lmax_prep[0] = lmax[0] + handler->lmax_diff[0];
lmax_prep[1] = lmax[1] + handler->lmax_diff[1];
lmax_prep[0] = task->lmax[0] + handler->lmax_diff[0];
lmax_prep[1] = task->lmax[1] + handler->lmax_diff[1];

const int n1_prep = ncoset(lmax_prep[0]);
const int n2_prep = ncoset(lmax_prep[1]);
Expand All @@ -1050,7 +1018,12 @@ double compute_coefficients(grid_context *const ctx,
initialize_tensor_2(pab_prep, n2_prep, n1_prep);
realloc_tensor(pab_prep);

grid_prepare_pab_dgemm(handler->func, offset, lmin, lmax, &zeta[0], pab,
grid_prepare_pab_dgemm(handler->func,
task->offset,
task->lmin,
task->lmax,
&task->zeta[0],
pab,
pab_prep);

// *** initialise the coefficient matrix, we transform the sum
Expand Down Expand Up @@ -1081,18 +1054,23 @@ double compute_coefficients(grid_context *const ctx,
initialize_tensor_3(&handler->coef, lp + 1, lp + 1, lp + 1);
realloc_tensor(&handler->coef);

// trese two functions can be done with dgemm again....

// initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
// need them to be stored as
// these two functions can be done with dgemm again....

grid_prepare_alpha_dgemm(ra, rb, rp, lmax_prep, &handler->alpha);
grid_prepare_alpha_dgemm(task->ra,
task->rb,
task->rp,
lmax_prep,
&handler->alpha);

// compute the coefficients after applying the function of interest
// coef[x][z][y]
grid_prepare_coef_dgemm(lmin_prep, lmax_prep, lp, prefactor, &handler->alpha,
pab_prep, &handler->coef);
return zetp;
grid_prepare_coef_dgemm(lmin_prep,
lmax_prep,
lp,
task->prefactor,
&handler->alpha,
pab_prep,
&handler->coef);
}

void collocate_one_grid_level_dgemm(grid_context *const ctx,
Expand Down Expand Up @@ -1147,8 +1125,6 @@ void collocate_one_grid_level_dgemm(grid_context *const ctx,
memset(handler->grid.data, 0, sizeof(double) * grid->alloc_size_);
}

// Initialize variables to detect when a new subblock has to be fetched.
int prev_block_num = -1, prev_iset = -1, prev_jset = -1;

#pragma omp for schedule(static)
for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
Expand All @@ -1170,12 +1146,10 @@ void collocate_one_grid_level_dgemm(grid_context *const ctx,
task->border_mask);
}

double rp[3];
double zetp = compute_coefficients(ctx, handler, task, &pab, &work,
&pab_prep, &prev_block_num, &prev_iset,
&prev_jset, pab_blocks, rp);
compute_coefficients(ctx, handler, task, pab_blocks, &pab, &work,
&pab_prep);

grid_collocate(handler, ctx->orthorhombic, zetp, rp, task->radius);
grid_collocate(handler, ctx->orthorhombic, task->zetp, task->rp, task->radius);
}

// Merge thread local grids into shared grid. Could be improved though....
Expand Down
55 changes: 55 additions & 0 deletions src/grid/cpu/grid_context_cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ void update_task_lists(const int nlevels, const int ntasks,
ctx->tasks[i] = ctx->tasks[i - 1] + ctx->tasks_per_level[i - 1];
}

int prev_block_num = -1;
int prev_iset = -1;
int prev_jset = -1;

_task *task = ctx->tasks[0];
for (int i = 0; i < ntasks; i++) {
task->level = level_list[i] - 1;
Expand All @@ -166,6 +170,57 @@ void update_task_lists(const int nlevels, const int ntasks,
task->rab[0] = rab_list[i][0];
task->rab[1] = rab_list[i][1];
task->rab[2] = rab_list[i][2];
const int iatom = task->iatom - 1;
const int jatom = task->jatom - 1;
const int iset = task->iset - 1;
const int jset = task->jset - 1;
const int ipgf = task->ipgf - 1;
const int jpgf = task->jpgf - 1;
const int ikind = ctx->atom_kinds[iatom] - 1;
const int jkind = ctx->atom_kinds[jatom] - 1;
const grid_basis_set *ibasis = ctx->basis_sets[ikind];
const grid_basis_set *jbasis = ctx->basis_sets[jkind];
const int ncoseta = ncoset(ibasis->lmax[iset]);
const int ncosetb = ncoset(jbasis->lmax[jset]);

task->zeta[0] = ibasis->zet[iset * ibasis->maxpgf + ipgf];
task->zeta[1] = jbasis->zet[jset * jbasis->maxpgf + jpgf];

const double *ra = &ctx->atom_positions[3 * iatom];
const double zetp = task->zeta[0] + task->zeta[1];
const double f = task->zeta[1] / zetp;
const double rab2 = task->rab[0] * task->rab[0] +
task->rab[1] * task->rab[1] + task->rab[2] * task->rab[2];

task->prefactor =
((iatom == jatom) ? 1.0 : 2.0) * exp(-task->zeta[0] * f * rab2);
task->zetp = zetp;

const int block_num = task->block_num - 1;

for (int i = 0; i < 3; i++) {
task->ra[i] = ra[i];
task->rp[i] = ra[i] + f * task->rab[i];
task->rb[i] = ra[i] + task->rab[i];
}

task->lmax[0] = ibasis->lmax[iset];
task->lmax[1] = jbasis->lmax[jset];
task->lmin[0] = ibasis->lmin[iset];
task->lmin[1] = jbasis->lmin[jset];

if ((block_num != prev_block_num) || (iset != prev_iset) ||
(jset != prev_jset)) {
task->update_block_ = true;
prev_block_num = block_num;
prev_iset = iset;
prev_jset = jset;
} else {
task->update_block_ = false;
}

task->offset[0] = ipgf * ncoseta;
task->offset[1] = jpgf * ncosetb;
task++;
}

Expand Down

0 comments on commit e09b9ef

Please sign in to comment.