From 987f79fad8d4a28aeeb7aa091de942ac1ce1cf5b Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 15 Dec 2021 21:15:46 -0500 Subject: [PATCH 1/3] enable OpenMP for `prod_force` and `prod_virial` About 1 ms can be saved in each training step. --- source/lib/src/prod_force.cc | 2 ++ source/lib/src/prod_force_grad.cc | 2 ++ source/lib/src/prod_virial.cc | 2 ++ source/lib/src/prod_virial_grad.cc | 2 ++ 4 files changed, 8 insertions(+) diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc index e9784d3409..b457ddb368 100644 --- a/source/lib/src/prod_force.cc +++ b/source/lib/src/prod_force.cc @@ -36,6 +36,7 @@ prod_force_a_cpu( memset(force, 0.0, sizeof(FPTYPE) * nall * 3); // compute force of a frame + #pragma omp parallel for for (int i_idx = 0; i_idx < nloc; ++i_idx) { // deriv wrt center atom for (int aa = 0; aa < ndescrpt; ++aa) { @@ -105,6 +106,7 @@ prod_force_r_cpu( } // compute force of a frame + #pragma omp parallel for for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; // deriv wrt center atom diff --git a/source/lib/src/prod_force_grad.cc b/source/lib/src/prod_force_grad.cc index 110bf790f4..78bad3c9ca 100644 --- a/source/lib/src/prod_force_grad.cc +++ b/source/lib/src/prod_force_grad.cc @@ -42,6 +42,7 @@ prod_force_grad_a_cpu( } // compute grad of one frame + #pragma omp parallel for for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; @@ -120,6 +121,7 @@ prod_force_grad_r_cpu( } // compute grad of one frame + #pragma omp parallel for for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; diff --git a/source/lib/src/prod_virial.cc b/source/lib/src/prod_virial.cc index f1c598c807..57a8369181 100644 --- a/source/lib/src/prod_virial.cc +++ b/source/lib/src/prod_virial.cc @@ -44,6 +44,7 @@ prod_virial_a_cpu( } // compute virial of a frame + #pragma omp parallel for for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; @@ -120,6 +121,7 @@ prod_virial_r_cpu( } // compute virial of a frame + #pragma omp parallel for for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; diff --git a/source/lib/src/prod_virial_grad.cc b/source/lib/src/prod_virial_grad.cc index 8e225c0793..0f8495c90e 100644 --- a/source/lib/src/prod_virial_grad.cc +++ b/source/lib/src/prod_virial_grad.cc @@ -41,6 +41,7 @@ prod_virial_grad_a_cpu( } // compute grad of one frame + #pragma omp parallel for for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; @@ -117,6 +118,7 @@ prod_virial_grad_r_cpu( } // compute grad of one frame + #pragma omp parallel for for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; From f9d9fc88ab7f6b8941285b9e0b106634519388d2 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 15 Dec 2021 22:00:56 -0500 Subject: [PATCH 2/3] bugfix --- source/lib/src/prod_force.cc | 4 ++-- source/lib/src/prod_virial.cc | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc index b457ddb368..1d09d88972 100644 --- a/source/lib/src/prod_force.cc +++ b/source/lib/src/prod_force.cc @@ -36,7 +36,6 @@ prod_force_a_cpu( memset(force, 0.0, sizeof(FPTYPE) * nall * 3); // compute force of a frame - #pragma omp parallel for for (int i_idx = 0; i_idx < nloc; ++i_idx) { // deriv wrt center atom for (int aa = 0; aa < ndescrpt; ++aa) { @@ -45,6 +44,7 @@ prod_force_a_cpu( force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; } // deriv wrt neighbors + #pragma omp parallel for for (int jj = 0; jj < nnei; ++jj) { int j_idx = nlist[i_idx * nnei + jj]; if (j_idx < 0) continue; @@ -106,7 +106,6 @@ prod_force_r_cpu( } // compute force of a frame - #pragma omp parallel for for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; // deriv wrt center atom @@ -116,6 +115,7 @@ prod_force_r_cpu( force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; } // deriv wrt neighbors + #pragma omp parallel for for (int jj = 0; jj < nnei; ++jj){ int j_idx = nlist[i_idx * nnei + jj]; // if (j_idx > nloc) j_idx = j_idx % nloc; diff --git a/source/lib/src/prod_virial.cc b/source/lib/src/prod_virial.cc index 57a8369181..d715cf9e5b 100644 --- a/source/lib/src/prod_virial.cc +++ b/source/lib/src/prod_virial.cc @@ -59,7 +59,9 @@ prod_virial_a_cpu( for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0]; + #pragma omp atomic virial[dd0 * 3 + dd1] -= tmp_v; + #pragma omp atomic atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v; } } @@ -133,7 +135,9 @@ prod_virial_r_cpu( for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * env_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0]; + #pragma omp atomic virial[dd0 * 3 + dd1] -= tmp_v; + #pragma omp atomic atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v; } } From 3b19ddfee49a660765343dee0dd440d351971787 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 15 Dec 2021 22:17:35 -0500 Subject: [PATCH 3/3] spawn the threads only once --- source/lib/src/prod_force.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc index 1d09d88972..b1286e7b14 100644 --- a/source/lib/src/prod_force.cc +++ b/source/lib/src/prod_force.cc @@ -36,15 +36,17 @@ prod_force_a_cpu( memset(force, 0.0, sizeof(FPTYPE) * nall * 3); // compute force of a frame + #pragma omp parallel for (int i_idx = 0; i_idx < nloc; ++i_idx) { // deriv wrt center atom + #pragma omp single for (int aa = 0; aa < ndescrpt; ++aa) { force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; } // deriv wrt neighbors - #pragma omp parallel for + #pragma omp for for (int jj = 0; jj < nnei; ++jj) { int j_idx = nlist[i_idx * nnei + jj]; if (j_idx < 0) continue; @@ -106,16 +108,18 @@ prod_force_r_cpu( } // compute force of a frame + #pragma omp parallel for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; // deriv wrt center atom + #pragma omp single for (int aa = 0; aa < ndescrpt; ++aa){ force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; } // deriv wrt neighbors - #pragma omp parallel for + #pragma omp for for (int jj = 0; jj < nnei; ++jj){ int j_idx = nlist[i_idx * nnei + jj]; // if (j_idx > nloc) j_idx = j_idx % nloc;