From 987f79fad8d4a28aeeb7aa091de942ac1ce1cf5b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 15 Dec 2021 21:15:46 -0500
Subject: [PATCH 1/3] enable OpenMP for `prod_force` and `prod_virial`

About 1 ms can be saved in each training step.
---
 source/lib/src/prod_force.cc       | 2 ++
 source/lib/src/prod_force_grad.cc  | 2 ++
 source/lib/src/prod_virial.cc      | 2 ++
 source/lib/src/prod_virial_grad.cc | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc
index e9784d3409..b457ddb368 100644
--- a/source/lib/src/prod_force.cc
+++ b/source/lib/src/prod_force.cc
@@ -36,6 +36,7 @@ prod_force_a_cpu(
 
   memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
   // compute force of a frame
+  #pragma omp parallel for
   for (int i_idx = 0; i_idx < nloc; ++i_idx) {
     // deriv wrt center atom
     for (int aa = 0; aa < ndescrpt; ++aa) {
@@ -105,6 +106,7 @@ prod_force_r_cpu(
   }
 
   // compute force of a frame
+  #pragma omp parallel for
   for (int ii = 0; ii < nloc; ++ii){
     int i_idx = ii;	
     // deriv wrt center atom
diff --git a/source/lib/src/prod_force_grad.cc b/source/lib/src/prod_force_grad.cc
index 110bf790f4..78bad3c9ca 100644
--- a/source/lib/src/prod_force_grad.cc
+++ b/source/lib/src/prod_force_grad.cc
@@ -42,6 +42,7 @@ prod_force_grad_a_cpu(
   }      
 
   // compute grad of one frame
+  #pragma omp parallel for
   for (int ii = 0; ii < nloc; ++ii){
     int i_idx = ii;
 	
@@ -120,6 +121,7 @@ prod_force_grad_r_cpu(
   }      
 
   // compute grad of one frame
+  #pragma omp parallel for
   for (int ii = 0; ii < nloc; ++ii){
     int i_idx = ii;
 	
diff --git a/source/lib/src/prod_virial.cc b/source/lib/src/prod_virial.cc
index f1c598c807..57a8369181 100644
--- a/source/lib/src/prod_virial.cc
+++ b/source/lib/src/prod_virial.cc
@@ -44,6 +44,7 @@ prod_virial_a_cpu(
   }
 
   // compute virial of a frame
+  #pragma omp parallel for
   for (int ii = 0; ii < nloc; ++ii){
     int i_idx = ii;
 
@@ -120,6 +121,7 @@ prod_virial_r_cpu(
   }
 
   // compute virial of a frame
+  #pragma omp parallel for
   for (int ii = 0; ii < nloc; ++ii){
     int i_idx = ii;
 
diff --git a/source/lib/src/prod_virial_grad.cc b/source/lib/src/prod_virial_grad.cc
index 8e225c0793..0f8495c90e 100644
--- a/source/lib/src/prod_virial_grad.cc
+++ b/source/lib/src/prod_virial_grad.cc
@@ -41,6 +41,7 @@ prod_virial_grad_a_cpu(
   }      
 
   // compute grad of one frame
+  #pragma omp parallel for
   for (int ii = 0; ii < nloc; ++ii){
     int i_idx = ii;
 	
@@ -117,6 +118,7 @@ prod_virial_grad_r_cpu(
   }      
 
   // compute grad of one frame
+  #pragma omp parallel for
   for (int ii = 0; ii < nloc; ++ii){
     int i_idx = ii;
 	

From f9d9fc88ab7f6b8941285b9e0b106634519388d2 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 15 Dec 2021 22:00:56 -0500
Subject: [PATCH 2/3] bugfix

---
 source/lib/src/prod_force.cc  | 4 ++--
 source/lib/src/prod_virial.cc | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc
index b457ddb368..1d09d88972 100644
--- a/source/lib/src/prod_force.cc
+++ b/source/lib/src/prod_force.cc
@@ -36,7 +36,6 @@ prod_force_a_cpu(
 
   memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
   // compute force of a frame
-  #pragma omp parallel for
   for (int i_idx = 0; i_idx < nloc; ++i_idx) {
     // deriv wrt center atom
     for (int aa = 0; aa < ndescrpt; ++aa) {
@@ -45,6 +44,7 @@ prod_force_a_cpu(
       force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
     }
     // deriv wrt neighbors
+    #pragma omp parallel for
     for (int jj = 0; jj < nnei; ++jj) {
       int j_idx = nlist[i_idx * nnei + jj];
       if (j_idx < 0) continue;
@@ -106,7 +106,6 @@ prod_force_r_cpu(
   }
 
   // compute force of a frame
-  #pragma omp parallel for
   for (int ii = 0; ii < nloc; ++ii){
     int i_idx = ii;	
     // deriv wrt center atom
@@ -116,6 +115,7 @@ prod_force_r_cpu(
       force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
     }
     // deriv wrt neighbors
+    #pragma omp parallel for
     for (int jj = 0; jj < nnei; ++jj){
       int j_idx = nlist[i_idx * nnei + jj];
       // if (j_idx > nloc) j_idx = j_idx % nloc;
diff --git a/source/lib/src/prod_virial.cc b/source/lib/src/prod_virial.cc
index 57a8369181..d715cf9e5b 100644
--- a/source/lib/src/prod_virial.cc
+++ b/source/lib/src/prod_virial.cc
@@ -59,7 +59,9 @@ prod_virial_a_cpu(
 	for (int dd0 = 0; dd0 < 3; ++dd0){
 	  for (int dd1 = 0; dd1 < 3; ++dd1){
 	    FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] *  env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0];
+      #pragma omp atomic
 	    virial[dd0 * 3 + dd1] -= tmp_v;
+      #pragma omp atomic
 	    atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
 	  }
 	}
@@ -133,7 +135,9 @@ prod_virial_r_cpu(
       for (int dd0 = 0; dd0 < 3; ++dd0){
 	for (int dd1 = 0; dd1 < 3; ++dd1){
 	  FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] *  env_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0];
+    #pragma omp atomic
 	  virial[dd0 * 3 + dd1] -= tmp_v;
+    #pragma omp atomic
 	  atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
 	}
       }

From 3b19ddfee49a660765343dee0dd440d351971787 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 15 Dec 2021 22:17:35 -0500
Subject: [PATCH 3/3] spawn the threads only once

---
 source/lib/src/prod_force.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc
index 1d09d88972..b1286e7b14 100644
--- a/source/lib/src/prod_force.cc
+++ b/source/lib/src/prod_force.cc
@@ -36,15 +36,17 @@ prod_force_a_cpu(
 
   memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
   // compute force of a frame
+  #pragma omp parallel
   for (int i_idx = 0; i_idx < nloc; ++i_idx) {
     // deriv wrt center atom
+    #pragma omp single
     for (int aa = 0; aa < ndescrpt; ++aa) {
       force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
       force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
       force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
     }
     // deriv wrt neighbors
-    #pragma omp parallel for
+    #pragma omp for
     for (int jj = 0; jj < nnei; ++jj) {
       int j_idx = nlist[i_idx * nnei + jj];
       if (j_idx < 0) continue;
@@ -106,16 +108,18 @@ prod_force_r_cpu(
   }
 
   // compute force of a frame
+  #pragma omp parallel
   for (int ii = 0; ii < nloc; ++ii){
     int i_idx = ii;	
     // deriv wrt center atom
+    #pragma omp single
     for (int aa = 0; aa < ndescrpt; ++aa){
       force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
       force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
       force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
     }
     // deriv wrt neighbors
-    #pragma omp parallel for
+    #pragma omp for
     for (int jj = 0; jj < nnei; ++jj){
       int j_idx = nlist[i_idx * nnei + jj];
       // if (j_idx > nloc) j_idx = j_idx % nloc;