Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable OpenMP for prod_force and prod_virial #1360

Merged
merged 3 commits into from
Dec 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions source/lib/src/prod_force.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,17 @@ prod_force_a_cpu(

memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
// compute force of a frame
#pragma omp parallel
Copy link
Collaborator

@wanghan-iapcm wanghan-iapcm Dec 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two threads with different i_idx may write on the same force force[j_idx * 3 + xxx], which gives unpredictable result.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realized that, so that pragma omp for is inside this loop and before another loop. The current version can pass the tests.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK I understand you compute the neighbors of the same atom using multi-threading.

What I do not understand is why you need omp parallel to generate threads here, but not at L49 where you really need multi-threading

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image

for (int i_idx = 0; i_idx < nloc; ++i_idx) {
// deriv wrt center atom
#pragma omp single
for (int aa = 0; aa < ndescrpt; ++aa) {
force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
}
// deriv wrt neighbors
#pragma omp for
for (int jj = 0; jj < nnei; ++jj) {
int j_idx = nlist[i_idx * nnei + jj];
if (j_idx < 0) continue;
Expand Down Expand Up @@ -105,15 +108,18 @@ prod_force_r_cpu(
}

// compute force of a frame
#pragma omp parallel
for (int ii = 0; ii < nloc; ++ii){
int i_idx = ii;
// deriv wrt center atom
#pragma omp single
for (int aa = 0; aa < ndescrpt; ++aa){
force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
}
// deriv wrt neighbors
#pragma omp for
for (int jj = 0; jj < nnei; ++jj){
int j_idx = nlist[i_idx * nnei + jj];
// if (j_idx > nloc) j_idx = j_idx % nloc;
Expand Down
2 changes: 2 additions & 0 deletions source/lib/src/prod_force_grad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ prod_force_grad_a_cpu(
}

// compute grad of one frame
#pragma omp parallel for
for (int ii = 0; ii < nloc; ++ii){
int i_idx = ii;

Expand Down Expand Up @@ -120,6 +121,7 @@ prod_force_grad_r_cpu(
}

// compute grad of one frame
#pragma omp parallel for
for (int ii = 0; ii < nloc; ++ii){
int i_idx = ii;

Expand Down
6 changes: 6 additions & 0 deletions source/lib/src/prod_virial.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ prod_virial_a_cpu(
}

// compute virial of a frame
#pragma omp parallel for
for (int ii = 0; ii < nloc; ++ii){
int i_idx = ii;

Expand All @@ -58,7 +59,9 @@ prod_virial_a_cpu(
for (int dd0 = 0; dd0 < 3; ++dd0){
for (int dd1 = 0; dd1 < 3; ++dd1){
FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0];
#pragma omp atomic
virial[dd0 * 3 + dd1] -= tmp_v;
#pragma omp atomic
atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
}
}
Expand Down Expand Up @@ -120,6 +123,7 @@ prod_virial_r_cpu(
}

// compute virial of a frame
#pragma omp parallel for
for (int ii = 0; ii < nloc; ++ii){
int i_idx = ii;

Expand All @@ -131,7 +135,9 @@ prod_virial_r_cpu(
for (int dd0 = 0; dd0 < 3; ++dd0){
for (int dd1 = 0; dd1 < 3; ++dd1){
FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * env_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0];
#pragma omp atomic
virial[dd0 * 3 + dd1] -= tmp_v;
#pragma omp atomic
atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
}
}
Expand Down
2 changes: 2 additions & 0 deletions source/lib/src/prod_virial_grad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ prod_virial_grad_a_cpu(
}

// compute grad of one frame
#pragma omp parallel for
for (int ii = 0; ii < nloc; ++ii){
int i_idx = ii;

Expand Down Expand Up @@ -117,6 +118,7 @@ prod_virial_grad_r_cpu(
}

// compute grad of one frame
#pragma omp parallel for
for (int ii = 0; ii < nloc; ++ii){
int i_idx = ii;

Expand Down