From 3b96011132517b505efc1c0391d2a2b05fdad1f0 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 21 Nov 2021 19:43:00 -0500
Subject: [PATCH] update guidelines for the number of threads (#1291)

* update guidelines for the number of threads

Setting `OMP_NUM_THREADS` to 3 may be faster than 6. It needs to be confirmed.

* add warnings if not adjusting threads
---
 deepmd/env.py                  |  8 ++++++++
 doc/train/parallel-training.md |  2 ++
 doc/train/training-advanced.md | 14 ++++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/deepmd/env.py b/deepmd/env.py
index 6e6543697e..214302ab6b 100644
--- a/deepmd/env.py
+++ b/deepmd/env.py
@@ -126,6 +126,14 @@ def set_tf_default_nthreads():
     `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`
     control TF configuration of multithreading.
     """
+    if "OMP_NUM_THREADS" not in os.environ or \
+       "TF_INTRA_OP_PARALLELISM_THREADS" not in os.environ or \
+       "TF_INTER_OP_PARALLELISM_THREADS" not in os.environ:
+        logging.warning(
+            "To get the best performance, it is recommended to adjust "
+            "the number of threads by setting the environment variables "
+            "OMP_NUM_THREADS, TF_INTRA_OP_PARALLELISM_THREADS, and "
+            "TF_INTER_OP_PARALLELISM_THREADS.")
     set_env_if_empty("TF_INTRA_OP_PARALLELISM_THREADS", "0", verbose=False)
     set_env_if_empty("TF_INTER_OP_PARALLELISM_THREADS", "0", verbose=False)
 
diff --git a/doc/train/parallel-training.md b/doc/train/parallel-training.md
index b252446971..7fecd364c2 100644
--- a/doc/train/parallel-training.md
+++ b/doc/train/parallel-training.md
@@ -39,6 +39,8 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 horovodrun -np 4 \
 
 Need to mention, environment variable `CUDA_VISIBLE_DEVICES` must be set to control parallelism on the occupied host where one process is bound to one GPU card.
 
+Note that `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS`, and `TF_INTER_OP_PARALLELISM_THREADS` should be carefully adjusted to achieve the best performance.
+
 When using MPI with Horovod, `horovodrun` is a simple wrapper around `mpirun`. In the case where fine-grained control over options passed to `mpirun`, [`mpirun` can be invoked directly](https://horovod.readthedocs.io/en/stable/mpi_include.html), and it will be detected automatically by Horovod, e.g.,
 ```bash
 CUDA_VISIBLE_DEVICES=4,5,6,7 mpirun -l -launcher=fork -hosts=localhost -np 4 \
diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md
index ea9e1e8075..082569712f 100644
--- a/doc/train/training-advanced.md
+++ b/doc/train/training-advanced.md
@@ -111,16 +111,26 @@ optional arguments:
 
 **`--init-frz-model frozen_model.pb`**, initializes the training with an existing model that is stored in `frozen_model.pb`.
 
-On some resources limited machines, one may want to control the number of threads used by DeePMD-kit. This is achieved by three environmental variables: `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`. `OMP_NUM_THREADS` controls the multithreading of DeePMD-kit implemented operations. `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` controls `intra_op_parallelism_threads` and `inter_op_parallelism_threads`, which are  Tensorflow configurations for multithreading. An explanation is found [here](https://stackoverflow.com/questions/41233635/meaning-of-inter-op-parallelism-threads-and-intra-op-parallelism-threads).
+To get the best performance, one should control the number of threads used by DeePMD-kit. This is achieved by three environmental variables: `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`. `OMP_NUM_THREADS` controls the multithreading of DeePMD-kit implemented operations. `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` controls `intra_op_parallelism_threads` and `inter_op_parallelism_threads`, which are  Tensorflow configurations for multithreading. An explanation is found [here](https://www.intel.com/content/www/us/en/developer/articles/technical/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference.html).
 
 For example if you wish to use 3 cores of 2 CPUs on one node, you may set the environmental variables and run DeePMD-kit as follows:
 ```bash
-export OMP_NUM_THREADS=6
+export OMP_NUM_THREADS=3
 export TF_INTRA_OP_PARALLELISM_THREADS=3
 export TF_INTER_OP_PARALLELISM_THREADS=2
 dp train input.json
 ```
 
+For a node with 128 cores, it is recommended to start with the following variables:
+
+```bash
+export OMP_NUM_THREADS=16
+export TF_INTRA_OP_PARALLELISM_THREADS=16
+export TF_INTER_OP_PARALLELISM_THREADS=8
+```
+
+It is encouraged to adjust the configurations after empirical testing.
+
 One can set other environmental variables:
 
 | Environment variables | Allowed value          | Default value | Usage                      |