From 3b96011132517b505efc1c0391d2a2b05fdad1f0 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sun, 21 Nov 2021 19:43:00 -0500 Subject: [PATCH] update guidelines for the number of threads (#1291) * update guidelines for the number of threads Setting `OMP_NUM_THREADS` to 3 may be faster than 6. It needs to be confirmed. * add warnings if not adjusting threads --- deepmd/env.py | 8 ++++++++ doc/train/parallel-training.md | 2 ++ doc/train/training-advanced.md | 14 ++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/deepmd/env.py b/deepmd/env.py index 6e6543697e..214302ab6b 100644 --- a/deepmd/env.py +++ b/deepmd/env.py @@ -126,6 +126,14 @@ def set_tf_default_nthreads(): `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` control TF configuration of multithreading. """ + if "OMP_NUM_THREADS" not in os.environ or \ + "TF_INTRA_OP_PARALLELISM_THREADS" not in os.environ or \ + "TF_INTER_OP_PARALLELISM_THREADS" not in os.environ: + logging.warning( + "To get the best performance, it is recommended to adjust " + "the number of threads by setting the environment variables " + "OMP_NUM_THREADS, TF_INTRA_OP_PARALLELISM_THREADS, and " + "TF_INTER_OP_PARALLELISM_THREADS.") set_env_if_empty("TF_INTRA_OP_PARALLELISM_THREADS", "0", verbose=False) set_env_if_empty("TF_INTER_OP_PARALLELISM_THREADS", "0", verbose=False) diff --git a/doc/train/parallel-training.md b/doc/train/parallel-training.md index b252446971..7fecd364c2 100644 --- a/doc/train/parallel-training.md +++ b/doc/train/parallel-training.md @@ -39,6 +39,8 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 horovodrun -np 4 \ Need to mention, environment variable `CUDA_VISIBLE_DEVICES` must be set to control parallelism on the occupied host where one process is bound to one GPU card. +Note that `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS`, and `TF_INTER_OP_PARALLELISM_THREADS` should be carefully adjusted to achieve the best performance. + When using MPI with Horovod, `horovodrun` is a simple wrapper around `mpirun`. In the case where fine-grained control over options passed to `mpirun`, [`mpirun` can be invoked directly](https://horovod.readthedocs.io/en/stable/mpi_include.html), and it will be detected automatically by Horovod, e.g., ```bash CUDA_VISIBLE_DEVICES=4,5,6,7 mpirun -l -launcher=fork -hosts=localhost -np 4 \ diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md index ea9e1e8075..082569712f 100644 --- a/doc/train/training-advanced.md +++ b/doc/train/training-advanced.md @@ -111,16 +111,26 @@ optional arguments: **`--init-frz-model frozen_model.pb`**, initializes the training with an existing model that is stored in `frozen_model.pb`. -On some resources limited machines, one may want to control the number of threads used by DeePMD-kit. This is achieved by three environmental variables: `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`. `OMP_NUM_THREADS` controls the multithreading of DeePMD-kit implemented operations. `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` controls `intra_op_parallelism_threads` and `inter_op_parallelism_threads`, which are Tensorflow configurations for multithreading. An explanation is found [here](https://stackoverflow.com/questions/41233635/meaning-of-inter-op-parallelism-threads-and-intra-op-parallelism-threads). +To get the best performance, one should control the number of threads used by DeePMD-kit. This is achieved by three environmental variables: `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`. `OMP_NUM_THREADS` controls the multithreading of DeePMD-kit implemented operations. `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` controls `intra_op_parallelism_threads` and `inter_op_parallelism_threads`, which are Tensorflow configurations for multithreading. An explanation is found [here](https://www.intel.com/content/www/us/en/developer/articles/technical/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference.html). For example if you wish to use 3 cores of 2 CPUs on one node, you may set the environmental variables and run DeePMD-kit as follows: ```bash -export OMP_NUM_THREADS=6 +export OMP_NUM_THREADS=3 export TF_INTRA_OP_PARALLELISM_THREADS=3 export TF_INTER_OP_PARALLELISM_THREADS=2 dp train input.json ``` +For a node with 128 cores, it is recommended to start with the following variables: + +```bash +export OMP_NUM_THREADS=16 +export TF_INTRA_OP_PARALLELISM_THREADS=16 +export TF_INTER_OP_PARALLELISM_THREADS=8 +``` + +It is encouraged to adjust the configurations after empirical testing. + One can set other environmental variables: | Environment variables | Allowed value | Default value | Usage |