From a646389e12953ed3ec66b0276cd94115cd9d8533 Mon Sep 17 00:00:00 2001 From: Hanyu Cui Date: Tue, 16 Jul 2019 18:08:24 -0700 Subject: [PATCH] Adds runtime version info for single-node distributed training --- python/sparkdl/horovod/runner_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/sparkdl/horovod/runner_base.py b/python/sparkdl/horovod/runner_base.py index f668b1cb..83e9ad6f 100644 --- a/python/sparkdl/horovod/runner_base.py +++ b/python/sparkdl/horovod/runner_base.py @@ -47,11 +47,12 @@ def __init__(self, np): which maps to a GPU on a GPU cluster or a CPU core on a CPU cluster. Accepted values are: - - If <0, this will spawn -np subprocesses on the driver node to run Horovod locally. + - If <0, this will spawn `-np` subprocesses on the driver node to run Horovod locally. Training stdout and stderr messages go to the notebook cell output, and are also available in driver logs in case the cell output is truncated. This is useful for debugging and we recommend testing your code under this mode first. However, be careful of heavy use of the Spark driver on a shared Databricks cluster. + Note that `np < -1` is only supported on Databricks Runtime 5.5 ML and above. - If >0, this will launch a Spark job with `np` tasks starting all together and run the Horovod job on the task nodes. It will wait until `np` task slots are available to launch the job.