From 174b23cba16bfc3bb20be4a4533ffe1d8c0855b4 Mon Sep 17 00:00:00 2001 From: Scott Sievert Date: Thu, 9 May 2019 15:52:47 -0500 Subject: [PATCH 01/10] DOC: adds CONTRIBUTING.md --- CONTRIBUTING.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000000..0ea8325501b --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,29 @@ +For more information, see https://docs.dask.org/en/latest/develop.html#contributing-to-code + + +## Style +Distributed conforms with the [flake8] and [black] styles. To make sure your +code conforms with these styles, run + +``` shell +$ pip install black flake8 +$ cd path/to/distributed +$ black . +$ flake8 . +``` + +[flake8]:http://flake8.pycqa.org/en/latest/ +[black]:https://github.com/python/black + +## Docstrings + +Dask Distributed roughly follows the [numpydoc] standard. More information is +available at https://docs.dask.org/en/latest/develop.html#docstrings. + +[numpydoc]:https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt + +## Tests + +Dask employs extensive unit tests to ensure correctness of code both for today +and for the future. Test coverage is expected for all code contributions. More +detail is at https://docs.dask.org/en/latest/develop.html#test From 8ae3a191ce6a8b62e962473bbeab2e7e7e7c4079 Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 8 Jun 2020 15:17:01 -0500 Subject: [PATCH 02/10] Add usage example to SSHCluster docstring --- distributed/deploy/ssh.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/distributed/deploy/ssh.py b/distributed/deploy/ssh.py index 1f35d7e9672..bbac0c79c67 100644 --- a/distributed/deploy/ssh.py +++ b/distributed/deploy/ssh.py @@ -278,6 +278,9 @@ def SSHCluster( Examples -------- + First, a basic example that works as long as your machine accepts SSH + connections (aka if port 22 is open): + >>> from dask.distributed import Client, SSHCluster >>> cluster = SSHCluster( ... ["localhost", "localhost", "localhost", "localhost"], @@ -287,6 +290,31 @@ def SSHCluster( ... ) >>> client = Client(cluster) + Now, an example with a remote cluser you have SSH access to as + user ``foo``: + + >>> from dask.distributed import Client, SSHCluster + >>> import os + >>> + >>> auth = {"username": "foo", "password": os.environ.get("PASSWORD")} + >>> cluster = SSHCluster( + ... ["machine1", "machine2", "machine3"] + ... connect_options=auth, + ... worker_options={"nthreads": 2}, + ... scheduler_options={"port": 0, "dashboard_address": ":8797"} + ... ) + >>> client = Client(cluster) + + This example assumes the password is set as an environment variable in the + current shell, with ``export PASSWORD=bar``, + which could be specified in shell initialization (e.g, ``.profile``). + + .. warning:: + + Best practice is NOT to specify any password in the Python script. + If it's specified in the Python script there's a (very) strong chance + it'll leak unintentionally. + An example using a different worker module, in particular the ``dask-cuda-worker`` command from the ``dask-cuda`` project. From 0d6c06ca14d03653755cf471767502a6c388e1e2 Mon Sep 17 00:00:00 2001 From: Scott Date: Tue, 9 Jun 2020 11:10:13 -0500 Subject: [PATCH 03/10] Show key generation for SSH --- distributed/deploy/ssh.py | 47 ++++++++++++++------------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/distributed/deploy/ssh.py b/distributed/deploy/ssh.py index bbac0c79c67..19e93a75aa0 100644 --- a/distributed/deploy/ssh.py +++ b/distributed/deploy/ssh.py @@ -278,43 +278,28 @@ def SSHCluster( Examples -------- - First, a basic example that works as long as your machine accepts SSH - connections (aka if port 22 is open): + The most relevant example is with a remote cluser you have SSH + access to as user ``foo``. Best practice is to generate a key-pair + following the `SSH keygen tutorial`_: - >>> from dask.distributed import Client, SSHCluster - >>> cluster = SSHCluster( - ... ["localhost", "localhost", "localhost", "localhost"], - ... connect_options={"known_hosts": None}, - ... worker_options={"nthreads": 2}, - ... scheduler_options={"port": 0, "dashboard_address": ":8797"} - ... ) - >>> client = Client(cluster) + .. code:: bash + + $ # Generate a key pair + $ ssh-keygen -t rsa -b 4096 -f ~/.ssh/dask-ssh -P "" + $ # Copy to remote machine + $ ssh-copy-id -i ~/.ssh/dask-ssh user@machine - Now, an example with a remote cluser you have SSH access to as - user ``foo``: + Now it's possible to login to ``machine`` without entering a + password via ``ssh -i ~/.ssh-dask-ssh user@machine``. Let's + create an ``SSHCluster``: >>> from dask.distributed import Client, SSHCluster - >>> import os - >>> - >>> auth = {"username": "foo", "password": os.environ.get("PASSWORD")} >>> cluster = SSHCluster( - ... ["machine1", "machine2", "machine3"] - ... connect_options=auth, - ... worker_options={"nthreads": 2}, - ... scheduler_options={"port": 0, "dashboard_address": ":8797"} - ... ) + ... ["machine1", "machine1"], + ... scheduler_options={"port": 0, "dashboard_address": ":8797"}, + ... connect_options={...}) >>> client = Client(cluster) - This example assumes the password is set as an environment variable in the - current shell, with ``export PASSWORD=bar``, - which could be specified in shell initialization (e.g, ``.profile``). - - .. warning:: - - Best practice is NOT to specify any password in the Python script. - If it's specified in the Python script there's a (very) strong chance - it'll leak unintentionally. - An example using a different worker module, in particular the ``dask-cuda-worker`` command from the ``dask-cuda`` project. @@ -331,6 +316,8 @@ def SSHCluster( dask.distributed.Scheduler dask.distributed.Worker asyncssh.connect + + .. _SSH keygen tutorial: https://www.ssh.com/ssh/keygen/ """ if set(kwargs) & old_cluster_kwargs: from .old_ssh import SSHCluster as OldSSHCluster From 5cb5016cc2375bf4ab4088ec85ce518fc2a71af2 Mon Sep 17 00:00:00 2001 From: Scott Sievert Date: Tue, 9 Jun 2020 16:38:57 +0000 Subject: [PATCH 04/10] Update distributed/deploy/ssh.py Co-authored-by: Jacob Tomlinson --- distributed/deploy/ssh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/deploy/ssh.py b/distributed/deploy/ssh.py index 19e93a75aa0..db06d1fb510 100644 --- a/distributed/deploy/ssh.py +++ b/distributed/deploy/ssh.py @@ -297,7 +297,7 @@ def SSHCluster( >>> cluster = SSHCluster( ... ["machine1", "machine1"], ... scheduler_options={"port": 0, "dashboard_address": ":8797"}, - ... connect_options={...}) + ... connect_options={"username": "user", "client_keys": "~/.ssh/dask-ssh"}) >>> client = Client(cluster) An example using a different worker module, in particular the From 01142a5dd2df5eb004adde3aac388c878753bf0e Mon Sep 17 00:00:00 2001 From: Scott Date: Wed, 10 Jun 2020 09:48:13 -0500 Subject: [PATCH 05/10] Add firewall complications --- distributed/deploy/ssh.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/distributed/deploy/ssh.py b/distributed/deploy/ssh.py index db06d1fb510..b13136dca10 100644 --- a/distributed/deploy/ssh.py +++ b/distributed/deploy/ssh.py @@ -287,19 +287,28 @@ def SSHCluster( $ # Generate a key pair $ ssh-keygen -t rsa -b 4096 -f ~/.ssh/dask-ssh -P "" $ # Copy to remote machine - $ ssh-copy-id -i ~/.ssh/dask-ssh user@machine + $ ssh-copy-id -i ~/.ssh/dask-ssh foo@machine Now it's possible to login to ``machine`` without entering a - password via ``ssh -i ~/.ssh-dask-ssh user@machine``. Let's + password via ``ssh -i ~/.ssh-dask-ssh foo@machine``. Let's create an ``SSHCluster``: >>> from dask.distributed import Client, SSHCluster >>> cluster = SSHCluster( ... ["machine1", "machine1"], ... scheduler_options={"port": 0, "dashboard_address": ":8797"}, - ... connect_options={"username": "user", "client_keys": "~/.ssh/dask-ssh"}) + ... connect_options={"username": "foo", "client_keys": "~/.ssh/dask-ssh"}) >>> client = Client(cluster) + This depends on a successful connection between the your machine + and the Dask scheduler. Firewalls can complicate this, which results in a + timeout because a connection can't be made. This snippet will resolve any + networking if only port 22 is open for SSH access: + + .. code:: python + + SSHCluster(..., scheduler_options={"port": 22}) + An example using a different worker module, in particular the ``dask-cuda-worker`` command from the ``dask-cuda`` project. From 909ea423cb8611f6ef08cc11d42d636c7e846092 Mon Sep 17 00:00:00 2001 From: Scott Date: Wed, 10 Jun 2020 09:48:29 -0500 Subject: [PATCH 06/10] Add note on remote_python about conda envs --- distributed/deploy/ssh.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/distributed/deploy/ssh.py b/distributed/deploy/ssh.py index b13136dca10..9df6042fd40 100644 --- a/distributed/deploy/ssh.py +++ b/distributed/deploy/ssh.py @@ -274,7 +274,8 @@ def SSHCluster( worker_module: str, optional Python module to call to start the worker. remote_python: str, optional - Path to Python on remote nodes. + Path to Python on remote nodes. This can specified to use the + Python executable of a conda environment. Examples -------- From 2aa59525ea2f45e9a768409f8604d63fcbab9e79 Mon Sep 17 00:00:00 2001 From: Scott Date: Wed, 10 Jun 2020 09:51:45 -0500 Subject: [PATCH 07/10] Manually rip out untested code snipped --- distributed/deploy/ssh.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/distributed/deploy/ssh.py b/distributed/deploy/ssh.py index 9df6042fd40..2cd73183b7d 100644 --- a/distributed/deploy/ssh.py +++ b/distributed/deploy/ssh.py @@ -303,12 +303,7 @@ def SSHCluster( This depends on a successful connection between the your machine and the Dask scheduler. Firewalls can complicate this, which results in a - timeout because a connection can't be made. This snippet will resolve any - networking if only port 22 is open for SSH access: - - .. code:: python - - SSHCluster(..., scheduler_options={"port": 22}) + timeout because a connection can't be made. An example using a different worker module, in particular the ``dask-cuda-worker`` command from the ``dask-cuda`` project. From 93cb34294f4c3fc4048ed4450aba06bbeae2c76f Mon Sep 17 00:00:00 2001 From: Scott Date: Thu, 11 Jun 2020 13:10:48 -0500 Subject: [PATCH 08/10] Add SSH port forwarding --- distributed/deploy/ssh.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/distributed/deploy/ssh.py b/distributed/deploy/ssh.py index 2cd73183b7d..b3401ff2e93 100644 --- a/distributed/deploy/ssh.py +++ b/distributed/deploy/ssh.py @@ -303,7 +303,26 @@ def SSHCluster( This depends on a successful connection between the your machine and the Dask scheduler. Firewalls can complicate this, which results in a - timeout because a connection can't be made. + timeout because a connection can't be made. An alternative approach to + circumvent this issue is to start the Dask scheduler and it's workers on + the cluster, then port-forward the Dask scheduler and dashboard to your + local machine: + + .. code:: bash + + $ # Remote setup: dask-scheduler (default ports of 8786 and 8787) + $ # On local machine: + $ ssh -L 8796:localhost:8786 -L 8797:localhost:8787 foo@machine + $ python + >>> from distributed import Client + >>> client = Client("localhost:8796") + >>> # Perform simple computation on remote machine: + >>> client.submit(sum, [1, 2]) + + Now, all computation submitted to Dask scheduler will happen on the remote + cluster but development will take place on your local machine. The + dashboard will be available from the local machine at + ``http://localhost:8796`` to track the computation. An example using a different worker module, in particular the ``dask-cuda-worker`` command from the ``dask-cuda`` project. From 952b202fa28b55b5fa4d1c58fc338a289edbea53 Mon Sep 17 00:00:00 2001 From: Scott Date: Thu, 11 Jun 2020 14:43:33 -0500 Subject: [PATCH 09/10] small edit to example --- distributed/deploy/ssh.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/distributed/deploy/ssh.py b/distributed/deploy/ssh.py index b3401ff2e93..e0a670831e2 100644 --- a/distributed/deploy/ssh.py +++ b/distributed/deploy/ssh.py @@ -310,7 +310,9 @@ def SSHCluster( .. code:: bash - $ # Remote setup: dask-scheduler (default ports of 8786 and 8787) + $ # Remote setup: Dask cluster with dask-scheduler and dask-worker + $ # (by default, scheduler uses port 8786 and dashboard at 8787 + $ $ # On local machine: $ ssh -L 8796:localhost:8786 -L 8797:localhost:8787 foo@machine $ python From dfb20fd93783080dc64def3d4197394fa1fa4d15 Mon Sep 17 00:00:00 2001 From: Scott Date: Thu, 11 Jun 2020 14:53:54 -0500 Subject: [PATCH 10/10] Add SO link --- distributed/deploy/ssh.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/distributed/deploy/ssh.py b/distributed/deploy/ssh.py index e0a670831e2..e0719cdeb59 100644 --- a/distributed/deploy/ssh.py +++ b/distributed/deploy/ssh.py @@ -313,8 +313,10 @@ def SSHCluster( $ # Remote setup: Dask cluster with dask-scheduler and dask-worker $ # (by default, scheduler uses port 8786 and dashboard at 8787 $ - $ # On local machine: + $ # On local machine, SSH & port forward into remote machine $ ssh -L 8796:localhost:8786 -L 8797:localhost:8787 foo@machine + $ + $ # In a separate shell (or https://stackoverflow.com/q/2241063) $ python >>> from distributed import Client >>> client = Client("localhost:8796")