# idact - Prometheus sandbox

## Initial setup

Add `idact` to path:

In [7]:
import sys
import os
import bitmath
import getpass
import contextlib
import fabric
import logging
from pprint import pprint

def append_idact_path():
    idact_path = os.path.realpath(os.path.join(os.getcwd(), '../'))
    sys.path.append(idact_path)
append_idact_path()

from idact import *
from idact.detail.auth.set_password import set_password

os.environ['IDACT_KEY_LOCATION'] = os.path.join(os.getcwd(), '../.notebook-ssh')
# os.environ['IDACT_KEY_LOCATION'] = os.path.expanduser('~/.ssh')
if not os.path.exists(os.environ['IDACT_KEY_LOCATION']):
    os.mkdir(os.environ['IDACT_KEY_LOCATION'])

USER = 'plggarstka'

Hide debug information, setup context manager stack (for testing purposes)

## Add cluster (only first run)

In [8]:
key = KeyType.RSA  # Generate RSA key
# key = os.path.join(os.path.expanduser('~/.ssh'), 'id_rsa')

In [9]:
cluster = add_cluster(name="pro",
                      user=USER,
                      host="pro.cyfronet.pl",
                      port=22,
                      auth=AuthMethod.PUBLIC_KEY,
                      key=key,
                      install_key=True,
                      scratch="$SCRATCH")
save_environment('.idact-env')

2018-10-20 21:41:50 INFO: Generating public-private key pair.


## Load cluster (subsequent runs)

In [10]:
load_environment('.idact-env')
cluster = show_cluster("pro")
cluster

Cluster(pro.cyfronet.pl, 22, plggarstka, auth=AuthMethod.PUBLIC_KEY, key='E:\\shared\\uni\\eng-project\\notebooks\\../.notebook-ssh\\id_rsa_qk', install_key=True, disable_sshd=False)

In [11]:
set_log_level(logging.INFO)
#set_log_level(logging.DEBUG)
save_environment('.idact-env')

In [12]:
node = cluster.get_access_node()
node

Node(pro.cyfronet.pl:22, None)

On your first action, you will be asked for a password to install the key.
You can connect explicitly (optional) to do this right now:

In [13]:
node.connect()

2018-10-20 21:41:50 INFO: Installing key using password authentication.
Password for plggarstka@pro.cyfronet.pl:22: 


In [14]:
node.run('whoami')

'plggarstka'

In [15]:
node.run('hostname')

'login01.pro.cyfronet.pl'

## Allocate nodes

In [16]:
nodes = cluster.allocate_nodes(nodes=2,
                               cores=2,
                               memory_per_node=bitmath.GiB(10),
                               walltime=Walltime(minutes=20),
                               native_args={
                                   '--partition': 'plgrid-testing',
                                   '--account': 'intdata'
                               })

2018-10-20 21:42:17 INFO: Creating the ssh directory.


In [17]:
nodes

Nodes([Node(NotAllocated),Node(NotAllocated)], SlurmAllocation(job_id=13775764))

In [18]:
nodes.wait()
nodes

Nodes([Node(p1875:51901, 2018-10-20 20:02:27.757235+00:00),Node(p1885:55819, 2018-10-20 20:02:27.757235+00:00)], SlurmAllocation(job_id=13775764))

## Run commands

In [19]:
nodes[0].run('whoami')

'plggarstka'

In [20]:
nodes[0].run('hostname')

'p1875'

In [21]:
nodes[1].run('squeue')

'JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)\n          13775764 plgrid-te     wrap plggarst  R       0:13      2 p[1875,1885]'

In [22]:
nodes[1].run('hostname')

'p1885'

## Examine node resources

In [23]:
nodes[0].resources.memory_total

GiB(10.0)

In [24]:
nodes[0].resources.memory_usage

GiB(0.022434234619140625)

In [25]:
nodes[0].resources.cpu_cores

2

In [26]:
nodes[0].resources.cpu_usage

1.0

## Tunnel

In [27]:
tunnel = nodes[0].tunnel(here=9000, there=10000)

In [28]:
tunnel

MultiHopTunnel(9000:10000)

In [29]:
tunnel.close()

## Deploy notebook

One-time config step (cluster-specific):

In [30]:
cluster.config.setup_actions.jupyter = ['module load plgrid/tools/python-intel/3.6.2']
save_environment('.idact-env')

To run Jupyter Notebook on the cluster:

In [31]:
nb = nodes[0].deploy_notebook(local_port=8080)
nb



JupyterDeployment(8080 -> Node(p1875:51901, 2018-10-20 20:02:27.757235+00:00)

In [32]:
nodes[0].resources.memory_usage

GiB(0.08092498779296875)

In [33]:
nb.local_port

8080

To open the deployed notebook server in a new tab:

In [34]:
nb.open_in_browser()

In [35]:
nodes[0].resources.memory_usage

GiB(0.08106613159179688)

### Push and pull notebook

You can access the deployed notebook from multiple places by first pushing it:

In [36]:
cluster.push_deployment(nb)

2018-10-20 21:43:15 INFO: Pushing deployment: JupyterDeployment(8080 -> Node(p1875:51901, 2018-10-20 20:02:27.757235+00:00)


And then pulling:

In [37]:
deployments = cluster.pull_deployments()
deployments.jupyter_deployments

2018-10-20 21:43:25 INFO: Pulling deployments.


2018-10-20 21:43:30,650| ERROR   | Could not establish connection from ('127.0.0.1', 54604) to remote side of the tunnel
2018-10-20 21:43:30,658| ERROR   | Exception: Error reading SSH protocol banner
2018-10-20 21:43:30,765| ERROR   | Traceback (most recent call last):
2018-10-20 21:43:30,766| ERROR   |   File "E:\Anaconda3\envs\idact-dev\lib\site-packages\paramiko\transport.py", line 2044, in _check_banner
2018-10-20 21:43:30,767| ERROR   |     buf = self.packetizer.readline(timeout)
2018-10-20 21:43:30,768| ERROR   |   File "E:\Anaconda3\envs\idact-dev\lib\site-packages\paramiko\packet.py", line 353, in readline
2018-10-20 21:43:30,769| ERROR   |     buf += self._read_timeout(timeout)
2018-10-20 21:43:30,770| ERROR   |   File "E:\Anaconda3\envs\idact-dev\lib\site-packages\paramiko\packet.py", line 542, in _read_timeout
2018-10-20 21:43:30,771| ERROR   |     raise EOFError()
2018-10-20 21:43:30,772| ERROR   | EOFError
2018-10-20 21:43:30,774| ERROR   | 
2018-10-20 21:43:30,775| ERROR

2018-10-20 21:43:36 INFO: Pulled Jupyter deployment: JupyterDeployment(54614 -> Node(p1875:51901, 2018-10-20 20:02:27.757235+00:00)


[JupyterDeployment(54614 -> Node(p1875:51901, 2018-10-20 20:02:27.757235+00:00)]

In [38]:
nb_2 = deployments.jupyter_deployments[0]
nb_2

JupyterDeployment(54614 -> Node(p1875:51901, 2018-10-20 20:02:27.757235+00:00)

In [39]:
nb_2.open_in_browser()

In [40]:
nb_2.cancel()

2018-10-20 21:43:58 INFO: Cancelling Jupyter deployment.


More on pushing and pulling deployments in the next section.

## Push and pull nodes

In order to work with Dask, you would usually need a notebook running on the cluster, as shown above.

To access the allocated nodes from the cluster, you need to push their deployment first, same as the notebook deployment above:

In [41]:
cluster.push_deployment(nodes)

2018-10-20 21:44:22 INFO: Pushing deployment: Nodes([Node(p1875:51901, 2018-10-20 20:02:27.757235+00:00),Node(p1885:55819, 2018-10-20 20:02:27.757235+00:00)], SlurmAllocation(job_id=13775764))


Then, you would pull the deployment on the cluster:

In [42]:
deployments = cluster.pull_deployments()
deployments

2018-10-20 21:44:31 INFO: Pulling deployments.
2018-10-20 21:44:36 INFO: Pulled allocation deployment: Nodes([Node(p1875:51901, 2018-10-20 20:02:27.757235+00:00),Node(p1885:55819, 2018-10-20 20:02:27.757235+00:00)], SlurmAllocation(job_id=13775764))
2018-10-20 21:44:36 INFO: Pulled Jupyter deployment: JupyterDeployment(54644 -> Node(p1875:51901, 2018-10-20 20:02:27.757235+00:00)


2018-10-20 21:44:37,989| ERROR   | Secsh channel 0 open FAILED: Connection refused: Connect failed
2018-10-20 21:44:37,996| ERROR   | Could not establish connection from ('127.0.0.1', 54644) to remote side of the tunnel




2018-10-20 21:44:40,048| ERROR   | Secsh channel 1 open FAILED: Connection refused: Connect failed
2018-10-20 21:44:40,051| ERROR   | Could not establish connection from ('127.0.0.1', 54644) to remote side of the tunnel




2018-10-20 21:44:42,101| ERROR   | Secsh channel 2 open FAILED: Connection refused: Connect failed
2018-10-20 21:44:42,104| ERROR   | Could not establish connection from ('127.0.0.1', 54644) to remote side of the tunnel




2018-10-20 21:44:44,151| ERROR   | Secsh channel 3 open FAILED: Connection refused: Connect failed
2018-10-20 21:44:44,154| ERROR   | Could not establish connection from ('127.0.0.1', 54644) to remote side of the tunnel




SynchronizedDeployments(nodes=1, jupyter_deployments=0)

In [43]:
nodes = deployments.nodes[0]
nodes

Nodes([Node(p1875:51901, 2018-10-20 20:02:27.757235+00:00),Node(p1885:55819, 2018-10-20 20:02:27.757235+00:00)], SlurmAllocation(job_id=13775764))

Essentially, this feature is intended for using an allocation in multiple notebooks at once.

Deployments are cleared automatically if they are expired or cancelled. They can also be cleared manually by  running:

In [44]:
cluster.clear_pushed_deployments()

2018-10-20 21:45:06 INFO: Clearing deployments.


## Deploy Dask

One-time config step (cluster-specific):

In [45]:
cluster.config.setup_actions.dask = ['module load plgrid/tools/python-intel/3.6.2']
cluster.config.scratch = '$SCRATCH'
save_environment('.idact-env')

In [46]:
dd = deploy_dask(nodes)
dd

2018-10-20 21:45:14 INFO: Deploying Dask on 2 nodes.
2018-10-20 21:45:14 INFO: Connecting to p1875:51901 (1/2).
2018-10-20 21:45:15 INFO: Connecting to p1885:55819 (2/2).
2018-10-20 21:45:15 INFO: Deploying scheduler on the first node: p1875.


2018-10-20 21:45:38,619| ERROR   | Could not establish connection from ('127.0.0.1', 54674) to remote side of the tunnel
2018-10-20 21:45:38,624| ERROR   | Exception: Error reading SSH protocol banner
2018-10-20 21:45:38,630| ERROR   | Traceback (most recent call last):
2018-10-20 21:45:38,631| ERROR   |   File "E:\Anaconda3\envs\idact-dev\lib\site-packages\paramiko\transport.py", line 2044, in _check_banner
2018-10-20 21:45:38,632| ERROR   |     buf = self.packetizer.readline(timeout)
2018-10-20 21:45:38,634| ERROR   |   File "E:\Anaconda3\envs\idact-dev\lib\site-packages\paramiko\packet.py", line 353, in readline
2018-10-20 21:45:38,636| ERROR   |     buf += self._read_timeout(timeout)
2018-10-20 21:45:38,637| ERROR   |   File "E:\Anaconda3\envs\idact-dev\lib\site-packages\paramiko\packet.py", line 542, in _read_timeout
2018-10-20 21:45:38,638| ERROR   |     raise EOFError()
2018-10-20 21:45:38,639| ERROR   | EOFError
2018-10-20 21:45:38,641| ERROR   | 
2018-10-20 21:45:38,642| ERROR

2018-10-20 21:45:44 INFO: Checking scheduler connectivity from p1875 (1/2).
2018-10-20 21:45:44 INFO: Checking scheduler connectivity from p1885 (2/2).
2018-10-20 21:45:44 INFO: Deploying workers.
2018-10-20 21:45:44 INFO: Deploying worker 1/2.


2018-10-20 21:45:56,135| ERROR   | Could not establish connection from ('127.0.0.1', 54686) to remote side of the tunnel
2018-10-20 21:45:56,142| ERROR   | Exception: Error reading SSH protocol banner
2018-10-20 21:45:56,153| ERROR   | Traceback (most recent call last):
2018-10-20 21:45:56,155| ERROR   |   File "E:\Anaconda3\envs\idact-dev\lib\site-packages\paramiko\transport.py", line 2044, in _check_banner
2018-10-20 21:45:56,158| ERROR   |     buf = self.packetizer.readline(timeout)
2018-10-20 21:45:56,161| ERROR   |   File "E:\Anaconda3\envs\idact-dev\lib\site-packages\paramiko\packet.py", line 353, in readline
2018-10-20 21:45:56,162| ERROR   |     buf += self._read_timeout(timeout)
2018-10-20 21:45:56,163| ERROR   |   File "E:\Anaconda3\envs\idact-dev\lib\site-packages\paramiko\packet.py", line 542, in _read_timeout
2018-10-20 21:45:56,164| ERROR   |     raise EOFError()
2018-10-20 21:45:56,166| ERROR   | EOFError
2018-10-20 21:45:56,167| ERROR   | 
2018-10-20 21:45:56,168| ERROR

2018-10-20 21:46:01 INFO: Deploying worker 2/2.
2018-10-20 21:46:18 INFO: Validating worker 1/2.
2018-10-20 21:46:18 INFO: Validating worker 2/2.


DaskDeployment(scheduler=tcp://localhost:54672/tcp://172.20.71.90:36580, workers=2)

In [47]:
nodes[0].resources.memory_usage

GiB(0.38315582275390625)

Get Dask client:

In [48]:
client = dd.get_client()
client

0,1
Client  Scheduler: tcp://localhost:54672  Dashboard: http://localhost:37815/status,Cluster  Workers: 2  Cores: 4  Memory: 21.47 GB


In [49]:
nodes[0].resources.cpu_usage

4.0

Computation will work only if Python and library versions match:

In [50]:
#x = client.submit(lambda: value + 1, 10)
#x.result() == 11

Diagnostics servers are tunnelled:

In [51]:
dd.diagnostics.addresses

['http://localhost:37815', 'http://localhost:54696', 'http://localhost:54703']

To open diagnostics servers in new tabs:

In [52]:
dd.diagnostics.open_all()

In [53]:
dd.cancel()

2018-10-20 21:46:57 INFO: Cancelling worker deployment on p1885.
2018-10-20 21:47:05 INFO: Cancelling worker deployment on p1875.
2018-10-20 21:47:13 INFO: Cancelling scheduler deployment on p1875.


## Close

In [54]:
nodes.running()

True

In [55]:
nodes.cancel()

2018-10-20 21:47:22 INFO: Cancelling job 13775764.


In [56]:
nodes.running()

False

In [57]:
node.run('squeue')

'JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)\n          13775764 plgrid-te     wrap plggarst CG       4:53      2 p[1875,1885]'

## Push and pull the environment

When working on a cluster, it may be useful to synchronize idact config with the local machine. Pushing the environment will merge the local environment into the remote environment.

In [58]:
push_environment(cluster, path='~/.idact-notebook-env')

2018-10-20 21:47:32 INFO: Pushing the environment to cluster.
2018-10-20 21:47:33 ERROR: Failure: Getting file from node pro.cyfronet.pl: /net/people/plggarstka/.idact-notebook-env
2018-10-20 21:47:33 ERROR: Failure: Deserializing the environment from cluster.
2018-10-20 21:47:33 INFO: Remote environment is missing, current environment will be copied to cluster.


In [59]:
print(node.run('cat ~/.idact-notebook-env'))

{
    "clusters": {
        "pro": {
            "auth": "PUBLIC_KEY",
            "disableSshd": false,
            "host": "pro.cyfronet.pl",
            "installKey": true,
            "key": null,
            "port": 22,
            "portInfoRetries": 5,
            "scratch": "$SCRATCH",
            "setupActions": {
                "dask": [
                    "module load plgrid/tools/python-intel/3.6.2"
                ],
                "jupyter": [
                    "module load plgrid/tools/python-intel/3.6.2"
                ]
            },
            "user": "plggarstka"
        }
    },
    "logLevel": 20
}


The reverse operation is pulling the environment, which merges the remote environment into the local environment. Machine-specific information like the private key path is skipped when pushing or pulling.

In [60]:
pull_environment(cluster, path='~/.idact-notebook-env')

2018-10-20 21:47:38 INFO: Pulling the environment from cluster.


The 'path' parameter is optional. It defaults to ~/.idact.conf, or the value of the remote IDACT_CONFIG_PATH environment variable.

In [61]:
node.run('rm -v ~/.idact-notebook-env')

'removed ‘/net/people/plggarstka/.idact-notebook-env’'

## Remove cluster

A cluster can be removed from the environment.

In [62]:
add_cluster(name='fake',
            user='fakeuser',
            host='fakehost',
            port=2222)

2018-10-20 21:47:45 INFO: No auth method specified, defaulting to password-based.


Cluster(fakehost, 2222, fakeuser, auth=AuthMethod.ASK, key=None, install_key=True, disable_sshd=False)

In [63]:
show_clusters()

{'pro': Cluster(pro.cyfronet.pl, 22, plggarstka, auth=AuthMethod.PUBLIC_KEY, key='E:\\shared\\uni\\eng-project\\notebooks\\../.notebook-ssh\\id_rsa_qk', install_key=False, disable_sshd=False),
 'fake': Cluster(fakehost, 2222, fakeuser, auth=AuthMethod.ASK, key=None, install_key=True, disable_sshd=False)}

In [64]:
remove_cluster('fake')

In [65]:
show_clusters()

{'pro': Cluster(pro.cyfronet.pl, 22, plggarstka, auth=AuthMethod.PUBLIC_KEY, key='E:\\shared\\uni\\eng-project\\notebooks\\../.notebook-ssh\\id_rsa_qk', install_key=False, disable_sshd=False)}