In [1]:
import dask
dask.config.config


{'temporary-directory': None,
 'dataframe': {'shuffle-compression': None},
 'array': {'svg': {'size': 120}}}

In [3]:
from dask_jobqueue import LSFCluster

In [4]:
# Per node specification
cluster = LSFCluster(
    scheduler_options={"dashboard_address": ":3762"},
    cores=8,
    processes=1, # default sqrt(cores). set to one to max threads per machine.  better for numpy.  see https://docs.dask.org/en/latest/setup/single-machine.html
    memory="4 GB",
    project="VEN201",
    walltime="00:10",
    job_extra=["-nnodes 1"],          # <--- new!
    header_skip=["-R", "-n ", "-M"],  # <--- new!
    interface='ib0',
    use_stdin=False,
)

## Lets See what is sent to LSF

In [5]:
print(cluster.job_script())

#!/usr/bin/env bash

#BSUB -J dask-worker
#BSUB -P VEN201
#BSUB -W 00:30
#BSUB -nnodes 1

/ccs/home/vanstee/.conda/envs/powerai-ornl/bin/python -m distributed.cli.dask_worker tcp://10.41.0.32:36525 --nthreads 8 --memory-limit 4.00GB --name name --nanny --death-timeout 60 --interface ib0



In [6]:
from dask.distributed import Client
client = Client(cluster)

In [6]:
# client.restart()

In [7]:
client

0,1
Client  Scheduler: tcp://10.41.0.32:36525  Dashboard: http://10.41.0.32:3762/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [8]:
cluster.scale(2)

In [None]:
client

In [12]:
!bjobs

JOBID   USER       STAT   SLOTS    QUEUE       START_TIME    FINISH_TIME   JOB_NAME                      
376497  vanstee    RUN    43       batch       Sep 29 16:23  Sep 29 16:53  dask-worker                   
376498  vanstee    RUN    43       batch       Sep 29 16:23  Sep 29 16:53  dask-worker                   


# Numpy simple example ...

In [14]:
import dask.array as da
# 2.5 B element array , 500 chunks
x = da.random.random([5000,5000], chunks=[250,250])


In [18]:
cluster.scale(8)

In [15]:
x = x.persist()
x

Unnamed: 0,Array,Chunk
Bytes,200.00 MB,500.00 kB
Shape,"(5000, 5000)","(250, 250)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 200.00 MB 500.00 kB Shape (5000, 5000) (250, 250) Count 400 Tasks 400 Chunks Type float64 numpy.ndarray",5000  5000,

Unnamed: 0,Array,Chunk
Bytes,200.00 MB,500.00 kB
Shape,"(5000, 5000)","(250, 250)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray


In [16]:
y = x.T ** x - x.mean()

In [17]:
y.persist()

Unnamed: 0,Array,Chunk
Bytes,200.00 MB,500.00 kB
Shape,"(5000, 5000)","(250, 250)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 200.00 MB 500.00 kB Shape (5000, 5000) (250, 250) Count 400 Tasks 400 Chunks Type float64 numpy.ndarray",5000  5000,

Unnamed: 0,Array,Chunk
Bytes,200.00 MB,500.00 kB
Shape,"(5000, 5000)","(250, 250)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray


In [38]:
#del(y)
y.compute()

AttributeError: 'str' object has no attribute 'shape'

In [None]:
# Persist vs Compute https://distributed.dask.org/en/latest/memory.html
# use compute when the return value is small and you want to feed result into other analyses.
# use persist (similar to cache in spark) to trigger computation and pin results to memory.  
# Follow actions build task graphs, but only up to this point as it will use the value calculated by persist.