# Benchmarking: VIKING20X workload

In [1]:
import xarray
import dask.distributed

In [2]:
import subprocess

## Preparations
We need a local SSH key pair for SSH-based worker spawning to work properly.

In [3]:
!ssh-keygen -b 4096 -t rsa -f id_rsa_esmvfc -C esmvfc -P ''

Generating public/private rsa key pair.
Your identification has been saved in id_rsa_esmvfc
Your public key has been saved in id_rsa_esmvfc.pub
The key fingerprint is:
SHA256:w8w4JvlRtTsVzT0G4i6bwuvQO/HVmHTRxhz/kLRuglI esmvfc
The key's randomart image is:
+---[RSA 4096]----+
|          ...+*o.|
|         ....+oXo|
|        . E.. B o|
|     . * ..= o ..|
|    o = S.=.* o .|
|     +o+ o+= +   |
|     ..+oo.      |
|      ..+.       |
|      .+.        |
+----[SHA256]-----+


In [4]:
!ls id_rsa* -la

-rw------- 1 khoeflich domänen-benutzer 3369 Jan 30 12:45 id_rsa_esmvfc
-rw-r--r-- 1 khoeflich domänen-benutzer  732 Jan 30 12:45 id_rsa_esmvfc.pub


In [5]:
!cp ${HOME}/.ssh/authorized_keys ${HOME}/.ssh/authorized_keys_$(date '+%s')

In [6]:
!cat id_rsa_esmvfc.pub >> ${HOME}/.ssh/authorized_keys

## Open Dask cluster

In [7]:
# This instantiates a Dask scheduler without Dask workers, which will be spawned manually.
cluster = dask.distributed.LocalCluster(
    n_workers=0, ip='0.0.0.0'
)

In [8]:
client = dask.distributed.Client(cluster)

In [9]:
client.scheduler_info

<bound method Client.scheduler_info of <Client: 'tcp://10.199.124.103:44545' processes=0 threads=0, memory=0 B>>

In [10]:
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://10.199.124.103:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://10.199.124.103:8787/status,Workers: 0
Total threads:  0,Total memory:  0 B

0,1
Comm: tcp://10.199.124.103:44545,Workers: 0
Dashboard: http://10.199.124.103:8787/status,Total threads:  0
Started:  Just now,Total memory:  0 B


Prepare Dask worker machines to be added,

In [11]:
scheduler_ip='10.199.124.103:44545' # see client.scheduler_info above!
nthreads='7'; memory_limit='32GiB' # should be comparable to gcloud Dask worker machines!

In [12]:
worker_targets = ["khoeflich@scalc{:02d}.geomar.de".format(machine_id) for machine_id in range(4,14)]

In [13]:
import random
random.seed(56)

In [18]:
for _ in range(89):
    random.shuffle(worker_targets)

In [19]:
worker_targets

['khoeflich@scalc06.geomar.de',
 'khoeflich@scalc11.geomar.de',
 'khoeflich@scalc09.geomar.de',
 'khoeflich@scalc13.geomar.de',
 'khoeflich@scalc04.geomar.de',
 'khoeflich@scalc05.geomar.de',
 'khoeflich@scalc10.geomar.de',
 'khoeflich@scalc08.geomar.de',
 'khoeflich@scalc12.geomar.de',
 'khoeflich@scalc07.geomar.de']

In [20]:
#worker_targets = ['khoeflich@scalc04.geomar.de']

In [21]:
def add_dask_worker(ssh_target, scheduler_ip, nthreads, memory_limit):
    worker_command = "ssh "+ssh_target+" -i id_rsa_esmvfc singularity run --bind /data/user/khoeflich/github/ESM-VFC-cloud-project/ /home/khoeflich/ESM-VFC-cloud-project/performance/pangeo-notebook_2021.07.17.sif dask-worker "+scheduler_ip+" --nthreads "+nthreads+" --memory-limit "+memory_limit
    subprocess.Popen(worker_command, shell=True)

In [22]:
#for ssh_target in worker_targets:
#    add_dask_worker(ssh_target, scheduler_ip, nthreads)

In [23]:
len(cluster.scheduler_info.get('workers').keys())

0

## Specify VIKING20X dataset

In [24]:
ds_tgrid = xarray.open_zarr('/data/user/khoeflich/github/ESM-VFC-cloud-project/VIKING20X.L46-KFS003_1m_grid_T.zarr')

In [25]:
ds_ugrid = xarray.open_zarr('/data/user/khoeflich/github/ESM-VFC-cloud-project/VIKING20X.L46-KFS003_1m_grid_U.zarr')

In [26]:
ds_vgrid = xarray.open_zarr('/data/user/khoeflich/github/ESM-VFC-cloud-project/VIKING20X.L46-KFS003_1m_grid_V.zarr')

Uncompressed size,

In [27]:
print(ds_tgrid.nbytes/1e9, 'in GB')
print(ds_ugrid.nbytes/1e9, 'in GB')
print(ds_vgrid.nbytes/1e9, 'in GB')

403.110823688 in GB
201.561307016 in GB
201.561307016 in GB


Compressed size,

In [28]:
!du -sh /data/user/khoeflich/github/ESM-VFC-cloud-project/VIKING20X.L46-KFS003_1m_grid_T.zarr/
!du -sh /data/user/khoeflich/github/ESM-VFC-cloud-project/VIKING20X.L46-KFS003_1m_grid_U.zarr/
!du -sh /data/user/khoeflich/github/ESM-VFC-cloud-project/VIKING20X.L46-KFS003_1m_grid_V.zarr/

113G	/data/user/khoeflich/github/ESM-VFC-cloud-project/VIKING20X.L46-KFS003_1m_grid_T.zarr/
79G	/data/user/khoeflich/github/ESM-VFC-cloud-project/VIKING20X.L46-KFS003_1m_grid_U.zarr/
81G	/data/user/khoeflich/github/ESM-VFC-cloud-project/VIKING20X.L46-KFS003_1m_grid_V.zarr/


## Run performance experiment

In [None]:
import time, datetime

exp_name='scalc'
remaining_worker_targets = worker_targets.copy()
no_of_realizations = 25

now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
walltime = ['machine,workers,walltime']

# Measuring.

while len(remaining_worker_targets) > 0:
    
    # Start additional workers.

    for _ in range(2):
        add_dask_worker(remaining_worker_targets.pop(0), scheduler_ip, nthreads, memory_limit)
        
    # Wait for workers to be available.
    
    while len(cluster.scheduler_info.get('workers').keys()) < len(worker_targets)-len(remaining_worker_targets):
        time.sleep(5)
    
    # Warming up.
    
    for _ in range(3):
        ds_tgrid['votemper'].mean(['deptht', 'time_counter', 'x', 'y']).compute()
        ds_tgrid['vosaline'].mean(['deptht', 'time_counter', 'x', 'y']).compute()
        ds_ugrid['vozocrtx'].mean(['depthu', 'time_counter', 'x', 'y']).compute()
        ds_vgrid['vomecrty'].mean(['depthv', 'time_counter', 'x', 'y']).compute()

    # Do the calculation.

    for _ in range(no_of_realizations):
        
        workers=len(cluster.scheduler_info.get('workers').keys())

        start_time_in_sec = time.time() # time in seconds

        ds_tgrid['votemper'].mean(['deptht', 'time_counter', 'x', 'y']).compute()
        ds_tgrid['vosaline'].mean(['deptht', 'time_counter', 'x', 'y']).compute()
        ds_ugrid['vozocrtx'].mean(['depthu', 'time_counter', 'x', 'y']).compute()
        ds_vgrid['vomecrty'].mean(['depthv', 'time_counter', 'x', 'y']).compute()

        end_time_in_sec = time.time() # time in seconds

        output_row = "{},{},{}".format(exp_name, workers, end_time_in_sec-start_time_in_sec)
        walltime.append(output_row)

distributed.nanny - INFO -         Start Nanny at: 'tcp://10.199.124.111:43919'
distributed.nanny - INFO -         Start Nanny at: 'tcp://10.199.124.106:43895'
distributed.worker - INFO -       Start worker at: tcp://10.199.124.111:41607
distributed.worker - INFO -          Listening to: tcp://10.199.124.111:41607
distributed.worker - INFO -          dashboard at:       10.199.124.111:46663
distributed.worker - INFO - Waiting to connect to: tcp://10.199.124.103:44545
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO -               Threads:                          7
distributed.worker - INFO -                Memory:                  32.00 GiB
distributed.worker - INFO -       Local Directory: /home/khoeflich/dask-worker-space/worker-e483k12z
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO -         Registered to: tcp://10.199.124.103:44545
distributed.worker - INFO - ---------

In [32]:
client.close(); cluster.close()

distributed.worker - INFO - Stopping worker at tcp://10.199.124.104:39785
distributed.worker - INFO - Stopping worker at tcp://10.199.124.113:45693
distributed.worker - INFO - Stopping worker at tcp://10.199.124.107:43043
distributed.worker - INFO - Stopping worker at tcp://10.199.124.105:38509
distributed.worker - INFO - Stopping worker at tcp://10.199.124.108:41167
distributed.worker - INFO - Stopping worker at tcp://10.199.124.112:43189
distributed.worker - INFO - Stopping worker at tcp://10.199.124.109:37227
distributed.worker - INFO - Stopping worker at tcp://10.199.124.110:40675
distributed.worker - INFO - Stopping worker at tcp://10.199.124.111:41607
distributed.worker - INFO - Stopping worker at tcp://10.199.124.106:36727
distributed.nanny - INFO - Worker closed
distributed.nanny - INFO - Worker closed
distributed.nanny - INFO - Worker closed
distributed.nanny - INFO - Worker closed
distributed.nanny - INFO - Worker closed
distributed.nanny - INFO - Worker closed
distributed.na

Write results to disk,

In [33]:
import csv

with open('./viking20x_logs/'+now+'_'+exp_name+'.log', 'w') as file:
    wr = csv.writer(file, delimiter='\n')
    wr.writerow(walltime)

## Python environment

In [34]:
pip list

Package                       Version
----------------------------- --------------------------
adal                          1.2.7
adlfs                         2021.7.1
affine                        2.3.0
aiobotocore                   1.3.3
aiohttp                       3.7.4.post0
aioitertools                  0.7.1
alembic                       1.6.5
amqp                          5.0.6
anyio                         3.2.1
appdirs                       1.4.4
argon2-cffi                   20.1.0
asciitree                     0.3.3
asgiref                       3.4.1
asn1crypto                    1.4.0
astropy                       4.2.1
async-generator               1.10
async-timeout                 3.0.1
attrs                         21.2.0
av                            8.0.3
awscli                        1.19.106
azure-core                    1.14.0
azure-datalake-store          0.0.51
azure-identity                1.6.0
azure-storage-blob            12.8.1
Babel                    

In [35]:
!conda list --explicit

# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
@EXPLICIT
https://conda.anaconda.org/conda-forge/linux-64/gh-1.12.1-ha8f183a_1.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/nomkl-1.0-h5ca1d4c_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2021.5.30-ha878542_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-hab24e00_0.tar.bz2
https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-2.6.32-he073ed8_14.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2