In [1]:
import copy
import fnmatch
import json
import getpass
import os
import pathlib
import datetime
                    
from dask.distributed import LocalCluster, SSHCluster 
from laserfarm import Retiler, DataProcessing, GeotiffWriter, MacroPipeline
from laserfarm.remote_utils import get_wdclient, get_info_remote, list_remote

def last_modified(opts, remote_path):
    info = get_info_remote(get_wdclient(opts), remote_path.as_posix())
    format_ = '%a, %d %b %Y %H:%M:%S GMT'
    return datetime.datetime.strptime(info['modified'], format_)

# Macro-Pipeline Workflow - GeoTIFF Export (Mask)

## Set Run-Specific Input

Fill in the username/password for the SURF dCache. 

In [2]:
remote_path_root = pathlib.Path('/pnfs/grid.sara.nl/data/projects.nl/eecolidar/01_Escience/')

# We have mounted the dCache system to read input
#remote_path_input = remote_path_root / 'ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_targets_all'

# dCache path where to copy the geotiff files
remote_path_output = remote_path_root / 'ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_geotiff_classified'

In [3]:
webdav_login = input('WebDAV username: ')
webdav_password = getpass.getpass('WebDAV password: ')

WebDAV username:  fnattin4
WebDAV password:  ············


## Check Connection to Remote Storage

In [4]:
wd_opts = {
    'webdav_hostname': 'https://webdav.grid.surfsara.nl:2880',
    'webdav_login': webdav_login,
    'webdav_password': webdav_password,
    'webdav_timeout': 200
}
assert get_wdclient(wd_opts).check(remote_path_root.as_posix())

## Setup Cluster

Setup Dask cluster used for all the macro-pipeline calculations.

In [5]:
local_tmp = pathlib.Path('/data/local/tmp')

nprocs_per_node = 1  

# start the cluster
scheduler_node = 'node1'

# hosts = [f'node{i}' for i in range(1, 11)]
hosts = ['node2']
cluster = SSHCluster(hosts=[scheduler_node] + hosts, 
                     connect_options={'known_hosts': None, 
                                      'username': 'ubuntu', 
                                      'client_keys': '/home/ubuntu/.ssh/id_rsa'},
                     worker_options={'nthreads': 1, 
                                     'nprocs': nprocs_per_node,
                                     'memory_limit': 0,
                                     'local_directory': local_tmp/'dask-worker-space'}, 
                     scheduler_options={'dashboard_address': '8787'})
cluster

distributed.deploy.ssh - INFO - distributed.scheduler - INFO - -----------------------------------------------
distributed.deploy.ssh - INFO - distributed.scheduler - INFO - -----------------------------------------------
distributed.deploy.ssh - INFO - distributed.scheduler - INFO - Clear task state
distributed.deploy.ssh - INFO - distributed.scheduler - INFO -   Scheduler at: tcp://145.100.59.123:8786
distributed.deploy.ssh - INFO - distributed.nanny - INFO -         Start Nanny at: 'tcp://145.100.59.182:37843'
distributed.deploy.ssh - INFO - distributed.worker - INFO -       Start worker at: tcp://145.100.59.182:46851


## GeoTIFF Export

Export the rasterized features from the target grid to GeoTIFF files.

In [7]:
# output handle: AHN3 dataset, mask for building, road and water, target grid spacing 10m
output_handle = 'ahn3_mask-building-road-water_10m'

# setup input dictionary to configure the geotiff export pipeline
# NOTE: to export the geotiffs we have mounted the dCache storage with rclone
geotiff_export_input_classification = {
    'setup_local_fs': {
        'input_folder': '/data/local/home/eecolidar_webdav/01_Escience/ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_targets_classified',
        'output_folder': local_tmp.as_posix()
    },
    'parse_point_cloud': {},
    'data_split': {'xSub': 1, 'ySub': 1},
    'create_subregion_geotiffs': {'output_handle': output_handle},
    'pushremote': remote_path_output.as_posix(),
#     'cleanlocalfs': {}  # DO NOT CLEAN - it would erase remote input..
}


# write input dictionary to JSON file
with open('geotiff_export_input_classification.json', 'w') as f:
    json.dump(geotiff_export_input_classification, f)

In [None]:
macro = MacroPipeline()

gw = GeotiffWriter(bands='ground_type', label='ground_type').setup_webdav_client(wd_opts).config(geotiff_export_input_classification)
macro.add_task(gw)

macro.setup_cluster(cluster=cluster)

# run!
macro.run()

# save outcome results and write name of failed pipelines to file
macro.print_outcome(to_file='geotiff_export_classification.out')
failed = macro.get_failed_pipelines()
if failed:
    with open('geotiff_export_classification_failed.json', 'w') as f:
        json.dump([pip.label for pip in failed], f)
    raise RuntimeError('Some of the pipelines have failed')

In [None]:
from dask.distributed import Client, Future
client = Client('tcp://145.100.59.123:8786')
futures = [Future(key) for key in client.who_has().keys()]
client.cancel(futures)

## Terminate cluster

In [9]:
macro.shutdown()

tornado.application - ERROR - Exception in callback <bound method Client._heartbeat of <Client: 'tcp://145.100.59.123:8786' processes=1 threads=1>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/tornado/ioloop.py", line 907, in _run
    return self.callback()
  File "/usr/local/lib/python3.7/site-packages/distributed/client.py", line 1157, in _heartbeat
    self.scheduler_comm.send({"op": "heartbeat-client"})
  File "/usr/local/lib/python3.7/site-packages/distributed/batched.py", line 117, in send
    raise CommClosedError
distributed.comm.core.CommClosedError
tornado.application - ERROR - Exception in callback <bound method Client._heartbeat of <Client: 'tcp://145.100.59.123:8786' processes=1 threads=1>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/tornado/ioloop.py", line 907, in _run
    return self.callback()
  File "/usr/local/lib/python3.7/site-packages/distributed/client.py", line 1157, in _heartbeat
    sel