In [1]:
import copy
import fnmatch
import json
import getpass
import os
import pathlib
import datetime
                    
from dask.distributed import LocalCluster, SSHCluster 
from laserfarm import Retiler, DataProcessing, GeotiffWriter, MacroPipeline, Classification
from laserfarm.remote_utils import get_wdclient, get_info_remote, list_remote

def last_modified(opts, remote_path):
    info = get_info_remote(get_wdclient(opts), remote_path.as_posix())
    format_ = '%a, %d %b %Y %H:%M:%S GMT'
    return datetime.datetime.strptime(info['modified'], format_)

# Macro-Pipeline Workflow - Classify Points Using Cadastre Data

## Set Run-Specific Input

Fill in the username/password for the SURF dCache. Choose whether you want to i) run all input files, ii) run the only input files listed in `filename`, or iii) run the input that was updated since the last workflow run.

In [2]:
remote_path_root = pathlib.Path('/pnfs/grid.sara.nl/data/projects.nl/eecolidar/01_Escience/')

# dCache path to a set of targets
remote_path_input = remote_path_root / 'ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_targets_all/point_density'

# dCache path where to copy the classified targets
remote_path_output = remote_path_root / 'ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_targets_classified' 

run = 'from_file' # 'all', 'updated', 'from_file'
filename = 'classification_failed.json'  # if run is 'from_file', set name of file with input file names
assert run in ['all', 'updated', 'from_file']

In [3]:
webdav_login = input('WebDAV username: ')
webdav_password = getpass.getpass('WebDAV password: ')
if run == 'updated':
    last_run = datetime.datetime.strptime(input('Date last run (YYYY-MM-DD): '), '%Y-%m-%d')

WebDAV username:  fnattin4
WebDAV password:  ············


## Check Connection to Remote Storage

In [4]:
wd_opts = {
    'webdav_hostname': 'https://webdav.grid.surfsara.nl:2880',
    'webdav_login': webdav_login,
    'webdav_password': webdav_password,
    'webdav_timeout': 200
}
assert get_wdclient(wd_opts).check(remote_path_root.as_posix())

In [5]:
tiles = [t for t in list_remote(get_wdclient(wd_opts), remote_path_input.as_posix())
         if fnmatch.fnmatch(t, 'tile_*_*.ply')]
print('Found: {} tiles'.format(len(tiles)))
if run == 'updated':
    # determine which tiles have been updated since last run
    tiles = [t for t in tiles if last_modified(wd_opts, remote_path_input/t) > last_run]
elif run == 'from_file':
    with open(filename, 'r') as f:
        tiles_read = json.load(f)
    # check whether all files are available on dCache
    assert all([t in tiles for t in tiles_read]), f'Some of the files in {filename} are not in remote dir'
    tiles = tiles_read
print('Retrieve and classify: {} tiles'.format(len(tiles)))

Found: 37457 tiles
Retrieve and classify: 1 tiles


## Setup Cluster

Setup Dask cluster used for all the macro-pipeline calculations.

In [6]:
# Connect to custom client
from dask.distributed import Client
local_tmp = pathlib.Path('/data/local/tmp')
client = Client('node1:8786')
client

0,1
Client  Scheduler: tcp://node1:8786  Dashboard: /proxy/8787/status,Cluster  Workers: 20  Cores: 20  Memory: 337.31 GB


In [None]:
# local_tmp = pathlib.Path('/data/local/tmp')

# nprocs_per_node = 2  

# # start the cluster
# scheduler_node = 'node1'

# hosts = [f'node{i}' for i in range(1, 11)]
# cluster = SSHCluster(hosts=[scheduler_node] + hosts, 
#                      connect_options={'known_hosts': None, 
#                                       'username': 'ubuntu', 
#                                       'client_keys': '/home/ubuntu/.ssh/id_rsa'},
#                      worker_options={'nthreads': 1, 
#                                      'nprocs': nprocs_per_node,
#                                      'memory_limit': 0,
#                                      'local_directory': local_tmp/'dask-worker-space'}, 
#                      scheduler_options={'dashboard_address': '8787'})
# cluster

## Classify Target Points

Classify the target points according to the ground type, using Cadastre data.

In [7]:
# path where the shapefiles extracted from the cadastre data are available
shp_dir = '/data/local/home/eecolidar_webdav/01_Escience/TOP10NL_GML_50d_Blokken_september_2018/TOP10NL_50d_Blokken_september_2018_shapefiles'

# setup input dictionary to configure the classification pipeline
# NOTE: for the classification we have mounted the dCache storage with rclone to access shp files
classification_input = {
    'setup_local_fs': {'tmp_folder': local_tmp.as_posix()},
    'pullremote': remote_path_input.as_posix(),
    'locate_shp': {'shp_dir': shp_dir},
    'classification': {'ground_type': 1},
    'export_point_cloud': {},
    'pushremote': remote_path_output.as_posix(),
    'cleanlocalfs': {}
}

# write input dictionary to JSON file
with open('classification.json', 'w') as f:
    json.dump(classification_input, f)

In [8]:
macro = MacroPipeline()

# add pipeline list to macro-pipeline object and set the corresponding labels
macro.tasks = [Classification(t).config(classification_input).setup_webdav_client(wd_opts) 
               for t in tiles]
macro.set_labels([os.path.splitext(tile)[0] for tile in tiles])

#macro.setup_cluster(cluster=cluster)
macro.client = client

# run!
macro.run()

# save outcome results and write name of failed pipelines to file
macro.print_outcome(to_file='classification.out')
failed = macro.get_failed_pipelines()
if failed:
    with open('classification_failed.json', 'w') as f:
        json.dump(['.'.join([pip.label, 'ply']) for pip in failed], f)
    raise RuntimeError('Some of the pipelines have failed')

In [None]:
from dask.distributed import Client, Future
client = Client('tcp://145.100.59.123:8786')
futures = [Future(key) for key in client.who_has().keys()]
client.cancel(futures)

## Terminate cluster

In [None]:
# macro.shutdown()

In [9]:
macro.client.close()