In [1]:
import copy
import json
import pathlib

from dask.distributed import Client, LocalCluster
from urllib.request import urlretrieve

from lc_macro_pipeline import Retiler, DataProcessing, GeotiffWriter
from lc_macro_pipeline import MacroPipeline

# Macro-ecology LiDAR point-cloud processing pipeline 

## 0. Data Retrieval  and Cluster Setup

Files produced by the pipeline will be saved in the `tmp_folder` directory. 

In [2]:
tmp_folder = pathlib.Path('/var/tmp')

We start by checking whether the test data set is available locally, we otherwise retrieve it from the AHN3 repository.

In [3]:
testdata_files = ['C_41CZ2.LAZ']

file_paths = [tmp_folder/f for f in testdata_files]

for file_path in file_paths:
    if not file_path.is_file():
        url = 'https://geodata.nationaalgeoregister.nl/ahn3/extract/ahn3_laz'
        url = '/'.join([url, file_path.name])
        urlretrieve(url, file_path)

We then setup the cluster that we will use for the computation using `dask`. For this example, the cluster consists of 3 processes. Note: it is important that single-threaded workers are employed for the tasks that require `laserchicken`!  

In [4]:
cluster = LocalCluster(processes=True, 
                       n_workers=2, 
                       threads_per_worker=1, 
                       local_directory=tmp_folder/'dask-worker-space')

## 1. Retiling

The first step in the pipeline is to retile the retrieved point-cloud files to a regular grid, splitting the original data into smaller chuncks that are easier to handle for data processing. The boundaries of the grid and the number of tiles along each axis are set to:

In [5]:
grid = {
    'min_x': -113107.8100,
    'max_x': 398892.1900,
    'min_y': 214783.8700,
    'max_y': 726783.87,
    'n_tiles_side': 256
}

The retiling of multiple input files consists of independent tasks, which are thus efficiently parallelized. The input controlling all the steps of the retiling is organized in a dictionary.

In [6]:
# set path where output will be written 
retiling_out_path = tmp_folder/'retiled'

retiling_input = {
    'setup_local_fs': {'output_folder': retiling_out_path},
    'set_grid': grid,
    'split_and_redistribute': {},
    'validate': {}
}

In [7]:
retiling_macro = MacroPipeline()

for file_path in file_paths:
    retiler = Retiler(input_file=file_path, label=file_path.stem)
    retiler.config(retiling_input)
    retiling_macro.add_task(retiler)

retiling_macro.setup_client(cluster=cluster)

# run!
retiling_macro.run()
retiling_macro.print_outcome()

001 C_41CZ2                        Completed


## 2. Feature Extraction

Once the files are splitted into tiles of a manageable size, we proceed to the feature extraction stage, which is performed using `laserchicken`. We choose the following two example features:

In [8]:
feature_names = ['mean_normalized_height', 'std_normalized_height']

The base input dictionary for this step looks like:

In [9]:
# set path where output will be written 
dp_out_path = tmp_folder/'targets'

dp_input = {
    'setup_local_fs': {'output_folder': dp_out_path},
    'load': {},
    'normalize': {
        'cell_size': 1
    },
    'generate_targets': {
        'tile_mesh_size' : 10.0,
        'validate' : True,
        **grid
    },
    'extract_features': {
        'feature_names': feature_names,
        'volume_type': 'cell',
        'volume_size': 10
    },
    'export_targets': {}
}

The tiles to which the original input file has been retiled are listed in a record file located in the retiling output directory:

In [10]:
tiles = []
for file_path in file_paths:
    record_file = '_'.join([file_path.stem, 'retile_record.js'])
    with pathlib.Path(retiling_out_path/record_file).open() as f:
        record = json.load(f)
    assert record['validated']
    tiles += [pathlib.Path(retiling_out_path/tile)
              for tile in record['redistributed_to']]
print([t.as_posix() for t in tiles])

['/var/tmp/retiled/tile_169_107', '/var/tmp/retiled/tile_169_108', '/var/tmp/retiled/tile_169_106', '/var/tmp/retiled/tile_171_108', '/var/tmp/retiled/tile_170_108', '/var/tmp/retiled/tile_171_107', '/var/tmp/retiled/tile_170_107']


Each tile can be processed independently, so that again one can run the tasks in a parallel fashion.

In [11]:
dp_macro = MacroPipeline()

for tile in tiles:
    # parse tile index from the directory name
    tile_index = [int(n) for n in tile.name.split('_')[1:]]
    dp = DataProcessing(input=tile, label=tile.name, tile_index=tile_index)
    dp.config(dp_input)
    dp_macro.add_task(dp)
    
dp_macro.setup_client(cluster=cluster)

# run!
dp_macro.run()
dp_macro.print_outcome()

001 tile_169_107                   Completed
002 tile_169_108                   Completed
003 tile_169_106                   Completed
004 tile_171_108                   Completed
005 tile_170_108                   Completed
006 tile_171_107                   Completed
007 tile_170_107                   Completed


## 3. GeoTIFF Export

The last step of the pipeline is the transformation of the features extracted from the point-cloud data and 'rasterized' in the target grid to a GeoTIFF file. In this case, the construction of the geotiffs (one per feature) can be performed in parallel: 

In [12]:
# set path where output will be written 
gw_out_path = tmp_folder/'geotiffs'

gw_input = {
    'setup_local_fs': {'input_folder': dp_out_path,
                       'output_folder': gw_out_path},
    'parse_point_cloud': {},
    'data_split': [1, 1],
    'create_subregion_geotiffs': {'output_handle': 'geotiff'}
}

In [13]:
geotiff_macro = MacroPipeline()

for feature_name in feature_names:
    gw = GeotiffWriter(bands=feature_name, label=feature_name)
    gw.config(gw_input)
    geotiff_macro.add_task(gw)

geotiff_macro.setup_client(cluster=cluster)

# run!
geotiff_macro.run()
geotiff_macro.print_outcome()

001 mean_normalized_height         Completed
002 std_normalized_height          Completed


Finally, we stop the client and the scheduler of the cluster.

In [14]:
cluster.close()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
