In [None]:
import copy
import json 

from dask.distributed import Client, LocalCluster
from pathlib import Path
from urllib.request import urlretrieve


from lc_macro_pipeline.retiler import Retiler
from lc_macro_pipeline.data_processing import DataProcessing
from lc_macro_pipeline.geotiff_writer import Geotiff_writer
from lc_macro_pipeline.macro_pipeline import MacroPipeline

# Macro-ecology LiDAR point-cloud processing pipeline 

## 0. Data Retrieval  and Cluster Setup

Files produced by the pipeline will be saved in the `temp_folder` directory. 

In [None]:
temp_folder = Path('/var/tmp')

We start by checking whether the test data set is available locally, we otherwise retrieve it from the AHN3 repository.

In [None]:
testdata_files = ['C_41CZ2.LAZ']

file_paths = [temp_folder.joinpath(f) for f in testdata_files]

for file_path in file_paths:
    if not file_path.is_file():
        file_url = '/'.join(['https://geodata.nationaalgeoregister.nl/ahn3/extract/ahn3_laz', 
                             file_path.name])
        urlretrieve(file_url, file_path)

We then setup the cluster that we will use for the computation using `dask`. For this example, the cluster consists of 3 processes. Note: it is important that single-threaded workers are employed for the tasks that require `laserchicken`!  

In [None]:
cluster = LocalCluster(processes=True, 
                       n_workers=3, 
                       threads_per_worker=1, 
                       local_directory=temp_folder.joinpath('dask-worker-space'))
client = Client(cluster)

## 1. Retiling

The first step in the pipeline is to retile the retrieved point-cloud files to a regular grid, splitting the original data into smaller chuncks that are easier to handle for data processing. The boundaries of the grid and the number of tiles along each axis are set to:

In [None]:
grid = {
    'min_x': -113107.8100,
    'max_x': 398892.1900,
    'min_y': 214783.8700,
    'max_y': 726783.87,
    'n_tiles_side': 256
}

The retiling of multiple input files consists of independent tasks, which are thus efficiently parallelized. The input controlling all the steps of the retiling is organized in a dictionary.

In [None]:
retiling_macro = MacroPipeline()

for file_path in file_paths:
    print(file_path)
    
    retiler = Retiler()
    
    retiler.input = {
        'localfs': {
            'input_folder': file_path.parent.as_posix(),
            'input_file': file_path.name,
            'temp_folder': temp_folder
        },
        'tiling': grid,
        'split_and_redistribute': {},
        'validate': {}
    }
    
    retiling_macro.add_task(retiler)

res = retiling_macro.run(client)

## 2. Feature Extraction

Once the files are splitted into tiles of a manageable size, we proceed to the feature extraction stage, which is performed using `laserchicken`. We choose the following two example features:

In [None]:
feature_names = ["mean_normalized_height", "std_normalized_height"]

The base input dictionary for this step looks like:

In [None]:
dp_input = {
    "normalize": {
        "cell_size": 1
    },
    "generate_targets": {
        'min_x': -113107.8100,
        'max_x': 398892.1900,
        'min_y': 214783.8700,
        'max_y': 726783.87,
        'n_tiles_side': 256,
        "tile_mesh_size" : 10.0,
        "validate" : True,
    },
    "extract_features": {
        "feature_names": feature_names,
        "volume_type": "cell",
        "volume_size": 10
    }
}

The tiles to which the original input file has been retiled are listed in a record file located in the temporary directory:

In [None]:
tiles = []
for file_path in file_paths:
    record_file = '_'.join([file_path.stem, 'retile_record.js'])
    with Path(temp_folder/file_path.stem/record_file).open() as f:
        record = json.load(f)
    assert record['validated']
    tiles += [Path(temp_folder/file_path.stem/tile)
              for tile in record['redistributed_to']]
print([t.as_posix() for t in tiles])

Each tile can be processed independently, so that again one can run the tasks in a parallel fashion.

In [None]:
dp_macro = MacroPipeline()

for tile in tiles:
    print(tile)
    
    dp = DataProcessing()

    # add tile-specific input to the dictionary
    dp_input['load'] = {'path': tile.as_posix()}
    dp_input['export_targets'] = {'path': tile.with_suffix('.ply').as_posix(), 'overwrite': True}
    dp_input['generate_targets']['index_tile_x'] = int(tile.name.split('_')[1]) 
    dp_input['generate_targets']['index_tile_y'] = int(tile.name.split('_')[2])
    dp.input = copy.deepcopy(dp_input)
    
    dp_macro.add_task(dp)
    
res = dp_macro.run(client)

## 3. GeoTIFF Export

The last step of the pipeline is the transformation of the features extracted from the point-cloud data and 'rasterized' in the target grid to a GeoTIFF file. We set the path where input will be read and output written:

In [None]:
path = Path(temp_folder/file_paths[0].stem)

In this case, the construction of the geotiffs (one per feature) can be performed in parallel: 

In [None]:
geotiff_macro = MacroPipeline()

for feature_name in feature_names:
    print(feature_name)
    
    gw = Geotiff_writer()

    gw.input = {
        "parse_point_cloud" : {"data_directory": path.as_posix()},
        "data_split": {"xSub": 1, "ySub": 1},
        "create_subregion_geotiffs": {
            "outputdir": path.as_posix(), 
            "outputhandle": "geotiff",
            "band_export": [feature_name]
        }
    }
    
    geotiff_macro.add_task(gw)

res = geotiff_macro.run(client)

Finally, we stop the client and the scheduler of the cluster.

In [None]:
client.shutdown()