In [1]:
import logging
import os

import click
import datacube
import fsspec
import geopandas as gpd
import pandas as pd
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.io import (
    check_dir_exists,
    check_file_exists,
    check_if_s3_uri,
    find_parquet_files,
)
from deafrica_waterbodies.make_polygons import (
    set_wetness_thresholds,
    process_raster_polygons,
    merge_polygons_at_tile_boundaries
)
from deafrica_waterbodies.tiling import (
    get_tiles,
)

In [2]:
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1

# aoi_vector_file = None
aoi_vector_file = "data/SenegalBasin.geojson"
tile_size_factor = 4
num_workers = 16

detection_threshold: float = 0.1
extent_threshold: float = 0.05
min_valid_observations: int = 128
# output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental"
output_directory = "s3://deafrica-waterbodies-dev/0-0-1/shapefile/"
overwrite = True
land_sea_mask_fp = "data/af_msk_3s.tif"

In [4]:
import xarray as xr
def filter_hydrosheds_land_mask(hydrosheds_land_mask: xr.DataArray) -> xr.DataArray:
    """
    Function to filter the HydroSHEDs Land Mask into a boolean mask.
    """
    # Indicator values: 1 = land, 2 = ocean sink, 3 = inland sink, 255 is no data.
    boolean_mask = (hydrosheds_land_mask != 255) & (hydrosheds_land_mask != 2)
    return boolean_mask

In [5]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [6]:
# Support pathlib Paths.
if aoi_vector_file is not None:
    aoi_vector_file = str(aoi_vector_file)
output_directory = str(output_directory)

In [7]:
# Parameters to use when loading datasets.
dask_chunks = {"x": 3200, "y": 3200, "time": 1}

In [8]:
# Load the area of interest as a GeoDataFrame.
if aoi_vector_file is not None:
    try:
        aoi_gdf = gpd.read_file(aoi_vector_file)
    except Exception as error:
        _log.exception(f"Could not read the file {aoi_vector_file}")
        raise error
else:
    aoi_gdf = None

In [9]:
# Get the tiles fo the wofs_ls_summary_alltime product.
tiles, grid_workflow = get_tiles(aoi_gdf=aoi_gdf,
                                 tile_size_factor=tile_size_factor,
                                 num_workers=num_workers)

[2023-10-18 17:18:47,412] {tiling.py:113} INFO - New tile size is (384000.0, 384000.0).
[2023-10-18 17:18:49,335] {tiling.py:132} INFO - Number of wofs_ls_summary_alltime tiles: 329


329it [00:01, 222.54it/s]


[2023-10-18 17:18:51,006] {tiling.py:167} INFO - Filtered out 320 tiles.
[2023-10-18 17:18:51,007] {tiling.py:168} INFO - Number of wofs_ls_summary_alltime tiles covering the area of interest: 9


In [10]:
# For testing only work with 5 tiles.
#import itertools
# Initialize limit 
# N = 5
# Using islice() + items() 
# Get first N items in dictionary 
# tiles = dict(itertools.islice(tiles.items(), N)) 

In [11]:
# Directory to write generated waterbody polygons to.
polygons_from_thresholds_dir = os.path.join(output_directory, "polygons_from_thresholds")

In [12]:
# Set the filesystem to use.
if check_if_s3_uri(polygons_from_thresholds_dir):
    fs = fsspec.filesystem("s3")
else:
    fs = fsspec.filesystem("file")

In [13]:
# Check if the directory exists. If it does not, create it.
if not check_dir_exists(polygons_from_thresholds_dir):
    fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True)
    _log.info(f"Created directory {polygons_from_thresholds_dir}")

[2023-10-18 17:18:51,230] {credentials.py:620} INFO - Found credentials in shared credentials file: ~/.aws/credentials
[2023-10-18 17:18:51,551] {3896921575.py:4} INFO - Created directory s3://deafrica-waterbodies-dev/0-0-1/shapefile/polygons_from_thresholds


In [14]:
# Set the wetness thresholds.
min_wet_thresholds = set_wetness_thresholds(detection_threshold=detection_threshold, extent_threshold=extent_threshold)

[2023-10-18 17:18:51,556] {make_polygons.py:70} INFO - We will be running a hybrid wetness threshold.
        You have set 0.1 as the location threshold, which will define the location of the water body polygons.
        You have set 0.05 as the extent threshold, which will define the extent/shape of the water body polygons.


In [15]:
# Generate the first set of polygons for each of the tiles.
for tile in tiles.items():
    tile_id = tile[0]
    raster_polygons_fp = os.path.join(polygons_from_thresholds_dir, f"{tile_id[0]}_{tile_id[1]}_raster_polygons.parquet")

    if not overwrite:
        _log.info(f"Checking existence of {raster_polygons_fp}")
        exists = check_file_exists(raster_polygons_fp)
        if exists:
            _log.info(f"{raster_polygons_fp} exists! \n Skipping generating water body polygons for {tile_id}.")

    if overwrite or not exists:
        try:
            _log.info(f"Generating water body polygons for tile {tile_id}.")
            raster_polygons = process_raster_polygons(tile=tile,
                                                      grid_workflow=grid_workflow,
                                                      dask_chunks=dask_chunks,
                                                      min_valid_observations=min_valid_observations,
                                                      min_wet_thresholds=min_wet_thresholds,
                                                      land_sea_mask_fp=land_sea_mask_fp,
                                                      filter_land_sea_mask=filter_hydrosheds_land_mask)
            if raster_polygons.empty:
                _log.info(f"Tile {str(tile_id)} contains no water body polygons.")
            else:
                # Drop the attributes column.
                raster_polygons.drop(columns=["attribute"], errors="ignore", inplace=True)
                # Write the polygons to parquet files.
                raster_polygons.to_parquet(raster_polygons_fp)
                _log.info(f"Tile {str(tile_id)} water body polygons written to {raster_polygons_fp}")
        except Exception as error:
            _log.exception(
                f"\nTile {str(tile_id)} did not run. \n"
            )
            _log.exception(error)

[2023-10-18 17:18:51,563] {2831178259.py:14} INFO - Generating water body polygons for tile (42, 24).
[2023-10-18 17:20:10,495] {2831178259.py:29} INFO - Tile (42, 24) water body polygons written to s3://deafrica-waterbodies-dev/0-0-1/shapefile/polygons_from_thresholds/42_24_raster_polygons.parquet
[2023-10-18 17:20:10,496] {2831178259.py:14} INFO - Generating water body polygons for tile (42, 25).
[2023-10-18 17:21:22,068] {2831178259.py:29} INFO - Tile (42, 25) water body polygons written to s3://deafrica-waterbodies-dev/0-0-1/shapefile/polygons_from_thresholds/42_25_raster_polygons.parquet
[2023-10-18 17:21:22,069] {2831178259.py:14} INFO - Generating water body polygons for tile (42, 23).
[2023-10-18 17:22:36,065] {2831178259.py:29} INFO - Tile (42, 23) water body polygons written to s3://deafrica-waterbodies-dev/0-0-1/shapefile/polygons_from_thresholds/42_23_raster_polygons.parquet
[2023-10-18 17:22:36,067] {2831178259.py:14} INFO - Generating water body polygons for tile (43, 23)

In [16]:
# Get the extent for each tile.
crs = grid_workflow.grid_spec.crs
tile_ids = [tile[0] for tile in tiles.items()]
tile_extents_geoms = [tile[1].geobox.extent.geom for tile in tiles.items()]
tile_extents_gdf = gpd.GeoDataFrame({"tile_id":tile_ids, "geometry":tile_extents_geoms}, crs=crs)

tile_extents_fp = os.path.join(
    output_directory, "tile_boundaries.parquet"
)

tile_extents_gdf.to_parquet(tile_extents_fp)
_log.info(f"Tile boundaries written to {tile_extents_fp}")

[2023-10-18 17:30:39,592] {1768085930.py:12} INFO - Tile boundaries written to s3://deafrica-waterbodies-dev/0-0-1/shapefile/tile_boundaries.parquet


In [17]:
# Find all parquet files for the first set of polygons.
raster_polygon_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=".*raster_polygons.*")
_log.info(f"Found {len(raster_polygon_paths)} parquet files for the raster polygons.")

# Load all polygons into a single GeoDataFrame.
_log.info("Loading the raster polygons parquet files..")
raster_polygon_polygons_list = []
for path in raster_polygon_paths:
    gdf = gpd.read_parquet(path)
    raster_polygon_polygons_list.append(gdf)

raster_polygons = pd.concat(raster_polygon_polygons_list, ignore_index=True)
_log.info(f"Found {len(raster_polygons)} raster polygons.")

_log.info("Merging raster waterbody polygons located at tile boundaries...")
raster_polygons_merged = merge_polygons_at_tile_boundaries(
    raster_polygons, tile_extents_gdf
)
_log.info(f"Raster polygons count after merging polygons at tile boundaries {len(raster_polygons_merged)}.")

_log.info("Writing raster polygons merged at tile boundaries to disk..")
raster_polygons_output_fp = os.path.join(
    output_directory, "raster_polygons_merged_at_tile_boundaries.parquet"
)

raster_polygons_merged.to_parquet(raster_polygons_output_fp)
_log.info(f"Polygons written to {raster_polygons_output_fp}")

[2023-10-18 17:30:39,655] {1335075486.py:3} INFO - Found 8 parquet files for the raster polygons.
[2023-10-18 17:30:39,656] {1335075486.py:6} INFO - Loading the raster polygons parquet files..
[2023-10-18 17:30:41,159] {1335075486.py:13} INFO - Found 54030 raster polygons.
[2023-10-18 17:30:41,160] {1335075486.py:15} INFO - Merging raster waterbody polygons located at tile boundaries...
[2023-10-18 17:30:46,298] {1335075486.py:19} INFO - Raster polygons count after merging polygons at tile boundaries 53977.
[2023-10-18 17:30:46,299] {1335075486.py:21} INFO - Writing raster polygons merged at tile boundaries to disk..
[2023-10-18 17:30:46,974] {1335075486.py:27} INFO - Polygons written to s3://deafrica-waterbodies-dev/0-0-1/shapefile/raster_polygons_merged_at_tile_boundaries.parquet
