In [None]:
import logging
import os

import click
import datacube
import fsspec
import geopandas as gpd
import pandas as pd
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.io import (
    check_dir_exists,
    check_file_exists,
    check_if_s3_uri,
    find_parquet_files,
)
from deafrica_waterbodies.make_polygons import (
    set_wetness_thresholds,
    process_raster_polygons,
    merge_polygons_at_tile_boundaries
)
from deafrica_waterbodies.tiling import (
    filter_tiles,
    get_tiles_ids,
    tile_wofs_ls_summary_alltime,
)

In [None]:
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [None]:
verbose = 1

# aoi_vector_file = None
aoi_vector_file = "data/SenegalBasin.geojson"
tile_size_factor = 4
num_workers = 16

detection_threshold: float = 0.1
extent_threshold: float = 0.05
min_valid_observations: int = 128
# output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental"
output_directory = "s3://deafrica-waterbodies-dev/0-0-1/shapefile/"
overwrite = False
land_sea_mask_fp = "data/af_msk_3s.tif"

In [None]:
import xarray as xr
def filter_hydrosheds_land_mask(hydrosheds_land_mask: xr.DataArray) -> xr.DataArray:
    """
    Function to filter the HydroSHEDs Land Mask into a boolean mask.
    """
    # Indicator values: 1 = land, 2 = ocean sink, 3 = inland sink, 255 is no data.
    boolean_mask = (hydrosheds_land_mask != 255) & (hydrosheds_land_mask != 2)
    return boolean_mask

In [None]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [None]:
# Support pathlib Paths.
if aoi_vector_file is not None:
    aoi_vector_file = str(aoi_vector_file)
output_directory = str(output_directory)

In [None]:
# Parameters to use when loading datasets.
dask_chunks = {"x": 3200, "y": 3200, "time": 1}

In [None]:
# Load the area of interest as a GeoDataFrame.
if aoi_vector_file is not None:
    try:
        aoi_gdf = gpd.read_file(aoi_vector_file)
    except Exception as error:
        _log.exception(f"Could not read the file {aoi_vector_file}")
        raise error
else:
    aoi_gdf = None

In [None]:
# Tile the wofs_ls_summary_alltime product.
tiles, grid_workflow = tile_wofs_ls_summary_alltime(tile_size_factor)

In [None]:
# Filter the tiles to the area of interest.
filtered_tile_ids = filter_tiles(tiles, aoi_gdf, num_workers)
filtered_tiles = {k: v for k, v in tiles.items() if k in filtered_tile_ids}
_log.info(f"Filtered out {len(tiles) - len(filtered_tiles)} tiles.")
_log.info(f"Number of wofs_ls_summary_alltime tiles covering the area of interest: {len(filtered_tiles)}" )

In [None]:
# Directory to write generated waterbody polygons to.
polygons_from_thresholds_dir = os.path.join(output_directory, "polygons_from_thresholds")

In [None]:
# Set the filesystem to use.
if check_if_s3_uri(polygons_from_thresholds_dir):
    fs = fsspec.filesystem("s3")
else:
    fs = fsspec.filesystem("file")

In [None]:
# Check if the directory exists. If it does not, create it.
if not check_dir_exists(polygons_from_thresholds_dir):
    fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True)
    _log.info(f"Created directory {polygons_from_thresholds_dir}")

In [None]:
# Set the wetness thresholds have been set correctly.
min_wet_thresholds = set_wetness_thresholds(detection_threshold=detection_threshold, extent_threshold=extent_threshold)

In [None]:
# Generate the first set of polygons for each of the tiles.
for tile in filtered_tiles.items():
    tile_id = tile[0]
    raster_polygons_fp = os.path.join(polygons_from_thresholds_dir, f"{tile_id[0]}_{tile_id[1]}_raster_polygons.parquet")

    if not overwrite:
        _log.info(f"Checking existence of {raster_polygons_fp}")
        exists = check_file_exists(raster_polygons_fp)
        if exists:
            _log.info(f"{raster_polygons_fp} exists! \n Skipping generating water body polygons for {tile_id}.")

    if overwrite or not exists:
        
        try: 
            _log.info(f"Generating water body polygons for tile {tile_id}")
            raster_polgyons = process_raster_polygons(tile=tile,
                                                      grid_workflow=grid_workflow,
                                                      dask_chunks=dask_chunks,
                                                      min_valid_observations=min_valid_observations,
                                                      min_wet_thresholds=min_wet_thresholds,
                                                      land_sea_mask_fp=land_sea_mask_fp,
                                                      filter_land_sea_mask=filter_hydrosheds_land_mask)
            
            # Write the polygons to parquet files.
            raster_polgyons.to_parquet(raster_polygons_fp)
            
        except Exception as error:
            _log.exception(
                f"\nDataset {str(tile_id)} did not run. \n"
            )
            _log.exception(error)


In [None]:
# Get the extents for each tile.
crs = grid_workflow.grid_spec.crs
filtered_tiles_ids = [tile[0] for tile in filtered_tiles.items()]
filtered_tiles_extents_geoms = [tile[1].geobox.extent.geom for tile in filtered_tiles.items()]
filtered_tiles_extents_gdf = gpd.GeoDataFrame({"tile_id":filtered_tiles_ids, "geometry":filtered_tiles_extents_geoms}, crs=crs)

filtered_tiles_extents_fp = os.path.join(
    output_directory, "tile_boundaries.parquet"
)

filtered_tiles_extents_gdf.to_parquet(filtered_tiles_extents_fp)
_log.info(f"Tile boundaries written to {filtered_tiles_extents_fp}")

In [None]:
# Find all parquet files for the first set of polygons.
raster_polygon_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=".*raster_polygons.*")
_log.info(f"Found {len(raster_polygon_paths)} parquet files for the raster polygons.")

# Load all polygons into a single GeoDataFrame.
_log.info("Loading the raster polygons parquet files..")
raster_polygon_polygons_list = []
for path in raster_polygon_paths:
    gdf = gpd.read_parquet(path)
    raster_polygon_polygons_list.append(gdf)

raster_polygons = pd.concat(raster_polygon_polygons_list, ignore_index=True)
_log.info(f"Found {len(raster_polygons)} raster polygons.")

_log.info("Merging raster waterbody polygons located at tile boundaries...")
raster_polygons_merged = merge_polygons_at_tile_boundaries(
    raster_polygons, filtered_tiles_extents_gdf
)
_log.info(f"Raster polygons count {len(raster_polygons_merged)}.")

_log.info("Writing raster polygons merged at tile boundaries to disk..")
raster_polygons_output_fp = os.path.join(
    output_directory, "raster_polygons_merged_at_tile_boundaries.parquet"
)

raster_polygons_merged.to_parquet(raster_polygons_output_fp)
_log.info(f"Polygons written to {raster_polygons_output_fp}")