In [None]:
import logging
import os

import click
import datacube
import fsspec
import geopandas as gpd
import pandas as pd
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.io import (
    check_dir_exists,
    check_file_exists,
    check_if_s3_uri,
    find_parquet_files,
)
from deafrica_waterbodies.make_polygons import (
    check_wetness_thresholds,
    get_polygons_from_tile,
    merge_polygons_at_tile_boundaries
)
from deafrica_waterbodies.tiling import (
    filter_tiles,
    get_tiles_ids,
    tile_wofs_ls_summary_alltime,
)

In [None]:
import os

# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [None]:
verbose = 1

aoi_vector_file = "data/SenegalBasin.geojson"
tile_size_factor = 4
num_workers = 16

primary_threshold: float = 0.1
secondary_threshold: float = 0.05
minimum_valid_observations: int = 128
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile4"
overwrite = False

In [None]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [None]:
# Support pathlib Paths.
aoi_vector_file = str(aoi_vector_file)
output_directory = str(output_directory)

In [None]:
# Parameters to use when loading datasets.
dask_chunks = {"x": 3200, "y": 3200, "time": 1}

In [None]:
# Load the area of interest as a GeoDataFrame.
if aoi_vector_file is not None:
    try:
        aoi_gdf = gpd.read_file(aoi_vector_file)
    except Exception as error:
        _log.exception(f"Could not read the file {aoi_vector_file}")
        raise error
else:
    aoi_gdf = None

In [None]:
# Tile the wofs_ls_summary_alltime product.
tiles, grid_workflow = tile_wofs_ls_summary_alltime(tile_size_factor)

In [None]:
# Filter the tiles to the area of interest.
filtered_tile_ids = filter_tiles(tiles, aoi_gdf, num_workers)
filtered_tiles = {k: v for k, v in tiles.items() if k in filtered_tile_ids}

print(len(filtered_tiles))

In [None]:
# Directory to write generated waterbody polygons to.
polygons_from_thresholds_dir = os.path.join(output_directory, "polygons_from_thresholds")

In [None]:
# Set the filesystem to use.
if check_if_s3_uri(polygons_from_thresholds_dir):
    fs = fsspec.filesystem("s3")
else:
    fs = fsspec.filesystem("file")

In [None]:
# Check if the directory exists. If it does not, create it.
if not check_dir_exists(polygons_from_thresholds_dir):
    fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True)
    _log.info(f"Created directory {polygons_from_thresholds_dir}")

In [None]:
# Check if the wetness thresholds have been set correctly.
minimum_wet_thresholds = [secondary_threshold, primary_threshold]
_log.info(check_wetness_thresholds(minimum_wet_thresholds))

In [None]:
# Generate the first set of primary and secondary threhsold polygons for each of the tiles.
for tile in filtered_tiles.items():
    tile_id = tile[0]
    primary_threshold_polygons_fp = os.path.join(
        polygons_from_thresholds_dir, f"{tile_id[0]}_{tile_id[1]}_primary_threshold_polygons.parquet"
    )
    secondary_threshold_polygons_fp = os.path.join(
        polygons_from_thresholds_dir, f"{tile_id[0]}_{tile_id[1]}_secondary_threshold_polygons.parquet"
    )

    if not overwrite:
        _log.info(f"Checking existence of {primary_threshold_polygons_fp} and {secondary_threshold_polygons_fp}")
        exists = check_file_exists(primary_threshold_polygons_fp) and check_file_exists(secondary_threshold_polygons_fp)

    if overwrite or not exists:
        (
            primary_threshold_polygons,
            secondary_threshold_polygons,
        ) = get_polygons_from_tile(
            tile=tile,
            grid_workflow=grid_workflow,
            dask_chunks=dask_chunks,
            min_valid_observations=minimum_valid_observations,
            primary_threshold=primary_threshold,
            secondary_threshold=secondary_threshold,
        )
        # Write the polygons to parquet files.
        primary_threshold_polygons.to_parquet(primary_threshold_polygons_fp)
        secondary_threshold_polygons.to_parquet(secondary_threshold_polygons_fp)

In [None]:
# Get the extents for each tile.
crs = grid_workflow.grid_spec.crs
filtered_tiles_extents_geoms = [tile[1].geobox.extent.geom for tile in filtered_tiles.items()]
filtered_tiles_extents_gdf = gpd.GeoDataFrame(geometry=filtered_tiles_extents_geoms, crs=crs)

In [None]:
# Find all parquet files for the primary threshold.
primary_threshold_polygons_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=".*primary.*")
_log.info(f"Found {len(primary_threshold_polygons_paths)} parquet files for the primary threshold polygons.")

In [None]:
# Load all the primary threshold polygons into a single GeoDataFrame.
_log.info("Loading the primary threshold polygons parquet files..")
primary_threshold_polygons_list = []
for path in primary_threshold_polygons_paths:
    gdf = gpd.read_parquet(path)
    primary_threshold_polygons_list.append(gdf)

primary_threshold_polygons = pd.concat(primary_threshold_polygons_list, ignore_index=True)
_log.info(f"Found {len(primary_threshold_polygons)} primary threshold polygons.")

In [None]:
_log.info("Merging primary threshold waterbody polygons located at tile boundaries...")
primary_threshold_polygons_merged = merge_polygons_at_tile_boundaries(
    primary_threshold_polygons, filtered_tiles_extents_gdf
)
_log.info(f"Primary threshold polygons count {len(primary_threshold_polygons_merged)}.")

In [None]:
_log.info("Writing primary threshold polygons merged at tile boundaries to disk..")
primary_threshold_polygons_output_fp = os.path.join(
    output_directory, "primary_threshold_polygons_merged_at_tile_boundaries.parquet"
)

primary_threshold_polygons_merged.to_parquet(primary_threshold_polygons_output_fp)
_log.info(f"Polygons written to {primary_threshold_polygons_output_fp}")

In [None]:
# Find all parquet files for the secondary threshold.
secondary_threshold_polygons_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=".*secondary.*")
_log.info(f"Found {len(secondary_threshold_polygons_paths)} parquet files for the secondary threshold polygons.")

In [None]:
# Load all the secondary threshold polygons into a single GeoDataFrame.
_log.info("Loading the secondary threshold polygons parquet files...")
secondary_threshold_polygons_list = []
for path in secondary_threshold_polygons_paths:
    gdf = gpd.read_parquet(path)
    secondary_threshold_polygons_list.append(gdf)

secondary_threshold_polygons = pd.concat(secondary_threshold_polygons_list, ignore_index=True)
_log.info(f"Found {len(secondary_threshold_polygons)} secondary threshold polygons.")

In [None]:
_log.info("Merging secondary threshold waterbody polygons located at dataset/scene boundaries...")
secondary_threshold_polygons_merged = merge_polygons_at_tile_boundaries(
    secondary_threshold_polygons, filtered_tiles_extents_gdf
)
_log.info(f"Secondary threshold polygons count {len(secondary_threshold_polygons_merged)}.")

In [None]:
_log.info("Writing secondary threshold polygons merged at tile boundaries to disk..")
secondary_threshold_polygons_output_fp = os.path.join(
    output_directory, "secondary_threshold_polygons_merged_at_ds_boundaries.parquet"
)

secondary_threshold_polygons_merged.to_parquet(secondary_threshold_polygons_output_fp)

_log.info(f"Polygons written to {secondary_threshold_polygons_output_fp}")