In [None]:
import logging
import os

import click
import geopandas as gpd
import pandas as pd
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.io import find_parquet_files
from deafrica_waterbodies.make_polygons import merge_polygons_at_tile_boundaries

In [None]:
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [None]:
verbose = 1
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental"

In [None]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [None]:
# Support pathlib paths.
output_directory = str(output_directory)

In [None]:
# Directory containing the water body polygons generated from
# thresholding WOfS All time summary datasets.
polygons_from_thresholds_dir = os.path.join(output_directory, "polygons_from_thresholds")
filtered_tiles_extents_gdf_fp = os.path.join(output_directory, "tile_boundaries.parquet")

In [None]:
# Find all parquet files for the raster processed polygons
raster_polygon_paths = find_parquet_files(path=polygons_from_thresholds_dir, pattern=".*raster_polygons.*")
_log.info(f"Found {len(raster_polygon_paths)} parquet files for the raster polygons.")

In [None]:
# Load all the polygons into a single GeoDataFrame.
_log.info("Loading the raster polygons parquet files..")
raster_polygon_polygons_list = []
for path in raster_polygon_paths:
    gdf = gpd.read_parquet(path)
    raster_polygon_polygons_list.append(gdf)

raster_polygons = pd.concat(raster_polygon_polygons_list, ignore_index=True)
_log.info(f"Found {len(raster_polygons)} raster polygons.")

In [None]:
_log.info("Loading tile boundaries....")
filtered_tiles_extents_gdf = gpd.read_parquet(filtered_tiles_extents_gdf_fp)

_log.info("Merging raster waterbody polygons located at tile boundaries...")
raster_polygons_merged = merge_polygons_at_tile_boundaries(
    raster_polygons, filtered_tiles_extents_gdf
)
_log.info(f"Raster polygons count {len(raster_polygons_merged)}.")

In [None]:
_log.info("Writing raster polygons merged at tile boundaries to disk..")
raster_polygons_output_fp = os.path.join(
    output_directory, "raster_polygons_merged_at_tile_boundaries.parquet"
)

raster_polygons_merged.to_parquet(raster_polygons_output_fp)
_log.info(f"Polygons written to {raster_polygons_output_fp}")