In [1]:
import logging
import math
import os

import pandas as pd
import geopandas as gpd
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.filters import fill_holes, filter_by_area

In [2]:
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental"
# output_directory = "s3://deafrica-waterbodies-dev/0-0-1/shapefile/"
large_polygons_threshold = 10**6

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib paths.
output_directory = str(output_directory)

In [6]:
# Load the raster polygons
_log.info("Loading polygons...")

raster_polygons_fp = os.path.join(output_directory, "raster_polygons_merged_at_tile_boundaries.parquet")

raster_polygons = gpd.read_parquet(raster_polygons_fp)

# Drop the attributes column if it exists.
raster_polygons.drop(columns=["attribute"], errors="ignore", inplace=True)

_log.info(f"Raster polygons count {len(raster_polygons)}.")

[2023-10-23 11:19:40,565] {3544693789.py:2} INFO - Loading polygons...
[2023-10-23 11:19:51,472] {3544693789.py:11} INFO - Raster polygons count 1075799.


In [7]:
# Identify the large polygons.
large_polygons = filter_by_area(raster_polygons, min_polygon_size=large_polygons_threshold, max_polygon_size=math.inf)
_log.info(f"Count for polygons larger than {large_polygons_threshold} m2: {len(large_polygons)}")

[2023-10-23 11:19:51,477] {filters.py:117} INFO - Filtering 1075799 polygons by minimum area 1000000 and max area inf...
[2023-10-23 11:19:51,796] {filters.py:130} INFO - Filtered out 1056504 polygons.
[2023-10-23 11:19:51,796] {640283564.py:3} INFO - Count for polygons larger than 1000000 m2: 19295


In [8]:
# Remove the large polygons from the raster polygons.
large_polygons_idx = large_polygons.index.values
raster_polygons_large_removed = raster_polygons.drop(index=large_polygons_idx)
_log.info(f"{len(raster_polygons) - len(raster_polygons_large_removed)} large polygons removed from raster polygons.")

[2023-10-23 11:19:51,929] {2675327616.py:4} INFO - 19295 large polygons removed from raster polygons.


In [9]:
%%time
# Fill holes in the large polygons.
_log.info("Filling holes in large polygons...")
large_polygons.geometry = large_polygons.geometry.apply(lambda p: fill_holes(p))

[2023-10-23 11:19:51,934] {<timed exec>:2} INFO - Filling holes in large polygons...
CPU times: user 17.8 s, sys: 353 ms, total: 18.2 s
Wall time: 18.1 s


In [10]:
# Add back in the large polygons with holes filled.
raster_polygons_with_holes_filled = pd.concat([raster_polygons_large_removed, large_polygons], ignore_index=True)

raster_polygons_with_holes_filled.drop(columns=["area_m2"], errors="ignore", inplace=True)

_log.info(f"Polygon count after filling holes in large polygons {len(raster_polygons_with_holes_filled)}.")
raster_polygons_with_holes_filled.to_parquet(os.path.join(output_directory, "raster_polygons_with_holes_filled.parquet"))

[2023-10-23 11:20:10,201] {3685114858.py:6} INFO - Polygon count after filling holes in large polygons 1075799.
