In [1]:
import math
import logging
import pandas as pd
import geopandas as gpd
    
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.filters import split_large_polygons

In [2]:
import os
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental"
#output_directory = "s3://deafrica-waterbodies-dev/0-0-1/shapefile/"
handle_large_polygons = "erode-dilate-v2"
pp_test_threshold = 0.005

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib paths.
output_directory = str(output_directory)

In [6]:
# Load the raster polygons
_log.info("Loading raster polygons...")

raster_polygons_fp = os.path.join(
    output_directory, "raster_polygons_merged_at_tile_boundaries.parquet"
)

raster_polygons = gpd.read_parquet(raster_polygons_fp)

# Drop the attributes column.
raster_polygons.drop(columns=["attribute"], errors="ignore", inplace=True)

_log.info(f"Raster polygons count {len(raster_polygons)}.")

[2023-10-17 20:18:00,846] {4168478290.py:2} INFO - Loading raster polygons...
[2023-10-17 20:18:09,829] {4168478290.py:13} INFO - Raster polygons count 1075799.


### Run split on large polygons

In [7]:
# Identify the large polygons.
large_polygons_threshold = 10**8
large_polygons = gpd.GeoDataFrame(data=raster_polygons.loc[raster_polygons.area >= large_polygons_threshold])
_log.info(f"Count for polygons larger than {large_polygons_threshold} m2: {len(large_polygons)}")
large_polygons.to_parquet(os.path.join(output_directory, "large_polygons.parquet"))

[2023-10-17 20:18:30,496] {434086015.py:4} INFO - Count for polygons larger than 100000000 m2: 239


In [8]:
# Remove the large polygons from the raster polygons.
large_polygons_idx = large_polygons.index.values
raster_polygons_large_removed = raster_polygons.drop(index=large_polygons_idx)
_log.info(f"{len(raster_polygons) - len(raster_polygons_large_removed)} large polygons removed from raster polygons.")

[2023-10-17 20:18:32,621] {2675327616.py:4} INFO - 239 large polygons removed from raster polygons.


In [None]:
%%time
# Split the large polygons.
large_polygons_handled = split_large_polygons(waterbody_polygons=large_polygons, pp_test_threshold=pp_test_threshold, method=handle_large_polygons)

[2023-10-17 20:19:00,107] {filters.py:622} INFO - Splitting large polygons using the `erode-dilate-v2` method, using the threshold 0.005.
[2023-10-17 20:19:00,192] {filters.py:507} INFO - Splitting 73 polygons.


In [None]:
# Add back in the newly split polygons.
raster_polygons_large_poly_split = pd.concat([raster_polygons_large_removed, large_polygons_handled], ignore_index=True)
_log.info(f"Polygon count after handling large polygons {len(raster_polygons_large_poly_split)}.")
raster_polygons_large_poly_split.to_parquet(os.path.join(output_directory, "raster_polygons_large_polygons_handled.parquet"))