In [1]:
import math
import logging
import pandas as pd
import geopandas as gpd
    
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.filters import split_large_polygons, filter_by_area

In [2]:
import os
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental"
# output_directory = "s3://deafrica-waterbodies-dev/0-0-1/shapefile/"
handle_large_polygons = "erode-dilate-v1"
pp_test_threshold = 0.005
large_polygons_threshold = 10**8

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib paths.
output_directory = str(output_directory)

In [6]:
# Load the raster polygons
_log.info("Loading raster polygons...")

raster_polygons_fp = os.path.join(output_directory, "raster_polygons_with_holes_filled.parquet")

raster_polygons = gpd.read_parquet(raster_polygons_fp)

# Drop the attributes column if it exists.
raster_polygons.drop(columns=["attribute"], errors="ignore", inplace=True)

_log.info(f"Raster polygons count {len(raster_polygons)}.")

[2023-10-18 20:57:38,824] {98882911.py:2} INFO - Loading raster polygons...
[2023-10-18 20:57:39,321] {98882911.py:8} INFO - Raster polygons count 53977.


### Run split on large polygons

In [7]:
# Identify the large polygons.
large_polygons = filter_by_area(raster_polygons, min_polygon_size=large_polygons_threshold, max_polygon_size=math.inf)
_log.info(f"Count for polygons larger than {large_polygons_threshold} m2: {len(large_polygons)}")
large_polygons.to_parquet(os.path.join(output_directory, "large_polygons.parquet"))

[2023-10-18 20:57:42,803] {filters.py:117} INFO - Filtering 53977 polygons by minimum area 100000000 and max area inf...
[2023-10-18 20:57:42,824] {filters.py:130} INFO - Filtered out 53961 polygons.
[2023-10-18 20:57:42,824] {2569074451.py:3} INFO - Count for polygons larger than 100000000 m2: 16


In [8]:
# Remove the large polygons from the raster polygons.
large_polygons_idx = large_polygons.index.values
raster_polygons_large_removed = raster_polygons.drop(index=large_polygons_idx)
raster_polygons_large_removed.drop(columns=["area_m2"], errors="ignore", inplace=True)
_log.info(f"{len(raster_polygons) - len(raster_polygons_large_removed)} large polygons removed from raster polygons.")

[2023-10-18 20:57:53,816] {36683088.py:5} INFO - 16 large polygons removed from raster polygons.


In [9]:
%%time
# Split the large polygons.
large_polygons_handled = split_large_polygons(waterbody_polygons=large_polygons, pp_test_threshold=pp_test_threshold, method=handle_large_polygons)

[2023-10-18 20:57:56,801] {filters.py:605} INFO - Splitting large polygons using the `erode-dilate-v1` method, using the threshold 0.005.
[2023-10-18 20:57:56,811] {filters.py:416} INFO - Splitting 7 polygons.
[2023-10-18 20:59:41,346] {filters.py:436} INFO - Polygon count after splitting using erode-dilate-v1 method: 1142
CPU times: user 1min 44s, sys: 174 ms, total: 1min 44s
Wall time: 1min 44s


In [10]:
# Add back in the newly split polygons.
raster_polygons_large_poly_split = pd.concat([raster_polygons_large_removed, large_polygons_handled], ignore_index=True)
_log.info(f"Polygon count after handling large polygons {len(raster_polygons_large_poly_split)}.")
raster_polygons_large_poly_split.to_parquet(os.path.join(output_directory, "raster_polygons_large_polygons_handled.parquet"))

[2023-10-18 20:59:41,358] {2280308349.py:3} INFO - Polygon count after handling large polygons 55103.
