In [1]:
import logging
import os

import geopandas as gpd
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.id_field import guess_id_field
from tqdm import tqdm

In [2]:
import os

# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
# Set up logger.
logging_setup(verbose=1)
_log = logging.getLogger(__name__)

In [4]:
polygons_vector_file = "s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental/continentalwaterbodies.parquet"
use_id = "UID"

In [5]:
# Read the vector file.
try:
    polygons_gdf = gpd.read_file(polygons_vector_file)
except Exception as error:
    _log.exception(f"Could not read file {polygons_vector_file}")
    _log.error(error)
    raise error
else:
    _log.info(f"Polygon count {len(polygons_gdf)}")

[2023-10-19 18:55:19,696] {3451265644.py:9} INFO - Polygon count 11654


In [6]:
id_field = guess_id_field(polygons_gdf, use_id)

# Set the ID field as the index.
polygons_gdf.set_index(id_field, inplace=True)

[2023-10-19 18:55:19,702] {id_field.py:64} INFO - Values in the column UID are unique.


In [7]:
%%time
polygons_to_delete = []
for row in polygons_gdf.itertuples():
    row_id = row.Index
    row_geom = row.geometry

    polygons_to_check_against = polygons_gdf.loc[polygons_gdf.index != row_id]

    # Check if the row geometry is within any of the other polygons.
    if polygons_to_check_against.geometry.contains(row_geom).any():
        polygons_to_delete.append(row_id)

CPU times: user 20.4 s, sys: 1.29 s, total: 21.7 s
Wall time: 21.7 s


In [8]:
# Get the parent directory of the polygons vector file.
dir_name = os.path.dirname(polygons_vector_file)
# Get the file name of the polygons vector file without the file extenstion.
base_name = os.path.splitext(os.path.basename(polygons_vector_file))[0]

In [9]:
polygons_to_delete_gdf = polygons_gdf.loc[polygons_gdf.index.isin(polygons_to_delete)]
_log.info(f"Found {len(polygons_to_delete_gdf)} polygons within polygons.")

polygons_to_delete_fp = os.path.join(dir_name, f"{base_name}_polygons_to_delete.parquet")
polygons_to_delete_gdf.to_parquet(polygons_to_delete_fp)

[2023-10-19 18:55:41,381] {2775308144.py:2} INFO - Found 1 polygons within polygons.


In [10]:
polygons_within_polygons_removed = polygons_gdf.loc[~polygons_gdf.index.isin(polygons_to_delete)]
_log.info(f"Polygon count after handling polygons within polygons {len(polygons_within_polygons_removed)}.")

polygons_within_polygons_removed_fp = os.path.join(dir_name, f"{base_name}_polygons_within_polygons_removed.parquet")
polygons_within_polygons_removed.to_parquet(polygons_within_polygons_removed_fp)

[2023-10-19 18:55:41,409] {421957777.py:2} INFO - Polygon count after handling polygons within polygons 11653.
