In [1]:
import logging

import geopandas as gpd

from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.filters import filter_using_land_sea_mask

In [2]:
import os
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1
output_directory = "s3://deafrica-waterbodies-dev/0-0-1/shapefile/"
polygons_vector_file = "s3://deafrica-waterbodies-dev/0-0-1/shapefile/senegalbasinwaterbodiesv0_polygons_within_polygons_removed.parquet"
land_sea_mask_fp = "data/goas_v01.shp"

In [4]:
# Set up logger.
logging_setup(verbose=1)
_log = logging.getLogger(__name__)

In [5]:
# Load the polygons
_log.info("Loading polygons...")

try:
    polygons_gdf = gpd.read_file(polygons_vector_file)
except Exception as error:
    _log.exception(f"Could not read file {polygons_vector_file}")
    _log.error(error)
    raise error
else:
    _log.info(f"Polygons count {len(polygons_gdf)}.")

[2023-10-19 20:38:52,414] {3539245125.py:2} INFO - Loading polygons...
[2023-10-19 20:39:00,883] {3539245125.py:11} INFO - Polygons count 35009.


In [6]:
filtered_polygons_gdf = filter_using_land_sea_mask(polygons_gdf, land_sea_mask_fp)

[2023-10-19 20:39:00,890] {filters.py:159} INFO - Filtering out ocean polygons from the water body polygons...
[2023-10-19 20:39:56,842] {filters.py:174} INFO - Filtered out 1111 water body polygons.


In [7]:
filtered_polygons_fp = os.path.join(output_directory, "filtered_using_land_sea_mask.parquet")

In [8]:
filtered_polygons_gdf.to_parquet(filtered_polygons_fp)