In [1]:
import logging
import os
import math

import geopandas as gpd

from deafrica_waterbodies.filters import filter_by_area
from deafrica_waterbodies.attributes import (
    add_area_and_perimeter_attributes,
    add_timeseries_attribute,
    assign_unique_ids,
)
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.io import write_waterbodies_to_file

In [2]:
import os

# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1
#output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental"
output_directory = "s3://deafrica-waterbodies-dev/0-0-1/shapefile/"
min_polygon_size = 4500  # 5 pixels
max_polygon_size = math.inf
product_version = "0.0.1"
timeseries_bucket = "deafrica-waterbodies-dev"
file_name_prefix = "senegalbasinwaterbodies"

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib paths.
output_directory = str(output_directory)

In [6]:
_log.info("Loading raster polygons...")
# Load the raster waterbody polygons.
raster_polygons_fp = os.path.join(output_directory, "raster_polygons_large_polygons_handled.parquet")

raster_polygons = gpd.read_parquet(raster_polygons_fp)

_log.info(f"Raster polygons count {len(raster_polygons)}.")

[2023-10-18 21:10:20,808] {810598675.py:1} INFO - Loading raster polygons...
[2023-10-18 21:10:21,586] {810598675.py:7} INFO - Raster polygons count 55103.


In [7]:
# Filter the polygons by area.
area_filtered_raster_polygons = filter_by_area(raster_polygons, min_polygon_size=min_polygon_size, max_polygon_size=max_polygon_size)
area_filtered_raster_polygons.to_parquet(os.path.join(output_directory, "area_filtered_raster_polygons.parquet"))

[2023-10-18 21:10:21,804] {filters.py:117} INFO - Filtering 55103 polygons by minimum area 4500 and max area inf...
[2023-10-18 21:10:21,832] {filters.py:130} INFO - Filtered out 18801 polygons.


In [8]:
waterbodies_gdf = assign_unique_ids(polygons=area_filtered_raster_polygons)
waterbodies_gdf = add_area_and_perimeter_attributes(polygons=waterbodies_gdf)
waterbodies_gdf = add_timeseries_attribute(
    polygons=waterbodies_gdf,
    product_version=product_version,
    timeseries_bucket=timeseries_bucket,
)

In [9]:
# Reproject to EPSG:4326
waterbodies_gdf_4326 = waterbodies_gdf.to_crs("EPSG:4326")

In [10]:
# Write to disk.
write_waterbodies_to_file(
    waterbodies_gdf=waterbodies_gdf_4326,
    product_version=product_version,
    output_directory=output_directory,
    file_name_prefix=file_name_prefix,
)

[2023-10-18 21:11:07,812] {io.py:240} INFO - Waterbody polygons written to s3://deafrica-waterbodies-dev/0-0-1/shapefile/senegalbasinwaterbodiesv0.shp
