In [1]:
import logging
import os
import math

import geopandas as gpd

from deafrica_waterbodies.filters import filter_by_area
from deafrica_waterbodies.attributes import (
    add_area_and_perimeter_attributes,
    add_timeseries_attribute,
    assign_unique_ids,
)
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.io import write_waterbodies_to_file

In [2]:
import os

# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental"
# output_directory = "s3://deafrica-waterbodies-dev/0-0-1/shapefile/"
min_polygon_size = 4500  # 5 pixels
max_polygon_size = math.inf
product_version = "0.0.1"
timeseries_bucket = "deafrica-waterbodies-dev"
file_name_prefix = "continental_waterbodies"

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib paths.
output_directory = str(output_directory)

In [6]:
_log.info("Loading raster polygons...")
# Load the raster waterbody polygons.
raster_polygons_fp = os.path.join(output_directory, "raster_polygons_large_polygons_handled.parquet")

raster_polygons = gpd.read_parquet(raster_polygons_fp)

# Drop the attributes column if it exists.
raster_polygons.drop(columns=["attribute"], errors="ignore", inplace=True)

_log.info(f"Raster polygons count {len(raster_polygons)}.")

[2023-10-23 11:29:58,697] {13206279.py:1} INFO - Loading raster polygons...
[2023-10-23 11:30:04,868] {13206279.py:10} INFO - Raster polygons count 1078401.


In [7]:
raster_polygons.head()

Unnamed: 0,geometry
0,"POLYGON ((-1641390.000 1823040.000, -1641390.0..."
1,"POLYGON ((-1634430.000 1817610.000, -1634400.0..."
2,"POLYGON ((-1634100.000 1817100.000, -1634070.0..."
3,"POLYGON ((-1635450.000 1817370.000, -1635330.0..."
4,"POLYGON ((-1635630.000 1816980.000, -1635540.0..."


In [8]:
# Filter the polygons by area.
area_filtered_raster_polygons = filter_by_area(raster_polygons, min_polygon_size=min_polygon_size, max_polygon_size=max_polygon_size)
area_filtered_raster_polygons.to_parquet(os.path.join(output_directory, "area_filtered_raster_polygons.parquet"))

[2023-10-23 11:30:04,886] {filters.py:117} INFO - Filtering 1078401 polygons by minimum area 4500 and max area inf...
[2023-10-23 11:30:05,249] {filters.py:130} INFO - Filtered out 387303 polygons.


In [9]:
waterbodies_gdf = assign_unique_ids(polygons=area_filtered_raster_polygons)
waterbodies_gdf = add_area_and_perimeter_attributes(polygons=waterbodies_gdf)
waterbodies_gdf = add_timeseries_attribute(
    polygons=waterbodies_gdf,
    product_version=product_version,
    timeseries_bucket=timeseries_bucket,
)

In [10]:
waterbodies_gdf.head()

Unnamed: 0,geometry,area_m2,UID,WB_ID,perim_m,timeseries
0,"POLYGON ((-1087200.000 849749.999, -1087080.00...",11700.0,e9pvzx5t2,0,780.0,https://deafrica-waterbodies-dev.s3.af-south-1...
1,"POLYGON ((-1087140.000 850709.999, -1087080.00...",147599.9999,e9pvzxcs1,1,4260.0,https://deafrica-waterbodies-dev.s3.af-south-1...
2,"POLYGON ((-1085580.000 849989.999, -1085580.00...",6300.0,e9pvzzx9v,2,360.0,https://deafrica-waterbodies-dev.s3.af-south-1...
3,"POLYGON ((-1119570.000 888719.999, -1119570.00...",11700.0,e9pxxzp4y,3,720.0,https://deafrica-waterbodies-dev.s3.af-south-1...
4,"POLYGON ((-1119990.000 889679.999, -1119960.00...",44100.0,e9pxzb5ze,4,1680.0,https://deafrica-waterbodies-dev.s3.af-south-1...


In [11]:
waterbodies_gdf.iloc[0].timeseries

'https://deafrica-waterbodies-dev.s3.af-south-1.amazonaws.com/0-0-1/timeseries/e9pv/e9pvzx5t2_v0.csv'

In [12]:
# Reproject to EPSG:4326
waterbodies_gdf_4326 = waterbodies_gdf.to_crs("EPSG:4326")

In [13]:
# Write to disk.
write_waterbodies_to_file(
    waterbodies_gdf=waterbodies_gdf_4326,
    product_version=product_version,
    output_directory=output_directory,
    file_name_prefix=file_name_prefix,
)

[2023-10-23 11:33:29,557] {io.py:240} INFO - Waterbody polygons written to s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental/continental_waterbodiesv0_0_1.shp
