In [1]:
import os
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [2]:
import logging
import math
import os
from importlib import import_module

import click
import fsspec
import geopandas as gpd
import pandas as pd

from deafrica_waterbodies.attributes import (
    add_area_and_perimeter_attributes,
    add_timeseries_attribute,
    assign_unique_ids,
)
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.filters import filter_by_area
from deafrica_waterbodies.io import (
    check_dir_exists,
    check_file_exists,
    check_if_s3_uri,
    find_parquet_files,
    write_waterbodies_to_file,
)
from deafrica_waterbodies.make_polygons import (
    merge_polygons_at_tile_boundaries,
    process_raster_polygons,
    set_wetness_thresholds,
)
from deafrica_waterbodies.plugins.utils import run_plugin, validate_plugin
from deafrica_waterbodies.tiling import get_wofs_ls_summary_alltime_tiles

In [3]:
verbose=1
aoi_vector_file=None
tile_size_factor=4
num_workers=8
detection_threshold=0.1
extent_threshold=0.05
min_valid_observations=60
raster_processing_plugin_name="ocean_filtering_using_hydrosheds"
output_directory = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/historical_extent/"
overwrite="True"
min_polygon_size=4500
max_polygon_size=math.inf
timeseries_directory = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/surface_area_change/"
file_name_prefix="waterbodies"
land_sea_mask_fp="/g/data/deafrica-waterbodies/masks/af_msk_3s.tif"

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Parameters to use when loading datasets.
dask_chunks = {"x": 3200, "y": 3200, "time": 1}

In [6]:
# Support pathlib Paths.
if aoi_vector_file is not None:
    aoi_vector_file = str(aoi_vector_file)

output_directory = str(output_directory)

In [7]:
# Set the filesystem to use.
if check_if_s3_uri(output_directory):
    fs = fsspec.filesystem("s3")
else:
    fs = fsspec.filesystem("file")

In [8]:
# Directory to write generated waterbody polygons to.
polygons_from_thresholds_dir = os.path.join(output_directory, "polygons_from_thresholds")

# Check if the directory exists. If it does not, create it.
if not check_dir_exists(polygons_from_thresholds_dir):
    fs.mkdirs(polygons_from_thresholds_dir, exist_ok=True)
    _log.info(f"Created directory {polygons_from_thresholds_dir}")

[2023-11-17 07:41:18,344] {credentials.py:611} INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [9]:
# Load the area of interest as a GeoDataFrame.
if aoi_vector_file is not None:
    try:
        aoi_gdf = gpd.read_file(aoi_vector_file)
    except Exception as error:
        _log.exception(f"Could not read the file {aoi_vector_file}")
        raise error
else:
    aoi_gdf = None

In [10]:
# Get the tiles fo the wofs_ls_summary_alltime product.
tiles, grid_workflow = get_wofs_ls_summary_alltime_tiles(
    aoi_gdf=aoi_gdf, tile_size_factor=tile_size_factor, num_workers=num_workers
)

[2023-11-17 07:41:18,606] {tiling.py:113} INFO - New tile size is (384000.0, 384000.0).
[2023-11-17 07:41:20,674] {tiling.py:132} INFO - Number of wofs_ls_summary_alltime tiles: 329


In [11]:
# Set the wetness thresholds.
min_wet_thresholds = set_wetness_thresholds(
    detection_threshold=detection_threshold, extent_threshold=extent_threshold
)

[2023-11-17 07:41:20,680] {make_polygons.py:69} INFO - We will be running a hybrid wetness threshold.
        You have set 0.1 as the location threshold, which will define the location of the water body polygons.
        You have set 0.05 as the extent threshold, which will define the extent/shape of the water body polygons.


In [12]:
# Set filters to apply during raster processing.
if raster_processing_plugin_name is not None:
    # Read the plugin as a Python module.
    module = import_module(f"deafrica_waterbodies.plugins.{raster_processing_plugin_name}")
    plugin_file = module.__file__
    plugin = run_plugin(plugin_file)
    _log.info(f"Using plugin {plugin_file}")
    validate_plugin(plugin)
else:
    plugin = None

[2023-11-17 07:41:20,708] {136687532.py:7} INFO - Using plugin /home/jovyan/dev/deafrica-waterbodies/deafrica_waterbodies/plugins/ocean_filtering_using_hydrosheds.py


In [None]:
# Generate the first set of polygons for each of the tiles.
for tile in tiles.items():
    tile_id = tile[0]
    raster_polygons_fp = os.path.join(
        polygons_from_thresholds_dir, f"{tile_id[0]}_{tile_id[1]}_raster_polygons.parquet"
    )

    if not overwrite:
        _log.info(f"Checking existence of {raster_polygons_fp}")
        exists = check_file_exists(raster_polygons_fp)
        if exists:
            _log.info(
                f"{raster_polygons_fp} exists! \n Skipping generating water body polygons for {tile_id}."
            )

    if overwrite or not exists:
        try:
            _log.info(f"Generating water body polygons for tile {tile_id}.")
            raster_polygons = process_raster_polygons(
                tile=tile,
                grid_workflow=grid_workflow,
                plugin=plugin,
                dask_chunks=dask_chunks,
                min_valid_observations=min_valid_observations,
                min_wet_thresholds=min_wet_thresholds,
                land_sea_mask_fp=land_sea_mask_fp,
            )
            if raster_polygons.empty:
                _log.info(f"Tile {str(tile_id)} contains no water body polygons.")
            else:
                # Drop the attributes column if it exists.
                raster_polygons.drop(columns=["attribute"], errors="ignore", inplace=True)
                # Write the polygons to parquet files.
                raster_polygons.to_parquet(raster_polygons_fp)
                _log.info(
                    f"Tile {str(tile_id)} water body polygons written to {raster_polygons_fp}"
                )
        except Exception as error:
            _log.exception(f"\nTile {str(tile_id)} did not run. \n")
            _log.exception(error)

[2023-11-17 07:41:20,715] {1473515291.py:18} INFO - Generating water body polygons for tile (42, 24).


In [None]:
# Get the extent for each tile.
crs = grid_workflow.grid_spec.crs
tile_ids = [tile[0] for tile in tiles.items()]
tile_extents_geoms = [tile[1].geobox.extent.geom for tile in tiles.items()]
tile_extents_gdf = gpd.GeoDataFrame(
    {"tile_id": tile_ids, "geometry": tile_extents_geoms}, crs=crs
)

tile_extents_fp = os.path.join(output_directory, "tile_boundaries.parquet")

tile_extents_gdf.to_parquet(tile_extents_fp)
_log.info(f"Tile boundaries written to {tile_extents_fp}")

In [None]:
# Find all parquet files for the first set of polygons.
raster_polygon_paths = find_parquet_files(
    path=polygons_from_thresholds_dir, pattern=".*raster_polygons.*"
)
_log.info(f"Found {len(raster_polygon_paths)} parquet files for the raster polygons.")

In [None]:
# Load all polygons into a single GeoDataFrame.
_log.info("Loading the raster polygons parquet files..")
raster_polygon_polygons_list = []
for path in raster_polygon_paths:
    gdf = gpd.read_parquet(path)
    raster_polygon_polygons_list.append(gdf)

raster_polygons = pd.concat(raster_polygon_polygons_list, ignore_index=True)
_log.info(f"Found {len(raster_polygons)} raster polygons.")

In [None]:
_log.info("Merging raster waterbody polygons located at tile boundaries...")
raster_polygons_merged = merge_polygons_at_tile_boundaries(raster_polygons, tile_extents_gdf)
# Drop the attributes column if it exists.
raster_polygons_merged.drop(columns=["attribute"], errors="ignore", inplace=True)
_log.info(
    f"Raster polygons count after merging polygons at tile boundaries {len(raster_polygons_merged)}."
)

_log.info("Writing raster polygons merged at tile boundaries to disk..")
raster_polygons_merged_fp = os.path.join(
    output_directory, "raster_polygons_merged_at_tile_boundaries.parquet"
)
raster_polygons_merged.to_parquet(raster_polygons_merged_fp)
_log.info(f"Polygons written to {raster_polygons_merged_fp}")

In [None]:
# Delete to conserve memeory
del raster_polygons
del tile_extents_gdf

In [None]:
# Filter the polygons by area.
area_filtered_raster_polygons = filter_by_area(
    raster_polygons_merged, min_polygon_size=min_polygon_size, max_polygon_size=max_polygon_size
)
area_filtered_raster_polygons.to_parquet(
    os.path.join(output_directory, "area_filtered_raster_polygons.parquet")
)

In [None]:
waterbodies_gdf = assign_unique_ids(polygons=area_filtered_raster_polygons)
waterbodies_gdf = add_area_and_perimeter_attributes(polygons=waterbodies_gdf)
waterbodies_gdf = add_timeseries_attribute(
    polygons=waterbodies_gdf,
    timeseries_directory=timeseries_directory,
    region_code="af-south-1"
)

In [None]:
# Reproject to EPSG:4326
waterbodies_gdf_4326 = waterbodies_gdf.to_crs("EPSG:4326")

In [None]:
# Write to disk.
write_waterbodies_to_file(
    waterbodies_gdf=waterbodies_gdf_4326,
    output_directory=output_directory,
    file_name_prefix=file_name_prefix,
)

In [None]:
waterbodies_gdf_4326.to_parquet(os.path.join(output_directory, f"{file_name_prefix}.parquet"))