In [None]:
import logging
import math
import os

import click
import geopandas as gpd
import pandas as pd
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.filters import (
    #filter_by_area,
    filter_using_land_sea_mask,
    filter_using_major_rivers_mask,
    filter_using_urban_mask,
    merge_primary_and_secondary_threshold_polygons,
    split_large_polygons,
)

In [None]:
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [None]:
verbose = 1
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/raster_processing/continental"
min_polygon_size = 4500  # 5 pixels
max_polygon_size = math.inf
land_sea_mask_fp = ""
major_rivers_mask_fp = ""
urban_mask_fp = ""
handle_large_polygons = "erode-dilate-v2"
pp_test_threshold = 0.005

In [None]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [None]:
# Support pathlib paths.
output_directory = str(output_directory)

In [None]:
# Load the raster polygons
_log.info("Loading raster polygons...")

raster_polygons_fp = os.path.join(
    output_directory, "raster_polygons_merged_at_tile_boundaries.parquet"
)

raster_polygons = gpd.read_parquet(raster_polygons_fp)

_log.info(f"Raster polygons count {len(raster_polygons)}.")

## To Do

1. Run the existing "erode-dilate-v2" splitting method on polygons that are larger than 10^8 in area. The code for this is below.
2. Fix holes in large polygons -- consider isolating to the largest polygons only. You can see an example of the issue over Lake Victoria. This stack overflow page might be of use: https://stackoverflow.com/questions/63317410/how-to-fill-holes-in-multi-polygons-created-when-dissolving-geodataframe-with-ge
3. Remove polygons that don't meet the area requirements.
4. Ensure the polygons have all the required attributes (area, perimeter, timeseries csv)

### Run split on large polygons

In [None]:
larger_10tothe8 = raster_polygons.loc[raster_polygons.area >= 10**8, :].copy()

In [None]:
large_polygons_handled = split_large_polygons(
    waterbody_polygons=larger_10tothe8, pp_thresh=0.005, method="erode-dilate-v2"
)

In [None]:
# To Do: drop the selected polygons from the continental dataset 
# and add back in the newly split polygons. Export to s3

### Fix holes in large polygons

In [None]:
# To Do: investigate geopandas approach, otherwise, could do this in GIS

### Remove small polygons

Have redefined the function in this notebook to work on the combined raster polygons. The original function could be updated in the filters.py file once we know it's working as expected

In [None]:
def filter_by_area(
    raster_polygons: gpd.GeoDataFrame | None,
    min_polygon_size: float = 4500,
    max_polygon_size: float = math.inf,
) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
    """
    Filter the primary and secondary threshold polygons using the minimum and
    maximum area.

    Parameters
    ----------
    raster_polygons : gpd.GeoDataFrame
    secondary_threshold_polygons : gpd.GeoDataFrame
    min_polygon_size : float, optional
        Minimum area of a waterbody polygon to be included in the output polygons, by default 4500
    max_polygon_size : float, optional
        Maximum area of a waterbody polygon to be included in the output polygons, by default math.inf

    Returns
    -------
    tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
        The area filtered primary threshold polygons and the area filtered
        secondary threshold polygons.
    """

    if raster_polygons is not None:
        _log.info(
            f"Filtering primary threshold polygons by minimum area {min_polygon_size} and max area {max_polygon_size}..."
        )

        raster_polygons["area"] = pd.to_numeric(raster_polygons.area)
        area_filtered_raster_polygons = raster_polygons.loc[
            (
                (raster_polygons["area"] > min_polygon_size)
                & (raster_polygons["area"] <= max_polygon_size)
            )
        ]
        area_filtered_raster_polygons.reset_index(drop=True, inplace=True)
        _log.info(
            f"Filtered out {len(raster_polygons) - len(area_filtered_raster_polygons)} primary threshold polygons."
        )
    else:
        area_filtered_raster_polygons = None

    return area_filtered_raster_polygons

In [None]:
# To Do: Filter final cleaned polygons by area and export parquet to s3

area_filtered_raster_polygons = filter_by_area(
    raster_polygons, 
    min_polygon_size=min_polygon_size,
    max_polygon_size=max_polygon_size,
)

area_filtered_raster_polygons_fp = os.path.join(
    output_directory, "area_filtered_raster_polygons.parquet"
)
area_filtered_raster_polygons.to_parquet(area_filtered_raster_polygons_fp)
_log.info(f"Area filtered primary threshold polygons written to {area_filtered_raster_polygons_fp}")


### Add required attributes and then export to s3
Once the file is generated with the necessary attributes, ask Leon to upload it to the GeoServer (replace the existing senegal basin file, keeping the same name). 