# Notebook 1 prepare postcard and load data to csv

The objective of this notebook is to prepare a geomad postcard for your AOI (masking, scaling and loading additional band ratios and spectral indices) and sampling all the datasets into a csv based on your training data geodataframe.

## Step 1.1: Configure the environment

In [None]:
import geopandas as gpd
import pandas as pd
from dask.distributed import Client as DaskClient
from odc.stac import configure_s3_access, load
from pystac.client import Client
from utils import calculate_band_indices, scale, texture
from masking import all_masks
import xarray as xr

In [None]:
# Reload scripts and imports
%load_ext autoreload
%autoreload 2

In [None]:
# from pathlib import Path

# folder = Path("~/data/seagrass").expanduser()

# files = folder.glob("*.geojson")

# geodataframes = []
# for f in files:
#     gdf = gpd.read_file(f)
#     # Add the file name as a column
#     gdf["site_name"] = f.name.replace("_postcard.geojson", "")
#     geodataframes.append(gdf.to_crs("epsg:4326"))

# merged_gdf = pd.concat(geodataframes, ignore_index=True)
# merged_gdf["coastal_class"] = merged_gdf["coastal_class"].fillna(merged_gdf["observed"])

# merged_gdf = merged_gdf.drop(columns=["observed", "date", "observed_id", "id", "fid", "uuid"])

# # Update anything that is `algae` so that cc_id is 11
# merged_gdf.loc[merged_gdf["coastal_class"] == "algae", "cc_id"] = 11
# # Update anything that is `seagrass` to be class 4
# merged_gdf.loc[merged_gdf["coastal_class"] == "seagrass", "cc_id"] = 4
# # Land is 10
# merged_gdf.loc[merged_gdf["coastal_class"] == "land", "cc_id"] = 10
# # Deeps is 12
# merged_gdf.loc[merged_gdf["coastal_class"] == "deeps", "cc_id"] = 12
# # Sediment is 1
# merged_gdf.loc[merged_gdf["coastal_class"] == "sediment", "cc_id"] = 1
# # Seaweed is 5
# merged_gdf.loc[merged_gdf["coastal_class"] == "seaweed", "cc_id"] = 5
# # Rock is 13
# merged_gdf.loc[merged_gdf["coastal_class"] == "rock", "cc_id"] = 13
# # Sand is 2
# merged_gdf.loc[merged_gdf["coastal_class"] == "sand", "cc_id"] = 2
# # Coral is 6
# merged_gdf.loc[merged_gdf["coastal_class"] == "coral", "cc_id"] = 6
# # Mangrove is 9
# merged_gdf.loc[merged_gdf["coastal_class"] == "mangrove", "cc_id"] = 9

# merged_gdf.to_file("training_all.gpkg")

In [None]:
training_data_all = gpd.read_file("~/data/seagrass/all_field_tdata_11092025.geojson")
# tulagi = gpd.read_file("~/data/seagrass/sols_tulagi_ws_postcard.geojson")
# tulagi["file_name"] = "sols_tulagi_ws"
# tulagi["coastal_class"] = tulagi["observed"]
# tulagi.drop(columns=["observed", "id"], inplace=True)

rivers = gpd.read_file("~/data/seagrass/rivers_tdata_11092025_postcard.geojson")
rivers["file_name"] = rivers["site_name"]

training_data_all = pd.concat([training_data_all, rivers], ignore_index=True)

training_data_all.drop(columns=["id", "index", "site_name"], inplace=True)
training_data_all.rename(columns={"file_name": "site_name"}, inplace=True)

training_data_all.explore(column="site_name"


In [None]:
training_data_all.head()

In [None]:
print(len(training_data_all))

In [None]:
training_data_all = gpd.read_file("all_data.gpkg", driver="GPKG")

In [None]:
# Combine the cc_id and coastal_class to find any mistakes
test = training_data_all["cc_id"].astype(str) + "_" + training_data_all["observed"].astype(str)

# Show unique combinations with count
unique = test.value_counts()
unique.sort_index()

## Step 1.2: Configure STAC access and search parameters

In [None]:
catalog = "https://stac.digitalearthpacific.org"
client = Client.open(catalog)

configure_s3_access(aws_unsigned=True)

In [None]:
def process_site(site_name, training_data, mask=True):
    site_name = site_name
    extent = training_data.total_bounds
    datetime = "2024"

    items = client.search(
        collections=["dep_s2_geomad"], bbox=extent, datetime=datetime
    ).item_collection()

    print(f"Found {len(items)} items in for {datetime} for {site_name}")

    data = load(
        items,
        bbox=extent,
        chunks=dict(x=2024, y=2024),
        fail_on_error=False,
        datetime=datetime,
        measurements=[
            "nir",
            "red",
            "blue",
            "green",
            "emad",
            "smad",
            "bcmad",
            "green",
            "nir08",
            "nir09",
            "swir16",
            "swir22",
            "coastal",
            "rededge1",
            "rededge2",
            "rededge3",
        ],
    )

    print(f"Loading array that is {data.dims}")

    scaled_data = scale(data).squeeze(drop=True)
    loaded_data = scaled_data.compute()

    print("Loaded data into memory")

    data_indices = calculate_band_indices(loaded_data)
    texture_data = texture(data_indices.blue, levels=32).compute()
    combined_data = xr.merge([data_indices, texture_data])
    if mask:
        final_data = all_masks(combined_data)
    else:
        final_data = combined_data

    print("Finished preparing indices and masking data")

    # Reproject training data to the GeoMAD CRS and convert to xarray
    training_reprojected = training_data.to_crs(final_data.odc.crs)
    training_da = training_reprojected.assign(
        x=training_reprojected.geometry.x, y=training_reprojected.geometry.y
    ).to_xarray()

    # Extract training values from the masked dataset
    training_values = (
        final_data.sel(training_da[["x", "y"]], method="nearest")
        .squeeze()
        .compute()
        .to_pandas()
    )

    print("Finished extracting training values")

    # Join the training data with the extracted values and remove unnecessary columns
    training_array = pd.concat([training_data["cc_id"], training_values], axis=1)

    # Drop rows where there was no data available
    training_array = training_array.dropna()

    # training_array.drop(columns=["spatial_ref", "x", "y"], inplace=True)

    # Preview our resulting training array
    return training_array

In [None]:
from odc.geo.geom import BoundingBox
import os

masked = False

all_training_data = []
with DaskClient(n_workers=2, threads_per_worker=16, memory_limit='12GB') as dc:
    print(dc.dashboard_link)
    for site_name, group in training_data_all.groupby("site_name"):
        print(f"PROCESSING: {site_name}")
        out_file = f"training-data/{site_name}{'' if masked else '-unmasked'}-training.csv"
        extent = group.total_bounds
        bbox = BoundingBox(*extent, crs="epsg:4326")
        print(f"Area is: {bbox.polygon.to_crs('epsg:6933').area / 10000:.2f} ha")

        if os.path.exists(out_file):
            print(f"Skipping {site_name} as {out_file} already exists")
            training_data = gpd.read_file(out_file)
        else:
            training_data = process_site(site_name, group, mask=masked)

        all_training_data.append(training_data)
        training_data.to_csv(out_file, index=False)

# Combine all training data into a single DataFrame
combined_training_data = pd.concat(all_training_data, axis=0)
print(len(combined_training_data))

In [None]:
# Write out the combined training data
combined_training_data.to_csv(f"training-data/combined{'' if masked else '-unmasked'}-training.csv", index=False)