# Notebook 1 prepare postcard and load data to csv

The objective of this notebook is to prepare a geomad postcard for your AOI (masking, scaling and loading additional band ratios and spectral indices) and sampling all the datasets into a csv based on your training data geodataframe.

## Step 1.1: Configure the environment

In [1]:
import geopandas as gpd
import pandas as pd
from dask.distributed import Client as DaskClient
from odc.stac import configure_s3_access, load
from pystac.client import Client
from utils import calculate_band_indices, scale, texture
from masking import all_masks
import xarray as xr

In [2]:
# Reload scripts and imports
%load_ext autoreload
%autoreload 2

In [6]:
from pathlib import Path

folder = Path("training-data/turbidity/").expanduser()

files = folder.glob("*.geojson")

# geodataframes = []
# for f in files:
#     # gdf = gpd.read_file(f)
#     # Add the file name as a column
#     gdf = gpd.read_file(f).to_crs("epsg:4326").copy()
#     gdf["site_name"] = f.name.rstrip("_turbid.geojson")
#     geodataframes.append(gdf.to_crs("epsg:4326"))



In [7]:
for f in files:
    try:
        # --- METHOD 1: Using .copy() to break the chain ---
        # Read the file, transform the CRS, and explicitly create a new copy.
        gdf_copy_method = gpd.read_file(f).to_crs("epsg:4326").copy()
        
        # Now, you can safely add a new column to the copy.
        gdf_copy_method["site_name"] = f.name.rstrip("_turbid.geojson")
        
        geodataframes.append(gdf_copy_method)

        # --- METHOD 2: Using .loc to safely assign a value ---
        # This method is often preferred as it is the "correct" way to do this in Pandas.
        gdf_loc_method = gpd.read_file(f).to_crs("epsg:4326")
        
        # We use .loc to set the entire new column, which prevents the warning.
        gdf_loc_method.loc[:, "site_name"] = f.name.rstrip("_turbid.geojson")
        
        geodataframes.append(gdf_loc_method)

    except Exception as e:
        print(f"Failed to process {f.name}: {e}")

Failed to process png_gulf_region_west.geojson: name 'geodataframes' is not defined
Failed to process fj_labasa_turbid.geojson: name 'geodataframes' is not defined
Failed to process png_gulf_central_turbid.geojson: name 'geodataframes' is not defined
Failed to process fj_sigatoka_turbid.geojson: name 'geodataframes' is not defined
Failed to process png_east_sepik_turbid.geojson: name 'geodataframes' is not defined
Failed to process sols_western_turbid.geojson: name 'geodataframes' is not defined
Failed to process sols_mataniko_turbid.geojson: name 'geodataframes' is not defined
Failed to process png_fly_river.geojson: name 'geodataframes' is not defined
Failed to process png_gulf_central.geojson: name 'geodataframes' is not defined
Failed to process png_merauke.geojson: name 'geodataframes' is not defined
Failed to process png_fly_river_turbid.geojson: name 'geodataframes' is not defined
Failed to process png_west_sepik_turbid.geojson: name 'geodataframes' is not defined
Failed to proc

In [None]:
merged_gdf = pd.concat(geodataframes, ignore_index=True)
merged_gdf["coastal_class"] = merged_gdf["coastal_class"].fillna(merged_gdf["observed"])
merged_gdf

In [None]:
merged_gdf = merged_gdf.drop(columns=["id"])
merged_gdf.explore()

In [None]:
tdata = merged_gdf

In [None]:
tdata = tdata.rename(columns={'Class': 'coastal_class', 'coastal_cl': 'coastal_class', 'class': 'coastal_class'}, inplace=False)
print(tdata.columns.unique) 
tdata

In [None]:
# If your DataFrame has three columns all called 'coastal_class'
# we select them by filtering on column name
coastal_class = tdata.loc[:, tdata.columns == 'coastal_class']
 
# Merge them into one column (take the first non-null value per row)
tdata['coastal_class'] = coastal_class.bfill(axis=1).iloc[:, 0]
 
# Drop the duplicate ones (keeping only the new single column)
tdata = tdata.loc[:, ~tdata.columns.duplicated()]

In [None]:
tdata

In [None]:
print(tdata.columns.unique)

In [None]:
print(tdata['coastal_class'].value_counts())

In [None]:
# tdata = training_data_all 
tdata['observed'] = tdata['coastal_class'].replace('Land', 'land') #: This method is called on the Series (the column) and replaces all occurrences of 'old_value' with 'new_value'.
tdata['observed'] = tdata['coastal_class'].replace('Deeps', 'deeps') #: This method is called on the Series (the column) and replaces all occurrences of 'old_value' with 'new_value'.
tdata['observed'] = tdata['coastal_class'].replace('Turbidity', 'sediment') #: This method is called on the Series (the column) and replaces all occurrences of 'old_value' with 'new_value'.
# print(tdata.columns.unique)
tdata

In [None]:
tdata

In [62]:
# Define two lists
old_categories = ['sediment', 'sand', 'rubble', 'seagrass', 'algae', 'coral', 'rock', 'deeps', 'mangrove', 'land']
new_groups = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Create the mapping dictionary
category_mapping = dict(zip(old_categories, new_groups))

# Apply mapping
# tdata['observed_id'] = tdata['observed'].map(category_mapping)
tdata['observed'] = tdata['coastal_class'].map(category_mapping)

# Print the result
print(tdata.columns)

print(tdata['observed'].value_counts())
print('total gps points',(len(tdata)))

Index(['geometry', 'site_name', 'coastal_class', 'cc_id', 'observed'], dtype='object')
observed
1.0     194
8.0      85
10.0     32
Name: count, dtype: int64
total gps points 3046


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [13]:
# training_data_all = gpd.read_file("training-data/all_tdata_082025.geojson")
# training_data_all.explore(column="coastal_class")


In [14]:
training_data_all.head()

NameError: name 'training_data_all' is not defined

In [None]:
# merged_gdf = pd.concat(geodataframes, ignore_index=True)
# tdata["coastal_class"] = tdata["coastal_class"].fillna(tdata["observed"])

# tdata = tdata.drop(columns=["observed", "fid"])
tdata.to_file("all_tdata_10092025.geojson")

In [None]:
# Combine the cc_id and coastal_class to find any mistakes
test = training_data_all["cc_id"].astype(str) + "_" + training_data_all["coastal_class"].astype(str)

# Show unique combinations with count
unique = test.value_counts()
unique.sort_index()

## Step 1.2: Configure STAC access and search parameters

In [None]:
catalog = "https://stac.digitalearthpacific.org"
client = Client.open(catalog)

configure_s3_access(aws_unsigned=True)

In [None]:
def process_site(site_name, training_data, mask=True):
    site_name = site_name
    extent = training_data.total_bounds
    datetime = "2024"

    items = client.search(
        collections=["dep_s2_geomad"], bbox=extent, datetime=datetime
    ).item_collection()

    print(f"Found {len(items)} items in for {datetime} for {site_name}")

    data = load(
        items,
        bbox=extent,
        chunks=dict(x=2024, y=2024),
        fail_on_error=False,
        datetime=datetime,
        measurements=[
            "nir",
            "red",
            "blue",
            "green",
            "emad",
            "smad",
            "bcmad",
            "green",
            "nir08",
            "nir09",
            "swir16",
            "swir22",
            "coastal",
            "rededge1",
            "rededge2",
            "rededge3",
        ],
    )

    scaled_data = scale(data).squeeze(drop=True)
    loaded_data = scaled_data.compute()

    print("Loaded data into memory")

    data_indices = calculate_band_indices(loaded_data)
    texture_data = texture(data_indices.blue, levels=32).compute()
    combined_data = xr.merge([data_indices, texture_data])
    if mask:
        final_data = all_masks(combined_data)
    else:
        final_data = combined_data

    print("Finished preparing indices and masking data")

    # Reproject training data to the GeoMAD CRS and convert to xarray
    training_reprojected = training_data.to_crs(final_data.odc.crs)
    training_da = training_reprojected.assign(
        x=training_reprojected.geometry.x, y=training_reprojected.geometry.y
    ).to_xarray()

    # Extract training values from the masked dataset
    training_values = (
        final_data.sel(training_da[["x", "y"]], method="nearest")
        .squeeze()
        .compute()
        .to_pandas()
    )

    print("Finished extracting training values")

    # Join the training data with the extracted values and remove unnecessary columns
    training_array = pd.concat([training_data["cc_id"], training_values], axis=1)

    # Drop rows where there was no data available
    training_array = training_array.dropna()

    # training_array.drop(columns=["spatial_ref", "x", "y"], inplace=True)

    # Preview our resulting training array
    return training_array

In [None]:
masked = True

all_training_data = []
with DaskClient(n_workers=2, threads_per_worker=16, memory_limit='12GB') as dc:
    print(dc.dashboard_link)
    for site_name, group in training_data_all.groupby("site_name"):
        print(f"PROCESSING: {site_name}")
        extent = group.total_bounds
        training_data = process_site(site_name, group, mask=masked)
        all_training_data.append(training_data)
        training_data.to_csv(f"training-data/{site_name}{'' if masked else '-unmasked'}-training.csv", index=False)

# Combine all training data into a single DataFrame
combined_training_data = pd.concat(all_training_data, axis=0)
print(len(combined_training_data))

In [None]:
# Write out the combined training data
combined_training_data.to_csv(f"training-data/combined{'' if masked else '-unmasked'}-training.csv", index=False)