<img src="https://github.com/nicholasmetherall/digital-earth-pacific-macblue-activities/blob/main/attachments/images/DE_Pacific_banner.JPG?raw=true" width="900"/>

Figure 1.1.a. Jupyter environment + Python notebooks

# Digital Earth Pacific Notebook 1 prepare postcard and load data to csv

The objective of this notebook is to prepare a geomad postcard for your AOI (masking, scaling and loading additional band ratios and spectral indices) and sampling all the datasets into a csv based on your training data geodataframe.

## Step 1.1: Configure the environment

In [43]:
from datetime import datetime
from shapely.geometry import Polygon
import folium
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
from ipyleaflet import basemaps
from numpy.lib.stride_tricks import sliding_window_view
import pystac_client
import planetary_computer
from odc.stac import load
from pystac.client import Client
from skimage.feature import graycomatrix, graycoprops
from utils import scale, do_prediction, calculate_band_indices, apply_masks, threshold_calc_land, threshold_calc_ds

In [32]:
# Predefined variable for title and version

# Enter your initials
initials = "agl"

# Enter your site name
site = "bootless"

# Date
date = datetime.now()

# Make a clean version string
version = f"{initials}-{site}-{date.strftime('%d%m%Y')}"
print(version)

agl-bootless-08072025


## Step 1.2: Configure STAC access and search parameters

In [33]:
catalog = "https://stac.digitalearthpacific.org"
client = Client.open(catalog)

mspc_catalogue = "https://planetarycomputer.microsoft.com/api/stac/v1/"
dem_collection = "cop-dem-glo-30"
elevation_threshold: float = 10.0

In [45]:
## Use training data bounds

training = gpd.read_file("training-data/bootless_bay.geojson")
training = training.to_crs("EPSG:4326")
min_lon, min_lat, max_lon, max_lat = training.total_bounds

bbox = [min_lon, min_lat, max_lon, max_lat]

min_lon, min_lat, max_lon, max_lat = bbox
bbox_polygon = Polygon([
    (min_lon, min_lat),
    (max_lon, min_lat),
    (max_lon, max_lat),
    (min_lon, max_lat),
    (min_lon, min_lat) # Close the polygon
])

# TODO: configure colours...
training.explore(column="observed")

In [35]:
## Use bounding box

# min_lon = 160.08855
# min_lat = -9.12915
# max_lon = 160.17137
# max_lat = -9.08003

# bbox = [min_lon, min_lat, max_lon, max_lat]

In [41]:
dem_stac_client = pystac_client.Client.open(mspc_catalogue)

search = dem_stac_client.search(
    collections=[dem_collection],
    intersects=bbox,
    max_items=1
)

item = next(search.get_items())
print(f"STAC item ID: {item.id}")

# Get signed asset URL
asset_href = planetary_computer.sign(item.assets["data"].href)
print(asset_href)


Exception: intersects must be of type None, str, dict, or an object that implements __geo_interface__

In [None]:
datetime = "2024"

items = client.search(
    collections=["dep_s2_geomad"],
    datetime=datetime,
    bbox=bbox
).item_collection()

print(f"Found {len(items)} items in for {datetime}")

In [7]:
data = load(
    items,
    measurements=[
        "nir",
        "red",
        "blue",
        "green",
        "emad",
        "smad",
        "bcmad",
        "count",
        "green",
        "nir08",
        "nir09",
        "swir16",
        "swir22",
        "coastal",
        "rededge1",
        "rededge2",
        "rededge3",
    ],
    bbox=bbox,
    chunks={"x": 2048, "y": 2048},
    groupby="solar_day",
)

scaled = (data.where(data != 0) * 0.0001).clip(0, 1)

# Load into memory
scaled = scaled.compute().squeeze()

In [8]:
# Incorporate band ratios and indices

# Modified Normalised Difference Water Index (MNDWI)
scaled["mndwi"] = (scaled["green"] - scaled["swir16"]) / (scaled["green"] + scaled["swir16"])

# Normalised Difference Turbidity Index (NDTI)
scaled["ndti"] = (scaled["red"] - scaled["green"]) / (scaled["red"] + scaled["green"])

# coastal aerosol index
scaled["cai"] = (scaled["coastal"] - scaled["blue"]) / (
    scaled["coastal"] + scaled["blue"]
)
# vegetation index (NDVI)
scaled["ndvi"] = (scaled["nir"] - scaled["red"]) / (
    scaled["nir"] + scaled["red"]
)
# enhanced vegetation index
scaled["evi"] = (2.5 * scaled["nir"] - scaled["red"]) / (
    scaled["nir"] + (6 * scaled["red"]) - (7.5 * scaled["blue"]) + 1
)
# soil adjusted vegetation index
scaled["savi"] = (scaled["nir"] - scaled["red"]) / (
    scaled["nir"] + scaled["red"]
)
# water index (NDWI)
scaled["ndwi"] = (
    (scaled["green"] - scaled["nir"])
    / (scaled["green"] + scaled["nir"] + 0.428)
    * (1 + 0.428)
)
# blue to green ratio
scaled["b_g"] = (scaled["blue"]) / (scaled["green"])
# blue to red ratio
scaled["b_r"] = (scaled["blue"]) / (scaled["red"])
# max chlorophlyll index (MCI)
scaled["mci"] = (scaled["nir"]) / (scaled["rededge1"])
# normalised difference chlorophyll index (NDCI)
scaled["ndci"] = (scaled["rededge1"] - scaled["red"]) / (
    scaled["rededge1"] + scaled["red"]
)
# Natural log of blue/green
scaled["ln_bg"] = np.log(scaled.blue / scaled.green)

In [9]:
# Explore the site we are working on
scaled.odc.explore(vmin=0, vmax=0.3, bands=["red", "green", "blue"], name=site)

In [10]:
## Moderate land mask
MNDWI_THRESHOLD = -0.2
mndwi_mask = scaled.mndwi > MNDWI_THRESHOLD

mndwi_mask.odc.explore()

In [22]:
# NDTI_THRESHOLD = -0.2
# ndti_mask = scaled.ndti > NDTI_THRESHOLD
# ndti_mask.odc.explore()

In [None]:
ln_bg = scaled_data['ln_bg'].values.flatten()  # flatten in case it's multi-dimensional
# Remove NaNs if present
ln_bg = ln_bg[~np.isnan(ln_bg)]
# Calculate mean and std
mean = ln_bg.mean()
std = ln_bg.std()


In [12]:
# Deep ocean mask
ln_bg_mask = scaled["ln_bg"] < 0

ln_bg_mask.odc.explore()

In [13]:
all_masks = mndwi_mask & ln_bg_mask

centroid = scaled.odc.geobox.geographic_extent.centroid.coords[0][::-1]
m = folium.Map(location=centroid, zoom_start=14)

scaled.odc.to_rgba(bands=["red", "green", "blue"], vmin=0, vmax=0.3).odc.add_to(m, name="RGB")
mndwi_mask.where(mndwi_mask == 0).odc.add_to(m, name="MNDWI Mask", vmin=0, vmax=1)
ndti_mask.where(ndti_mask == 0).odc.add_to(m, name="NDTI Mask", vmin=0, vmax=1)
ln_bg_mask.where(ln_bg_mask == 0).odc.add_to(m, name="ln_bg Mask", vmin=0, vmax=1)
all_masks.where(all_masks == 0).odc.add_to(m, name="All Masks", vmin=0, vmax=1)

folium.LayerControl().add_to(m)

m

In [14]:
# Now apply the mask, and view the masked region

masked = scaled.where(all_masks)
masked.odc.explore(
    vmin=0, vmax=0.3, bands=["red", "green", "blue"], name=f"{site}-masked", tiles=basemaps.Esri.WorldImagery
)

  return x.astype("uint8")


### GLCM texture analysis

The objective of this notebook was to train the machine learning model that will allow us to classify an area with land cover classes defined through the training data.

Step 1.2. Input the training data to sample geomad data from the postcard

In [15]:
WINDOW_SIZE = 9
LEVELS = 32

# Input
max = masked.blue.max().values
min = masked.blue.min().values
# Scale to 0-LEVELS for GLCM
img = ((masked.blue - min) / (max - min) * (LEVELS - 1)).clip(0, LEVELS - 1).values.astype(np.uint8)

# Extract overlapping windows
patches = sliding_window_view(img, (WINDOW_SIZE, WINDOW_SIZE))
# Shape: (rows, cols, win_y, win_x)

# Your patch function
def glcm_features(patch):
    glcm = graycomatrix(
        patch,
        distances=[1],
        angles=[0],
        levels=LEVELS,
        symmetric=True,
        normed=True
    )
    out = np.empty(7, dtype=np.float32)
    out[0] = graycoprops(glcm, "contrast")[0, 0]
    out[1] = graycoprops(glcm, "homogeneity")[0, 0]
    out[2] = graycoprops(glcm, "energy")[0, 0]
    out[3] = graycoprops(glcm, "ASM")[0, 0]
    out[4] = graycoprops(glcm, "correlation")[0, 0]
    out[5] = graycoprops(glcm, "mean")[0, 0]

            
            # glcm_p = glcm[:, :, 0, 0]
            # entropy[i, j] = -np.sum(glcm_p * np.log2(glcm_p + 1e-10))
    
    glcm_p = glcm[:, :, 0, 0]
    out[6] = -np.sum(glcm_p * np.log2(glcm_p + 1e-10))
    return out

# Use apply_ufunc to vectorize over (row, col) dimensions
result = xr.apply_ufunc(
    glcm_features,
    xr.DataArray(patches, dims=["y", "x", "win_y", "win_x"]),
    input_core_dims=[["win_y", "win_x"]],
    output_core_dims=[["feature"]],
    vectorize=True,
    dask="parallelized",
    output_dtypes=[np.float32]
)

# Add coordinates & names
pad = WINDOW_SIZE - 1
result = result.assign_coords({
    "y": masked.y[: -pad],
    "x": masked.x[: -pad],
    "feature": ["contrast", "homogeneity", "energy", "ASM", "correlation", "mean", "entropy"]
})

result_bands = result.to_dataset(dim="feature")

# Combine with original
masked_plus = masked.copy()
masked_plus = masked_plus.assign(result_bands)

masked_plus

  img = ((masked.blue - min) / (max - min) * (LEVELS - 1)).clip(0, LEVELS - 1).values.astype(np.uint8)


In [16]:
# Re-apply the mask
masked_plus = masked_plus.where(all_masks)

In [17]:
masked_plus.correlation.odc.explore()

### Postcard csv

The objective of this notebook was to train the machine learning model that will allow us to classify an area with land cover classes defined through the training data.

Step 1.2. Input the training data to sample geomad data from the postcard

In [18]:
# Reproject training data to the GeoMAD CRS and convert to xarray
training_reprojected = training.to_crs(masked_plus.odc.crs)
training_da = training_reprojected.assign(
    x=training_reprojected.geometry.x, y=training_reprojected.geometry.y
).to_xarray()

# Extract training values from the masked dataset
training_values = (
    masked_plus.sel(training_da[["x", "y"]], method="nearest")
    .squeeze()
    .compute()
    .to_pandas()
)
training_values

Unnamed: 0_level_0,nir,red,blue,green,emad,smad,bcmad,count,nir08,nir09,...,homogeneity,energy,ASM,correlation,mean,entropy,y,x,spatial_ref,time
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0602,0.0742,0.0926,0.1005,0.152102,0.000002,0.000034,0.0031,0.0637,0.0988,...,0.888889,0.575007,0.330633,0.552448,3.541667,1.757751,-1057915.0,-302095.0,3832,2024-01-01
1,0.0598,0.0750,0.0989,0.1071,0.161220,0.000003,0.000034,0.0031,0.0648,0.0689,...,0.868056,0.661656,0.437789,0.282831,3.756944,1.547091,-1057915.0,-302075.0,3832,2024-01-01
2,0.0597,0.0750,0.0949,0.1040,0.161838,0.000003,0.000034,0.0031,0.0667,0.0682,...,0.951389,0.839043,0.703993,0.533117,3.881944,0.910556,-1057945.0,-302075.0,3832,2024-01-01
3,0.0597,0.0752,0.0952,0.1040,0.158307,0.000002,0.000034,0.0031,0.0662,0.0687,...,0.895833,0.636696,0.405382,0.515152,3.687500,1.604869,-1057925.0,-302085.0,3832,2024-01-01
4,0.0580,0.0723,0.0995,0.1074,0.159002,0.000003,0.000034,0.0031,0.0638,0.0677,...,0.875000,0.694722,0.482639,0.242105,3.791667,1.438722,-1057915.0,-302065.0,3832,2024-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
815,,,,,,,,,,,...,,,,,,,-1059575.0,-303115.0,3832,2024-01-01
816,,,,,,,,,,,...,,,,,,,-1059535.0,-303115.0,3832,2024-01-01
817,,,,,,,,,,,...,,,,,,,-1059485.0,-303115.0,3832,2024-01-01
818,,,,,,,,,,,...,,,,,,,-1059435.0,-303135.0,3832,2024-01-01


In [19]:
# Join the training data with the extracted values and remove unnecessary columns
training_array = pd.concat([training["cc_id"], training_values], axis=1)

# Drop rows where there was no data available
training_array = training_array.dropna()

# Preview our resulting training array
training_array.head()

Unnamed: 0,cc_id,nir,red,blue,green,emad,smad,bcmad,count,nir08,...,homogeneity,energy,ASM,correlation,mean,entropy,y,x,spatial_ref,time
0,4,0.0602,0.0742,0.0926,0.1005,0.152102,2e-06,3.4e-05,0.0031,0.0637,...,0.888889,0.575007,0.330633,0.552448,3.541667,1.757751,-1057915.0,-302095.0,3832,2024-01-01
1,4,0.0598,0.075,0.0989,0.1071,0.16122,3e-06,3.4e-05,0.0031,0.0648,...,0.868056,0.661656,0.437789,0.282831,3.756944,1.547091,-1057915.0,-302075.0,3832,2024-01-01
2,4,0.0597,0.075,0.0949,0.104,0.161838,3e-06,3.4e-05,0.0031,0.0667,...,0.951389,0.839043,0.703993,0.533117,3.881944,0.910556,-1057945.0,-302075.0,3832,2024-01-01
3,4,0.0597,0.0752,0.0952,0.104,0.158307,2e-06,3.4e-05,0.0031,0.0662,...,0.895833,0.636696,0.405382,0.515152,3.6875,1.604869,-1057925.0,-302085.0,3832,2024-01-01
4,4,0.058,0.0723,0.0995,0.1074,0.159002,3e-06,3.4e-05,0.0031,0.0638,...,0.875,0.694722,0.482639,0.242105,3.791667,1.438722,-1057915.0,-302065.0,3832,2024-01-01


In [20]:
# Write the training data to a CSV file
training_array.to_csv(f"training-data/{version}-training.csv", index=False)