<img src="https://github.com/nicholasmetherall/digital-earth-pacific-macblue-activities/blob/main/attachments/images/DE_Pacific_banner.JPG?raw=true" width="900"/>
Figure 1.1.a. Jupyter environment + Python notebooks

### Digital Earth Pacific Notebook 1 prepare postcard and load data to csv

<font color='green'>The objective of this notebook is to prepare a geomad postcard for your AOI (masking, scaling and loading additional band ratios and spectral indices) and sampling all the datasets into a csv based on your training data geodataframe. </font>

<font color='blue'>Step 1.1</font>

In [3]:
from pystac_client import Client
from dask.distributed import Client as DaskClient
from odc.stac import load, configure_s3_access
import rasterio as rio
import geopandas as gpd
import pandas as pd
import numpy as np
import xarray as xr
import folium
# import postcards
from utils import load_data
from utils import mask_and_scale
from sklearn.ensemble import RandomForestClassifier
import odc.geo.xr
import rioxarray
import matplotlib.pyplot as plt
import joblib
from shapely.geometry import box

<font color='blue'>Define catalogue</font>

In [4]:
catalog = "https://stac.digitalearthpacific.org"
client = Client.open(catalog)

<font color='red'>Define your area of interest - copy and paste the bottom left latitude (min_lat) and the bottem left longitude (min_lon) and the top right latitude (max_lat) and the top right longitude (max_lon)

<font color='red'>In this activity you can use the following examples: 
`-18.1313, -18.1553, 177.4347, 177.4032`
  
<font color='red'>  Note we have reordered these into the wrong order so you will have to define them correctly below:</font>

In [5]:
utanglang_gdf = gpd.read_file("training_data/utanglang_data_12032025.geojson")
utanglang_gdf = utanglang_gdf.to_crs("EPSG:4326")
min_lon, min_lat, max_lon, max_lat = utanglang_gdf.total_bounds
bbox = [min_lon, min_lat, max_lon, max_lat]

In [6]:
# min_lat = -18.20459
# min_lon = 178.33041
# max_lat = -18.10000 
# max_lon = 178.53506
# bbox = [min_lon, min_lat, max_lon, max_lat]

<font color='red'>Define your time of interest - choose a range of a few months in 2024 using the syntax `datetime="YYYY-MM/YYYY-MM"`</font>

In [7]:
datetime="2024"
items = list(client.search(collections=["dep_s2_geomad"], datetime=datetime, bbox=bbox).items())

In [8]:
from pystac import Collection

In [9]:
collection = Collection.from_file("https://stac.digitalearthpacific.org/collections/dep_s2_geomad")

In [10]:
data = load(
        items,
        measurements=[
            "nir", "red", "blue", "green", "emad", "smad", 
            "bcmad", "count", "green", "nir08", 
            "nir09", "swir16", "swir22", "coastal",
            "rededge1", "rededge2", "rededge3", 
        ],
        bbox=bbox,
        chunks={"x": 2048, "y": 2048},
        groupby="solar_day",
    )

In [11]:
# dask_client = DaskClient(n_workers=1, threads_per_worker=16, memory_limit='16GB')
# configure_s3_access(cloud_defaults=True, requester_pays=True)

In [12]:
scaled = (data.where(data != 0) * 0.0001).clip(0, 1)

In [13]:
mndwi = (scaled["green"]-scaled["swir16"])/(scaled["green"]+scaled["swir16"])
mndwi_land_mask = mndwi > 0
clipped_ds = scaled.where(mndwi_land_mask)

In [14]:
ndti = (clipped_ds["red"]-clipped_ds["green"])/(clipped_ds["red"]+clipped_ds["green"])
ndti_mask = ndti < 0.2
clipped_ds = clipped_ds.where(ndti_mask)

In [15]:
nir = clipped_ds['nir']
nir_mask = nir < 0.085
clipped_ds = clipped_ds.where(nir_mask)

In [16]:
all_masks = (mndwi_land_mask+ndti_mask+nir_mask)

In [17]:
# Incorporate other band ratios and indices
cai = (clipped_ds["coastal"]-clipped_ds["blue"])/( clipped_ds["coastal"]+ clipped_ds["blue"]) #coastal aerosol index
ndvi = (clipped_ds["nir"]-clipped_ds["red"])/( clipped_ds["nir"]+ clipped_ds["red"]) #vegetation index (NDVI)
ndwi = (clipped_ds["green"]-clipped_ds["nir"])/(clipped_ds["green"]+clipped_ds["nir"]) #water index (NDWI)
b_g = (clipped_ds["blue"])/(clipped_ds["green"]) #blue to green ratio
b_r = (clipped_ds["blue"])/(clipped_ds["red"]) #blue to red ratio
mci = (clipped_ds["nir"])/(clipped_ds["rededge1"]) # max chlorophlyll index (MCI)
ndci = (clipped_ds["rededge1"]-clipped_ds["red"])/(clipped_ds["rededge1"]+clipped_ds["red"]) # normalised difference chlorophyll index (NDCI)


In [18]:
clipped_ds['cai'] = cai
clipped_ds['ndvi'] = ndvi
clipped_ds['ndwi'] = ndwi
clipped_ds['mndwi'] = mndwi
clipped_ds['ndti'] = ndti
clipped_ds['b_g'] = b_g
clipped_ds['b_r'] = b_r
clipped_ds['mci'] = mci
clipped_ds['ndci'] = ndci
# clipped_ds

In [19]:
# clipped_da = clipped_ds.squeeze().to_array(dim="band")
# clipped_da = clipped_da.astype('float32')
# clipped_da.odc.write_cog("clipped_ds_masked.tiff")

### Postcard csv

<font color='green'>The objective of this notebook was to train the machine learning model that will allow us to classify an area with land cover classes defined through the training data. </font>

<font color='blue'>Step 1.2. Input the training data to sample geomad data from the postcard</font>

In [20]:
# Define training data
utanglang_gdf = gpd.read_file("training_data/utanglang_data_12032025.geojson")
utanglang_gdf = utanglang_gdf.to_crs("EPSG:4326")
# utanglang_gdf.explore(column="cc_id", legend=True)

In [21]:
utanglang_postcard = clipped_ds#.where(all_masks)
# utanglang_postcard =clipped_ds.to_array(dim="band")
# First transform the training points to the same CRS as the data
utanglang_training = utanglang_gdf.to_crs(utanglang_postcard.odc.geobox.crs)

In [22]:
print(utanglang_training.columns)
utanglang_training=utanglang_training.drop(columns=['fid', 'date', 'uuid'])

Index(['fid', 'observed', 'date', 'uuid', 'x', 'y', 'cc_id', 'geometry'], dtype='object')


In [23]:
utanglang_training

Unnamed: 0,observed,x,y,cc_id,geometry
0,CEPA boats,,,6,POINT (-302223.131 -1056287.192)
1,deeps,,,8,POINT (2039620.732 -1956908.207)
2,deeps,,,8,POINT (2039515.155 -1957069.507)
3,deeps,,,8,POINT (2039662.963 -1957271.134)
4,deeps,,,8,POINT (2039479.963 -1957406.775)
...,...,...,...,...,...
275,coral,,,6,POINT (2039861.612 -1957466.612)
276,coral,,,6,POINT (2040069.584 -1957925.362)
277,coral,,,6,POINT (2039863.607 -1958086.421)
278,coral,,,6,POINT (2039836.177 -1958105.644)


In [56]:
utanglang_training_da = utanglang_training.assign(x=utanglang_training.geometry.x, y=utanglang_training.geometry.y).to_xarray()

In [57]:
utanglang_training_values = (
    utanglang_postcard.sel(utanglang_training_da[["x", "y"]], method="nearest").squeeze().compute().to_pandas()
)
utanglang_training_values

Unnamed: 0_level_0,nir,red,blue,green,emad,smad,bcmad,count,nir08,nir09,...,time,cai,ndvi,ndwi,mndwi,ndti,b_g,b_r,mci,ndci
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,2024-01-01,,,,-0.024780,,,,,
1,0.0304,0.0322,0.0654,0.0417,0.087864,0.000003,0.000036,0.0060,0.0304,0.0367,...,2024-01-01,0.043160,-0.028754,0.156727,0.176305,-0.128552,1.568345,2.031056,0.910180,0.018293
2,0.0309,0.0335,0.0576,0.0399,0.089898,0.000003,0.000040,0.0060,0.0333,0.0435,...,2024-01-01,0.095761,-0.040373,0.127119,0.109875,-0.087193,1.443609,1.719403,0.865546,0.031792
3,0.0272,0.0305,0.0665,0.0401,0.080707,0.000004,0.000039,0.0059,0.0277,0.0363,...,2024-01-01,0.045911,-0.057192,0.191679,0.165698,-0.135977,1.658354,2.180328,0.860759,0.017713
4,0.0281,0.0323,0.0578,0.0393,0.079498,0.000004,0.000037,0.0055,0.0297,0.0354,...,2024-01-01,0.074460,-0.069536,0.166172,0.144105,-0.097765,1.470738,1.789474,0.854103,0.009202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,0.0313,0.0338,0.0846,0.0519,0.108376,0.000007,0.000041,0.0058,0.0343,0.0474,...,2024-01-01,0.005875,-0.038402,0.247596,0.209790,-0.211202,1.630058,2.502959,0.857534,0.038407
276,0.0300,0.0400,0.0550,0.0649,0.075024,0.000006,0.000026,0.0052,0.0253,0.0285,...,2024-01-01,0.054983,-0.142857,0.367756,0.537915,-0.237369,0.847458,1.375000,0.456621,0.243141
277,0.0267,0.0358,0.0684,0.0714,0.075918,0.000006,0.000034,0.0055,0.0260,0.0343,...,2024-01-01,-0.010340,-0.145600,0.455658,0.470649,-0.332090,0.957983,1.910614,0.760684,-0.009873
278,0.0257,0.0323,0.0693,0.0708,0.083596,0.000006,0.000033,0.0057,0.0269,0.0389,...,2024-01-01,-0.001445,-0.113793,0.467358,0.452308,-0.373424,0.978814,2.145511,0.725989,0.045790


In [58]:
# Join the training data with the extracted values and remove unnecessary columns
utanglang_training_array = pd.concat([utanglang_training["cc_id"], utanglang_training_values], axis=1)
# Drop rows where there was no data available
utanglang_training_array = utanglang_training_array.dropna()
# Preview our resulting training array
utanglang_training_array.head()

Unnamed: 0,cc_id,nir,red,blue,green,emad,smad,bcmad,count,nir08,...,time,cai,ndvi,ndwi,mndwi,ndti,b_g,b_r,mci,ndci
1,8,0.0304,0.0322,0.0654,0.0417,0.087864,3e-06,3.6e-05,0.006,0.0304,...,2024-01-01,0.04316,-0.028754,0.156727,0.176305,-0.128552,1.568345,2.031056,0.91018,0.018293
2,8,0.0309,0.0335,0.0576,0.0399,0.089898,3e-06,4e-05,0.006,0.0333,...,2024-01-01,0.095761,-0.040373,0.127119,0.109875,-0.087193,1.443609,1.719403,0.865546,0.031792
3,8,0.0272,0.0305,0.0665,0.0401,0.080707,4e-06,3.9e-05,0.0059,0.0277,...,2024-01-01,0.045911,-0.057192,0.191679,0.165698,-0.135977,1.658354,2.180328,0.860759,0.017713
4,8,0.0281,0.0323,0.0578,0.0393,0.079498,4e-06,3.7e-05,0.0055,0.0297,...,2024-01-01,0.07446,-0.069536,0.166172,0.144105,-0.097765,1.470738,1.789474,0.854103,0.009202
5,8,0.0343,0.0369,0.0616,0.0435,0.102791,4e-06,4.1e-05,0.006,0.0357,...,2024-01-01,0.068079,-0.036517,0.118252,0.099874,-0.08209,1.416092,1.669377,0.890909,0.02122


In [60]:
utanglang_training_array.to_csv("training_data/postcard_utanglang_dataframe.csv")