<img src="https://github.com/nicholasmetherall/digital-earth-pacific-macblue-activities/blob/main/attachments/images/DE_Pacific_banner.JPG?raw=true" width="900"/>
Figure 1.1.a. Jupyter environment + Python notebooks

### Digital Earth Pacific Notebook predictions



In [26]:
from pystac_client import Client
from dask.distributed import Client as DaskClient
from odc.stac import load, configure_s3_access
import rasterio as rio
import geopandas as gpd
import pandas as pd
import numpy as np
import xarray as xr
import folium
from utils import load_data
from utils import scale
from utils import apply_masks
from utils import calculate_band_indices
from utils import add_spectral_indices
from sklearn.ensemble import RandomForestClassifier
import odc.geo.xr
import rioxarray
import matplotlib.pyplot as plt
import joblib
from shapely.geometry import box

<font color='blue'>Define catalogue</font>

In [27]:
catalog = "https://stac.digitalearthpacific.org"
client = Client.open(catalog)

<font color='red'>Define your area of interest - copy and paste the bottom left latitude (min_lat) and the bottem left longitude (min_lon) and the top right latitude (max_lat) and the top right longitude (max_lon)

<font color='red'>In this activity you can use the following examples: 
`-18.1313, -18.1553, 177.4347, 177.4032`
  
<font color='red'>  Note we have reordered these into the wrong order so you will have to define them correctly below:</font>

In [28]:
min_lat = -17.432796
min_lon = 168.319607
max_lat = -17.418208
max_lon = 168.329603
bbox = [min_lon, min_lat, max_lon, max_lat]

In [29]:
datetime="2024"
items = list(client.search(collections=["dep_s2_geomad"], datetime=datetime, bbox=bbox).items())

In [30]:
from pystac import Collection

In [31]:
collection = Collection.from_file("https://stac.digitalearthpacific.org/collections/dep_s2_geomad")

In [32]:
data = load(
        items,
        measurements=[
            "nir", "red", "blue", "green", "emad", "smad", 
            "bcmad", "count", "green", "nir08", 
            "nir09", "swir16", "swir22", "coastal",
            "rededge1", "rededge2", "rededge3", 
        ],
        bbox=bbox,
        chunks={"x": 2048, "y": 2048},
        groupby="solar_day",
    )

In [33]:
scaled_data = scale(data)
print(type(scaled_data))
indices = add_spectral_indices(scaled_data)
print(type(indices))
masked_data = apply_masks(scaled_data)
print(type(masked_data))


<class 'xarray.core.dataset.Dataset'>
<class 'xarray.core.dataset.Dataset'>
<class 'xarray.core.dataset.Dataset'>


In [34]:
clipped_ds = masked_data
clipped_ds = calculate_band_indices(clipped_ds)

In [35]:
mndwi = (clipped_ds["green"] - clipped_ds["swir16"]) / (clipped_ds["green"] + clipped_ds["swir16"])
land_mask = mndwi > 0

In [36]:
postcard = clipped_ds

In [37]:
model = joblib.load("models/model-geomad-joined-data-rf-04032025.model")

In [38]:
# Convert to a stacked array of observations
# stacked_arrays = stacked_arrays.squeeze()
stacked_arrays = postcard.squeeze()#.stack(dims=["y", "x"])#.transpose()
stacked_arrays = stacked_arrays.to_dataarray()

In [39]:
stacked_arrays

Unnamed: 0,Array,Chunk
Bytes,1.83 MiB,75.04 kiB
Shape,"(25, 170, 113)","(1, 170, 113)"
Dask graph,25 chunks in 229 graph layers,25 chunks in 229 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.83 MiB 75.04 kiB Shape (25, 170, 113) (1, 170, 113) Dask graph 25 chunks in 229 graph layers Data type float32 numpy.ndarray",113  170  25,

Unnamed: 0,Array,Chunk
Bytes,1.83 MiB,75.04 kiB
Shape,"(25, 170, 113)","(1, 170, 113)"
Dask graph,25 chunks in 229 graph layers,25 chunks in 229 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [40]:
stacked_arrays_2d = stacked_arrays.stack(new_dim=("y", "x")) 

In [41]:
reordered_data_array = stacked_arrays_2d.transpose('new_dim', 'variable')

In [42]:
# Predict the classes
predicted = model.predict(reordered_data_array)
reordered_data_array = predicted.reshape(len(postcard.y), len(postcard.x))
predicted_da = xr.DataArray(
    reordered_data_array, coords={"y": postcard.y, "x": postcard.x}, dims=["y", "x"]
)


In [43]:
print(predicted_da.dtype)  # Check the dtype of your DataArray
predicted_da = predicted_da.astype('float32')  # Convert to float32

# Check for NaN values
if np.isnan(predicted_da).any():
    print("NaN values found in the data")
    # Handle NaN values, e.g. by filling them
    predicted_da = predicted_da.fillna(0)  # Replace NaN with 0 or appropriate value

object


In [44]:
predicted_da

In [45]:
from matplotlib import colors

classes = [
    [1, "sediment", "#8c8c8c"],
    [2, "sand", "#fedd24"],
    [3, "rubble", "#f8ffb4"],
    [4, "seagrass", "#6df7dc"],
    [5, "seaweed", "#b9df6f"],
    [6, "coral", "#a011c3"],
    [7, "rock", "#804600"],
    [8, "deeps", "#011b61"],
    [9, "mangrove", "#086a39"],
    [10, "land", "#ffffff"],
]

values_list = [c[0] for c in classes]
color_list = [c[2] for c in classes]

# Build a listed colormap.
c_map = colors.ListedColormap(color_list)
bounds = values_list + [14]
norm = colors.BoundaryNorm(bounds, c_map.N)

# predicted_da.plot.imshow(cmap=c_map, norm=norm, size=10)

In [46]:
# issue where not all masks are being included - only land but not surf / also strange effect on side
predicted_da = predicted_da.where(land_mask)

In [47]:
# masked_data = data.where(all_masks)

In [48]:
from matplotlib import colors

classes = [
    [1, "sediment", "#8c8c8c"],
    [2, "sand", "#fedd24"],
    [3, "rubble", "#f8ffb4"],
    [4, "seagrass", "#6df7dc"],
    [5, "seaweed", "#b9df6f"],
    [6, "coral", "#a011c3"],
    [7, "rock", "#804600"],
    [8, "deeps", "#011b61"],
    [9, "mangrove", "#086a39"],
    [10, "land", "#00FFFFFF"],
]

values_list = [c[0] for c in classes]
color_list = [c[2] for c in classes]

# Build a listed colormap.
c_map = colors.ListedColormap(color_list)
bounds = values_list + [14]
norm = colors.BoundaryNorm(bounds, c_map.N)

# predicted_da.plot.imshow(cmap=c_map, norm=norm, size=10)

In [49]:
predicted_da.odc.explore(cmap=c_map)

In [50]:
predicted_da.odc.write_cog("predictions/predicted_utanglang_joined_data_postcard_04032025.tiff", overwrite=True)

PosixPath('predicted_utanglang_joined_data_postcard_04032025.tiff')

In [52]:
test_data = gpd.read_file("testing-data/utanglang_postcard.geojson")

In [55]:
# # First transform the training points to the same CRS as the data
# test_data = test_data.to_crs(postcard.odc.geobox.crs)
# # Next get the X and Y values out of the point geometries
# training_da = test_data.assign(x=test_data.geometry.x, y=test_data.geometry.y).to_xarray()
# # Now we can use the x and y values (lon, lat) to extract values from the median composite
# training_values = (
#     postcard.sel(training_da[["x", "y"]], method="nearest").squeeze().compute().to_pandas()
# )
# len(training_values)

161

In [None]:
# # Join the training data with the extracted values and remove unnecessary columns
# training_array = pd.concat([training["cc_id"], training_values], axis=1)
# training_array = training_array.drop(
#     columns=[
#         "y",
#         "x",
#         "spatial_ref",
#     ]
# )

# # # Drop rows where there was no data available
# # training_array = training_array.dropna()

# # Preview our resulting training array
# training_array.head()

In [None]:
# print(predicted_da.dtype)  # Check the dtype of your DataArray
# predicted_da = predicted_da.astype('float32')  # Convert to float32

In [None]:
# # Check for NaN values
# if np.isnan(median).any():
#     print("NaN values found in the data")
#     # Handle NaN values, e.g. by filling them
#     median = median.fillna(0)  # Replace NaN with 0 or appropriate value

In [None]:
# # Check if CRS is set, if not set it
# if not median.rio.crs:
#     median = median.rio.write_crs("EPSG:4326")  # Set to appropriate CRS