<img src="https://github.com/nicholasmetherall/digital-earth-pacific-macblue-activities/blob/main/attachments/images/DE_Pacific_banner.JPG?raw=true" width="900"/>
Figure 1.1.a. Jupyter environment + Python notebooks

### Digital Earth Pacific Notebook predictions



In [22]:
from pystac_client import Client
from dask.distributed import Client as DaskClient
from odc.stac import load, configure_s3_access
import rasterio as rio
import geopandas as gpd
import pandas as pd
import numpy as np
import xarray as xr
import folium
from utils import load_data
from utils import scale
from utils import apply_masks
from utils import calculate_band_indices
from utils import add_spectral_indices
from sklearn.ensemble import RandomForestClassifier
import odc.geo.xr
import rioxarray
import matplotlib.pyplot as plt
import joblib
from shapely.geometry import box

<font color='blue'>Define catalogue</font>

In [23]:
catalog = "https://stac.digitalearthpacific.org"
client = Client.open(catalog)

In [24]:
# min_lat = -17.432796
# min_lon = 168.319607
# max_lat = -17.418208
# max_lon = 168.329603
# bbox = [min_lon, min_lat, max_lon, max_lat]

In [25]:
# Ba Estuary
min_lat = -17.49416
min_lon = 177.51971
max_lat = -17.34430
max_lon = 177.68452
bbox = [min_lon, min_lat, max_lon, max_lat]

In [26]:
datetime="2024"
items = list(client.search(collections=["dep_s2_geomad"], datetime=datetime, bbox=bbox).items())

In [27]:
from pystac import Collection

In [28]:
collection = Collection.from_file("https://stac.digitalearthpacific.org/collections/dep_s2_geomad")

In [29]:
data = load(
        items,
        measurements=[
            "nir", "red", "blue", "green", "emad", "smad", 
            "bcmad", "count", "green", "nir08", 
            "nir09", "swir16", "swir22", "coastal",
            "rededge1", "rededge2", "rededge3", 
        ],
        bbox=bbox,
        chunks={"x": 2048, "y": 2048},
        groupby="solar_day",
    )

In [30]:
scaled_data = scale(data)
print(type(scaled_data))
indices = add_spectral_indices(scaled_data)
print(type(indices))
masked_data = apply_masks(scaled_data)
print(type(masked_data))


<class 'xarray.core.dataset.Dataset'>
<class 'xarray.core.dataset.Dataset'>
<class 'xarray.core.dataset.Dataset'>


In [31]:
clipped_ds = masked_data
clipped_ds = calculate_band_indices(clipped_ds)

In [61]:
# Define training data
utanglang_gdf = gpd.read_file("utanglang_data_12032025.geojson")
utanglang_gdf = utanglang_gdf.to_crs("EPSG:4326")
# utanglang_gdf.explore(column="cc_id", legend=True)

In [62]:
utanglang_postcard = clipped_ds#.where(all_masks)
tdata = utanglang_gdf.to_crs(utanglang_postcard.odc.geobox.crs)

In [63]:
print(tdata.columns)
tdata=tdata.drop(columns=['x','y','fid', 'date', 'uuid'])

Index(['fid', 'observed', 'date', 'uuid', 'x', 'y', 'cc_id', 'geometry'], dtype='object')


In [64]:
tdata

Unnamed: 0,observed,cc_id,geometry
0,deeps,8,POINT (2039620.732 -1956908.207)
1,deeps,8,POINT (2039515.155 -1957069.507)
2,deeps,8,POINT (2039662.963 -1957271.134)
3,deeps,8,POINT (2039479.963 -1957406.775)
4,deeps,8,POINT (2039455.328 -1957245.472)
...,...,...,...
274,coral,6,POINT (2039861.612 -1957466.612)
275,coral,6,POINT (2040069.584 -1957925.362)
276,coral,6,POINT (2039863.607 -1958086.421)
277,coral,6,POINT (2039836.177 -1958105.644)


In [60]:
tdata_da = tdata.assign(x=tdata.geometry.x, y=tdata.geometry.y).to_xarray()
tdata_values = (
    utanglang_postcard.sel(tdata_da[["x", "y"]], method="nearest").squeeze().compute().to_pandas()
)
tdata_values


Unnamed: 0_level_0,nir,red,blue,green,emad,smad,bcmad,count,nir08,nir09,...,ndvi,ndwi,b_g,b_r,mci,ndci,y,x,spatial_ref,time
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0219,0.0239,0.0455,0.0344,0.071448,0.000011,0.000049,0.0039,0.0222,0.0234,...,-0.043668,0.222025,1.322675,1.903766,0.893878,0.012397,-1956905.0,3063485.0,3832,2024-01-01
1,0.0208,0.0227,0.0459,0.0343,0.068675,0.000008,0.000049,0.0039,0.0209,0.0210,...,-0.043678,0.245009,1.338192,2.022027,0.892704,0.013043,-1957065.0,3063485.0,3832,2024-01-01
2,0.0206,0.0228,0.0449,0.0337,0.067857,0.000010,0.000049,0.0039,0.0211,0.0209,...,-0.050691,0.241252,1.332344,1.969298,0.854772,0.027719,-1957275.0,3063485.0,3832,2024-01-01
3,0.0207,0.0228,0.0447,0.0338,0.065804,0.000008,0.000051,0.0039,0.0209,0.0207,...,-0.048276,0.240367,1.322485,1.960526,0.866109,0.023555,-1957405.0,3063485.0,3832,2024-01-01
4,0.0211,0.0229,0.0442,0.0339,0.068640,0.000008,0.000050,0.0039,0.0212,0.0225,...,-0.040909,0.232727,1.303835,1.930131,0.897872,0.012931,-1957245.0,3063485.0,3832,2024-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,0.0205,0.0228,0.0449,0.0338,0.067228,0.000008,0.000051,0.0039,0.0201,0.0196,...,-0.053118,0.244936,1.328402,1.969298,0.876068,0.012987,-1957465.0,3063485.0,3832,2024-01-01
275,0.0243,0.0749,0.1019,0.1310,0.082121,0.000003,0.000025,0.0034,0.0214,0.0188,...,-0.510081,0.687057,0.777863,1.360481,0.322709,0.002663,-1957925.0,3063485.0,3832,2024-01-01
276,0.0179,0.0828,0.1154,0.1490,0.061150,0.000002,0.000015,0.0032,0.0142,0.0134,...,-0.644489,0.785500,0.774497,1.393720,0.232770,-0.036944,-1958085.0,3063485.0,3832,2024-01-01
277,0.0181,0.0834,0.1181,0.1522,0.062750,0.000002,0.000014,0.0032,0.0142,0.0134,...,-0.643350,0.787434,0.775953,1.416067,0.235984,-0.041849,-1958105.0,3063485.0,3832,2024-01-01


In [11]:
mndwi = (clipped_ds["green"] - clipped_ds["swir16"]) / (clipped_ds["green"] + clipped_ds["swir16"])
land_mask = mndwi > 0

In [12]:
postcard = clipped_ds

In [13]:
model = joblib.load("models/model-geomad-joined-data-rf-04032025.model")

In [14]:
# Convert to a stacked array of observations
# stacked_arrays = stacked_arrays.squeeze()
stacked_arrays = postcard.squeeze()#.stack(dims=["y", "x"])#.transpose()
stacked_arrays = stacked_arrays.to_dataarray()

In [15]:
stacked_arrays

Unnamed: 0,Array,Chunk
Bytes,304.32 MiB,12.17 MiB
Shape,"(25, 1739, 1835)","(1, 1739, 1835)"
Dask graph,25 chunks in 229 graph layers,25 chunks in 229 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 304.32 MiB 12.17 MiB Shape (25, 1739, 1835) (1, 1739, 1835) Dask graph 25 chunks in 229 graph layers Data type float32 numpy.ndarray",1835  1739  25,

Unnamed: 0,Array,Chunk
Bytes,304.32 MiB,12.17 MiB
Shape,"(25, 1739, 1835)","(1, 1739, 1835)"
Dask graph,25 chunks in 229 graph layers,25 chunks in 229 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [16]:
stacked_arrays_2d = stacked_arrays.stack(new_dim=("y", "x")) 

In [17]:
reordered_data_array = stacked_arrays_2d.transpose('new_dim', 'variable')

In [18]:
# Predict the classes
predicted = model.predict(reordered_data_array)
reordered_data_array = predicted.reshape(len(postcard.y), len(postcard.x))
predicted_da = xr.DataArray(
    reordered_data_array, coords={"y": postcard.y, "x": postcard.x}, dims=["y", "x"]
)

In [19]:
print(predicted_da.dtype)  # Check the dtype of your DataArray
predicted_da = predicted_da.astype('float32')  # Convert to float32

# Check for NaN values
if np.isnan(predicted_da).any():
    print("NaN values found in the data")
    # Handle NaN values, e.g. by filling them
    predicted_da = predicted_da.fillna(0)  # Replace NaN with 0 or appropriate value

object


In [20]:
predicted_da

In [21]:
# issue where not all masks are being included - only land but not surf / also strange effect on side
predicted_da = predicted_da.where(land_mask)

In [22]:
# masked_data = data.where(all_masks)

In [24]:
from matplotlib import colors

classes = [
    [1, "sediment", "#8c8c8c"],
    [2, "sand", "#fedd24"],
    [3, "rubble", "#f8ffb4"],
    [4, "seagrass", "#6df7dc"],
    [5, "seaweed", "#b9df6f"],
    [6, "coral", "#a011c3"],
    [7, "rock", "#804600"],
    [8, "deeps", "#011b61"],
    [9, "mangrove", "#086a39"],
    [10, "land", "#00FFFFFF"],
]

values_list = [c[0] for c in classes]
color_list = [c[2] for c in classes]

# Build a listed colormap.
c_map = colors.ListedColormap(color_list)
bounds = values_list + [14]
norm = colors.BoundaryNorm(bounds, c_map.N)

# predicted_da.plot.imshow(cmap=c_map, norm=norm, size=10)

In [25]:
predicted_da.odc.explore(cmap=c_map)

In [None]:
predicted_da.odc.write_cog("predictions/predicted_ba_estuary_joined_data_postcard_04032025.tiff", overwrite=True)

In [None]:
test_data = gpd.read_file("testing-data/test-utanglang.geojson")

In [None]:
# First transform the training points to the same CRS as the data
test = test_data.to_crs(postcard.odc.geobox.crs)
# Next get the X and Y values out of the point geometries
training_da = test.assign(x=test.geometry.x, y=test.geometry.y).to_xarray()
# Now we can use the x and y values (lon, lat) to extract values from the median composite
training_values = (
    postcard.sel(training_da[["x", "y"]], method="nearest").squeeze().compute().to_pandas()
)
len(training_values)

In [None]:
# Join the training data with the extracted values and remove unnecessary columns
training_array = pd.concat([test["observed_id"], training_values], axis=1)
training_array = training_array.drop(
    columns=[
        "y",
        "x",
        "spatial_ref",
    ]
)
# # Drop rows where there was no data available
# training_array = training_array.dropna()
# Preview our resulting training array
training_array.head()

In [None]:
print(predicted_da.dtype)  # Check the dtype of your DataArray
predicted_da = predicted_da.astype('float32')  # Convert to float32

In [None]:
print(len(training_array), len(test))  # Check the lengths of both arrays

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Sample data
np.random.seed(42)
training_array = np.random.rand(100, 5)
test = pd.DataFrame({"observed_id": np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 100)})

# Split the data
X_train, X_test, y_train, y_test = train_test_split(training_array, test.observed_id, test_size=0.9, random_state=42)

# Replace None values with a default value, e.g., 0 or the most frequent value
y_train = y_train.fillna(0)  # or y_train.fillna(y_train.mode()[0])
y_test = y_test.fillna(0)    # Ensure y_test also has no None values
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Train your model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Generate and display confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm)
plt.figure(figsize=(12, 10))  # Adjust width and height for larger plot
cm_display.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
cm

In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

In [None]:
from sklearn.metrics import classification_report

# report = classification_report(true_labels, predicted_labels, target_names=class_labels)
# print(report)

report = classification_report(y_test, y_pred, target_names=['sediment', 'sand', 'rubble', 'seagrass', 'seaweed', 'coral', 'rock', 'deeps', 'mangrove', 'land'])
print(report)

In [None]:
print("Unique labels in y_test:", np.unique(y_test))
print("Unique labels in y_pred:", np.unique(y_pred))

In [None]:
print("Length of y_test:", len(y_test))
print("Length of y_pred:", len(y_pred))