In [None]:
import geopandas as gpd
import joblib
import numpy as np
import odc.geo.xr  # noqa: F401
import pandas as pd
import xarray as xr
from odc.stac import load
from pystac_client import Client
from shapely import geometry
from sklearn.ensemble import RandomForestClassifier
from dask.distributed import Client as dask_client

from utils import get_image_values

In [None]:
%reload_ext autoreload
%autoreload 2

## Load study area

Load data and set up your array to use for prediction

In [None]:
# Configure some things up front
chunks = dict(x=100, y=100)
datetime = "2023"

bbox = [177.24, -18.28, 178.7, -17.27]
bbox_geometry = geometry.box(*bbox)

gdf = gpd.GeoDataFrame({'geometry': [bbox_geometry]}, crs='EPSG:4326')
gdf.explore()

In [None]:
catalog = "https://stac.staging.digitalearthpacific.org"
client = Client.open(catalog)

# Search for Sentinel-2 GeoMAD data
items = client.search(
    collections=["dep_s2_geomad"],
    bbox=bbox,
    datetime=datetime
).items()

# Load the data
data = load(items, chunks=chunks, bbox=bbox).squeeze("time")
data


In [None]:
training_file = "training_data/fj_lulc_data_points_ce.gpkg"

training_data = gpd.read_file(training_file, bbox=bbox_geometry)
training_data.explore(
    tiles="https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}",
    attr="Esri",
    name="Esri Satellite",
    column="Class"
)

In [None]:
# Count the values in a specific column (e.g., "Class" column)
class_counts = training_data['Class'].value_counts()

# Display the counts
class_counts.plot.bar()

In [None]:
# Subset the training data to a smaller sample size
subset = training_data.sample(1000)

with dask_client(
    n_workers=16, threads_per_worker=16, memory_limit="10GB"
):
    variables = get_image_values(subset, data)

variables.head()

In [None]:
# Sort the variables by name, so they're in a consistent order
sorted_variables = variables.sort_index(axis=1)

# Join the new variables to the original points and drop non-required columns
training_array = pd.concat([training_data["ClassId"], sorted_variables], axis=1)
training_array = training_array.drop(columns=['time','x','y','spatial_ref'])

# Drop rows where there are any NaNs
training_array = training_array.dropna()

# Explore our data
training_array.head()

In [None]:
classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=10,
    n_jobs=-1,
    random_state=42,
)

training_data = np.array(training_array)[:, 1:]
classes = np.array(training_array)[:, 0]

model = classifier.fit(training_data, classes)

In [None]:
# Print feature importances against column headings
fields_importances = sorted(
    zip(training_array.columns[1:], classifier.feature_importances_),
    key=lambda x: x[1],
    reverse=True,
)

for i in fields_importances:
    # Format as a table to 2 decinal places
    print(f"{i[0]:<11}| {i[1]:.2f}")


## Export the model for use in the prediction notebook

In [None]:
joblib.dump(model, "nfi_model_ce.dump")