In [None]:
import geopandas as gpd
import joblib
import numpy as np
import odc.geo  # noqa: F401
import pandas as pd
from shapely import geometry
from sklearn.ensemble import RandomForestClassifier

from utils import load_data

In [None]:
%reload_ext autoreload
%autoreload 2

## Find and load S2 data

Load data and set up your array to use for prediction

In [None]:
# Configure some things up front
chunks = dict(x=256, y=256)
datetime = "2023"

bbox = [177.14, -18.41, 179.80, -16.01]
bbox_geometry = geometry.box(*bbox)

gdf = gpd.GeoDataFrame({'geometry': [bbox_geometry]}, crs='EPSG:4326')
gdf.explore()

In [None]:
merged = load_data(bbox, chunks=dict(x=256, y=256), datetime=datetime, resolution=10)
merged

## Train and predict

When you change your training data, you can re-train and predict here.

In [None]:
training_file = "training_data/MRD_dissagregated_31.geojson"

tdata = gpd.read_file(training_file, bbox=bbox_geometry)
tdata.explore()

In [None]:
# Get values for each of the image bands at each of the points.
pts_proj = tdata.to_crs(merged.odc.crs)

# a DataArray with x & y coords
pts_da = pts_proj.assign(x=pts_proj.geometry.x, y=pts_proj.geometry.y).to_xarray()

# a dataframe or series (for a single point)
pt_values_i = (
    merged.sel(pts_da[["x", "y"]], method="nearest").squeeze().compute().to_pandas()
)

if isinstance(pt_values_i, pd.Series):
    pt_values_i = pt_values_i.to_frame().transpose()
    pt_values_i.index = tdata.index

In [None]:
training_array = pd.concat([tdata, pt_values_i], axis=1).to_crs(4326)
training_array = training_array.drop(
    columns=[
        "y",
        "x",
        "spatial_ref",
        "time",
        "fid",
        "index",
        "lulc_class",
        "path",
        "geometry",
        "layer",
        "id",
    ]
)
# Drop rows where there are any NaNs
training_array = training_array.dropna()

training_array.head()

In [None]:
classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=10,
    n_jobs=-1,
    random_state=42,
)

training_data = np.array(training_array)[:, 1:]
classes = np.array(training_array)[:, 0]

model = classifier.fit(training_data, classes)


In [None]:
joblib.dump(model, "test_model.dump")

In [None]:
# Print feature importances against column headings
fields_importances = sorted(
    zip(training_array.columns[1:], classifier.feature_importances_),
    key=lambda x: x[1],
    reverse=True,
)

for i in fields_importances:
    # Format as a table to 2 decinal places
    print(f"{i[0]:<11}| {i[1]:.2f}")
