In [None]:
from pystac_client import Client
from odc.stac import load

from sklearn.ensemble import RandomForestRegressor

import geopandas as gpd
import pandas as pd
import numpy as np
import xarray as xr

import folium

In [None]:
in_points = "nadi.geojson"

gdf = gpd.read_file(in_points)
gdf.explore(column="depth")

In [None]:
catalog = "https://earth-search.aws.element84.com/v1"
client = Client.open(catalog)

# Get extents of gdf
bbox = list(gdf.to_crs("epsg:4326").total_bounds)

# Expand the bbox slightly
buffer = 0.05
bbox[0] = bbox[0] - buffer
bbox[1] = bbox[1] - buffer
bbox[2] = bbox[2] + buffer
bbox[3] = bbox[3] + buffer

items = client.search(
    collections=["sentinel-2-c1-l2a"],
    bbox=bbox,
    datetime="2024-07/2024-09",
    query={"eo:cloud_cover": {"lt": 30}},
).item_collection()

print(f"Found {len(items)} items")

In [None]:
data = load(
    items,
    chunks={},
    bbox=bbox,
    groupby="solar_day",
    measurements=[
        "red",
        "green",
        "blue",
        "nir",
        "nir09",
        "swir16",
        "swir22",
        "coastal",
        "rededge1",
        "rededge2",
        "rededge3",
        "scl"
    ]
)

# nodata, cloud shadow, medium cloud, high cloud
mask_flags = [1, 3, 8, 9]
cloud_mask = ~data.scl.isin(mask_flags)
masked = data.where(cloud_mask).drop_vars("scl")

scaled = (masked.where(masked != 0) * 0.0001).clip(0, 1)

scaled = scaled.compute()

In [None]:
scaled[["red", "green", "blue"]].to_array().plot.imshow(col="time", col_wrap=2, vmin=0, vmax=0.2)

In [None]:
subset = scaled.isel(time=0)

reprojected_gdf = gdf.to_crs(subset.odc.crs)

# Convert the geodataframe to an xarray
pts_da = gdf.assign(x=reprojected_gdf.geometry.x, y=reprojected_gdf.geometry.y).to_xarray()

# Extract values from the EO data onto the points xarray, and convert back to pandas
pt_values_i = (
    subset.sel(pts_da[["x", "y"]], method="nearest").squeeze().compute().to_pandas()
)

In [None]:
training_array = pd.concat([gdf, pt_values_i], axis=1)
training_array = training_array.drop(
    columns=[
        "y",
        "x",
        "spatial_ref",
        "time",
        "geometry",
    ]
)
# Drop rows where there are any NaNs
training_array = training_array.dropna()

training_array.head()

In [None]:
training_data = np.array(training_array)[:, 1:]
values = np.array(training_array)[:, 0]

In [None]:
regr = RandomForestRegressor(
    n_estimators=1000,
    # criterion="friedman_mse",
    max_depth=10,
    random_state=0
)

model = regr.fit(training_data, values)

In [None]:
predictions = []

for i in range(len(scaled.time)):
    one_time = scaled.isel(time=i)
    stacked_arrays = one_time.to_array().stack(dims=["y", "x"]).transpose()
    
    p = model.predict(stacked_arrays)
    array = p.reshape(len(masked.y), len(masked.x))
    predictions.append(xr.DataArray(
        array, coords={"x": masked.x, "y": masked.y}, dims=["y", "x"]
    ))

print(f"Completed predicting {len(scaled.time)} time slices")

In [None]:
# Combine predictions into an xarray
predicted = xr.concat(predictions, dim=scaled.time).to_dataset(name="depth")
predicted = predicted.where(cloud_mask)

In [None]:
predicted.depth.plot.imshow(col="time", col_wrap=2, cmap="viridis")

In [None]:
mean = predicted.depth.mean(dim="time")

In [None]:
out_da = out.reshape(len(masked.y), len(masked.x))

prediction = xr.DataArray(
    out_da, coords={"x": masked.x, "y": masked.y}, dims=["y", "x"]
).to_dataset(name="depth")

_ = prediction.depth.plot.hist(bins=50)

In [None]:
# Closer to 1 is better
model.score(training_data, values)

In [None]:
coords = (bbox[1] + bbox[3])/2, (bbox[0] + bbox[2])/2
m = folium.Map(location=coords, zoom_start=14, layer_control=True)

visual = subset.odc.to_rgba(["red", "green", "blue"], vmin=0, vmax=0.3)
visual.odc.add_to(m, name="RGB")

predicted.isel(time=0).depth.odc.add_to(m, name="Depth", cmap="Blues_r")
mean.odc.add_to(m, name="Mean Depth", cmap="Blues_r")

# Layer control
folium.LayerControl().add_to(m)

m

In [None]:
pts_da = gdf.assign(x=reprojected_gdf.geometry.x, y=reprojected_gdf.geometry.y).to_xarray()

# Extract values from the EO data onto the points xarray, and convert back to pandas
compare_depths = (
    prediction.depth.sel(pts_da[["x", "y"]], method="nearest").squeeze().compute().to_pandas()
).rename("depth_computed")

appended = pd.concat([gdf, compare_depths], axis=1)

appended["error"] = appended.depth - appended.depth_computed

appended["error"].hist(bins=20)

In [None]:
mean.odc.write_cog("depth_random_forest_nadi.tif", overwrite=True)