In [None]:
# Imports, configuration, and helpers
import os
import numpy as np
import xarray as xr
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from skimage.transform import resize
from skimage.filters import threshold_otsu
from scipy.ndimage import binary_dilation, binary_closing
import joblib
import folium

# optional rioxarray for GeoTIFF saving
try:
    import rioxarray  # noqa: F401
    RIO_AVAILABLE = True
except Exception:
    RIO_AVAILABLE = False

# User configuration (edit)
STAC_CATALOG = "https://earth-search.aws.element84.com/v1"
AOI_PATH = "AOI/EfateAOI.geojson"            # set to your local path or URL
TRAINING_PATH = "Training_Data/InvasiveClean6.geojson"
DATETIME = "2024-05/2024-09"
SENTINEL_COLLECTION = ["sentinel-2-c1-l2a"]

MASK_NAME = "combined_mask"
INCLUDE_MASK_AS_FEATURE = False   # True to include mask as feature, False to only exclude pixels
RF_N_ESTIMATORS = 100
RF_RANDOM_STATE = 42
MODEL_OUT = "rf_model.joblib"
PRED_GTIFF = "predicted.tif"
OUT_DS_NETCDF = "predicted_dataset.nc"

def info(*args):
    print("[INFO]", *args)

In [None]:
# STAC client, Dask, and robust AOI loading
from pystac_client import Client
from dask.distributed import Client as DaskClient
from odc.stac import load, configure_s3_access

# Start dask client (adjust to your environment)
dask_client = DaskClient(n_workers=1, threads_per_worker=8, memory_limit="16GB")
info("Started Dask client:", dask_client)

# configure s3 access (ODC)
configure_s3_access(cloud_defaults=True, requester_pays=True)
info("Configured S3 access")

# Robust AOI load: try path(s), else ask user or create small bbox fallback
def load_aoi(aoi_path):
    p = Path(aoi_path)
    if p.exists():
        return gpd.read_file(str(p))
    # try URL fallback if path looks like URL
    try:
        if aoi_path.startswith("http"):
            return gpd.read_file(aoi_path)
    except Exception:
        pass
    # fallback: prompt or create a simple bbox AOI (edit coords as needed)
    info("AOI not found at", aoi_path)
    # Example fallback bbox (replace with your coords if desired)
    minx, miny, maxx, maxy = 168.2, -17.6, 168.7, -17.1
    from shapely.geometry import box
    aoi = gpd.GeoDataFrame({"id":[1]}, geometry=[box(minx, miny, maxx, maxy)], crs="EPSG:4326")
    info("Using fallback bbox AOI")
    return aoi

aoi_gdf = load_aoi(AOI_PATH)
info("AOI CRS:", aoi_gdf.crs)
aoi_gdf.plot(edgecolor="red", facecolor="none")
plt.title("AOI check")
plt.show()
bbox = aoi_gdf.total_bounds  # [minx, miny, maxx, maxy]
info("BBox:", bbox)

In [None]:
# Load training points
if Path(TRAINING_PATH).exists() or TRAINING_PATH.startswith("http"):
    gdf = gpd.read_file(TRAINING_PATH, bbox=tuple(bbox))
    info("Loaded training points:", len(gdf))
else:
    raise FileNotFoundError(f"Training file not found: {TRAINING_PATH}")

# quick interactive check
try:
    gdf.explore(column="randomforest", legend=True)
except Exception:
    info("gdf.explore not available; here's head():")
    print(gdf.head())

In [None]:
# STAC search and load Sentinel-2 L2A bands via ODC
client = Client.open(STAC_CATALOG)
items = client.search(
    collections=SENTINEL_COLLECTION,
    bbox=bbox,
    datetime=DATETIME,
    query={"eo:cloud_cover": {"lt": 25}},
).item_collection()
info("Found STAC items:", len(items))

data = load(
    items,
    measurements=["red", "green", "blue", "nir08", "swir16", "scl"],
    bbox=bbox,
    chunks={"x": 2048, "y": 2048},
    groupby="solar_day",
)
info("Loaded data variables:", list(data.data_vars))
data

In [None]:
# cloud masking, scaling, indices, and median composite
mask_flags = [1, 3, 9, 10]  # SCL values to remove
cloud_mask = ~data.scl.isin(mask_flags)
masked = data.where(cloud_mask)

# scale to 0-1 and clip
scaled = (masked.where(masked != 0) * 0.0001).clip(0, 1)

# add NDVI for reference
scaled["ndvi"] = (scaled.nir08 - scaled.red) / (scaled.nir08 + scaled.red)

info("Computing median composite (this may take a while)...")
median = scaled.median("time").compute()
info("Median computed. Variables:", list(median.data_vars))

# quick visual check
try:
    median.odc.explore(vmin=0, vmax=0.3)
except Exception:
    info("median.odc.explore unavailable in this environment")

In [None]:
# compute indices, auto/select NDBI threshold, morphological expansion
green = median["green"]
red = median["red"]
nir = median["nir08"]
swir = median["swir16"]

ndwi  = (green - nir) / (green + nir)
ndbi  = (swir - nir) / (swir + nir)
ndbai = (swir - red) / (swir + red)

# Describe NDBI to choose threshold
ndbi_vals = ndbi.values.flatten()
ndbi_vals = ndbi_vals[np.isfinite(ndbi_vals)]
info("NDBI percentiles (0,5,50,95,100):", np.nanpercentile(ndbi_vals, [0,5,50,95,100]))

# Automatic threshold with Otsu if enough data, fallback to default
ndbi_thr = 0.1
try:
    if len(ndbi_vals) > 2000:
        ndbi_thr = float(threshold_otsu(ndbi_vals))
        info("Otsu selected NDBI threshold:", ndbi_thr)
    else:
        info("Too few samples for Otsu; using default", ndbi_thr)
except Exception as e:
    info("Otsu failed, using default NDBI threshold", ndbi_thr, "; reason:", e)

ndwi_thr = 0.2
ndbai_thr = 0.15

water_mask = (ndwi > ndwi_thr)
building_mask = (ndbi > ndbi_thr)
road_mask = (ndbai > ndbai_thr) & (ndwi < 0) & (ndbi < 0.2)

# expand building mask a little to capture settlement edges
y_dim, x_dim = ndwi.dims[-2], ndwi.dims[-1]
bm_np = building_mask.values.astype(bool)
bm_np = binary_dilation(bm_np, structure=np.ones((3,3)), iterations=1)
bm_np = binary_closing(bm_np, structure=np.ones((3,3)), iterations=1)
building_mask_expanded = xr.DataArray(bm_np, coords=[median[y_dim], median[x_dim]], dims=(y_dim, x_dim))

combined_mask_da = (water_mask | building_mask_expanded | road_mask).astype("uint8")
combined_mask_da.name = MASK_NAME
info("Combined mask counts (value:count):", np.unique(combined_mask_da.values, return_counts=True))

# quick view
plt.figure(figsize=(10,4))
plt.subplot(1,2,1); plt.title("combined_mask (0=keep,1=masked)"); plt.imshow(combined_mask_da.values, cmap="gray"); plt.axis("off")
plt.subplot(1,2,2); plt.title("ndbi histogram"); plt.hist(ndbi_vals, bins=120); plt.axvline(ndbi_thr, color='r'); plt.show()

In [None]:
# include mask as a feature or keep it for filtering
if INCLUDE_MASK_AS_FEATURE:
    median = median.assign(**{MASK_NAME: combined_mask_da.astype("int8")})
    info("Mask added as band to median; variables now:", list(median.data_vars))
else:
    mask_da = combined_mask_da
    info("Mask will be used to exclude pixels but not included as feature.")

In [None]:
# sample median bands at training points, filter by mask, drop NaNs
gdf_pts = gdf.to_crs(median.odc.geobox.crs)
gx = gdf_pts.geometry.x.values
gy = gdf_pts.geometry.y.values
info("Training points reprojected to raster CRS:", median.odc.geobox.crs)

arr = median.to_array()  # dims: variable, y, x (inspect if different)
info("arr dims:", arr.dims)
y_dim, x_dim = arr.dims[-2], arr.dims[-1]
points_dim = "points"

# Vectorized sampling using DataArray indexers
gx_da = xr.DataArray(gx, dims=points_dim)
gy_da = xr.DataArray(gy, dims=points_dim)
sampled = arr.sel({y_dim: gy_da, x_dim: gx_da}, method="nearest")
info("sampled dims after sel:", sampled.dims)

# squeeze singleton dims if present
sampled = sampled.squeeze(drop=True)
info("sampled dims after squeeze:", sampled.dims, "ndim:", sampled.ndim)

# If still not 2D, fallback to explicit nearest-index lookup
if sampled.ndim == 2:
    sampled_df = sampled.transpose(points_dim, "variable").to_pandas()
    sampled_df = pd.DataFrame(sampled_df).reset_index(drop=True)
else:
    info("Vectorized sampling produced ndim != 2; falling back to explicit nearest-index sampling (slower).")
    y_coords = arr.coords[y_dim].values
    x_coords = arr.coords[x_dim].values
    iy = np.array([np.abs(y_coords - yy).argmin() for yy in gy])
    ix = np.array([np.abs(x_coords - xx).argmin() for xx in gx])
    vals = np.stack([arr.isel({y_dim: iy_k, x_dim: ix_k}).values.ravel() for iy_k, ix_k in zip(iy, ix)], axis=0)
    var_names = list(arr.coords["variable"].values)
    sampled_df = pd.DataFrame(vals, columns=var_names).reset_index(drop=True)

# Combine labels + features
training_df = pd.concat([gdf_pts.reset_index(drop=True)["randomforest"], sampled_df.reset_index(drop=True)], axis=1)

# Filter out training points that fall in masked areas (if not including mask as a feature)
if not INCLUDE_MASK_AS_FEATURE:
    mask_at_pts = combined_mask_da.sel({y_dim: gy, x_dim: gx}, method="nearest").values
    keep = (mask_at_pts == 0)
    training_df = training_df.loc[keep].reset_index(drop=True)
    info(f"Kept {training_df.shape[0]} training points after mask filtering (out of {len(gx)})")

# Drop NaNs
training_df = training_df.dropna()
info("Training rows after dropna:", len(training_df))
training_df.head()

In [None]:
# Train RandomForest
classes = training_df.iloc[:, 0].values
observations = training_df.iloc[:, 1:].values
info("Observations shape:", observations.shape)

clf = RandomForestClassifier(n_estimators=RF_N_ESTIMATORS, random_state=RF_RANDOM_STATE, n_jobs=-1)
clf.fit(observations, classes)
info("RandomForest trained")

# Save model
joblib.dump(clf, MODEL_OUT)
info("Saved model to", MODEL_OUT)

In [None]:
# Prepare full image stack and predict
arr = median.to_array()  # variable, y, x
y_dim, x_dim = arr.dims[-2], arr.dims[-1]
ny, nx = arr.sizes[y_dim], arr.sizes[x_dim]
stacked = arr.stack(pixels=(y_dim, x_dim)).transpose("pixels", "variable")  # pixels x variable
X = stacked.values
info("Full predictor array shape:", X.shape)

# valid data: no NaNs
valid_data_mask = ~np.any(np.isnan(X), axis=1)

if not INCLUDE_MASK_AS_FEATURE:
    mask_flat = combined_mask_da.stack(pixels=(y_dim, x_dim)).values.astype(bool)
    predict_mask = valid_data_mask & (~mask_flat)
else:
    predict_mask = valid_data_mask

info("Pixels to predict:", int(predict_mask.sum()), "out of", X.shape[0])

pred_flat = np.full(X.shape[0], np.nan, dtype=np.float32)
if predict_mask.sum() > 0:
    pred_flat[predict_mask] = clf.predict(X[predict_mask]).astype(np.float32)

pred_2d = pred_flat.reshape(ny, nx)
predicted_da = xr.DataArray(pred_2d, coords={y_dim: median[y_dim], x_dim: median[x_dim]}, dims=(y_dim, x_dim))
predicted_da.name = "predicted_class"
info("Prediction finished, dtype:", predicted_da.dtype)

In [None]:
# mask the predicted result for visualization (so water/settlements/roads removed)
predicted_masked = predicted_da.where(~combined_mask_da.astype(bool), other=np.nan)
predicted_masked.name = "predicted_masked"
info("Applied combined mask to predicted results.")

# Optionally build an output dataset including predictions + mask
out_ds = xr.Dataset({"predicted": predicted_da, "predicted_masked": predicted_masked, MASK_NAME: combined_mask_da})
info("Output dataset prepared with variables:", list(out_ds.data_vars))

In [None]:
# Save outputs
if RIO_AVAILABLE:
    try:
        out_ds["predicted"].rio.write_crs(median.odc.geobox.crs, inplace=True)
        out_ds["predicted"].rio.to_raster(PRED_GTIFF)
        info("Saved predicted GeoTIFF to", PRED_GTIFF)
    except Exception as e:
        info("Could not save GeoTIFF:", e)
        out_ds.to_netcdf(OUT_DS_NETCDF)
        info("Saved NetCDF to", OUT_DS_NETCDF)
else:
    out_ds.to_netcdf(OUT_DS_NETCDF)
    info("rioxarray not available; saved NetCDF to", OUT_DS_NETCDF)

# Save model already done in Cell 9; if you'd like to save the dataset to netCDF in addition:
# out_ds.to_netcdf("out_with_mask.nc")

In [None]:
# Interactive map (folium)
# compute center in WGS84
try:
    gdf_wgs84 = gdf_pts.to_crs(epsg=4326)
    center = [float(gdf_wgs84.geometry.y.mean()), float(gdf_wgs84.geometry.x.mean())]
except Exception:
    # fallback: use AOI centroid in WGS84
    center = [float(aoi_gdf.to_crs(epsg=4326).geometry.centroid.y.mean()), float(aoi_gdf.to_crs(epsg=4326).geometry.centroid.x.mean())]

m = folium.Map(location=center, zoom_start=11)

# Try odc helpers if available; otherwise user may export tiles or open the GeoTIFF in GIS
try:
    median.odc.to_rgba(vmin=0, vmax=0.3).odc.add_to(m, name="Median Composite")
except Exception:
    info("ODC RGBA helper not available for median layer")

try:
    predicted_masked.odc.add_to(m, name="Predicted (masked)")
except Exception:
    info("ODC helper not available for predicted layer; consider exporting GeoTIFF and adding as tile layer")

# mask overlay (semi-transparent red for masked areas)
try:
    combined_mask_da.astype("uint8").odc.to_rgba(palette=["none", "red"], alpha=0.3).odc.add_to(m, name="Combined Mask")
except Exception:
    info("ODC helper not available for mask display")

# Add training points (converted to WGS84)
try:
    gdf_wgs84.explore(m=m, column="randomforest", legend=True, name="Training Data")
except Exception:
    info("Could not add training points to map via .explore()")

folium.LayerControl().add_to(m)
m

In [None]:
# Cell A: Quick checks: are predictions present and how many NaNs after masking?
import numpy as np

# Use whichever predicted variable you have: predicted_da (unmasked) and predicted_masked (masked)
if "predicted_da" not in globals():
    print("predicted_da not found. Do you have `predicted_da` variable? (run the prediction cell first)")
else:
    print("predicted_da dtype:", predicted_da.dtype)
    vals, counts = np.unique(np.nan_to_num(predicted_da.values, nan=-9999), return_counts=True)
    print("Unique values (nan replaced by -9999):")
    for v,c in zip(vals,counts):
        print(f"  {v}: {c}")

if "predicted_masked" in globals():
    vals_m, counts_m = np.unique(np.nan_to_num(predicted_masked.values, nan=-9999), return_counts=True)
    print("\npredicted_masked unique values (nan->-9999):")
    for v,c in zip(vals_m,counts_m):
        print(f"  {v}: {c}")
    # count masked pixels specifically
    n_masked = np.sum(np.isnan(predicted_masked.values))
    print(f"\nMasked pixels in predicted_masked (NaN): {int(n_masked)}")
else:
    print("predicted_masked not present yet.")

In [None]:
# Cell A: Quick checks: are predictions present and how many NaNs after masking?
import numpy as np

# Use whichever predicted variable you have: predicted_da (unmasked) and predicted_masked (masked)
if "predicted_da" not in globals():
    print("predicted_da not found. Do you have `predicted_da` variable? (run the prediction cell first)")
else:
    print("predicted_da dtype:", predicted_da.dtype)
    vals, counts = np.unique(np.nan_to_num(predicted_da.values, nan=-9999), return_counts=True)
    print("Unique values (nan replaced by -9999):")
    for v,c in zip(vals,counts):
        print(f"  {v}: {c}")

if "predicted_masked" in globals():
    vals_m, counts_m = np.unique(np.nan_to_num(predicted_masked.values, nan=-9999), return_counts=True)
    print("\npredicted_masked unique values (nan->-9999):")
    for v,c in zip(vals_m,counts_m):
        print(f"  {v}: {c}")
    # count masked pixels specifically
    n_masked = np.sum(np.isnan(predicted_masked.values))
    print(f"\nMasked pixels in predicted_masked (NaN): {int(n_masked)}")
else:
    print("predicted_masked not present yet.")

In [None]:
# Cell C: Quick static visual overlay of RGB, predicted, and mask (matplotlib)
import matplotlib.pyplot as plt
import numpy as np

# build an RGB with median bands (if available)
if all(k in median for k in ("red","green","blue")):
    r = median["red"].values
    g = median["green"].values
    b = median["blue"].values
    # If bands have shape (y,x) use directly; ensure 0-1
    rgb = np.dstack([r, g, b])
    # normalize for display if not already 0-1
    rgb_display = np.clip(rgb, 0, 1)
else:
    rgb_display = None

fig, axs = plt.subplots(1, 3, figsize=(15,5))
if rgb_display is not None:
    axs[0].imshow(rgb_display)
    axs[0].set_title("RGB median")
else:
    axs[0].text(0.5,0.5,"RGB not available", ha="center")
    axs[0].set_title("RGB median")

# predicted (raw)
if "predicted_da" in globals():
    im = axs[1].imshow(np.nan_to_num(predicted_da.values, nan=-1), cmap="viridis")
    axs[1].set_title("predicted_da (numeric classes)")
    plt.colorbar(im, ax=axs[1], fraction=0.046, pad=0.04)
else:
    axs[1].text(0.5,0.5,"predicted_da missing", ha="center")

# mask
if "combined_mask_da" in globals():
    axs[2].imshow(combined_mask_da.values, cmap="gray")
    axs[2].set_title("combined_mask (1=masked)")
else:
    axs[2].text(0.5,0.5,"combined_mask_da missing", ha="center")

for ax in axs:
    ax.axis("off")
plt.show()

In [None]:
# Cell D: Compare predictions at training points; show how many invasive training points were masked or mispredicted.
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# We need gdf_pts (reprojected training points) and arr -> sample the predicted layer at training locations
if "gdf" not in globals():
    print("gdf (training GeoDataFrame) not found")
else:
    # ensure points in same CRS as raster
    try:
        gdf_pts = gdf.to_crs(median.odc.geobox.crs)
    except Exception:
        gdf_pts = gdf

    gx = gdf_pts.geometry.x.values
    gy = gdf_pts.geometry.y.values

    # sample predicted_da at points (nearest)
    if "predicted_da" not in globals():
        print("predicted_da not found (run prediction step)")
    else:
        # vectorized sampling
        import xarray as xr
        pts = "pts"
        gx_da = xr.DataArray(gx, dims=pts)
        gy_da = xr.DataArray(gy, dims=pts)
        sampled_pred = predicted_da.sel({predicted_da.dims[-2]: gy_da, predicted_da.dims[-1]: gx_da}, method="nearest").values
        # sample mask at points
        if "combined_mask_da" in globals():
            sampled_mask = combined_mask_da.sel({combined_mask_da.dims[-2]: gy_da, combined_mask_da.dims[-1]: gx_da}, method="nearest").values.astype(bool)
        else:
            sampled_mask = np.zeros_like(sampled_pred, dtype=bool)

        # get training labels from gdf (column name assumed 'randomforest' — adjust if different)
        lab_col = "randomforest"
        if lab_col not in gdf_pts.columns:
            print(f"Training label column '{lab_col}' not found in gdf. Columns: {list(gdf_pts.columns)}")
        else:
            y_true = gdf_pts[lab_col].values
            # ensure numeric encoding similar to model if necessary (if y_true are strings and model predicted numeric labels, you might need to map)
            print("Unique training labels:", np.unique(y_true))
            print("Unique predictions at training points (nan shown for missing):", np.unique(sampled_pred[~np.isnan(sampled_pred)]))

            # Show counts of training invasive points masked
            # Find which label correspond to 'invasive' — we need the class value used in training. If training used numeric codes, report per-class
            # We'll show a table where we print label, how many are masked, and how many predicted as same label
            import pandas as pd
            dfpts = pd.DataFrame({
                "true": y_true,
                "pred": sampled_pred,
                "masked": sampled_mask
            })
            summary = dfpts.groupby("true").agg(total=("true","size"), masked=("masked","sum"),
                                               predicted_same=("pred", lambda s: np.sum(s == dfpts.loc[s.index,"true"])))
            print("\nPer-class summary at training point locations:")
            print(summary)

            # Confusion matrix for training points where prediction exists
            valid_idx = ~np.isnan(sampled_pred)
            if valid_idx.sum() > 0:
                try:
                    print("\nClassification report (at training points):")
                    print(classification_report(y_true[valid_idx], sampled_pred[valid_idx]))
                except Exception as e:
                    print("Could not print classification report (maybe labels mismatch); here's confusion matrix:")
                    print(confusion_matrix(y_true[valid_idx], sampled_pred[valid_idx]))
            else:
                print("No valid predictions at training points (all NaN).")

In [None]:
# Cell E: Sensitivity sweep across candidate thresholds to see masked area %
import numpy as np

# compute ndwi/ndbi/ndbai arrays if not in workspace
ndwi = (median["green"] - median["nir08"]) / (median["green"] + median["nir08"])
ndbi = (median["swir16"] - median["nir08"]) / (median["swir16"] + median["nir08"])
ndbai = (median["swir16"] - median["red"]) / (median["swir16"] + median["red"])

y_dim, x_dim = ndwi.dims[-2], ndwi.dims[-1]
ndbi_vals = ndbi.values.flatten()
ndbi_vals = ndbi_vals[np.isfinite(ndbi_vals)]

# Sweeps
ndbi_thr_candidates = np.linspace(np.percentile(ndbi_vals,5), np.percentile(ndbi_vals,95), 9)
ndwi_thr_candidates = [0.1, 0.15, 0.2, 0.25]
ndbai_thr_candidates = [0.1, 0.15, 0.2]

print("Testing combinations (ndbi, ndwi, ndbai) -> %masked")
for ndbi_thr in ndbi_thr_candidates:
    for ndwi_thr in ndwi_thr_candidates:
        for ndbai_thr in ndbai_thr_candidates:
            water_mask = (ndwi > ndwi_thr)
            building_mask = (ndbi > ndbi_thr)
            road_mask = (ndbai > ndbai_thr) & (ndwi < 0) & (ndbi < 0.2)
            combined = (water_mask | building_mask | road_mask).values.astype(bool)
            pct_masked = 100.0 * combined.sum() / combined.size
            print(f"ndbi={ndbi_thr:.3f}, ndwi={ndwi_thr:.2f}, ndbai={ndbai_thr:.2f} -> masked={pct_masked:.2f}%")
    print("------")

In [None]:
# Cell F: Quick recipes to try — pick one and rerun the prediction cell afterwards

# Option 1: Mask only water (keep settlements and roads unmasked)
mask_only_water = ( ( (median["green"] - median["nir08"]) / (median["green"] + median["nir08"]) ) > 0.2 )
mask_only_water = mask_only_water.astype("uint8")
print("Mask only water: masked% =", 100*mask_only_water.values.sum()/mask_only_water.size)

# Option 2: Reduce NDBI threshold (try 0.05 instead of 0.1)
ndbi_thr_try = 0.05
building_mask_lo = ( (median["swir16"] - median["nir08"]) / (median["swir16"] + median["nir08"]) ) > ndbi_thr_try
combined_try = (mask_only_water | building_mask_lo).astype("uint8")
print("Mask water + building (ndbi 0.05) masked% =", 100*combined_try.values.sum()/combined_try.size)

# Option 3: Reduce morphological dilation (if you used dilation iterations>1)
# If you used bm_np dilation iterations=1 earlier, try removing dilation or set iterations=0.

# Option 4: Use mask as a predictor instead of excluding pixels
print("To use the mask as a predictor, set INCLUDE_MASK_AS_FEATURE = True and re-run training/prediction cells.")

In [None]:
# Cell 1: Quick fix — mask only water so settlements remain visible
# Run this after you have median and predicted_da in memory.

ndwi = (median["green"] - median["nir08"]) / (median["green"] + median["nir08"])
mask_only_water = (ndwi > 0.2).astype("uint8")   # 0=keep, 1=water(mask)
mask_only_water.name = "mask_only_water"
print("Mask-only-water percent masked: {:.4f}%".format(100.0 * mask_only_water.values.sum() / mask_only_water.size))

# Apply to predicted layer (no retrain needed)
predicted_masked_water = predicted_da.where(~mask_only_water.astype(bool), other=np.nan)
predicted_masked_water.name = "predicted_masked_water"

# Quick counts
import numpy as np
vals, counts = np.unique(np.nan_to_num(predicted_masked_water.values, nan=-9999), return_counts=True)
print("Unique values in predicted_masked_water (nan->-9999):")
for v,c in zip(vals,counts):
    print(v, c)

# Optionally show a quick plot
import matplotlib.pyplot as plt
plt.figure(figsize=(8,6))
plt.title("Predicted (masked only water)")
plt.imshow(np.nan_to_num(predicted_masked_water.values, nan=-1))
plt.colorbar()
plt.axis("off")
plt.show()

In [None]:
# Robust sampling of mask at training points and building summary DataFrame
import numpy as np
import pandas as pd
import xarray as xr

# ensure gdf_pts exists and is in raster CRS
gdf_pts = gdf.to_crs(median.odc.geobox.crs)
gx = gdf_pts.geometry.x.values
gy = gdf_pts.geometry.y.values
n_points = len(gx)
print("n_points:", n_points)

# Use DataArray indexers so sel returns a 1D 'points' result when possible
points_dim = "points"
gx_da = xr.DataArray(gx, dims=points_dim)
gy_da = xr.DataArray(gy, dims=points_dim)

# Select mask values at point locations
mask_sel = combined_mask_try.sel({combined_mask_try.dims[-2]: gy_da,
                                  combined_mask_try.dims[-1]: gx_da},
                                 method="nearest")
print("mask_sel.dims:", mask_sel.dims, "mask_sel.shape:", mask_sel.shape)

# Squeeze any singleton dims, then get numpy array and flatten to 1D
mask_at_pts = mask_sel.squeeze(drop=True).values

# If mask_at_pts is multi-dimensional (unexpected), try to reduce it sensibly:
if mask_at_pts.ndim > 1:
    # If one of the dims equals number of points, try to pick that axis
    shapes = mask_at_pts.shape
    print("mask_at_pts.ndim > 1; shapes:", shapes)
    # find an axis equal to n_points and move it to front then ravel
    axis_with_points = None
    for i, s in enumerate(shapes):
        if s == n_points:
            axis_with_points = i
            break
    if axis_with_points is not None:
        # move that axis to 0 and then ravel other dims, keep one value per point by taking first along other axes
        # e.g., if shape is (n_points,1) or (1,n_points), this will work
        mask_at_pts = np.moveaxis(mask_at_pts, axis_with_points, 0)
        # if remaining dims exist, collapse them by taking the first element along each remaining axis
        while mask_at_pts.ndim > 1:
            mask_at_pts = mask_at_pts[:, 0]
        mask_at_pts = mask_at_pts.ravel()
    else:
        # last resort: flatten to 1D but warn (this likely misaligns points)
        mask_at_pts = mask_at_pts.ravel()
        print("Warning: mask_at_pts had no axis matching n_points; flattened array length:", mask_at_pts.size)

# Ensure mask_at_pts is 1-D and length matches number of points
mask_at_pts = np.asarray(mask_at_pts).ravel()
print("Final mask_at_pts.shape:", mask_at_pts.shape)

if mask_at_pts.shape[0] != n_points:
    raise ValueError(f"mask_at_pts length ({mask_at_pts.shape[0]}) does not match number of points ({n_points}). "
                     "Check the selection dims and that combined_mask_try has spatial dims (y,x) compatible with median.")

# Build DataFrame safely
dfpts = pd.DataFrame({"true": gdf_pts["randomforest"].values, "masked": mask_at_pts.astype(bool)})
summary = dfpts.groupby("true").agg(total=("true", "size"), masked=("masked", "sum"))
print("Per-class training-point masking summary:")
print(summary)

In [None]:
# Cell 3: INCLUDE mask as a predictor and retrain the RandomForest (do this if you want model to learn mask info instead of excluding pixels)
# Note: this re-runs training and prediction in-memory; it assumes you still have training_df extraction code available.
# Steps: add combined_mask to median, re-extract training_df and observations, retrain clf, predict.

# 1) use the combined_mask_try computed above (or combined_mask_da that you like). Convert to int8 and attach.
median_with_mask = median.assign(combined_mask = combined_mask_try.astype("int8"))

# 2) Re-extract training samples (vectorized sampling) — the same code you used in Cell 8 earlier:
arr = median_with_mask.to_array()  # dims: variable, y, x
points_dim = "points"
gx_da = xr.DataArray(gx, dims=points_dim)
gy_da = xr.DataArray(gy, dims=points_dim)
sampled = arr.sel({arr.dims[-2]: gy_da, arr.dims[-1]: gx_da}, method="nearest").squeeze(drop=True)
if sampled.ndim == 2:
    sampled_df = sampled.transpose(points_dim, "variable").to_pandas()
else:
    # fallback nearest-index loop (should be rare)
    y_coords = arr.coords[arr.dims[-2]].values
    x_coords = arr.coords[arr.dims[-1]].values
    iy = np.array([np.abs(y_coords - yy).argmin() for yy in gy])
    ix = np.array([np.abs(x_coords - xx).argmin() for xx in gx])
    vals = np.stack([arr.isel({arr.dims[-2]: iy_k, arr.dims[-1]: ix_k}).values.ravel()
                     for iy_k, ix_k in zip(iy, ix)], axis=0)
    var_names = list(arr.coords["variable"].values)
    sampled_df = pd.DataFrame(vals, columns=var_names)

# Combine labels & drop NaNs
training_df2 = pd.concat([gdf_pts.reset_index(drop=True)["randomforest"], pd.DataFrame(sampled_df).reset_index(drop=True)], axis=1)
training_df2 = training_df2.dropna()
print("Training rows available for retrain (mask as feature):", len(training_df2))

# 3) Train RF again (quick)
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(n_estimators=RF_N_ESTIMATORS, random_state=RF_RANDOM_STATE, n_jobs=-1)
y_train = training_df2.iloc[:,0].values
X_train = training_df2.iloc[:,1:].values
clf2.fit(X_train, y_train)
print("Retrained RF with mask as feature")

# 4) Predict across the image (same stacking approach as before, but using median_with_mask)
arr_full = median_with_mask.to_array().stack(pixels=(arr.dims[-2], arr.dims[-1])).transpose("pixels","variable")
X_full = arr_full.values
valid_mask_full = ~np.any(np.isnan(X_full), axis=1)
pred_flat2 = np.full(X_full.shape[0], np.nan, dtype=np.float32)
if valid_mask_full.sum() > 0:
    pred_flat2[valid_mask_full] = clf2.predict(X_full[valid_mask_full]).astype(np.float32)
pred_2d_2 = pred_flat2.reshape(median[arr.dims[-2]].size, median[arr.dims[-1]].size)
predicted_da_with_mask_feature = xr.DataArray(pred_2d_2, coords={arr.dims[-2]: median[arr.dims[-2]], arr.dims[-1]: median[arr.dims[-1]]}, dims=(arr.dims[-2], arr.dims[-1]))
predicted_da_with_mask_feature.name = "predicted_with_mask_feature"
print("Predicted with mask included as feature. Unique values:", np.unique(predicted_da_with_mask_feature.values[~np.isnan(predicted_da_with_mask_feature.values)]))

In [None]:
# Robust: sample combined_mask_try at training points and build per-class summary
import numpy as np
import pandas as pd
import xarray as xr

# ensure gdf_pts exists and is in raster CRS
try:
    gdf_pts
except NameError:
    gdf_pts = gdf.to_crs(median.odc.geobox.crs)

gx = gdf_pts.geometry.x.values
gy = gdf_pts.geometry.y.values
n_points = len(gx)
print("n_points:", n_points)

# Inspect mask dims for diagnostics
print("combined_mask_try dims:", combined_mask_try.dims, "shape:", combined_mask_try.shape)

# Vectorized selection with a 'points' DataArray indexer
points_dim = "points"
gx_da = xr.DataArray(gx, dims=points_dim)
gy_da = xr.DataArray(gy, dims=points_dim)

mask_sel = combined_mask_try.sel({combined_mask_try.dims[-2]: gy_da,
                                  combined_mask_try.dims[-1]: gx_da},
                                 method="nearest")
print("mask_sel.dims:", mask_sel.dims, "mask_sel.shape:", mask_sel.shape)

# Squeeze singleton dims
mask_vals = mask_sel.squeeze(drop=True).values
print("After squeeze, mask_vals.shape:", getattr(mask_vals, "shape", None), "ndim:", getattr(mask_vals, "ndim", None))

# If mask_vals still has >1 dim, try to find the axis that corresponds to points
if np.ndim(mask_vals) > 1:
    shapes = mask_vals.shape
    print("mask_vals multi-dim shapes:", shapes)
    # find axis equal to n_points
    axis_with_points = next((i for i,s in enumerate(shapes) if s == n_points), None)
    if axis_with_points is not None:
        # move that axis to front and collapse remaining axes by taking first element along them
        mask_vals = np.moveaxis(mask_vals, axis_with_points, 0)
        while mask_vals.ndim > 1:
            mask_vals = mask_vals[:, 0]
        mask_vals = mask_vals.ravel()
        print("Reduced mask_vals via axis_with_points -> shape:", mask_vals.shape)
    else:
        # as last resort, try to flatten but warn if lengths won't match
        mask_vals = mask_vals.ravel()
        print("Warning: no axis matches n_points; flattened mask_vals length:", mask_vals.size)

# Ensure final array is 1-D and matches points
mask_at_pts = np.asarray(mask_vals).ravel()
print("Final mask_at_pts.shape:", mask_at_pts.shape)

if mask_at_pts.shape[0] != n_points:
    raise ValueError(f"mask_at_pts length ({mask_at_pts.shape[0]}) != number of points ({n_points}). "
                     "Check combined_mask_try dims and that it is spatial (y,x). See printed diagnostics above.")

# Now build DataFrame and summary safely
dfpts = pd.DataFrame({"true": gdf_pts["randomforest"].values, "masked": mask_at_pts.astype(bool)})
summary = dfpts.groupby("true").agg(total=("true", "size"), masked=("masked", "sum"))
print("Per-class training-point masking summary:")
print(summary)

In [None]:
print("predicted_da:", getattr(predicted_da, "dims", None), predicted_da.shape)
print("predicted_masked:", getattr(predicted_masked, "dims", None), getattr(predicted_masked, "shape", None))
print("median coords:", list(median.coords))
print("CRS (odc geobox):", median.odc.geobox.crs)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

# RGB median
fig, axs = plt.subplots(1,3, figsize=(18,6))
if all(b in median for b in ("red","green","blue")):
    rgb = np.dstack([median["red"].values, median["green"].values, median["blue"].values])
    rgb = np.clip(rgb, 0, 1)  # ensure range 0-1
    axs[0].imshow(rgb)
    axs[0].set_title("Median RGB")
else:
    axs[0].text(0.5,0.5,"RGB not available", ha="center")

# Predicted (raw)
im = axs[1].imshow(np.nan_to_num(predicted_da.values, nan=-1), cmap="viridis")
axs[1].set_title("Predicted (raw classes)")
plt.colorbar(im, ax=axs[1], fraction=0.046)

# Masked predicted (invasives visible only where not masked)
im2 = axs[2].imshow(np.nan_to_num(predicted_masked.values, nan=-1), cmap="tab20")
axs[2].set_title("Predicted (masked)")
plt.colorbar(im2, ax=axs[2], fraction=0.046)

for ax in axs: ax.axis("off")
plt.show()

In [None]:
# save predicted_masked as GeoTIFF using rioxarray
import rioxarray
# ensure spatial metadata exists (ODC geobox provides it)
da = predicted_masked  # xarray.DataArray (y,x)
# write CRS info (use median.odc.geobox.crs)
da = da.rio.write_crs(median.odc.geobox.crs, inplace=False)
da.rio.to_raster("predicted_masked.tif", compress="LZW")
print("Saved predicted_masked.tif")
# save combined mask
combined_mask_da.rio.write_crs(median.odc.geobox.crs, inplace=True)
combined_mask_da.rio.to_raster("combined_mask.tif", compress="LZW")