In [1]:
import xarray as xr
import numpy as np
import pandas as pd
from pathlib import Path
import os

ZARR_PATH = Path("data/gold/IberFire_coarse8_time1.zarr")
LABEL_NAME = "is_fire"
DOWNSAMPLE = 1           # same as training
MAX_TIMES = None          # or None to use all; 365 is a good compromise
project_root = Path.cwd().parent
os.chdir(project_root)
ds = xr.open_zarr(ZARR_PATH, consolidated=False)

label_da = ds[LABEL_NAME]   # expected shape: [time, y, x]

# All candidate feature variables (exclude the label itself)
all_features = [v for v in ds.data_vars if v != LABEL_NAME]

# Figure out which features are dynamic (have time dim) vs static (no time)
dynamic_vars = []
static_vars = []
for v in all_features:
    da = ds[v]
    if "time" in da.dims:
        dynamic_vars.append(v)
    else:
        static_vars.append(v)

print("Dynamic features:", dynamic_vars)
print("Static features:", static_vars)

# Choose time indices to use (sample if too many)
time_indices = np.arange(ds.sizes["time"])
if MAX_TIMES is not None and MAX_TIMES < len(time_indices):
    rng = np.random.default_rng(42)
    time_indices = np.sort(
        rng.choice(time_indices, size=MAX_TIMES, replace=False)
    )

print(f"Will use {len(time_indices)} time steps for correlation.")


def streaming_corr_dynamic(feature_da, label_da, time_indices, down=1):
    """
    Pearson correlation between a dynamic feature [time,y,x]
    and label [time,y,x], streamed over time + space.
    """
    sum_x = sum_y = sum_x2 = sum_y2 = sum_xy = 0.0
    n = 0

    for t in time_indices:
        x = feature_da.isel(time=int(t)).values[::down, ::down].ravel()
        y = label_da.isel(time=int(t)).values[::down, ::down].ravel()

        mask = np.isfinite(x) & np.isfinite(y)
        x = x[mask]
        y = y[mask]
        if x.size == 0:
            continue

        n += x.size
        sum_x += x.sum()
        sum_y += y.sum()
        sum_x2 += (x**2).sum()
        sum_y2 += (y**2).sum()
        sum_xy += (x * y).sum()

    if n == 0:
        return np.nan

    mean_x = sum_x / n
    mean_y = sum_y / n
    var_x = (sum_x2 / n) - mean_x**2
    var_y = (sum_y2 / n) - mean_y**2
    cov_xy = (sum_xy / n) - mean_x * mean_y

    if var_x <= 0 or var_y <= 0:
        return np.nan

    return cov_xy / np.sqrt(var_x * var_y)


def streaming_corr_static(feature_da, label_da, time_indices, down=1):
    """
    Pearson correlation between a static feature [y,x]
    and a time-varying label [time,y,x].

    We conceptually "repeat" the static field for each time step and
    correlate over all (t,y,x) points, but do it in a streaming way.
    """
    # Static field: same x for all times
    x_static = feature_da.values[::down, ::down].ravel()

    mask_x = np.isfinite(x_static)
    x_static = x_static[mask_x]
    if x_static.size == 0:
        return np.nan

    sum_x = sum_y = sum_x2 = sum_y2 = sum_xy = 0.0
    n = 0

    for t in time_indices:
        y = label_da.isel(time=int(t)).values[::down, ::down].ravel()
        y = y[mask_x]   # align with static mask
        y = y[np.isfinite(y)]
        # Note: if y has NaNs where x was valid, this effectively shrinks sample size

        # If nothing left for this time step, skip
        if y.size == 0:
            continue

        # For correlation formula, x must match y's length.
        # If some y entries were dropped, we subselect x_static accordingly.
        # Here we recompute a mask for y, but since we only filtered NaNs,
        # the length should match x_static (mask_x) unless label has NaNs.
        # To keep things simple and robust, recompute x to match y.size:
        x = x_static[: y.size]

        n += x.size
        sum_x += x.sum()
        sum_y += y.sum()
        sum_x2 += (x**2).sum()
        sum_y2 += (y**2).sum()
        sum_xy += (x * y).sum()

    if n == 0:
        return np.nan

    mean_x = sum_x / n
    mean_y = sum_y / n
    var_x = (sum_x2 / n) - mean_x**2
    var_y = (sum_y2 / n) - mean_y**2
    cov_xy = (sum_xy / n) - mean_x * mean_y

    if var_x <= 0 or var_y <= 0:
        return np.nan

    return cov_xy / np.sqrt(var_x * var_y)


results = []

for v in all_features:
    print(f"Processing {v}...")
    da = ds[v]
    if v in dynamic_vars:
        corr = streaming_corr_dynamic(da, label_da, time_indices, down=DOWNSAMPLE)
    elif v in static_vars:
        corr = streaming_corr_static(da, label_da, time_indices, down=DOWNSAMPLE)
    else:
        # Fallback: treat as dynamic by default
        corr = streaming_corr_dynamic(da, label_da, time_indices, down=DOWNSAMPLE)

    results.append({"variable": v, "corr": corr})

corr_df = pd.DataFrame(results).sort_values(
    "corr", key=lambda x: np.abs(x), ascending=False
)

display(corr_df.head(30))
corr_df.to_csv("feature_target_correlations.csv", index=False)

Dynamic features: ['FAPAR', 'FWI', 'LAI', 'LST', 'NDVI', 'RH_max', 'RH_mean', 'RH_min', 'RH_range', 'SWI_001', 'SWI_005', 'SWI_010', 'SWI_020', 'is_holiday', 'is_near_fire', 'surface_pressure_max', 'surface_pressure_mean', 'surface_pressure_min', 'surface_pressure_range', 't2m_max', 't2m_mean', 't2m_min', 't2m_range', 'total_precipitation_mean', 'wind_direction_at_max_speed', 'wind_direction_mean', 'wind_speed_max', 'wind_speed_mean']
Static features: ['AutonomousCommunities', 'CLC_2006_1', 'CLC_2006_10', 'CLC_2006_11', 'CLC_2006_12', 'CLC_2006_13', 'CLC_2006_14', 'CLC_2006_15', 'CLC_2006_16', 'CLC_2006_17', 'CLC_2006_18', 'CLC_2006_19', 'CLC_2006_2', 'CLC_2006_20', 'CLC_2006_21', 'CLC_2006_22', 'CLC_2006_23', 'CLC_2006_24', 'CLC_2006_25', 'CLC_2006_26', 'CLC_2006_27', 'CLC_2006_28', 'CLC_2006_29', 'CLC_2006_3', 'CLC_2006_30', 'CLC_2006_31', 'CLC_2006_32', 'CLC_2006_33', 'CLC_2006_34', 'CLC_2006_35', 'CLC_2006_36', 'CLC_2006_37', 'CLC_2006_38', 'CLC_2006_39', 'CLC_2006_4', 'CLC_2006_40

Unnamed: 0,variable,corr
222,is_near_fire,0.088216
20,CLC_2006_27,0.02678
146,CLC_2018_27,0.024774
83,CLC_2012_27,0.024704
219,elevation_stdev,0.016741
241,slope_mean,0.015818
239,roughness_mean,0.015746
123,CLC_2012_scrub_proportion,0.014222
186,CLC_2018_scrub_proportion,0.014117
60,CLC_2006_scrub_proportion,0.013991


In [5]:
output_path = project_root / "feature_target_correlations.csv"
corr_df.to_csv(output_path, index=False)
print(f"Saved to: {output_path.resolve()}")

Saved to: /Users/vladimir/catalonia-wildfire-prediction/feature_target_correlations.csv
