# Subsampling window log exploration

Helpers to inspect window-level logs (e.g., `igbp_CRO_BE-Lon_windows_*.csv`) and plot time series, histograms, and cumulative sums with consistent gaps across reference and degraded fluxes.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_context("talk")
%matplotlib inline

In [None]:
def load_window_log(path):
    """Load a window-level log CSV."""
    return pd.read_csv(path)


def log_summary(df):
    """Quick overview of the log."""
    print(f"rows: {len(df):,}")
    meta_cols = [c for c in ["ecosystem", "site", "theta_index", "rotation_mode", "subsample_label", "subsample_mode"] if c in df]
    if meta_cols:
        print("meta cols:", meta_cols)
    if "subsample_label" in df:
        print("labels:", df["subsample_label"].unique())
    if "window_start" in df:
        print("time span:", df["window_start"].min(), "â†’", df["window_start"].max())
    print("numeric columns:", len(df.select_dtypes(include="number").columns))


def preview_columns(df, cols=None, n=5):
    """Show first rows of selected columns (defaults to common metrics)."""
    default_cols = [
        "window_start", "is_day", "subsample_label", "rotation_mode",
        "F_CO2_ref", "F_CO2_deg", "res_CO2",
        "F_LE_ref", "F_LE_deg", "res_LE",
        "F_H_ref", "F_H_deg", "res_H",
        "kept_fraction", "effective_fs", "target_fs", "ogive_stop_time_sec",
    ]
    cols = cols or [c for c in default_cols if c in df.columns]
    display(df[cols].head(n))


def hist_metric(df, metric="res_CO2", by="subsample_label", bins=50):
    """Histogram of a metric, optionally faceted by subsample label."""
    plot_df = df.copy()
    if by and by in plot_df.columns:
        g = sns.displot(plot_df, x=metric, col=by, col_wrap=3, bins=bins, facet_kws={"sharex": False})
        g.fig.subplots_adjust(top=0.9)
        g.fig.suptitle(f"{metric} histograms by {by}")
    else:
        plt.figure(figsize=(6, 4))
        sns.histplot(plot_df[metric], bins=bins, kde=False)
        plt.title(f"{metric} histogram")
        plt.tight_layout()
    plt.show()


def timeseries(df, metric="res_CO2", by_label=True):
    """Time series of a metric (sorted by window_start)."""
    if "window_start" not in df.columns:
        raise ValueError("window_start column required for timeseries")
    plot_df = df.copy()
    plot_df["window_start"] = pd.to_datetime(plot_df["window_start"])
    plot_df.sort_values("window_start", inplace=True)
    plt.figure(figsize=(10, 4))
    if by_label and "subsample_label" in plot_df.columns:
        sns.lineplot(data=plot_df, x="window_start", y=metric, hue="subsample_label", marker="o", lw=0.7)
    else:
        plt.plot(plot_df["window_start"], plot_df[metric], marker="o", lw=0.7)
    plt.title(f"{metric} over time")
    plt.tight_layout()
    plt.show()


def cumsum_flux(df, flux="CO2", rotation_mode=None, include_raw=False):
    """
    Plot cumulative sums of reference and degraded flux for each subsample label.
    Gaps are harmonized by dropping any window where any plotted series is NaN.
    """
    if "window_start" not in df.columns:
        raise ValueError("window_start column required for cumulative sums")

    plot_df = df.copy()
    if rotation_mode is not None and "rotation_mode" in plot_df.columns:
        plot_df = plot_df[plot_df["rotation_mode"] == rotation_mode]

    plot_df["window_start"] = pd.to_datetime(plot_df["window_start"])
    plot_df.sort_values("window_start", inplace=True)

    series_map = {
        "ref": f"F_{flux}_ref",
        "deg": f"F_{flux}_deg",
    }
    if include_raw:
        series_map["raw"] = f"F_{flux}_raw"

    missing_cols = [c for c in series_map.values() if c not in plot_df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns for {flux}: {missing_cols}")

    def _label_col():
        return "subsample_label" if "subsample_label" in plot_df.columns else None

    label_col = _label_col()
    groups = plot_df.groupby(label_col) if label_col else [("all", plot_df)]

    plt.figure(figsize=(10, 5))
    for label, grp in groups:
        # Harmonize gaps: drop any row with NaN in any series we plot
        mask = grp[list(series_map.values())].notna().all(axis=1)
        g = grp[mask].copy()
        if g.empty:
            continue
        for key, col in series_map.items():
            g[f"cum_{key}"] = g[col].cumsum()
            plt.plot(g["window_start"], g[f"cum_{key}"], label=f"{label}-{key}")
    plt.title(f"Cumulative {flux} flux (ref vs degraded)")
    plt.xlabel("window_start")
    plt.ylabel(f"cumsum F_{flux}")
    plt.legend(bbox_to_anchor=(1, 1))
    plt.tight_layout()
    plt.show()


In [None]:
# Example usage
log_path = r"C:\Users\geryatejina\dev\minspecs_eddy\igbp_CRO_BE-Lon_windows_20251223124544_temporarry.csv"
log_df = load_window_log(log_path)

log_summary(log_df)
preview_columns(log_df)
hist_metric(log_df, metric="res_CO2")
timeseries(log_df, metric="res_CO2")
cumsum_flux(log_df, flux="CO2", rotation_mode="double", include_raw=False)
