In [4]:
import warnings
warnings.filterwarnings('ignore')

import tqdm

In [26]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import multipletests

# =========================
# Helpers
# =========================

MODEL_ORDER = ['hist', 'oper1', 'oper2', 'oper3', 'oper4']

def _ensure_types(df):
    df = df.copy()
    df['lead'] = pd.to_numeric(df['lead'], errors='coerce')
    df = df.dropna(subset=['prediction_huc','lead','model'])
    df['model'] = df['model'].str.strip().str.lower()
    df = df[df['model'].isin(MODEL_ORDER)]
    return df

def paired_wilcoxon_effects(x, y):
    """Paired Wilcoxon with rank-biserial r_rb and CLES (paired)."""
    x = np.asarray(x); y = np.asarray(y)
    diff = x - y
    if len(diff) == 0 or np.allclose(diff, 0):
        return 0.0, 1.0, 0.0, 0.5
    W, p = stats.wilcoxon(x, y, zero_method='pratt', alternative='two-sided', correction=False, mode='auto')
    ranks = stats.rankdata(np.abs(diff))
    pos = diff > 0
    neg = diff < 0
    W_plus = ranks[pos].sum()
    W_minus = ranks[neg].sum()
    denom = W_plus + W_minus
    r_rb = 0.0 if denom == 0 else (W_plus - W_minus) / denom
    ties = (diff == 0)
    cles = (pos.sum() + 0.5 * ties.sum()) / len(diff)
    return float(W), float(p), float(r_rb), float(cles)

def holm_correct(pvals):
    """Holm-Bonferroni adjusted p-values in original order."""
    if not pvals:
        return []
    return multipletests(pvals, method='holm')[1]

def friedman_from_wide(df_cases, cols):
    """Friedman across given cols, rows = subjects."""
    arrays = [df_cases[c].values for c in cols]
    if any(len(a) < 3 for a in arrays):  # requires >=3 subjects
        return np.nan, np.nan
    chi2, p = stats.friedmanchisquare(*arrays)
    return float(chi2), float(p)

def theil_sen_slope(y, x):
    slope, _, _, _ = stats.theilslopes(y, x=x, alpha=0.95)
    return float(slope)

def bootstrap_ci_median(data, n_boot=100, alpha=0.05, seed=42):
    rng = np.random.default_rng(seed)
    data = np.asarray(data)
    if len(data) == 0:
        return (np.nan, np.nan)
    boots = np.empty(n_boot, dtype=float)
    for i in range(n_boot):
        boots[i] = np.median(rng.choice(data, size=len(data), replace=True))
    lo = np.percentile(boots, 100*(alpha/2))
    hi = np.percentile(boots, 100*(1 - alpha/2))
    return float(lo), float(hi)

# =========================
# A) Lead-wise comparisons
# =========================

def leadwise_tests(df_region, metric='NSE', leads=(1,10)):
    """
    For each lead in leads:
      - Friedman across 5 models (paired over HUC, requires complete cases)
      - Paired Wilcoxon for all 10 pairs among the 5 models (pairwise HUC intersection)
    Returns tidy DataFrame ready to to_markdown/to_csv.
    """
    out_rows = []
    for L in tqdm.tqdm(leads):
        dL = df_region[df_region['lead'] == L].dropna(subset=[metric]).copy()

        # Omnibus: require subjects with all 5 models
        wide5 = dL.pivot_table(index='prediction_huc', columns='model', values=metric, aggfunc='mean')
        missing_cols = [m for m in MODEL_ORDER if m not in wide5.columns]
        for m in missing_cols:  # ensure all columns exist
            wide5[m] = np.nan
        wide5 = wide5[MODEL_ORDER].dropna(how='any')
        chi2, p = (np.nan, np.nan)
        if wide5.shape[0] >= 3:
            chi2, p = friedman_from_wide(wide5, MODEL_ORDER)

        out_rows.append({
            'lead': L, 'metric': metric, 'Friedman (5 models)': f"χ²={chi2:.3g}, df=4, p={p:.3g}",
            'comparison': '—', 'W': '—', 'p_adj': '—', 'r_rb': '—', 'CLES': '—', 'n': wide5.shape[0]
        })

        # Post-hoc: all 10 pairs, each on its own HUC intersection
        pairs = []
        models = MODEL_ORDER
        for i in range(len(models)):
            for j in range(i+1, len(models)):
                a, b = models[i], models[j]
                wide2 = dL[dL['model'].isin([a,b])].pivot_table(
                    index='prediction_huc', columns='model', values=metric, aggfunc='mean'
                )
                if a not in wide2.columns or b not in wide2.columns:
                    continue
                wide2 = wide2[[a,b]].dropna(how='any')
                x, y = wide2[a].values, wide2[b].values
                if len(x) < 3:
                    continue
                W, p_raw, r_rb, cles = paired_wilcoxon_effects(x, y)
                pairs.append((f"{a} vs {b}", len(x), W, p_raw, r_rb, cles))

        # Holm adjust within this (region×metric×lead) family
        p_adj = holm_correct([p for (_, _, _, p, _, _) in pairs])
        for (comp, n, W, p_raw, r_rb, cles), pa in zip(pairs, p_adj):
            out_rows.append({
                'lead': L, 'metric': metric, 'Friedman (5 models)': '',
                'comparison': comp, 'W': f"{W:.3g}", 'p_adj': f"{pa:.3g}",
                'r_rb': f"{r_rb:.3f}", 'CLES': f"{cles:.3f}", 'n': n
            })
    return pd.DataFrame(out_rows)

# =========================
# B) Trend across leads vs Historical
# =========================

def build_delta_vs_hist(df_region, metric):
    """Return DataFrame with Δ = metric(operK) - metric(hist) for each (HUC, lead, operK)."""
    base = df_region[df_region['model']=='hist'][['prediction_huc','lead',metric]].rename(columns={metric:'base'})
    rows = []
    for oper in ['oper1','oper2','oper3','oper4']:
        d = df_region[df_region['model']==oper][['prediction_huc','lead',metric]].rename(columns={metric:'val'})
        mrg = pd.merge(d, base, on=['prediction_huc','lead'], how='inner')
        mrg['delta'] = mrg['val'] - mrg['base']
        mrg['model'] = oper
        rows.append(mrg[['prediction_huc','lead','model','delta']])
    return pd.concat(rows, ignore_index=True) if rows else pd.DataFrame(columns=['prediction_huc','lead','model','delta'])

def slope_summary_vs_hist(df_region, metric='NSE'):
    """
    For Δ(lead) per HUC per oper, compute Theil–Sen slope.
    Returns:
      table2 (summary per oper + omnibus Friedman on slopes)
      table3 (post-hoc paired Wilcoxon between oper slopes, Holm-corrected)
    """
    dD = build_delta_vs_hist(df_region, metric)
    # Per-HUC slopes per oper
    slopes = []
    for oper, g_op in dD.groupby('model'):
        for huc, g_h in g_op.groupby('prediction_huc'):
            g_h = g_h[['lead','delta']].dropna()
            if g_h.shape[0] < 2:
                continue
            m = theil_sen_slope(g_h['delta'].values, g_h['lead'].values)
            slopes.append({'prediction_huc': huc, 'model': oper, 'slope': m})
    slopes_df = pd.DataFrame(slopes)

    # Per-oper summary & Wilcoxon vs 0
    rows = []
    for oper, g in slopes_df.groupby('model'):
        s = g['slope'].values
        n = len(s)
        med = np.median(s) if n else np.nan
        ci_lo, ci_hi = bootstrap_ci_median(s, n_boot=2000, alpha=0.05, seed=42) if n else (np.nan, np.nan)
        if n >= 1 and np.any(np.abs(s) > 0):
            W, p = stats.wilcoxon(s, np.zeros_like(s), zero_method='pratt', alternative='two-sided')
            pos, neg = (s > 0).sum(), (s < 0).sum()
            r_rb = 0.0 if (pos+neg)==0 else (pos - neg)/(pos + neg)
        else:
            W, p, r_rb = (0.0, 1.0, 0.0)
        rows.append({
            'Case': oper, 'HUCs': n,
            'Median Theil–Sen Δ/lead': med,
            '95% CI (bootstrap)': f"[{ci_lo:.4g}, {ci_hi:.4g}]",
            'Wilcoxon W (vs 0)': W, 'p': p, 'r_rb': r_rb
        })
    table2 = pd.DataFrame(rows).sort_values('Case')
    print("Table 2 done")

    # Omnibus on slopes across oper1–4 (paired over HUC)
    wide = slopes_df.pivot_table(index='prediction_huc', columns='model', values='slope', aggfunc='mean')
    have = [m for m in ['oper1','oper2','oper3','oper4'] if m in wide.columns]
    omnibus_note = ""
    if len(have) == 4:
        wide4 = wide[['oper1','oper2','oper3','oper4']].dropna(how='any')
        if wide4.shape[0] >= 3:
            chi2, p = stats.friedmanchisquare(*[wide4[c].values for c in ['oper1','oper2','oper3','oper4']])
            omnibus_note = f"Friedman χ²={chi2:.3g}, df=3, p={p:.3g}"
            # Post-hoc on slopes
            pairs = [('oper1','oper2'),('oper1','oper3'),('oper1','oper4'),
                     ('oper2','oper3'),('oper2','oper4'),('oper3','oper4')]
            tmp, p_raw = [], []
            for a,b in pairs:
                x, y = wide4[a].values, wide4[b].values
                W, p0, r_rb, cles = paired_wilcoxon_effects(x, y)
                tmp.append((f"{a} vs {b}", len(x), W, p0, r_rb, cles))
                p_raw.append(p0)
            p_adj = holm_correct(p_raw)
            table3 = pd.DataFrame([{
                'Comparison on slopes (Δ)': comp, 'HUCs': n, 'W': W,
                'p_adj (Holm)': pa, 'r_rb': r_rb, 'CLES': cles
            } for (comp, n, W, p0, r_rb, cles), pa in zip(tmp, p_adj)])
        else:
            table3 = pd.DataFrame(columns=['Comparison on slopes (Δ)','HUCs','W','p_adj (Holm)','r_rb','CLES'])
    else:
        table3 = pd.DataFrame(columns=['Comparison on slopes (Δ)','HUCs','W','p_adj (Holm)','r_rb','CLES'])

    print("Table 3 done")

    # Add omnibus note to summary table
    omnibus_row = pd.DataFrame([{
        'Case': 'Across oper trends', 'HUCs': '',
        'Median Theil–Sen Δ/lead': '', '95% CI (bootstrap)': '',
        'Wilcoxon W (vs 0)': '', 'p': omnibus_note, 'r_rb': ''
    }])
    table2 = pd.concat([omnibus_row, table2], ignore_index=True)
    return table2, table3

# =========================
# Region runner
# =========================

def analyze_region_csv(path_csv, region_name=None, leads=(1,10)):
    """
    Run the full battery for NSE and F1 for a given region CSV.
    Returns: dict(metric -> dict of DataFrames)
    """
    df = pd.read_csv(path_csv)

    df['NSE'] = df['NSE'].fillna(-10)
    df['F1'] = df['F1'].fillna(0)
    df['NSE'] = df['NSE'].clip(lower=-10)
    df['F1'] = df['F1'].clip(lower=0)

    # Only keep lead in [1,5,10]
    df = df[df['lead'].isin(leads)]#.copy()

    df = _ensure_types(df)

    out = {}
    for metric in ['NSE','F1']:
        dmet = df.dropna(subset=[metric]).copy()

        # A) Lead-wise
        table1 = leadwise_tests(dmet, metric=metric, leads=leads)
        print(f"Completed lead-wise tests for {region_name} | {metric}")

        # B) Across-lead trend vs hist
        # table2, table3 = slope_summary_vs_hist(dmet, metric=metric)
        # # print(f"Completed slope tests for {region_name} | {metric}")

        # # Add region/metric labels (optional)
        # table1.insert(0, 'Region', region_name or path_csv)
        # table2.insert(0, 'Region', region_name or path_csv)
        # table2.insert(1, 'Metric', metric)
        # table3.insert(0, 'Region', region_name or path_csv)
        # table3.insert(1, 'Metric', metric)

        # out[metric] = {'leadwise': table1, 'slope_summary': table2, 'slope_posthoc': table3}
        out[metric] = {'leadwise': table1}

    return out

In [17]:
csv_path = '/home/sarth/rootdir/workdir/projects/Paper_Data_Latency/Revised_Statistical_Testing/Figure04-05'
camelsus = analyze_region_csv(f"{csv_path}/camelsus.csv", region_name="CAMELS-US", leads=[1,5,10])

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  5.18it/s]


Completed lead-wise tests for CAMELS-US | NSE
Table 2 done
Table 3 done


100%|██████████| 3/3 [00:00<00:00,  6.97it/s]


Completed lead-wise tests for CAMELS-US | F1
Table 2 done
Table 3 done


In [18]:
for metric, tabs in camelsus.items():
    print(f"\n== CAMELS-US | {metric} | Lead-wise ==")
    print(tabs['leadwise'].to_markdown(index=False))
    print(f"\n== CAMELS-US | {metric} | Δ vs hist Slope Summary ==")
    print(tabs['slope_summary'].to_markdown(index=False))
    print(f"\n== CAMELS-US | {metric} | Δ vs hist Slope Post-hoc ==")
    print(tabs['slope_posthoc'].to_markdown(index=False))


== CAMELS-US | NSE | Lead-wise ==
| Region    |   lead | metric   | Friedman (5 models)       | comparison     | W   | p_adj    | r_rb   | CLES   |   n |
|:----------|-------:|:---------|:--------------------------|:---------------|:----|:---------|:-------|:-------|----:|
| CAMELS-US |      1 | NSE      | χ²=54.7, df=4, p=3.73e-11 | —              | —   | —        | —      | —      |  18 |
| CAMELS-US |      1 | NSE      |                           | hist vs oper1  | 3   | 0.000191 | -0.965 | 0.111  |  18 |
| CAMELS-US |      1 | NSE      |                           | hist vs oper2  | 1   | 0.000107 | -0.988 | 0.056  |  18 |
| CAMELS-US |      1 | NSE      |                           | hist vs oper3  | 42  | 0.145    | -0.509 | 0.222  |  18 |
| CAMELS-US |      1 | NSE      |                           | hist vs oper4  | 0   | 7.63e-05 | -1.000 | 0.000  |  18 |
| CAMELS-US |      1 | NSE      |                           | oper1 vs oper2 | 1   | 0.000107 | -0.988 | 0.056  |  18 |
| CAM

In [27]:
hysets = analyze_region_csv(f"{csv_path}/hysets.csv", region_name="HYSETS", leads=[1,5,10])

100%|██████████| 3/3 [00:01<00:00,  1.97it/s]


Completed lead-wise tests for HYSETS | NSE


100%|██████████| 3/3 [00:00<00:00,  5.66it/s]

Completed lead-wise tests for HYSETS | F1





In [28]:
for metric, tabs in hysets.items():
    print(f"\n== HYSETS | {metric} | Lead-wise ==")
    print(tabs['leadwise'].to_markdown(index=False))
    # print(f"\n== HYSETS | {metric} | Δ vs hist Slope Summary ==")
    # print(tabs['slope_summary'].to_markdown(index=False))
    # print(f"\n== HYSETS | {metric} | Δ vs hist Slope Post-hoc ==")
    # print(tabs['slope_posthoc'].to_markdown(index=False))


== HYSETS | NSE | Lead-wise ==
|   lead | metric   | Friedman (5 models)     | comparison     | W   | p_adj   | r_rb   | CLES   |   n |
|-------:|:---------|:------------------------|:---------------|:----|:--------|:-------|:-------|----:|
|      1 | NSE      | χ²=7.51, df=4, p=0.111  | —              | —   | —       | —      | —      |  18 |
|      1 | NSE      |                         | hist vs oper1  | 60  | 1       | 0.298  | 0.611  |  18 |
|      1 | NSE      |                         | hist vs oper2  | 72  | 1       | 0.158  | 0.500  |  18 |
|      1 | NSE      |                         | hist vs oper3  | 38  | 0.308   | 0.556  | 0.667  |  18 |
|      1 | NSE      |                         | hist vs oper4  | 35  | 0.268   | 0.591  | 0.778  |  18 |
|      1 | NSE      |                         | oper1 vs oper2 | 74  | 1       | -0.135 | 0.444  |  18 |
|      1 | NSE      |                         | oper1 vs oper3 | 45  | 0.489   | 0.474  | 0.722  |  18 |
|      1 | NSE      |  

In [29]:
camelsind = analyze_region_csv(f"{csv_path}/camelsind.csv",region_name="CAMELS-IND", leads=[1,5,10])

100%|██████████| 3/3 [00:00<00:00,  5.99it/s]


Completed lead-wise tests for CAMELS-IND | NSE


100%|██████████| 3/3 [00:00<00:00,  6.41it/s]

Completed lead-wise tests for CAMELS-IND | F1





In [31]:
for metric, tabs in camelsind.items():
    print(f"\n== CAMELS-IND | {metric} | Lead-wise ==")
    print(tabs['leadwise'].to_markdown(index=False))
    # print(f"\n== CAMELS-IND | {metric} | Δ vs hist Slope Summary ==")
    # print(tabs['slope_summary'].to_markdown(index=False))
    # print(f"\n== CAMELS-IND | {metric} | Δ vs hist Slope Post-hoc ==")
    # print(tabs['slope_posthoc'].to_markdown(index=False))


== CAMELS-IND | NSE | Lead-wise ==
|   lead | metric   | Friedman (5 models)       | comparison     | W   | p_adj    | r_rb   | CLES   |   n |
|-------:|:---------|:--------------------------|:---------------|:----|:---------|:-------|:-------|----:|
|      1 | NSE      | χ²=49.9, df=4, p=3.75e-10 | —              | —   | —        | —      | —      |  15 |
|      1 | NSE      |                           | hist vs oper1  | 1   | 0.000732 | 0.983  | 0.933  |  15 |
|      1 | NSE      |                           | hist vs oper2  | 6   | 0.00256  | 0.900  | 0.867  |  15 |
|      1 | NSE      |                           | hist vs oper3  | 0   | 0.00061  | 1.000  | 1.000  |  15 |
|      1 | NSE      |                           | hist vs oper4  | 0   | 0.00061  | 1.000  | 1.000  |  15 |
|      1 | NSE      |                           | oper1 vs oper2 | 15  | 0.0167   | -0.750 | 0.200  |  15 |
|      1 | NSE      |                           | oper1 vs oper3 | 0   | 0.00061  | 1.000  | 1.000  

In [None]:
for metric, tabs in camelsus.items():
    print(f"\n== CAMELS-US | {metric} | Lead-wise ==")
    print(tabs['leadwise'].to_markdown(index=False))
    # print(f"\n== CAMELS-US | {metric} | Δ vs hist Slope Summary ==")
    # print(tabs['slope_summary'].to_markdown(index=False))
    # print(f"\n== CAMELS-US | {metric} | Δ vs hist Slope Post-hoc ==")
    # print(tabs['slope_posthoc'].to_markdown(index=False))


== CAMELS-US | NSE | Lead-wise ==
| Region    |   lead | metric   | Friedman (5 models)       | comparison     | W   | p_adj    | r_rb   | CLES   |   n |
|:----------|-------:|:---------|:--------------------------|:---------------|:----|:---------|:-------|:-------|----:|
| CAMELS-US |      1 | NSE      | χ²=54.7, df=4, p=3.73e-11 | —              | —   | —        | —      | —      |  18 |
| CAMELS-US |      1 | NSE      |                           | hist vs oper1  | 3   | 0.000191 | -0.965 | 0.111  |  18 |
| CAMELS-US |      1 | NSE      |                           | hist vs oper2  | 1   | 0.000107 | -0.988 | 0.056  |  18 |
| CAMELS-US |      1 | NSE      |                           | hist vs oper3  | 42  | 0.145    | -0.509 | 0.222  |  18 |
| CAMELS-US |      1 | NSE      |                           | hist vs oper4  | 0   | 7.63e-05 | -1.000 | 0.000  |  18 |
| CAMELS-US |      1 | NSE      |                           | oper1 vs oper2 | 1   | 0.000107 | -0.988 | 0.056  |  18 |
| CAM

In [None]:
for metric, tabs in hysets.items():
    print(f"\n== HYSETS | {metric} | Lead-wise ==")
    print(tabs['leadwise'].to_markdown(index=False))

100%|██████████| 2/2 [00:00<00:00,  4.92it/s]


Completed lead-wise tests for HYSETS | NSE


100%|██████████| 2/2 [00:00<00:00,  6.16it/s]

Completed lead-wise tests for HYSETS | F1

== HYSETS | NSE | Lead-wise ==
| Region   |   lead | metric   | Friedman (5 models)     | comparison     | W   | p_adj   | r_rb   | CLES   |   n |
|:---------|-------:|:---------|:------------------------|:---------------|:----|:--------|:-------|:-------|----:|
| HYSETS   |      1 | NSE      | χ²=7.51, df=4, p=0.111  | —              | —   | —       | —      | —      |  18 |
| HYSETS   |      1 | NSE      |                         | hist vs oper1  | 60  | 1       | 0.298  | 0.611  |  18 |
| HYSETS   |      1 | NSE      |                         | hist vs oper2  | 72  | 1       | 0.158  | 0.500  |  18 |
| HYSETS   |      1 | NSE      |                         | hist vs oper3  | 38  | 0.308   | 0.556  | 0.667  |  18 |
| HYSETS   |      1 | NSE      |                         | hist vs oper4  | 35  | 0.268   | 0.591  | 0.778  |  18 |
| HYSETS   |      1 | NSE      |                         | oper1 vs oper2 | 74  | 1       | -0.135 | 0.444  |  18 




In [None]:
for metric, tabs in camelsind.items():
    print(f"\n== CAMELS-IND | {metric} | Lead-wise ==")
    print(tabs['leadwise'].to_markdown(index=False))

100%|██████████| 2/2 [00:00<00:00,  6.21it/s]


Completed lead-wise tests for CAMELS-IND | NSE


100%|██████████| 2/2 [00:00<00:00,  6.45it/s]

Completed lead-wise tests for CAMELS-IND | F1

== CAMELS-IND | NSE | Lead-wise ==
| Region     |   lead | metric   | Friedman (5 models)       | comparison     | W   | p_adj    | r_rb   | CLES   |   n |
|:-----------|-------:|:---------|:--------------------------|:---------------|:----|:---------|:-------|:-------|----:|
| CAMELS-IND |      1 | NSE      | χ²=49.9, df=4, p=3.75e-10 | —              | —   | —        | —      | —      |  15 |
| CAMELS-IND |      1 | NSE      |                           | hist vs oper1  | 1   | 0.000732 | 0.983  | 0.933  |  15 |
| CAMELS-IND |      1 | NSE      |                           | hist vs oper2  | 6   | 0.00256  | 0.900  | 0.867  |  15 |
| CAMELS-IND |      1 | NSE      |                           | hist vs oper3  | 0   | 0.00061  | 1.000  | 1.000  |  15 |
| CAMELS-IND |      1 | NSE      |                           | hist vs oper4  | 0   | 0.00061  | 1.000  | 1.000  |  15 |
| CAMELS-IND |      1 | NSE      |                           | oper1 vs




In [None]:


# =========================
# Example usage:
# =========================


# # Print as Markdown:
# for metric, tabs in camelsus.items():
#     print(f"\n== CAMELS-US | {metric} | Lead-wise ==")
#     print(tabs['leadwise'].to_markdown(index=False))
#     print(f"\n== CAMELS-US | {metric} | Δ vs hist Slope Summary ==")
#     print(tabs['slope_summary'].to_markdown(index=False))
#     print(f"\n== CAMELS-US | {metric} | Δ vs hist Slope Post-hoc ==")
#     print(tabs['slope_posthoc'].to_markdown(index=False))

# # Optionally save:
# for region_name, res in [('CAMELS-US', camelsus), ('HYSETS', hysets), ('CAMELS-IND', camelsind)]:
#     for metric, tabs in res.items():
#         tabs['leadwise'].to_csv(f"{region_name}_{metric}_leadwise.csv", index=False)
#         tabs['slope_summary'].to_csv(f"{region_name}_{metric}_slope_summary.csv", index=False)
#         tabs['slope_posthoc'].to_csv(f"{region_name}_{metric}_slope_posthoc.csv", index=False)
