# Chronos-2 SFT+Lora — Statistical Significance Tests (Global vs Industry-Specific)

This notebook loads the evaluation dumps generated by `chronos2_sft_lora_eval_dump.ipynb` and tests whether performance differences between models are statistically robust.

We treat each **ticker** as one statistical unit (paired setting):
1. Aggregate metrics per ticker (averaged over sampled evaluation windows).
2. Compute paired deltas between models for the same tickers.
3. Run a Wilcoxon signed-rank test and a bootstrap 95% confidence interval for the mean delta.
4. For sector-level results (multiple sectors), apply Benjamini–Hochberg (FDR) correction.

**Sign convention:** Δ = metric(model B) − metric(model A). For MAE/MQL, lower is better, so **negative Δ means model B improves over model A**.


In [1]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon

# Repository root (to keep paths consistent across machines)
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
sys.path.append(project_root)


REPO_ROOT = Path(os.getcwd()).parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

# Support both layouts (repo root vs notebooks working dir)
cand1 = REPO_ROOT / "notebooks" / "outputs"
cand2 = REPO_ROOT / "outputs"
OUTPUTS_BASE = cand1 if cand1.exists() else cand2

DUMPS_DIR = OUTPUTS_BASE / "eval_dumps" / "sft_lora"
if not DUMPS_DIR.exists():
    raise FileNotFoundError(f"Dump directory not found: {DUMPS_DIR}")

OUT_DIR = OUTPUTS_BASE / "stats_results" / "sft_lora"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("REPO_ROOT:", REPO_ROOT)
print("DUMPS_DIR:", DUMPS_DIR)
print("OUT_DIR:", OUT_DIR)


REPO_ROOT: c:\Users\rosar\chronos_dnlp
DUMPS_DIR: c:\Users\rosar\chronos_dnlp\notebooks\outputs\eval_dumps\sft_lora
OUT_DIR: c:\Users\rosar\chronos_dnlp\notebooks\outputs\stats_results\sft_lora


In [2]:
# Load all Parquet dumps

paths = sorted(DUMPS_DIR.glob("*.parquet"))
print("n_parquet:", len(paths))
if len(paths) == 0:
    raise RuntimeError("No parquet files found in DUMPS_DIR")

dfs = []
for p in paths:
    df = pd.read_parquet(p)
    df["source_file"] = p.name
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)
print("df_all:", df_all.shape)
df_all.head()


n_parquet: 42
df_all: (72000, 21)


Unnamed: 0,window_id,start_idx,date,ticker,group,model,context_length,y_true,y_pred_q10,y_pred_q20,...,y_pred_q40,y_pred_q50,y_pred_q60,y_pred_q70,y_pred_q80,y_pred_q90,mae,mse,mql,source_file
0,0,184,2022-05-10 00:00:00+00:00,CMCSA,communication_services,baseline,128,0.013581,-0.051625,-0.033667,...,-0.020293,-0.016465,-0.013366,-0.010725,-0.008386,-0.006176,0.030046,0.000903,0.013864,communication_services__baseline.parquet
1,0,184,2022-05-10 00:00:00+00:00,CRM,communication_services,baseline,128,0.021699,-0.085641,-0.055584,...,-0.034463,-0.028186,-0.023013,-0.018546,-0.014516,-0.01066,0.049885,0.002488,0.022887,communication_services__baseline.parquet
2,0,184,2022-05-10 00:00:00+00:00,GOOGL,communication_services,baseline,128,0.016745,-0.065451,-0.043664,...,-0.027775,-0.022832,-0.018718,-0.01514,-0.011928,-0.008861,0.039577,0.001566,0.01808,communication_services__baseline.parquet
3,0,184,2022-05-10 00:00:00+00:00,T,communication_services,baseline,128,-0.009719,-0.056527,-0.031768,...,-0.016174,-0.012397,-0.009495,-0.007089,-0.005084,-0.003258,0.002678,7e-06,0.002119,communication_services__baseline.parquet
4,0,184,2022-05-10 00:00:00+00:00,TMUS,communication_services,baseline,128,0.010433,-0.06339,-0.040039,...,-0.022706,-0.017846,-0.013924,-0.010556,-0.007595,-0.004807,0.028278,0.0008,0.012693,communication_services__baseline.parquet


In [3]:
# -----------------------------
# 2) Aggregate per ticker (unit of analysis)
# -----------------------------
metrics = ["mae", "mql"]
agg = (
    df_all
    .groupby(["group", "model", "ticker"], as_index=False)[metrics]
    .mean()
)
print("agg:", agg.shape)
agg.head()


agg: (549, 5)


Unnamed: 0,group,model,ticker,mae,mql
0,communication_services,baseline,CMCSA,0.014223,0.006565
1,communication_services,baseline,CRM,0.02221,0.00998
2,communication_services,baseline,GOOGL,0.018694,0.008304
3,communication_services,baseline,T,0.012219,0.005621
4,communication_services,baseline,TMUS,0.01258,0.005571


In [4]:
# Statistical helpers

def bootstrap_mean_ci(delta: np.ndarray, n_boot: int = 2000, ci: float = 0.95, seed: int = 123):
    """Bootstrap CI for the mean of delta."""
    rng = np.random.default_rng(seed)
    n = len(delta)
    if n == 0:
        return (np.nan, np.nan)
    boots = []
    for _ in range(n_boot):
        sample = rng.choice(delta, size=n, replace=True)
        boots.append(sample.mean())
    boots = np.array(boots)
    lo = np.quantile(boots, (1-ci)/2)
    hi = np.quantile(boots, 1-(1-ci)/2)
    return float(lo), float(hi)

def bh_fdr(pvals: np.ndarray):
    """Benjamini–Hochberg FDR correction."""
    pvals = np.asarray(pvals, dtype=float)
    n = len(pvals)
    order = np.argsort(pvals)
    ranked = pvals[order]
    adj = np.empty(n, dtype=float)
    prev = 1.0
    for i in range(n-1, -1, -1):
        rank = i+1
        val = ranked[i] * n / rank
        prev = min(prev, val)
        adj[i] = prev
    out = np.empty(n, dtype=float)
    out[order] = np.clip(adj, 0, 1)
    return out

def paired_test_group(agg_df: pd.DataFrame, group: str, model_a: str, model_b: str, metric: str):
    """Paired test on ticker-level metric averages (delta = B - A)."""
    a = agg_df[(agg_df["group"] == group) & (agg_df["model"] == model_a)][["ticker", metric]]
    b = agg_df[(agg_df["group"] == group) & (agg_df["model"] == model_b)][["ticker", metric]]

    m = a.merge(b, on="ticker", suffixes=("_a", "_b"))
    if len(m) < 2:
        return None

    delta = (m[f"{metric}_b"] - m[f"{metric}_a"]).to_numpy()
    mean_d = float(np.mean(delta))
    med_d = float(np.median(delta))
    ci_lo, ci_hi = bootstrap_mean_ci(delta, n_boot=5000, ci=0.95, seed=123)

    # Wilcoxon signed-rank test (paired, non-parametric)
    try:
        p = float(wilcoxon(delta).pvalue)
    except Exception:
        p = np.nan

    return {
        "group": group,
        "metric": metric,
        "model_a": model_a,
        "model_b": model_b,
        "n_tickers": int(len(m)),
        "mean_delta_b_minus_a": mean_d,
        "median_delta_b_minus_a": med_d,
        "ci95_lo": ci_lo,
        "ci95_hi": ci_hi,
        "p_wilcoxon": p,
    }


In [5]:
# Build comparisons

comparisons = []

# GLOBAL: baseline vs LoRA general
for metric in metrics:
    r = paired_test_group(agg, "global", "baseline", "lora_general", metric)
    if r:
        comparisons.append(r)

# CATEGORY-LEVEL tests:
# We use the FAIR subset for categories:
#  - baseline:            __baseline.parquet
#  - lora_general (fair): __lora_general_ctx_cat.parquet
#  - lora_category:       __lora_category.parquet
cat_mask_general_fair = df_all["source_file"].str.contains("__lora_general_ctx_cat.parquet")
cat_mask_category     = df_all["source_file"].str.contains("__lora_category.parquet")
cat_mask_baseline     = df_all["source_file"].str.contains("__baseline.parquet") & (df_all["group"] != "global")

df_cat_fair = df_all[cat_mask_general_fair | cat_mask_category | cat_mask_baseline].copy()

agg_cat_fair = (
    df_cat_fair
    .groupby(["group", "model", "ticker"], as_index=False)[metrics]
    .mean()
)

cats = sorted([g for g in agg_cat_fair["group"].unique() if g != "global"])
print("n_categories in dumps:", len(cats))

for cat in cats:
    for metric in metrics:
        r1 = paired_test_group(agg_cat_fair, cat, "baseline", "lora_category", metric)
        if r1: comparisons.append(r1)

        r2 = paired_test_group(agg_cat_fair, cat, "lora_general", "lora_category", metric)
        if r2: comparisons.append(r2)

df_comp = pd.DataFrame(comparisons)
df_comp


n_categories in dumps: 10


Unnamed: 0,group,metric,model_a,model_b,n_tickers,mean_delta_b_minus_a,median_delta_b_minus_a,ci95_lo,ci95_hi,p_wilcoxon
0,global,mae,baseline,lora_general,114,-0.001995,-0.001843,-0.002135,-0.001862,1.918356e-20
1,global,mql,baseline,lora_general,114,-0.001005,-0.000948,-0.001065,-0.000946,1.918356e-20
2,communication_services,mae,baseline,lora_category,6,-0.001219,-0.001182,-0.001695,-0.000781,0.03125
3,communication_services,mae,lora_general,lora_category,6,0.000526,0.000471,0.000355,0.000715,0.03125
4,communication_services,mql,baseline,lora_category,6,-0.000588,-0.000546,-0.000774,-0.000425,0.03125
5,communication_services,mql,lora_general,lora_category,6,0.000226,0.000201,0.000178,0.000287,0.03125
6,consumer_discretionary,mae,baseline,lora_category,10,-0.002306,-0.002398,-0.002747,-0.001841,0.001953125
7,consumer_discretionary,mae,lora_general,lora_category,10,-4.9e-05,-4e-06,-0.000251,0.000143,0.6953125
8,consumer_discretionary,mql,baseline,lora_category,10,-0.001187,-0.00128,-0.001392,-0.000968,0.001953125
9,consumer_discretionary,mql,lora_general,lora_category,10,-8.9e-05,-6.4e-05,-0.000198,1.3e-05,0.1933594


In [6]:

# Multiple-testing correction (BH/FDR) for category comparisons

df_comp["p_adj_bh"] = np.nan

for (metric, model_a, model_b), sub in df_comp[df_comp["group"] != "global"].groupby(["metric", "model_a", "model_b"]):
    p = sub["p_wilcoxon"].to_numpy()
    df_comp.loc[sub.index, "p_adj_bh"] = bh_fdr(p)

df_comp_sorted = df_comp.sort_values(["metric", "model_a", "model_b", "p_adj_bh", "p_wilcoxon"])
df_comp_sorted.head(20)


Unnamed: 0,group,metric,model_a,model_b,n_tickers,mean_delta_b_minus_a,median_delta_b_minus_a,ci95_lo,ci95_hi,p_wilcoxon,p_adj_bh
22,health_care,mae,baseline,lora_category,18,-0.000713,-0.000562,-0.00094,-0.000494,7.629395e-06,7.6e-05
30,information_technology,mae,baseline,lora_category,17,-0.002514,-0.002633,-0.002845,-0.002172,1.525879e-05,7.6e-05
26,industrials,mae,baseline,lora_category,18,-9.5e-05,-0.00012,-0.000134,-5.6e-05,0.0005340576,0.00178
10,consumer_staples,mae,baseline,lora_category,11,-0.001297,-0.001126,-0.001744,-0.000932,0.0009765625,0.002441
6,consumer_discretionary,mae,baseline,lora_category,10,-0.002306,-0.002398,-0.002747,-0.001841,0.001953125,0.003906
18,financials,mae,baseline,lora_category,17,4.2e-05,3.8e-05,1.6e-05,6.7e-05,0.009338379,0.015564
2,communication_services,mae,baseline,lora_category,6,-0.001219,-0.001182,-0.001695,-0.000781,0.03125,0.039062
14,energy,mae,baseline,lora_category,6,-0.001921,-0.00159,-0.002576,-0.001437,0.03125,0.039062
34,materials,mae,baseline,lora_category,2,-0.00194,-0.00194,-0.002348,-0.001532,0.5,0.5
38,real_estate,mae,baseline,lora_category,2,-0.001456,-0.001456,-0.002018,-0.000894,0.5,0.5


In [7]:
# Save results

out_csv = OUT_DIR / "paired_tests_ticker_level.csv"
df_comp_sorted.to_csv(out_csv, index=False)
print("Saved:", out_csv)

# Quick view: global results + category results for (baseline -> lora_category) on MQL
display(df_comp_sorted[df_comp_sorted["group"] == "global"])
display(df_comp_sorted[
    (df_comp_sorted["group"] != "global") &
    (df_comp_sorted["metric"] == "mql") &
    (df_comp_sorted["model_a"] == "baseline") &
    (df_comp_sorted["model_b"] == "lora_category")
].sort_values("p_adj_bh"))


Saved: c:\Users\rosar\chronos_dnlp\notebooks\outputs\stats_results\sft_lora\paired_tests_ticker_level.csv


Unnamed: 0,group,metric,model_a,model_b,n_tickers,mean_delta_b_minus_a,median_delta_b_minus_a,ci95_lo,ci95_hi,p_wilcoxon,p_adj_bh
0,global,mae,baseline,lora_general,114,-0.001995,-0.001843,-0.002135,-0.001862,1.918356e-20,
1,global,mql,baseline,lora_general,114,-0.001005,-0.000948,-0.001065,-0.000946,1.918356e-20,


Unnamed: 0,group,metric,model_a,model_b,n_tickers,mean_delta_b_minus_a,median_delta_b_minus_a,ci95_lo,ci95_hi,p_wilcoxon,p_adj_bh
24,health_care,mql,baseline,lora_category,18,-0.000318,-0.000265,-0.000419,-0.000226,8e-06,7.6e-05
32,information_technology,mql,baseline,lora_category,17,-0.001279,-0.001309,-0.001448,-0.001115,1.5e-05,7.6e-05
28,industrials,mql,baseline,lora_category,18,-4.2e-05,-4.9e-05,-5.8e-05,-2.6e-05,0.000534,0.00178
12,consumer_staples,mql,baseline,lora_category,11,-0.000696,-0.000596,-0.000904,-0.000526,0.000977,0.002441
8,consumer_discretionary,mql,baseline,lora_category,10,-0.001187,-0.00128,-0.001392,-0.000968,0.001953,0.003906
20,financials,mql,baseline,lora_category,17,2e-05,1.8e-05,1e-05,2.9e-05,0.002579,0.004298
4,communication_services,mql,baseline,lora_category,6,-0.000588,-0.000546,-0.000774,-0.000425,0.03125,0.039062
16,energy,mql,baseline,lora_category,6,-0.000821,-0.000685,-0.001106,-0.00059,0.03125,0.039062
36,materials,mql,baseline,lora_category,2,-0.001012,-0.001012,-0.001239,-0.000785,0.5,0.5
40,real_estate,mql,baseline,lora_category,2,-0.000833,-0.000833,-0.001069,-0.000596,0.5,0.5
