In [16]:
import re
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
from scipy import stats

# ---------------------------
# Parsing
# ---------------------------

def parse_results_file(path: str) -> Dict[str, Dict[str, Dict[str, float]]]:
    """
    Parse one results file.

    Returns:
      data[model][metric] = {'value': float, 'lo': float or None, 'hi': float or None}
    Skips entries with "Not found".
    """
    data: Dict[str, Dict[str, Dict[str, float]]] = {}
    current_model = None

    subject_re = re.compile(r'^\s*Subject:\s*(.+?)\s*$')
    line_re = re.compile(r'^\s{2,}(.+?):\s*(.+?)\s*$')
    num_ci_re = re.compile(
        r'^\s*([+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*\[\s*([+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*,\s*([+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*\]\s*$'
    )
    num_only_re = re.compile(r'^\s*([+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*$')

    with open(path, 'r', encoding='utf-8') as f:
        for raw in f:
            line = raw.rstrip('\n')

            m_sub = subject_re.match(line)
            if m_sub:
                current_model = m_sub.group(1).strip()
                if current_model not in data:
                    data[current_model] = {}
                continue

            m_line = line_re.match(line)
            if not m_line or current_model is None:
                continue

            metric = m_line.group(1).strip()
            val_str = m_line.group(2).strip()

            # Skip "Not found"
            if val_str.lower().startswith('not found'):
                continue

            m_num_ci = num_ci_re.match(val_str)
            if m_num_ci:
                v = float(m_num_ci.group(1))
                lo = float(m_num_ci.group(2))
                hi = float(m_num_ci.group(3))
                data[current_model][metric] = {'value': v, 'lo': lo, 'hi': hi}
                continue

            m_num = num_only_re.match(val_str)
            if m_num:
                v = float(m_num.group(1))
                data[current_model][metric] = {'value': v, 'lo': None, 'hi': None}
                continue

            # Unrecognized format; ignore
    return data


# ---------------------------
# Utilities
# ---------------------------

def is_correlation_metric(metric_name: str) -> bool:
    """Heuristic: treat metrics containing 'correl' (case-insensitive) as correlations."""
    return 'correl' in metric_name.lower()

def zcrit_from_level(level: float = 0.95) -> float:
    return stats.norm.ppf(0.5 + level/2.0)

def fisher_z(x: np.ndarray) -> np.ndarray:
    x = np.clip(x, -0.999999, 0.999999)
    return np.arctanh(x)

def inv_fisher_z(z: np.ndarray) -> np.ndarray:
    return np.tanh(z)

def se_from_ci(lo: np.ndarray, hi: np.ndarray, level: float = 0.95) -> np.ndarray:
    zcrit = zcrit_from_level(level)
    return (hi - lo) / (2.0 * zcrit)

def se_from_ci_fisher_r(lo_r: np.ndarray, hi_r: np.ndarray, level: float = 0.95) -> np.ndarray:
    zcrit = zcrit_from_level(level)
    lo_z = fisher_z(lo_r)
    hi_z = fisher_z(hi_r)
    return (hi_z - lo_z) / (2.0 * zcrit)

def fixed_effect_pool(values: np.ndarray, se: np.ndarray) -> Tuple[float, float, float]:
    """Inverse-variance weighted mean and its SE and CI bounds (on the same scale as values)."""
    w = 1.0 / (se**2)
    mu = np.sum(w * values) / np.sum(w)
    se_mu = 1.0 / np.sqrt(np.sum(w))
    return mu, se_mu, w.sum()

def mean_t_ci(x: np.ndarray, level: float = 0.95) -> Tuple[float, float, float]:
    """Mean, SE, and t critical for a sample (across models)."""
    k = x.size
    m = x.mean()
    sd = x.std(ddof=1) if k > 1 else 0.0
    se = sd / np.sqrt(k) if k > 0 else np.nan
    tcrit = stats.t.ppf(0.5 + level/2.0, df=max(k-1, 1))
    return m, se, tcrit


# ---------------------------
# Collect per-metric data
# ---------------------------

def collect_metric_data(metric: str, file_paths: List[str]) -> Dict[str, pd.DataFrame]:
    """
    For each file, return a DataFrame with columns:
      model, value, lo, hi, has_ci (bool)
    Only includes models that have the metric present.
    """
    out = {}
    metric_lower = metric.lower()
    for path in file_paths:
        parsed = parse_results_file(path)
        rows = []
        for model, metrics in parsed.items():
            # case-insensitive lookup across keys
            match_key = None
            for k in metrics.keys():
                if k.lower() == metric_lower:
                    match_key = k
                    break
            if match_key is None:
                continue
            entry = metrics[match_key]
            v = entry.get('value', None)
            lo = entry.get('lo', None)
            hi = entry.get('hi', None)
            if v is None:
                continue
            has_ci = lo is not None and hi is not None
            rows.append({'model': model, 'value': float(v),
                         'lo': float(lo) if has_ci else np.nan,
                         'hi': float(hi) if has_ci else np.nan,
                         'has_ci': has_ci})
        df = pd.DataFrame(rows)
        out[path] = df.sort_values('model').reset_index(drop=True)
    return out


# ---------------------------
# Per-file summaries
# ---------------------------

def summarize_per_file(metric: str,
                       file_to_df: Dict[str, pd.DataFrame],
                       ci_level: float = 0.95) -> pd.DataFrame:
    """
    For each file, compute an overall mean and CI:
      - If per-model CIs exist: fixed-effect inverse-variance pooling.
        * If correlation metric: operate on Fisher z and back-transform for reporting.
        * Else: operate on raw scale.
      - If no per-model CIs: mean±t across models.
    Returns a DataFrame with columns:
      file, n_models, method, mean, ci_lo, ci_hi
    """
    is_corr = is_correlation_metric(metric)
    zcrit = zcrit_from_level(ci_level)

    rows = []
    for path, df in file_to_df.items():
        n = len(df)
        if n == 0:
            rows.append({'file': path, 'n_models': 0, 'method': 'no data',
                         'mean': np.nan, 'ci_lo': np.nan, 'ci_hi': np.nan})
            continue

        has_any_ci = df['has_ci'].any()
        if has_any_ci:
            # Use only models with CIs
            d = df[df['has_ci']].copy()
            if len(d) == 0:
                rows.append({'file': path, 'n_models': 0, 'method': 'no usable CI',
                             'mean': np.nan, 'ci_lo': np.nan, 'ci_hi': np.nan})
                continue

            if is_corr:
                # Pool on Fisher z
                vals_z = fisher_z(d['value'].to_numpy())
                se_z = se_from_ci_fisher_r(d['lo'].to_numpy(), d['hi'].to_numpy(), level=ci_level)
                se_z, used_floor = stabilize_se(se_z, fallback='floor')
                z_bar, se_bar, _ = fixed_effect_pool(vals_z, se_z)
                ci_z = (z_bar - zcrit*se_bar, z_bar + zcrit*se_bar)
                mean_r = float(inv_fisher_z(np.array([z_bar]))[0])
                ci_lo = float(inv_fisher_z(np.array([ci_z[0]]))[0])
                ci_hi = float(inv_fisher_z(np.array([ci_z[1]]))[0])
                method = 'fixed-effect IVW on Fisher z'
                rows.append({'file': path, 'n_models': int(len(d)), 'method': method,
                             'mean': mean_r, 'ci_lo': ci_lo, 'ci_hi': ci_hi})
            else:
                # Pool on raw scale
                vals = d['value'].to_numpy()
                se = se_from_ci(d['lo'].to_numpy(), d['hi'].to_numpy(), level=ci_level)
                se, used_floor = stabilize_se(se, fallback='floor')
                mu, se_mu, _ = fixed_effect_pool(vals, se)
                ci = (mu - zcrit*se_mu, mu + zcrit*se_mu)
                method = 'fixed-effect IVW (raw scale)'
                rows.append({'file': path, 'n_models': int(len(d)), 'method': method,
                             'mean': float(mu), 'ci_lo': float(ci[0]), 'ci_hi': float(ci[1])})
        else:
            # No per-model CIs -> mean±t across models
            vals = df['value'].to_numpy()
            m, se, tcrit = mean_t_ci(vals, level=ci_level)
            ci = (m - tcrit*se, m + tcrit*se) if len(vals) > 1 else (m, m)
            method = 'mean±t across models (no model CIs)'
            rows.append({'file': path, 'n_models': int(len(vals)), 'method': method,
                         'mean': float(m), 'ci_lo': float(ci[0]), 'ci_hi': float(ci[1])})

    summary = pd.DataFrame(rows)
    return summary


# ---------------------------
# Pairwise comparisons
# ---------------------------

def pairwise_comparisons(metric: str,
                         file_to_df: Dict[str, pd.DataFrame],
                         ci_level: float = 0.95,
                         assume_independent: bool = True) -> pd.DataFrame:
    """
    For each file pair:
      - Identify overlapping models with data for the metric in both.
      - If both sides have per-model CIs for the overlaps:
          * If correlation metric: paired IVW difference on Fisher z.
          * Else: paired IVW difference on raw scale.
      - Else (no CIs): paired t-test across models (raw differences).

    Returns DataFrame with:
      file1, file2, n_common, same_model_set, method, est, ci_lo, ci_hi, p_value
    est/CI are on:
      - Fisher z scale for correlation metrics (paired IVW),
      - raw scale otherwise,
      - raw scale difference for paired t.
    """
    is_corr = is_correlation_metric(metric)
    zcrit = zcrit_from_level(ci_level)
    files = list(file_to_df.keys())

    # Precompute model sets for equality check
    models_per_file = {f: set(file_to_df[f]['model']) for f in files}

    records = []

    for i in range(len(files)):
        for j in range(i+1, len(files)):
            f1, f2 = files[i], files[j]
            d1 = file_to_df[f1]
            d2 = file_to_df[f2]
            overlap = sorted(set(d1['model']).intersection(set(d2['model'])))

            same_set = models_per_file[f1] == models_per_file[f2]
            if len(overlap) == 0:
                records.append({'file1': f1, 'file2': f2, 'n_common': 0, 'same_model_set': same_set,
                                'method': 'no overlap', 'est': np.nan, 'ci_lo': np.nan,
                                'ci_hi': np.nan, 'p_value': np.nan})
                continue

            # Join on model
            j1 = d1.set_index('model').loc[overlap]
            j2 = d2.set_index('model').loc[overlap]

            # Identify if we can do IVW (need CIs on both)
            use_ivw = j1['has_ci'].all() and j2['has_ci'].all()

            if use_ivw:
                if is_corr:
                    # Work on Fisher z
                    v1 = fisher_z(j1['value'].to_numpy())
                    v2 = fisher_z(j2['value'].to_numpy())
                    se1 = se_from_ci_fisher_r(j1['lo'].to_numpy(), j1['hi'].to_numpy(), level=ci_level)
                    se2 = se_from_ci_fisher_r(j2['lo'].to_numpy(), j2['hi'].to_numpy(), level=ci_level)
                    # Independence assumption -> Var(d) = se1^2 + se2^2
                    var_d = se1**2 + se2**2 if assume_independent else se1**2 + se2**2
                    d = v2 - v1
                    w = 1.0 / var_d
                    D = np.sum(w * d) / np.sum(w)
                    SE_D = 1.0 / np.sqrt(np.sum(w))
                    ci = (D - zcrit*SE_D, D + zcrit*SE_D)
                    z_stat = D / SE_D
                    p = 2.0 * (1.0 - stats.norm.cdf(abs(z_stat)))
                    method = 'paired IVW on Fisher z'
                    records.append({'file1': f1, 'file2': f2, 'n_common': len(overlap),
                                    'same_model_set': same_set, 'method': method,
                                    'est': float(D), 'ci_lo': float(ci[0]),
                                    'ci_hi': float(ci[1]), 'p_value': float(p)})
                else:
                    # Raw scale IVW
                    v1 = j1['value'].to_numpy()
                    v2 = j2['value'].to_numpy()
                    se1 = se_from_ci(j1['lo'].to_numpy(), j1['hi'].to_numpy(), level=ci_level)
                    se2 = se_from_ci(j2['lo'].to_numpy(), j2['hi'].to_numpy(), level=ci_level)
                    var_d = se1**2 + se2**2 if assume_independent else se1**2 + se2**2
                    d = v2 - v1
                    w = 1.0 / var_d
                    D = np.sum(w * d) / np.sum(w)
                    SE_D = 1.0 / np.sqrt(np.sum(w))
                    ci = (D - zcrit*SE_D, D + zcrit*SE_D)
                    z_stat = D / SE_D
                    p = 2.0 * (1.0 - stats.norm.cdf(abs(z_stat)))
                    method = 'paired IVW (raw scale)'
                    records.append({'file1': f1, 'file2': f2, 'n_common': len(overlap),
                                    'same_model_set': same_set, 'method': method,
                                    'est': float(D), 'ci_lo': float(ci[0]),
                                    'ci_hi': float(ci[1]), 'p_value': float(p)})
            else:
                # Paired t on raw differences across models
                # Use only models with values (ignore CI fields)
                v1 = j1['value'].to_numpy()
                v2 = j2['value'].to_numpy()
                d = v2 - v1
                k = d.size
                Dbar = d.mean()
                sd = d.std(ddof=1) if k > 1 else 0.0
                se = sd / np.sqrt(k) if k > 0 else np.nan
                tcrit = stats.t.ppf(0.5 + ci_level/2.0, df=max(k-1, 1))
                ci = (Dbar - tcrit*se, Dbar + tcrit*se) if k > 1 else (Dbar, Dbar)
                t_stat = Dbar / se if se > 0 else np.inf * np.sign(Dbar)
                p = 2.0 * (1.0 - stats.t.cdf(abs(t_stat), df=max(k-1, 1))) if k > 1 else 0.0
                method = 'paired t across models (no model CIs)'
                records.append({'file1': f1, 'file2': f2, 'n_common': int(k),
                                'same_model_set': same_set, 'method': method,
                                'est': float(Dbar), 'ci_lo': float(ci[0]),
                                'ci_hi': float(ci[1]), 'p_value': float(p)})

    return pd.DataFrame.from_records(records)


# ---------------------------
# Main entry point
# ---------------------------

def analyze_metric_across_files(metric_name: str,
                                file_paths: List[str],
                                ci_level: float = 0.95,
                                assume_independent: bool = True):
    """
    High-level function:
      - Parses files
      - Builds per-file summaries
      - Builds pairwise comparison table

    Returns (summary_df, pairwise_df)
    Also prints both tables.
    """
    file_to_df = collect_metric_data(metric_name, file_paths)

    # Per-file summaries
    summary = summarize_per_file(metric_name, file_to_df, ci_level=ci_level)

    # Pairwise comparisons
    pairwise = pairwise_comparisons(metric_name, file_to_df,
                                    ci_level=ci_level,
                                    assume_independent=assume_independent)

    # Pretty print
    pd.set_option('display.max_colwidth', 120)
    print(f"\nMetric: {metric_name}")
    print("\nPer-file summaries:")
    display(summary)

    print("\nPairwise comparisons:")
    display(pairwise)

    return summary, pairwise

def difference_of_pooled_test(summary_df: pd.DataFrame, ci_level: float = 0.95) -> pd.DataFrame:
    """
    For each file pair in a summary table (from summarize_per_file), test the
    difference between pooled means using their pooled SEs.

    Returns: file1, file2, est, ci_lo, ci_hi, p_value
    """
    zcrit = zcrit_from_level(ci_level)
    rows = []
    for i in range(len(summary_df)):
        for j in range(i+1, len(summary_df)):
            a = summary_df.iloc[i]
            b = summary_df.iloc[j]
            m1, m2 = float(a['mean']), float(b['mean'])
            se1 = (float(a['ci_hi']) - m1) / zcrit
            se2 = (float(b['ci_hi']) - m2) / zcrit
            diff = m1 - m2
            se = np.sqrt(se1**2 + se2**2)
            ci = (diff - zcrit*se, diff + zcrit*se)
            z_stat = diff / se if se > 0 else np.inf * np.sign(diff)
            p = 2.0 * (1.0 - stats.norm.cdf(abs(z_stat)))
            rows.append({'file1': a['file'], 'file2': b['file'],
                         'est': diff, 'ci_lo': ci[0], 'ci_hi': ci[1], 'p_value': p})
    return pd.DataFrame(rows)

def stabilize_se(se: np.ndarray, fallback: str = 'floor') -> Tuple[np.ndarray, bool]:
    """
    Replace nonpositive or nonfinite SEs.
    fallback:
      - 'floor': set to 0.5 * min positive SE (or 1e-12 if none).
      - 'drop': return SEs and a flag indicating there were bad SEs (caller can drop rows).
    Returns (se_fixed, used_floor)
    """
    se = np.asarray(se, float)
    pos = np.isfinite(se) & (se > 0)
    if pos.any():
        floor = max(1e-12, 0.5 * float(np.min(se[pos])))
        bad = ~pos
        if fallback == 'floor':
            se[bad] = floor
            return se, True if bad.any() else False
        elif fallback == 'drop':
            return se, bad.any()
    else:
        # No positive SEs at all
        if fallback == 'floor':
            se[:] = 1e-12
            return se, True
        elif fallback == 'drop':
            return se, True
    return se, False

In [None]:
files = ["analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt", "analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt", "analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt", "analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt"]

# Metrics you listed
metrics = [
    "Delegation rate",
    "Top Prob Mean",
    "Raw introspection score",
    "Raw self-acc lift",
    "Correctness Coef Cntl",
    "Capent Correl Cntl",            # treated as correlation (Fisher z)
    "Capent Correl Prob Cntl",       # treated as correlation (Fisher z)
    "Calibration AUC",
    "ECE",
    "Brier",
    "Brier Resolution",
    "Brier Reliability",
    "Pseudo R2 Cntl",                # no CIs -> paired t
]

for m in metrics:
    summarize, compare = analyze_metric_across_files(m, files, ci_level=0.95, assume_independent=True)


Metric: Raw introspection score

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),0.130885,0.106539,0.155231
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),0.103908,0.080447,0.127368
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),0.141092,0.117037,0.165148
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),0.144308,0.117596,0.17102



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),-0.029313,-0.063249,0.004622,0.09045
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.015235,-0.019553,0.050024,0.390709
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.017392,-0.019354,0.054137,0.353591
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.038349,0.004176,0.072521,0.027844
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.044254,0.008179,0.080329,0.016202
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.00208,-0.034259,0.038419,0.910677



Metric: Raw self-acc lift

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),0.106005,0.076924,0.135087
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),0.059782,0.026587,0.092978
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),0.022227,0.003565,0.040889
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),0.056633,0.031857,0.08141



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),-0.036931,-0.082694,0.008832,0.113721
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),-0.072294,-0.108737,-0.035852,0.000101
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),-0.028114,-0.071076,0.014848,0.199636
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),-0.033802,-0.075166,0.007561,0.109223
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.009137,-0.039472,0.057745,0.712569
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.043966,0.010727,0.077205,0.009528



Metric: Correctness Coef Cntl

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),1.879967,1.373909,2.386026
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),2.064486,1.558425,2.570548
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),2.511758,2.005695,3.017821
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,fixed-effect IVW (raw scale),2.803849,2.29779,3.309909



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.184517,-0.53116,0.900194,0.613333
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.631797,-0.083881,1.347475,0.083586
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.923878,0.208202,1.639553,0.011401
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.44728,-0.2684,1.16296,0.220604
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.739363,0.023686,1.455041,0.042885
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired IVW (raw scale),0.292076,-0.423602,1.007755,0.423779



Metric: Capent Correl Cntl

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,8,fixed-effect IVW on Fisher z,0.198383,0.164198,0.232092
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,8,fixed-effect IVW on Fisher z,0.214195,0.182312,0.245627
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,fixed-effect IVW on Fisher z,0.115257,0.071213,0.158852
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,fixed-effect IVW on Fisher z,0.016945,-0.025716,0.059544



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,8,True,paired IVW on Fisher z,0.015699,-0.032794,0.064191,0.5257421
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW on Fisher z,-0.087384,-0.146327,-0.02844,0.003665124
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW on Fisher z,-0.191464,-0.250965,-0.131964,2.847129e-10
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW on Fisher z,-0.09458,-0.152006,-0.037154,0.001246378
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW on Fisher z,-0.224116,-0.281931,-0.166301,3.019807e-14
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,True,paired IVW on Fisher z,-0.103821,-0.16922,-0.038421,0.001861886



Metric: Capent Correl Prob Cntl

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,7,fixed-effect IVW on Fisher z,0.211812,0.175433,0.247613
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,7,fixed-effect IVW on Fisher z,0.235805,0.201991,0.269057
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,6,fixed-effect IVW on Fisher z,0.150006,0.101762,0.197546
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,6,fixed-effect IVW on Fisher z,0.031396,-0.015821,0.078473



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,7,True,paired IVW on Fisher z,0.024045,-0.027829,0.075918,0.3636166
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,6,False,paired IVW on Fisher z,-0.061252,-0.125767,0.003264,0.06276978
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,6,False,paired IVW on Fisher z,-0.216153,-0.281784,-0.150521,1.082374e-10
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,6,False,paired IVW on Fisher z,-0.098731,-0.161599,-0.035864,0.002083554
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,6,False,paired IVW on Fisher z,-0.259652,-0.323436,-0.195868,1.554312e-15
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,6,True,paired IVW on Fisher z,-0.124912,-0.198213,-0.051611,0.0008378216



Metric: Calibration AUC

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,9,fixed-effect IVW (raw scale),0.645787,0.627454,0.66412
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,9,fixed-effect IVW (raw scale),0.625355,0.60852,0.642189
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,fixed-effect IVW (raw scale),0.581051,0.555419,0.606683
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,fixed-effect IVW (raw scale),0.707129,0.685002,0.729257



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,9,True,paired IVW (raw scale),-0.017232,-0.042156,0.007693,0.1754045
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),-0.065233,-0.09951,-0.030957,0.0001914135
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.045431,0.014351,0.07651,0.004170081
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),-0.041357,-0.074424,-0.008291,0.01423031
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.068692,0.038698,0.098685,7.166608e-06
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,True,paired IVW (raw scale),0.106097,0.071729,0.140465,1.442722e-09



Metric: ECE

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,9,fixed-effect IVW (raw scale),0.397627,0.382034,0.41322
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,9,fixed-effect IVW (raw scale),0.356105,0.341584,0.370627
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,fixed-effect IVW (raw scale),0.72591,0.71168,0.74014
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,fixed-effect IVW (raw scale),0.65924,0.645063,0.673418



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,9,True,paired IVW (raw scale),-0.037896,-0.059222,-0.01657,0.0004961525
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.313966,0.290897,0.337035,0.0
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.23705,0.213692,0.260408,0.0
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.346076,0.323849,0.368304,0.0
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.263211,0.240701,0.285721,0.0
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,True,paired IVW (raw scale),-0.069542,-0.089883,-0.049201,2.074851e-11



Metric: Brier

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,9,fixed-effect IVW (raw scale),0.394659,0.382082,0.407236
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,9,fixed-effect IVW (raw scale),0.352324,0.341465,0.363183
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,fixed-effect IVW (raw scale),0.674575,0.661912,0.687238
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,fixed-effect IVW (raw scale),0.59926,0.586134,0.612386



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,9,True,paired IVW (raw scale),-0.028552,-0.045246,-0.011858,0.0008018612
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.260352,0.240711,0.279993,0.0
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.177302,0.157189,0.197415,0.0
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.283929,0.2654,0.302459,0.0
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.194008,0.174832,0.213183,0.0
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,True,paired IVW (raw scale),-0.073587,-0.091974,-0.0552,4.440892e-15



Metric: Brier Resolution

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,9,fixed-effect IVW (raw scale),0.009552,0.006852,0.012253
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,9,fixed-effect IVW (raw scale),0.004021,0.002266,0.005775
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,fixed-effect IVW (raw scale),0.001264,0.000384,0.002144
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,fixed-effect IVW (raw scale),0.003958,0.00238,0.005536



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,9,True,paired IVW (raw scale),-0.001947,-0.005558,0.001665,0.290827
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),-0.008244,-0.011593,-0.004895,1e-06
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),-0.004919,-0.008781,-0.001056,0.012561
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),-0.002649,-0.004912,-0.000385,0.021841
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.000213,-0.002533,0.00296,0.878972
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,True,paired IVW (raw scale),0.002671,0.000699,0.004643,0.007939



Metric: Brier Reliability

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,9,fixed-effect IVW (raw scale),0.156633,0.143826,0.16944
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,9,fixed-effect IVW (raw scale),0.087196,0.07843,0.095962
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,fixed-effect IVW (raw scale),0.519934,0.499416,0.540452
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,fixed-effect IVW (raw scale),0.426669,0.408442,0.444896



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,9,True,paired IVW (raw scale),-0.033995,-0.050151,-0.017839,3.722342e-05
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.332334,0.306586,0.358082,0.0
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.234062,0.209854,0.25827,0.0
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.358969,0.334289,0.383649,0.0
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,False,paired IVW (raw scale),0.254923,0.231836,0.27801,0.0
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,7,True,paired IVW (raw scale),-0.093574,-0.121122,-0.066027,2.780265e-11



Metric: Pseudo R2 Cntl

Per-file summaries:


Unnamed: 0,file,n_models,method,mean,ci_lo,ci_hi
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,15,mean±t across models (no model CIs),0.089953,0.071505,0.108402
1,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,15,mean±t across models (no model CIs),0.107373,0.057252,0.157494
2,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,mean±t across models (no model CIs),0.10326,0.06792,0.1386
3,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,mean±t across models (no model CIs),0.128573,0.07669,0.180457



Pairwise comparisons:


Unnamed: 0,file1,file2,n_common,same_model_set,method,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,15,True,paired t across models (no model CIs),0.01742,-0.027711,0.062551,0.421635
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,True,paired t across models (no model CIs),0.013307,-0.03451,0.061123,0.560127
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired t across models (no model CIs),0.03862,-0.009509,0.086749,0.107259
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,15,True,paired t across models (no model CIs),-0.004113,-0.068516,0.06029,0.892993
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired t across models (no model CIs),0.0212,-0.024461,0.066861,0.336243
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,15,True,paired t across models (no model CIs),0.025313,-0.038781,0.089408,0.411208


In [12]:
difference_of_pooled_test(summarize)

Unnamed: 0,file1,file2,est,ci_lo,ci_hi,p_value
0,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,-0.184519,-0.900196,0.531158,0.613329
1,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,-0.63179,-1.347469,0.083888,0.083589
2,analysis_log_multi_logres_dg_gpqa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,-0.923882,-1.639558,-0.208206,0.011401
3,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,-0.447271,-1.162951,0.268408,0.220613
4,analysis_log_multi_logres_dg_simplemc_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,-0.739363,-1.45504,-0.023685,0.042885
5,analysis_log_multi_logres_dg_gpsa_dg_full_hist_parsed.txt,analysis_log_multi_logres_dg_simpleqa_dg_full_hist_parsed.txt,-0.292091,-1.00777,0.423587,0.423755
