In [14]:
import os
import json
from pathlib import Path
from json import JSONDecodeError

In [15]:
ROOT = Path("../experiments/output")
ANALYSES = {"entailment", "geval", "ragas"}
OUT_PARENT = ROOT / "merged"

SHARED_FIELDS = {
    "id",
    "statement",
    "evidences",
    "label",
    "model_verdict",
    "explanation",
    "confidence",
}

STRICT_MATCH = {"id", "statement"}

In [16]:
def load_results(fp: Path):
    """Return list[dict] from .json, .jsonl or mis-labelled JSON-Lines."""
    txt = fp.read_text().strip()
    try:
        data = json.loads(txt)
        return data if isinstance(data, list) else [data]
    except JSONDecodeError:
        return [json.loads(line) for line in txt.splitlines() if line.strip()]


def dataset_from_file(fname: str, model: str) -> str:
    """Derive dataset name by stripping ‘_<model>.results|stats’."""
    stem = fname.rsplit(".", 1)[0]                       # no extension
    for suf in (".results", ".stats"):
        tag = f"_{model}{suf}"
        if stem.endswith(tag):
            return stem[:-len(tag)]
    return "_".join(stem.split("_")[:2])                 # fallback


def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

In [17]:
merged = {}

for analysis in ANALYSES:
    adir = ROOT / analysis
    if not adir.exists():
        continue

    for model_dir in filter(Path.is_dir, adir.iterdir()):
        model = model_dir.name

        for file in model_dir.iterdir():
            if file.suffix not in {".json", ".jsonl"}:
                continue
            is_results = file.name.endswith(".results" + file.suffix)
            is_stats   = file.name.endswith(".stats"   + file.suffix)
            if not (is_results or is_stats):
                continue

            dataset = dataset_from_file(file.name, model)
            entry = merged.setdefault(model, {}).setdefault(
                dataset, {"records": {}, "dataset_stats": {}}
            )

            if is_results:
                for item in load_results(file):
                    rec = entry["records"].setdefault(item["id"], {"id": item["id"]})

                    # shared fields
                    for field in SHARED_FIELDS:
                        val = item.get(field, None)

                        if val is None:
                            continue

                        if field in STRICT_MATCH:
                            if field in rec and rec[field] is not None and rec[field] != val:
                                raise ValueError(
                                    f"Mismatch in {field} for {item['id']} " f"({analysis})"
                                )
                            rec.setdefault(field, val)
                        else:
                            # take the first non-missing value
                            if rec.get(field) is None:
                                rec[field] = val

                    # analysis-specific fields (keep all, prefixed)
                    for k, v in item.items():
                        if k not in SHARED_FIELDS:
                            rec[f"{analysis}_{k}"] = v

            else:  # stats
                entry["dataset_stats"][analysis] = load_results(file)

# ensure every shared field exists (fill with None)
for model_d in merged.values():
    for data_d in model_d.values():
        for rec in data_d["records"].values():
            for f in SHARED_FIELDS:
                rec.setdefault(f, None)

# --------------------------------------------------------------------------- #
for model, datasets in merged.items():
    out_dir = OUT_PARENT / model
    ensure_dir(out_dir)
    for dataset, content in datasets.items():
        out_path = out_dir / f"{dataset}_{model}_merged.json"
        with out_path.open("w") as f:
            json.dump(
                {
                    "model": model,
                    "dataset": dataset,
                    "records": list(content["records"].values()),
                    "dataset_stats": content["dataset_stats"],
                },
                f,
                indent=2,
            )
        print("✓", out_path)

✓ ../experiments/output/merged/mistral_7b_cot/politi_hop_mistral_7b_cot_merged.json
✓ ../experiments/output/merged/mistral_7b_cot/hover_train_mistral_7b_cot_merged.json
✓ ../experiments/output/merged/mistral_7b_cot/covid_fact_mistral_7b_cot_merged.json
✓ ../experiments/output/merged/mistral_7b_non_cot/politi_hop_mistral_7b_non_cot_merged.json
✓ ../experiments/output/merged/mistral_7b_non_cot/hover_train_mistral_7b_non_cot_merged.json
✓ ../experiments/output/merged/mistral_7b_non_cot/covid_fact_mistral_7b_non_cot_merged.json
✓ ../experiments/output/merged/gpt4o_cot/covid_fact_gpt4o_cot_merged.json
✓ ../experiments/output/merged/gpt4o_cot/hover_train_gpt4o_cot_merged.json
✓ ../experiments/output/merged/gpt4o_cot/politi_hop_gpt4o_cot_merged.json
✓ ../experiments/output/merged/deepseek_r1_32b_cot/covid_fact_deepseek_r1_32b_cot_merged.json
✓ ../experiments/output/merged/deepseek_r1_32b_cot/hover_train_deepseek_r1_32b_cot_merged.json
✓ ../experiments/output/merged/deepseek_r1_32b_cot/politi_

### Check for any mismatches


In [11]:
from collections import defaultdict

In [12]:
def norm(val):
    """Return a hashable string representation (needed for sets / dict keys)."""
    if isinstance(val, (list, dict)):
        return json.dumps(val, sort_keys=True, ensure_ascii=False)
    return str(val)

In [13]:
data = {}

for analysis in ANALYSES:
    adir = ROOT / analysis
    if not adir.exists():
        continue

    for model_dir in filter(Path.is_dir, adir.iterdir()):
        model = model_dir.name

        for file in model_dir.iterdir():
            if not file.name.endswith((".results.json", ".results.jsonl")):
                continue

            dataset = dataset_from_file(file.name, model)

            for obj in load_results(file):
                rec_key = (model, dataset, obj["id"])
                rec = data.setdefault(rec_key, {f: {} for f in SHARED_FIELDS})

                for field in SHARED_FIELDS:
                    rec[field][analysis] = obj.get(field, "<MISSING>")


# -----------------------------------------------------------------------------
# scan for disagreements and print them
# -----------------------------------------------------------------------------
for (model, dataset, rec_id), fields in data.items():
    for field, by_analysis in fields.items():
        unique_vals = {norm(v) for v in by_analysis.values()}
        if len(unique_vals) <= 1:
            continue                     # all analyses agree

        print(
            f"\n≠ MISMATCH | model={model} | dataset={dataset} | "
            f"id={rec_id} | field={field}"
        )
        for ana, val in by_analysis.items():
            snippet = str(val)
            if len(snippet) > 300:
                snippet = snippet[:300] + " …"
            print(f"  {ana:<11}: {snippet}")


≠ MISMATCH | model=mistral_7b_cot | dataset=politi_hop | id=politihop_combined_and_grouped-000001 | field=confidence
  ragas      : <MISSING>
  entailment : 0.75
  geval      : <MISSING>

≠ MISMATCH | model=mistral_7b_cot | dataset=politi_hop | id=politihop_combined_and_grouped-000001 | field=model_verdict
  ragas      : NOT_SUPPORTED
  entailment : NOT_SUPPORTED
  geval      : <MISSING>

≠ MISMATCH | model=mistral_7b_cot | dataset=politi_hop | id=politihop_combined_and_grouped-000002 | field=confidence
  ragas      : <MISSING>
  entailment : 0.9
  geval      : <MISSING>

≠ MISMATCH | model=mistral_7b_cot | dataset=politi_hop | id=politihop_combined_and_grouped-000002 | field=model_verdict
  ragas      : NOT_SUPPORTED
  entailment : NOT_SUPPORTED
  geval      : <MISSING>

≠ MISMATCH | model=mistral_7b_cot | dataset=politi_hop | id=politihop_combined_and_grouped-000003 | field=confidence
  ragas      : <MISSING>
  entailment : 0.95
  geval      : <MISSING>

≠ MISMATCH | model=mistral_7