### Phase B — Gold Standard Finalization and Adjudication Merge (FRD 5, 6.2.1)
- This cell processes **post-adjudication** outputs to generate the finalized Gold Standard for each disease.  
- It reads `_for_adjudication.csv` files, filters concepts marked `"YES"` to keep, aligns and cleans **Gold Standard 1 & 2** datasets, and preserves a consistent metadata + set column order.  
- The merged and cleaned outputs are saved as `<disease>_Gold_Standard_FINAL.csv`, which will be used for **accuracy evaluation, weighted/unweighted F1 calculations, and downstream Phase B analyses**.  


In [None]:
import pandas as pd
import glob
import os

import os

import pandas as pd
import glob
import os

# --------------------------------------------
# Step 0: Path setup (robust)
# --------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()  # fallback for notebooks

ADJUD_DIR = os.path.join(SCRIPT_DIR, '..', 'ClinicalAdjudicationFiles')
os.makedirs(os.path.join(SCRIPT_DIR, 'results'), exist_ok=True)

# Define the correct desired column order
metadata_cols = [
    "original_index", "key", "conceptSetId", "conceptId", "conceptName",
    "standardConcept", "invalidReason", "conceptCode", "vocabularyId"
]

# --------------------------------------------
# Step 1: Find all adjudication Excel files recursively
# --------------------------------------------
adjud_files = glob.glob(os.path.join(ADJUD_DIR, "**", "*_for_adjudication.xlsx"), recursive=True)


if not adjud_files:
    print("⚠️ No adjudication Excel files found.")
else:
    print(f"🧩 Found {len(adjud_files)} adjudication Excel files.\n")

# --------------------------------------------
# Step 2: Convert .xlsx → .csv (if not already converted)
# --------------------------------------------
converted_files = []
for adj_path in adjud_files:
    disease_prefix = os.path.splitext(os.path.basename(adj_path))[0].replace("_for_adjudication", "")
    csv_path = os.path.join(os.path.dirname(adj_path), f"{disease_prefix}_for_adjudication.csv")

    # Only convert if CSV doesn't already exist or Excel is newer
    if not os.path.exists(csv_path) or os.path.getmtime(adj_path) > os.path.getmtime(csv_path):
        try:
            df = pd.read_excel(adj_path, dtype=str)
            df.to_csv(csv_path, index=False)
            print(f"✅ Converted: {os.path.basename(adj_path)} → {os.path.basename(csv_path)}")
        except Exception as e:
            print(f"❌ Failed to convert {adj_path}: {e}")
            continue

    converted_files.append(csv_path)

# --------------------------------------------
# Step 3: Use converted CSVs for processing
# --------------------------------------------
adjud_files = converted_files
# print(f"🗂️ Proceeding with {len(adjud_files)} CSV adjudication files.\n")


In [None]:
# ===============================================================
# 🧬 FINAL GOLD STANDARD GENERATION — WITH CORRECT PATHS
# ===============================================================
# Purpose:
#   1️⃣ Load each adjudicated *_for_adjudication.csv file (from ClinicalAdjudicationFiles)
#   2️⃣ Load corresponding Gold Standard 1 (from phaseA_gold)
#   3️⃣ Keep only adjudicated “YES” concepts from GS2
#   4️⃣ Merge GS1 + adjudicated GS2 → FINAL Gold Standard
#   5️⃣ Save to results/phaseA_gold_final/
#   6️⃣ Create a summary report
# ===============================================================

import os
import pandas as pd
import glob

# ---------------------------------------------------------------
# 🧭 PATH CONFIGURATION
# ---------------------------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

# Phase A Gold (GS1)
GOLD_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_gold")

# Adjudicated GS2 (from clinical adjudication folders)
ADJUD_DIR = os.path.join(SCRIPT_DIR, "..", "ClinicalAdjudicationFiles")

# Output folder for final merged GS
FINAL_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_gold_final")
os.makedirs(FINAL_DIR, exist_ok=True)

# Preserve standard metadata order
metadata_cols = [
    "original_index", "key", "Disease", "conceptSetId", "conceptId", "conceptName",
    "standardConcept", "invalidReason", "conceptCode", "vocabularyId"
]

# ---------------------------------------------------------------
# 🔍 Locate all adjudicated GS2 CSVs
# ---------------------------------------------------------------
adjud_files = sorted(glob.glob(os.path.join(ADJUD_DIR, "**", "*_for_adjudication.csv"), recursive=True))

if not adjud_files:
    print(f"⚠️ No adjudicated GS2 files found in {ADJUD_DIR}")
    raise SystemExit

# print(f"🧩 Found {len(adjud_files)} adjudicated GS2 files to merge.\n")

summary_rows = []

# ---------------------------------------------------------------
# 🧩 Process each adjudicated GS2 file
# ---------------------------------------------------------------
for adj_file in adjud_files:
    disease_prefix = os.path.basename(adj_file).replace("_for_adjudication.csv", "")
    gs1_path = os.path.join(GOLD_DIR, f"{disease_prefix}_Gold_Standard_1.csv")
    final_path = os.path.join(FINAL_DIR, f"{disease_prefix}_Gold_Standard_FINAL.csv")

    if not os.path.exists(gs1_path):
        print(f"⚠️ Skipping {disease_prefix}: Missing GS1 file → {gs1_path}")
        continue

    # -----------------------------------------------------------
    # STEP 1: Load adjudicated GS2
    # -----------------------------------------------------------
    try:
        adj_df = pd.read_csv(adj_file, dtype=str)
        adj_df.columns = adj_df.columns.str.strip()
    except Exception as e:
        print(f"❌ Error reading {adj_file}: {e}")
        continue

    if "key" not in adj_df.columns or "keepConceptSet" not in adj_df.columns:
        print(f"⚠️ Skipping {disease_prefix}: Missing 'key' or 'keepConceptSet' column.")
        continue

    total_before = len(adj_df)
    adj_keep = adj_df[adj_df["keepConceptSet"].str.strip().str.upper() == "YES"].copy()
    total_after = len(adj_keep)

    print(f"=== Disease: {disease_prefix} ===")
    # print(f"   → Adjudicated YES: {total_after:,} / {total_before:,}")

    # -----------------------------------------------------------
    # STEP 2: Load GS1 (always keep all)
    # -----------------------------------------------------------
    gs1_df = pd.read_csv(gs1_path, dtype=str)
    gs1_df = gs1_df.drop(columns=["index_changed", "sum_sets"], errors="ignore")

    # -----------------------------------------------------------
    # STEP 3: Align columns dynamically
    # -----------------------------------------------------------
    all_cols = metadata_cols.copy()
    set_cols = sorted([c for c in gs1_df.columns if c.startswith("Set") or c.startswith("SET")])
    ordered_cols = all_cols + set_cols

    # Reindex adjudicated GS2
    adj_df = adj_df.reindex(columns=ordered_cols, fill_value="")
    gs1_df = gs1_df.reindex(columns=ordered_cols, fill_value="")

    # Keep only adjudicated YES rows in GS2
    if "key" in adj_df.columns:
        adj_filtered = adj_df[adj_df["key"].isin(adj_keep["key"])].copy()
    else:
        adj_filtered = pd.DataFrame(columns=ordered_cols)

    # -----------------------------------------------------------
    # STEP 4: Merge GS1 + adjudicated GS2
    # -----------------------------------------------------------
    final_df = pd.concat([gs1_df, adj_filtered], ignore_index=True)
    final_df.to_csv(final_path, index=False)

    # print(f"   ✅ Saved FINAL Gold Standard → {os.path.basename(final_path)}")
    # print(f"   ├─ GS1 rows kept: {len(gs1_df):,}")
    # print(f"   ├─ Adjudicated GS2 rows kept: {len(adj_filtered):,}")
    print(f"    The Gold Standard concepts: {len(final_df):,}")
    print("------------------------------------------------------------")

    summary_rows.append({
        "Disease": disease_prefix,
        "GS1_Rows": len(gs1_df),
        "GS2_Adjudicated_Rows": len(adj_filtered),
        "Final_Total": len(final_df)
    })

# ---------------------------------------------------------------
# 📊 Step 5: Save Summary Table
# ---------------------------------------------------------------
if summary_rows:
    summary_df = pd.DataFrame(summary_rows)
    summary_path = os.path.join(FINAL_DIR, "Final_Gold_Standard_Summary.csv")
    summary_df.to_csv(summary_path, index=False)
    # print("\n📈 Final Gold Standard Merge Summary\n")
    # print(summary_df.to_string(index=False))
    # print(f"\n💾 Saved summary table → {summary_path}")
else:
    print("⚠️ No final gold standards created — please verify adjudicated GS2 files.")


<!-- ### Phase B — Gold Standard QA: Duplicate and Missing Key Check (FRD 6.2.1)
- This cell performs a **quality assurance check** on the post-adjudication datasets before final Gold Standard generation.  
- It identifies duplicate keys in `GS2`, compares the keys marked `"YES"` in adjudication with `GS2`, and reports missing or extra keys.  
- These checks ensure that all approved concepts are captured in the Gold Standard and highlight potential **data inconsistencies** for review prior to downstream accuracy calculations.   -->


In [None]:
# # Check for duplicate keys in GS2 #DEBUG
# dup_keys = gs1_df["key"].value_counts()
# dup_keys = dup_keys[dup_keys > 1]
# if not dup_keys.empty:
#     print(f"⚠️ Found {len(dup_keys)} duplicate keys in {gs2_file}")
#     print(dup_keys.head(10))

# # Compare key sets directly
# missing_in_gs2 = keep_keys - set(gs2_df["key"])
# extra_in_gs2 = set(gs2_df["key"]) - keep_keys

# print(f"🧾 Keys in adjudication (YES): {len(keep_keys)}")
# print(f"🧾 Keys in GS2: {len(gs2_df['key'].unique())}")
# print(f"❌ Keys in adjudication but not in GS2: {len(missing_in_gs2)}")
# print(f"⚠️ Duplicated keys in GS2: {len(dup_keys)}")

# if missing_in_gs2:
#     print("\n🔍 Missing keys (in adjudication but not in GS2):")
#     print(list(missing_in_gs2)[:20])  # show first 20 only
#     print(f"... and {len(missing_in_gs2) - 20} more" if len(missing_in_gs2) > 20 else "")
# else:
#     print("\n✅ No missing keys — all adjudicated YES keys found in GS2.")


In [None]:
# df = pd.read_csv('C02_DiabetesMelltius_Gold_Standard_FINAL.csv',dtype=str)
# df.columns #DEBUG

In [None]:
# df = pd.read_csv('C01_ObsessiveCompulsiveDisorder_Gold_Standard_FINAL.csv',dtype=str)
# df.head() #DEBUG

### Phase B — Arm vs Final Gold Standard Comparison (FRD 6.2.1, 6.2.2)
- This cell compares each original arm’s concept set (Human or AI) to the **final adjudicated Gold Standard** for each disease.  
- It computes counts of concepts overlapping with, unique to the arm, and unique to the final set, as well as the overlap percentage.  
- Outputs are saved in `gold_standard_comparison_summary.csv` for downstream **accuracy metrics, F1 calculations, and clinical impact analyses**.


<!-- ###  Phase B — Primary Objective Analysis (Weighted F1 vs TGS) (FRD 6.2.1)

1. **Compare Arm Outputs to Gold Standard (FRD 6.2.1):** This cell loads all Human and AI workflow concept sets for each disease and compares them to the adjudicated True Gold Standard (TGS), calculating both weighted and unweighted precision, recall, and F1 scores.  
2. **Weighted Metrics Using Concept Prevalence (FRD 6.2.1):** Weighted counts of True Positives (WTP), False Positives (WFP), and False Negatives (WFN) are computed using `ConceptRecordCounts.csv` to generate prevalence-weighted F1 scores for each workflow.  
3. **Output Saved (CSV):** Results are stored in `gold_standard_comparison_summary.csv`, summarizing per-disease metrics for all arms, including overlaps, Jaccard similarity, weighted/unweighted precision, recall, and F1 scores.


 -->

In [None]:
# ===============================================================
# 🧬 VALIDATION — COMPARE ORIGINAL ARMS VS FINAL GOLD STANDARD
# ===============================================================
# Purpose:
#   1️⃣ Detect correct arm folders inside ConceptSets/
#   2️⃣ Fallback to phaseA_pre_adjudication summaries if needed
#   3️⃣ Compare all arm concepts vs adjudicated gold standard
# ===============================================================

import os
import re
import glob
import pandas as pd

# ---------------------------------------------------------------
# 🧭 PATH CONFIGURATION
# ---------------------------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

CONCEPTSET_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "ConceptSets"))
PHASEA_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_pre_adjudication")
FINAL_GOLD_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_gold_final")
OUTPUT_PATH = os.path.join(SCRIPT_DIR, "results", "gold_standard_comparison_summary.csv")

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# ---------------------------------------------------------------
# 🔹 Helper: Clean Arm Names
# ---------------------------------------------------------------
def clean_arm_name(folder_name):
    if re.search(r"\[S\d+\]", folder_name, re.IGNORECASE):
        return None
    match = re.search(r"\[(AI\d+|H\d+|HUMAN)\]", folder_name, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    if "AI" in folder_name.upper():
        m = re.search(r"AI\d+", folder_name.upper())
        if m:
            return m.group(0)
    if "H1" in folder_name.upper() or "HUMAN" in folder_name.upper():
        return "H1"
    return "UNKNOWN"

# ---------------------------------------------------------------
# 🔹 Helper: Load concept IDs from includedConcepts.csv
# ---------------------------------------------------------------
def load_included_concepts(file_path):
    try:
        df = pd.read_csv(file_path, dtype=str)
        df.columns = [c.strip() for c in df.columns]
        norm_map = {re.sub(r'[^a-z0-9]', '', c.lower()): c for c in df.columns}
        chosen = norm_map.get("conceptid") or next(
            (orig for norm, orig in norm_map.items()
             if "concept" in norm and "id" in norm and "set" not in norm),
            None
        )
        if not chosen:
            return set()
        vals = df[chosen].dropna().astype(str).str.strip().str.replace(r"\.0+$", "", regex=True)
        return set(vals[vals != ""])
    except Exception:
        return set()

# ---------------------------------------------------------------
# 🔹 Analyze disease using ConceptSets (preferred)
# ---------------------------------------------------------------
def analyze_disease_conceptsets(disease_name, final_file):
    """
    Analyze overlap between ConceptSets and Final Gold Standard (flat structure).
    Looks for [Cxx][AIx]/[H1] folders directly under ConceptSets/.
    """
    disease_prefix = re.match(r"(C\d+)", disease_name)
    if not disease_prefix:
        print(f"⚠️ Could not parse prefix for {disease_name}")
        return None
    prefix = disease_prefix.group(1)

    # 🔍 Find all concept set folders matching the prefix (e.g. [C01])
    candidate_folders = [
        f for f in os.listdir(CONCEPTSET_DIR)
        if re.search(rf"\[{prefix}\]", f, re.IGNORECASE)
    ]

    if not candidate_folders:
        print(f"⚠️ No folders found for {disease_name} in {CONCEPTSET_DIR}")
        return None

    arm_data = {}
    for folder in candidate_folders:
        if re.search(r"ONLINE", folder, re.IGNORECASE):
            continue  # skip online folders
        folder_path = os.path.join(CONCEPTSET_DIR, folder)
        if not os.path.isdir(folder_path):
            continue

        csv_path = os.path.join(folder_path, "includedConcepts.csv")
        if not os.path.exists(csv_path):
            continue

        arm_name = clean_arm_name(folder)
        if arm_name and arm_name != "UNKNOWN":
            arm_data[arm_name] = load_included_concepts(csv_path)

    if not arm_data:
        print(f"⚠️ No valid arm concept sets found for {disease_name}")
        return None

    # Load Final Gold Standard
    final_df = pd.read_csv(final_file, dtype=str)
    if "conceptId" not in final_df.columns:
        print(f"⚠️ conceptId missing in {final_file}")
        return None
    final_concepts = set(final_df["conceptId"].dropna().astype(str))

    stats = []
    # print(f"\n📊 Disease: {disease_name} — {len(final_concepts)} concepts in Gold Standard")

    for arm, concepts in arm_data.items():
        overlap = concepts & final_concepts
        only_arm = concepts - final_concepts
        only_final = final_concepts - concepts

        stats.append({
            "Disease": disease_name,
            "Arm": arm,
            "Concepts_in_Arm": len(concepts),
            "Concepts_in_Final": len(final_concepts),
            "Overlap": len(overlap),
            "Arm_only": len(only_arm),
            "Final_only": len(only_final),
            "Overlap_%": round(100 * len(overlap) / max(len(concepts), 1), 2)
        })

        # print(f"   - {arm}: Arm={len(concepts)} | Overlap={len(overlap)} ({round(100 * len(overlap)/max(len(concepts),1),2)}%)")

    return pd.DataFrame(stats)

# ---------------------------------------------------------------
# 🔹 Fallback: Analyze using arm_summary.csv (no concepts)
# ---------------------------------------------------------------
def analyze_disease_summary_fallback(disease_name, final_file):
    """Fallback mode using phaseA_pre_adjudication arm_summary.csv."""
    arm_summary_path = os.path.join(PHASEA_DIR, disease_name, "arm_summary.csv")
    if not os.path.exists(arm_summary_path):
        return None

    df_summary = pd.read_csv(arm_summary_path, dtype=str)
    final_df = pd.read_csv(final_file, dtype=str)

    total_final = len(final_df["conceptId"].dropna()) if "conceptId" in final_df.columns else 0
    df_summary["Concepts_in_Final"] = total_final
    print(f"📄 Fallback summary used for {disease_name} (no concept-level comparison)")
    return df_summary

# ---------------------------------------------------------------
# 🚀 MAIN EXECUTION
# ---------------------------------------------------------------
def main():
    final_files = sorted(glob.glob(os.path.join(FINAL_GOLD_DIR, "*_Gold_Standard_FINAL.csv")))
    if not final_files:
        print(f"⚠️ No final gold standard files found in {FINAL_GOLD_DIR}")
        return

    all_results = []
    for final_file in final_files:
        disease_name = os.path.basename(final_file).replace("_Gold_Standard_FINAL.csv", "")

        # Try concept-level comparison first
        summary_df = analyze_disease_conceptsets(disease_name, final_file)

        # Fallback: use pre-adjudication summary
        if summary_df is None:
            summary_df = analyze_disease_summary_fallback(disease_name, final_file)

        if summary_df is not None:
            all_results.append(summary_df)
        else:
            print(f"⚠️ No valid data found for {disease_name}")

    if all_results:
        combined = pd.concat(all_results, ignore_index=True)
        combined.to_csv(OUTPUT_PATH, index=False)
        # print(f"\n✅ Saved comparison summary → {OUTPUT_PATH}")
    else:
        print("⚠️ No comparisons generated.")

if __name__ == "__main__":
    main()


In [None]:
# ==========================================================
# 🧠 6.2.1 — Primary Objective Analysis (Weighted F1 vs TGS)
# ==========================================================
import os
import re
import glob
import numpy as np
import pandas as pd

# --------------------------------------------
# 🧭 PATH CONFIGURATION
# --------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "ConceptSets"))
OUTPUT_PATH = os.path.join(SCRIPT_DIR, "results", "gold_standard_primary_metrics.csv")
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
FINAL_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_gold_final")
# -------------------------------------------------------------
# 🧩 Helper: Clean Arm Name
# -------------------------------------------------------------
def clean_arm_name(folder_name):
    """Extract standardized arm name like AI1, H1, etc."""
    if re.search(r"\[S\d+\]", folder_name, re.IGNORECASE):
        return None

    match = re.search(r"(AI\d+|H\d+|HUMAN|CLINICIAN|REVIEWER|EXPERT|MANUAL)", folder_name, re.IGNORECASE)
    if match:
        name = match.group(1).upper()
        return "H1" if name in ["HUMAN", "CLINICIAN", "REVIEWER", "EXPERT", "MANUAL"] else name

    parts = re.findall(r"\[([A-Za-z0-9]+)\]", folder_name)
    if parts:
        return parts[-1].upper()
    return "UNKNOWN"

# -------------------------------------------------------------
# 🧩 Helper: Load includedConcepts.csv
# -------------------------------------------------------------
def load_included_concepts(file_path):
    """Return Concept IDs as a clean set."""
    try:
        df = pd.read_csv(file_path, dtype=str)
        cols = [c.strip() for c in df.columns]
        norm_map = {re.sub(r"[^a-z0-9]", "", c.lower()): c for c in cols}

        chosen_col = (
            norm_map.get("conceptid")
            or next((orig for norm, orig in norm_map.items() if "concept" in norm and "id" in norm and "set" not in norm), None)
        )
        if not chosen_col:
            print(f"⚠️ No Concept ID column found in {file_path}")
            return set()

        vals = df[chosen_col].dropna().astype(str).str.strip().str.replace(r"\.0+$", "", regex=True)
        return set(vals[vals != ""])
    except Exception as e:
        print(f"[load_included_concepts] Error reading {file_path}: {e}")
        return set()

# -------------------------------------------------------------
# 🧩 Weighted Metrics
# -------------------------------------------------------------
def compute_weighted_f1(arm_concepts, gold_standard, record_count_df):
    """Compute weighted precision, recall, and F1."""
    df = record_count_df.copy()
    df["record_count"] = pd.to_numeric(df["record_count"], errors="coerce").fillna(0)
    df = df[df["conceptId"].isin(set(arm_concepts) | set(gold_standard))]

    df["TP"] = df["conceptId"].isin(arm_concepts & gold_standard)
    df["FP"] = df["conceptId"].isin(arm_concepts - gold_standard)
    df["FN"] = df["conceptId"].isin(gold_standard - arm_concepts)

    WTP = df.loc[df["TP"], "record_count"].sum()
    WFP = df.loc[df["FP"], "record_count"].sum()
    WFN = df.loc[df["FN"], "record_count"].sum()

    P_W = WTP / (WTP + WFP) if (WTP + WFP) else np.nan
    R_W = WTP / (WTP + WFN) if (WTP + WFN) else np.nan
    F1_W = (2 * P_W * R_W / (P_W + R_W)) if (P_W + R_W) else np.nan

    return {"WTP": WTP, "WFP": WFP, "WFN": WFN, "P_W": P_W, "R_W": R_W, "F1_W": F1_W}

# -------------------------------------------------------------
# 🧩 Unweighted Metrics
# -------------------------------------------------------------
def compute_unweighted_PRF(arm_concepts, gold_standard):
    """Compute unweighted precision, recall, F1."""
    tp = len(set(arm_concepts) & set(gold_standard))
    fp = len(set(arm_concepts) - set(gold_standard))
    fn = len(set(gold_standard) - set(arm_concepts))

    P = tp / (tp + fp) if (tp + fp) else np.nan
    R = tp / (tp + fn) if (tp + fn) else np.nan
    F1 = (2 * P * R / (P + R)) if (P + R) else np.nan

    return {"TP": tp, "FP": fp, "FN": fn, "Precision": P, "Recall": R, "F1": F1}

# -------------------------------------------------------------
# 🧩 Compare all arms vs Gold Standard FINAL
# -------------------------------------------------------------
def analyze_disease_vs_final(prefix, final_file, record_count_df):
    """Compare all arms for a given disease prefix."""
    candidate_folders = [
        f for f in os.listdir(ROOT_DIR)
        if re.search(rf"\[{prefix}\]", f, re.IGNORECASE)
    ]

    if not candidate_folders:
        print(f"⚠️ No folders found for prefix {prefix} in ConceptSets/")
        return None

    # Load Final Gold Standard
    final_df = pd.read_csv(final_file, dtype=str)
    final_concepts = set(final_df["conceptId"].dropna().astype(str))
    gold_count = len(final_concepts)

    stats = []
    for folder in sorted(candidate_folders):
        folder_path = os.path.join(ROOT_DIR, folder)
        csv_path = os.path.join(folder_path, "includedConcepts.csv")
        if not os.path.exists(csv_path):
            continue

        arm_name = clean_arm_name(folder)
        if not arm_name or arm_name == "UNKNOWN":
            continue

        arm_concepts = load_included_concepts(csv_path)
        if not arm_concepts:
            continue

        overlap = arm_concepts & final_concepts
        jaccard = len(overlap) / len(arm_concepts | final_concepts) if arm_concepts or final_concepts else np.nan

        weighted = compute_weighted_f1(arm_concepts, final_concepts, record_count_df)
        unweighted = compute_unweighted_PRF(arm_concepts, final_concepts)

        stats.append({
            "Disease": prefix,
            "Arm": arm_name,
            "Concepts_in_Arm": len(arm_concepts),
            "Concepts_in_GoldStandard": gold_count,
            "Overlap": len(overlap),
            "Jaccard_Similarity": round(jaccard, 3),

            # Weighted
            "WTP": weighted["WTP"], "WFP": weighted["WFP"], "WFN": weighted["WFN"],
            "Weighted_Precision": round(weighted["P_W"], 3) if pd.notna(weighted["P_W"]) else "",
            "Weighted_Recall": round(weighted["R_W"], 3) if pd.notna(weighted["R_W"]) else "",
            "Weighted_F1": round(weighted["F1_W"], 3) if pd.notna(weighted["F1_W"]) else "",

            # Unweighted
            "TP": unweighted["TP"], "FP": unweighted["FP"], "FN": unweighted["FN"],
            "Unweighted_Precision": round(unweighted["Precision"], 3) if pd.notna(unweighted["Precision"]) else "",
            "Unweighted_Recall": round(unweighted["Recall"], 3) if pd.notna(unweighted["Recall"]) else "",
            "Unweighted_F1": round(unweighted["F1"], 3) if pd.notna(unweighted["F1"]) else ""
        })


    if not stats:
        print(f"⚠️ No valid concept sets for {prefix}")
        return None

    return pd.DataFrame(stats)

# -------------------------------------------------------------
# 🚀 MAIN EXECUTION
# -------------------------------------------------------------
def main():
    record_count_path = os.path.join(SCRIPT_DIR, "ConceptRecordCounts.csv")
    if not os.path.exists(record_count_path):
        print("⚠️ ConceptRecordCounts.csv not found — weighted metrics may be skipped.")
        record_count_df = pd.DataFrame(columns=["conceptId", "record_count"])
    else:
        record_count_df = pd.read_csv(record_count_path, dtype=str)

    
    final_files = sorted(glob.glob(os.path.join(FINAL_DIR, "*_Gold_Standard_FINAL.csv")))
   
    all_results = []

    if not final_files:
        print("⚠️ No Gold Standard FINAL files found.")
        return

    print(f"🧩 Found {len(final_files)} FINAL gold standard files.\n")

    for final_file in final_files:
        disease_prefix = re.search(r"(C\d+)", os.path.basename(final_file))
        if not disease_prefix:
            print(f"⚠️ Could not detect prefix in {final_file}")
            continue
        prefix = disease_prefix.group(1)

        summary_df = analyze_disease_vs_final(prefix, final_file, record_count_df)
        if summary_df is not None:
            all_results.append(summary_df)

    if all_results:
        combined = pd.concat(all_results, ignore_index=True)
        combined = combined.sort_values(by=["Disease", "Arm"])
        combined.to_csv(OUTPUT_PATH, index=False)
        print(f"\n✅ Saved weighted F1 summary → {OUTPUT_PATH}")
        print(f"   Rows: {combined.shape[0]} | Diseases: {combined['Disease'].nunique()}")
    else:
        print("⚠️ No data processed — check ConceptSets and FINAL files.")

if __name__ == "__main__":
    main()


In [None]:
summary_df = pd.read_csv(os.path.join(SCRIPT_DIR, "results",'gold_standard_primary_metrics.csv'),dtype=str)
# summary_df

In [None]:
# Load the full summary (if not already loaded)


RESULTS_DIR = os.path.join(SCRIPT_DIR, "results")
INPUT_PATH = os.path.join(RESULTS_DIR, "gold_standard_primary_metrics.csv")

OUTPUT_DIR = os.path.join(RESULTS_DIR, "phaseB_primary_objectives")
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_PATH = os.path.join(OUTPUT_DIR, "phase2_primary_objectives_summary_short.csv")

# --------------------------------------------
# 📘 LOAD INPUT DATA
# --------------------------------------------
if not os.path.exists(INPUT_PATH):
    raise FileNotFoundError(f"❌ Input file not found: {INPUT_PATH}\nPlease run the weighted F1 analysis first.")

summary_df = pd.read_csv(INPUT_PATH, dtype=str)
# print(f"📄 Loaded full summary → {INPUT_PATH} ({len(summary_df):,} rows)")

essential_cols = [
    "Disease", "Arm",
    "WTP", "WFP", "WFN",
    "Weighted_Precision", "Weighted_Recall", "Weighted_F1",
    "Unweighted_F1"
]


# Ensure all columns exist before selection
missing = [col for col in essential_cols if col not in summary_df.columns]
if missing:
    print(f"⚠️ Missing expected columns: {missing} — will fill with blanks.")
    for col in missing:
        summary_df[col] = ""

summary_short = summary_df[essential_cols].copy()

# Convert numeric fields to 3-decimal floats (if applicable)
numeric_cols = ["Weighted_Precision", "Weighted_Recall", "Weighted_F1", "Unweighted_F1"]
for col in numeric_cols:
    summary_short[col] = pd.to_numeric(summary_short[col], errors="coerce").round(3)

# --------------------------------------------
# 💾 SAVE OUTPUT
# --------------------------------------------
summary_short.to_csv(OUTPUT_PATH, index=False)
# print(f"✅ Saved condensed summary → {OUTPUT_PATH}")


### Phase B — Primary Objective Analysis: Disease-Level Weighted F1 ($F1_W$) Across Diseases (6.2.1)

**Analysis type:** Prevalence-Weighted F1 ($F1_W$) — the primary outcome measure described in Section 6.2.1 of the SAP.  
$F1_W$ represents the harmonic mean of precision and recall, weighted by each concept’s prevalence ($V_i$) in the source data.

1. **Compare Human vs AI Performance (FRD 6.2.1):** Calculates the prevalence-weighted F1 for the Human arm (H1) and the mean across all AI arms (AI1–AI4) per disease.  
2. **Compute Differences (Δ) Between AI and Human:** Determines the difference between mean AI $F1_W$ and Human $F1_W$ to assess AI performance relative to the benchmark.  
3. **Output Saved (CSV):** The summarized, disease-level comparison is saved in `phase1_diseaselevel_summary_pretty.csv`, capturing per-disease weighted F1 scores and AI–Human differences for reporting and visualization.


In [None]:
# ===============================================================
# 🧠 PHASE 2 — DISEASE-LEVEL SUMMARY (AI vs Human Weighted F1)
# ===============================================================

INPUT_PATH = os.path.join(SCRIPT_DIR, "results", "gold_standard_primary_metrics.csv")
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "results", "phaseB_primary_objectives")
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_PATH = os.path.join(OUTPUT_DIR, "phase2_diseaselevel_summary_pretty.csv")

# --------------------------------------------
# 📘 LOAD DATA
# --------------------------------------------
if not os.path.exists(INPUT_PATH):
    raise FileNotFoundError(f"❌ Input file not found: {INPUT_PATH}")

summary_df = pd.read_csv(INPUT_PATH, dtype=str)
# print(f"📄 Loaded weighted F1 data → {INPUT_PATH} ({len(summary_df):,} rows)")

# Convert Weighted_F1 to numeric safely
summary_df["Weighted_F1"] = pd.to_numeric(summary_df["Weighted_F1"], errors="coerce")

# --------------------------------------------
# 🧩 FILTER HUMAN (H1) AND AI (AI1–AIn) ARMS
# --------------------------------------------
human_df = summary_df[summary_df["Arm"].str.upper() == "H1"].copy()
ai_df = summary_df[summary_df["Arm"].str.upper().str.startswith("AI")].copy()

if human_df.empty or ai_df.empty:
    raise ValueError("⚠️ Missing Human (H1) or AI arms in summary data — check input file.")

# Compute mean AI Weighted F1 per disease
ai_mean = ai_df.groupby("Disease")["Weighted_F1"].mean()

# --------------------------------------------
# 🧮 MERGE INTO A SINGLE SUMMARY TABLE
# --------------------------------------------
human_f1 = human_df.set_index("Disease")["Weighted_F1"].rename("Human_F1W")
ai_mean_f1 = ai_mean.rename("AI_Mean_F1W")

disease_summary = pd.concat([human_f1, ai_mean_f1], axis=1)

# Compute Δ(AI–H1)
disease_summary["Δ(AI–H1)"] = disease_summary["AI_Mean_F1W"] - disease_summary["Human_F1W"]

# --------------------------------------------
# 🧾 FORMAT & ROUND
# --------------------------------------------
disease_summary = (
    disease_summary.reset_index()
    .rename(columns={
        "Human_F1W": "Human $F1_W$",
        "AI_Mean_F1W": "Mean AI $F1_W$",
        "Δ(AI–H1)": "Δ(AI–H1)"
    })
    .round(3)
    .sort_values(by="Δ(AI–H1)", ascending=False)
    .reset_index(drop=True)
)

# --------------------------------------------
# 💾 SAVE & DISPLAY
# --------------------------------------------
disease_summary.to_csv(OUTPUT_PATH, index=False)
# print(f"✅ Saved AI vs Human F1 summary → {OUTPUT_PATH}")

# print("\n📊 AI vs Human (H1) Weighted F1 Summary by Disease:")
display(disease_summary)


In [None]:
# # Summarize by disease (mean AI vs Human)
# summary_short = summary_df[['Disease', 'Arm', 'Weighted_F1']].copy()

# # Split Human vs AI
# human = summary_short[summary_short['Arm'] == 'H1'].set_index('Disease')
# ai = summary_short[summary_short['Arm'].str.startswith('AI')]

# # Mean AI F1 per disease
# ai_mean = ai.groupby('Disease')['Weighted_F1'].mean().rename('AI_Mean_F1')

# # Merge
# disease_summary = human[['Weighted_F1']].rename(columns={'Weighted_F1': 'Human $F1_W$'}).join(ai_mean)
# disease_summary['Δ(AI–H1)'] = disease_summary['AI_Mean_F1'] - disease_summary['Mean AI $F1_W$']

# # Save concise file
# disease_summary.to_csv("phase1_primaryobjectivesummary_diseaselevel.csv")
# display(disease_summary.round(3))


---------------------------------------------------------------------

---------------------------**Cross-Disease Comparative Summary**----------------

**Cross-Disease Comparative Analysis of AI vs Human Workflows (Mean Δ(AI–H1) in Prevalence-Weighted F1)**

This table summarizes the mean difference (Δ) in prevalence-weighted F1 ($F1_W$) between each AI workflow (AI1–AI4) and the Human workflow, averaged across all diseases.
**Higher positive Δ** values indicate better average alignment of the **AI workflow** with the **True Gold Standard (TGS)** compared to the Human reference.

In [None]:
pivot = summary_df.pivot(index='Disease', columns='Arm', values='Weighted_F1')
comparisons = []
for ai in ['AI1','AI2','AI3','AI4']:
    diffs = pivot[ai] - pivot['H1']
    comparisons.append({
        'AI_arm': ai,
        'Mean_Δ': diffs.mean(),
        '95%_CI_lower': diffs.mean() - 1.96*diffs.std()/np.sqrt(len(diffs)),
        '95%_CI_upper': diffs.mean() + 1.96*diffs.std()/np.sqrt(len(diffs))
    })
comparisons = pd.DataFrame(comparisons)
comparisons

In [None]:
# print("🔍 Checking disease–arm availability:")
# pivot_check = summary_df.pivot_table(index='Disease', columns='Arm', values='Weighted_F1', aggfunc='count', fill_value=0)
# print(pivot_check)
# print("\nNumber of diseases with complete arm data:", (pivot_check > 0).all(axis=1).sum())


### Phase B — Paired Comparative Analysis with Permutation FWER Correction (6.2.1)

1. **Descriptive Statistics per Arm (FRD 6.2.1, SAP 2.1.3):** Calculates mean, median, standard deviation, min, max, and 95% CI for Weighted F1 ($F1_W$) of Human (H1) and AI arms across diseases with complete data.  
2. **Paired Comparisons & Non-Inferiority Testing:** Performs paired t-tests or Wilcoxon signed-rank tests comparing each AI arm to H1, including bootstrapped 95% CIs and evaluation of non-inferiority with a margin δ=0.05.  
3. **Permutation-Based Familywise Error Rate (FWER):** Implements exact sign-flip permutation tests to adjust for multiple comparisons, integrating results into `paired_analysis_summary_with_permutation.csv`; this supports robust inference on AI vs. Human performance differences.


In [None]:
# ==========================================================
# 📊 PRIMARY STATISTICS (SAP 2.1.3) — with permutation FWER correction
# ==========================================================
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
import itertools

# --- 0️⃣ Clean and prepare data -------------------------------------------
summary_df['Arm'] = (
    summary_df['Arm'].astype(str).str.strip().str.upper()
    .replace({'HUMAN': 'H1', 'H1.0': 'H1'})
)
summary_df['Weighted_F1'] = pd.to_numeric(summary_df['Weighted_F1'], errors='coerce')

# Drop diseases missing any arm’s Weighted_F1 (must have H1 + 4 AI arms)
complete_diseases = (
    summary_df.groupby('Disease')['Weighted_F1']
    .apply(lambda x: x.notna().sum() == 5)
)
complete_diseases = complete_diseases[complete_diseases].index
summary_df = summary_df[summary_df['Disease'].isin(complete_diseases)].copy()

print(f"✅ Using {len(complete_diseases)} diseases with complete arms: {list(complete_diseases)}")

# --- 1️⃣ Descriptive statistics per arm -----------------------------------
desc = (
    summary_df
    .groupby('Arm')['Weighted_F1']
    .agg(['mean', 'median', 'std', 'min', 'max'])
    .reset_index()
)
desc['N'] = summary_df['Disease'].nunique()
desc['95CI_low']  = desc['mean'] - 1.96 * desc['std'] / np.sqrt(desc['N'])
desc['95CI_high'] = desc['mean'] + 1.96 * desc['std'] / np.sqrt(desc['N'])
desc.to_csv("descriptive_statistics_by_arm.csv", index=False)
# print("\n✅ Descriptive statistics saved to descriptive_statistics_by_arm.csv")


# --- 2️⃣ Paired comparative analysis --------------------------------------
def paired_compare(df, control='H1', delta=0.05):
    diseases = df['Disease'].unique()
    ai_arms = [a for a in df['Arm'].unique() if a != control]
    out = []

    for ai in ai_arms:
        diffs = []
        for d in diseases:
            ddf = df[df['Disease'] == d]
            try:
                f_ai = float(ddf.loc[ddf['Arm'] == ai, 'Weighted_F1'].iloc[0])
                f_h  = float(ddf.loc[ddf['Arm'] == control, 'Weighted_F1'].iloc[0])
                if np.isfinite(f_ai) and np.isfinite(f_h):
                    diffs.append(f_ai - f_h)
            except Exception:
                continue

        diffs = np.array(diffs)
        n = len(diffs)
        if n < 2:
            continue  # too few pairs to test

        mean_diff = np.mean(diffs)
        sd_diff   = np.std(diffs, ddof=1)
        shapiro_p = stats.shapiro(diffs).pvalue if n >= 3 else np.nan

        # choose test
        if np.isnan(shapiro_p) or shapiro_p <= 0.05:
            try:
                t_stat, p_val = stats.wilcoxon(diffs)
                test_used = "Wilcoxon"
            except ValueError:
                t_stat, p_val = np.nan, np.nan
                test_used = "Wilcoxon (invalid)"
        else:
            t_stat, p_val = stats.ttest_rel(diffs, np.zeros_like(diffs))
            test_used = "Paired t-test"

        # bootstrap CI
        rng = np.random.default_rng(42)
        boot = [np.mean(rng.choice(diffs, n, replace=True)) for _ in range(5000)]
        ci_low, ci_high = np.percentile(boot, [2.5, 97.5])
        non_inferior = ci_low > -delta

        out.append({
            'AI_Arm': ai,
            'N_Diseases': n,
            'Mean_Diff_F1W': round(mean_diff, 3),
            'SD_Diff': round(sd_diff, 3),
            'Shapiro_p': round(shapiro_p, 3) if not np.isnan(shapiro_p) else np.nan,
            'Test_Used': test_used,
            'p_value': round(p_val, 4) if p_val is not None and np.isfinite(p_val) else np.nan,
            '95CI_Low': round(ci_low, 3),
            '95CI_High': round(ci_high, 3),
            'NonInferior(δ=0.05)': non_inferior
        })
    return pd.DataFrame(out)

paired_df = paired_compare(summary_df, control='H1', delta=0.05)
paired_df.to_csv("paired_analysis_summary.csv", index=False)
# print("\n✅ Paired comparison saved to paired_analysis_summary.csv")

# --- 3️⃣ Exact permutation (sign-flip) familywise correction --------------
def permutation_max_stat_familywise(summary_df, control='H1'):
    diseases = sorted(summary_df['Disease'].unique())
    ai_arms = [a for a in sorted(summary_df['Arm'].unique()) if a != control]

    # build matrix
    mat = {arm: np.array([
        summary_df.loc[(summary_df['Disease']==d)&(summary_df['Arm']==arm),
                       'Weighted_F1'].astype(float).iloc[0]
        if not summary_df.loc[(summary_df['Disease']==d)&(summary_df['Arm']==arm)].empty
        else np.nan
        for d in diseases]) for arm in ai_arms+[control]}

    # keep only diseases present for all arms
    valid_mask = np.all(~np.isnan(np.column_stack(list(mat.values()))), axis=1)
    diseases_valid = np.array(diseases)[valid_mask]
    n = len(diseases_valid)
    if n == 0:
        raise ValueError("No complete disease rows across all arms")

    # compute diffs
    diffs_matrix = np.vstack([(mat[ai][valid_mask] - mat[control][valid_mask])
                              for ai in ai_arms])
    n_arms = diffs_matrix.shape[0]

    # observed mean diffs
    obs_mean = diffs_matrix.mean(axis=1)

    # exact sign-flip enumeration (2^n permutations)
    sign_patterns = list(itertools.product([1, -1], repeat=n))
    max_stats = []
    for signs in sign_patterns:
        s = np.array(signs)
        means = diffs_matrix.dot(s) / n
        max_stats.append(np.max(np.abs(means)))
    max_stats = np.array(max_stats)

    # familywise p for each AI arm
    fw_p = [np.mean(max_stats >= abs(obs_mean[i])) for i in range(n_arms)]

    perm_df = pd.DataFrame({
        'AI_Arm': ai_arms,
        'Observed_Mean_Diff': obs_mean,
        'FWER_pvalue': fw_p
    })
    return perm_df

perm_df = permutation_max_stat_familywise(summary_df, control='H1')
paired_df = paired_df.merge(perm_df, on='AI_Arm', how='left')
paired_df.to_csv("paired_analysis_summary_with_permutation.csv", index=False)

print("\n🔍 Paired_df check before plotting:")
# print(paired_df)

# --- 4️⃣ Visualization: forest-style plot --------------------------------
# paired_plot = paired_df.replace([np.inf, -np.inf], np.nan)
# paired_plot = paired_plot.dropna(subset=['Mean_Diff_F1W', '95CI_Low', '95CI_High'])

# if paired_plot.empty:
#     print("⚠️ No valid arms with finite CI values. Possibly missing H1 or identical scores.")
# else:
#     fig, ax = plt.subplots(figsize=(8, 5))
#     ax.errorbar(
#         paired_plot['AI_Arm'],
#         paired_plot['Mean_Diff_F1W'],
#         yerr=[
#             paired_plot['Mean_Diff_F1W'] - paired_plot['95CI_Low'],
#             paired_plot['95CI_High'] - paired_plot['Mean_Diff_F1W']
#         ],
#         fmt='o', capsize=5, color='navy'
#     )
#     ax.axhline(0, color='gray', ls='--', label='No difference')
#     ax.axhline(-0.05, color='red', ls=':', label='Non-inferiority margin (−0.05)')
#     for i, row in paired_plot.iterrows():
#         lbl = (
#             f"{row['Mean_Diff_F1W']:.2f} "
#             f"[{row['95CI_Low']:.2f}, {row['95CI_High']:.2f}]\n"
#             f"p={row['p_value'] if pd.notna(row['p_value']) else 'NA'}, "
#             f"FWER={row['FWER_pvalue'] if pd.notna(row['FWER_pvalue']) else 'NA'}"
#         )
#         ax.text(row['AI_Arm'], row['Mean_Diff_F1W'] + 0.02, lbl,
#                 ha='center', va='bottom', fontsize=9)
#     ax.set_ylabel("Mean difference in Weighted F1 (AI – Human)")
#     ax.set_title("Paired comparison across clinical ideas\n(Exact permutation FWER adjusted)")
#     ax.legend()
#     plt.tight_layout()
#     plt.savefig("paired_f1_differences_permutation.png", dpi=300)
#     plt.show()


### Phase B — Final Primary Objective Analysis Summary (SAP 6.2.1)

1. **Comprehensive Summary Across Metrics:** Combines descriptive statistics, disease-level Δ(AI–H1) comparisons, and paired inferential results for all AI and Human (H1) arms, using Weighted F1 ($F1_W$) as the primary outcome measure (FRD 6.2.1).  
2. **Non-Inferiority Assessment:** Evaluates each AI arm against H1 with 95% CIs, FWER-adjusted p-values, and interpretation of non-inferiority using δ = 0.05 to identify AI workflows performing comparably to humans.  

In [None]:
# ==========================================================
# 📄 SAP 6.2.1 — Final Primary Objective Analysis Summary
# ==========================================================

# import pandas as pd
# import numpy as np

# print("\n" + "="*70)
# print("📊 SAP 6.2.1 — Primary Objective Analysis Summary")
# print("="*70)

# # 1️⃣ DESCRIPTIVE STATISTICS ---------------------------------------------------
# print("\n1️⃣ DESCRIPTIVE STATISTICS (Weighted F1 per Arm)")
# try:
#     print(desc[['Arm', 'mean', 'median', 'std', 'min', 'max', '95CI_low', '95CI_high']].round(3))
# except Exception as e:
#     print("⚠️ Descriptive summary unavailable:", e)

# 2️⃣ DISEASE-LEVEL Δ(AI–H1) COMPARISON ---------------------------------------
# try:
#     pivot = summary_df.pivot(index='Disease', columns='Arm', values='Weighted_F1')
#     pivot['AI_Mean'] = pivot[['AI1', 'AI2', 'AI3', 'AI4']].mean(axis=1)
#     pivot['Δ(AI–H1)'] = pivot['AI_Mean'] - pivot['H1']
#     print("\n2️⃣ DISEASE-LEVEL COMPARISON (Human vs Mean AI Weighted F1)")
#     print(pivot[['H1', 'AI_Mean', 'Δ(AI–H1)']].round(3))
# except Exception as e:
#     print("⚠️ Could not compute per-disease Δ(AI–H1):", e)

# # 3️⃣ PAIRED INFERENTIAL ANALYSIS ---------------------------------------------
# print("\n3️⃣ PAIRED INFERENTIAL RESULTS (AI vs Human)")
# try:
#     print(paired_df[['AI_Arm', 'N_Diseases', 'Mean_Diff_F1W', '95CI_Low', '95CI_High',
#                      'Test_Used', 'p_value', 'FWER_pvalue', 'NonInferior(δ=0.05)']])
# except Exception as e:
#     print("⚠️ Paired inferential results unavailable:", e)

# # 4️⃣ NON-INFERIORITY INTERPRETATION ------------------------------------------
# print("\n4️⃣ NON-INFERIORITY INTERPRETATION (δ = 0.05)")
# try:
#     for _, row in paired_df.iterrows():
#         status = "✅ Non-inferior" if row['95CI_Low'] > -0.05 else "❌ Not non-inferior"
#         print(f"{row['AI_Arm']}: Δ={row['Mean_Diff_F1W']:.3f}, "
#               f"95% CI=({row['95CI_Low']:.3f}, {row['95CI_High']:.3f}), "
#               f"FWER p={row['FWER_pvalue']:.3f}, {status}")
# except Exception as e:
#     print("⚠️ Non-inferiority summary unavailable:", e)

# 5️⃣ SAVE ALL SUMMARIES ------------------------------------------------------
# print("\n5️⃣ EXPORTING RESULTS")
# try:
#     with pd.ExcelWriter("SAP_6_2_1_PrimaryObjectiveAnalysis.xlsx") as writer:
#         desc.to_excel(writer, sheet_name="Descriptive_Stats", index=False)
#         pivot[['H1', 'AI_Mean', 'Δ(AI–H1)']].to_excel(writer, sheet_name="Disease_Comparison")
#         paired_df.to_excel(writer, sheet_name="Paired_Inferential", index=False)
#     # print("✅ Saved: SAP_6_2_1_PrimaryObjectiveAnalysis.xlsx")
# except Exception as e:
#     print("⚠️ Export failed:", e)

# print("\n✅ SAP 6.2.1 summary complete.")
print("="*70)


**Exploratory Inferential Analysis — Paired Comparison of AI vs Human Workflows (Weighted F1 Δ with FWER Correction)**

This table presents the paired statistical comparison of each AI workflow against the Human workflow across all clinical ideas

For each AI arm, the mean difference in prevalence-weighted F1 ($Δ = AI – H1$), standard deviation, 95 % bootstrap confidence interval, normality check, chosen paired test, and permutation-adjusted (FWER) p-values are reported.
Positive mean Δ values indicate higher average alignment of the AI workflow with the True Gold Standard (TGS) relative to the Human reference.

In [None]:
# Keep only the essential columns for SAP reporting
paired_df = paired_df[[
    'AI_Arm',
    'Mean_Diff_F1W',
    '95CI_Low',
    '95CI_High',
    'Test_Used',
    'p_value',
    'FWER_pvalue',
    'NonInferior(δ=0.05)'
]]

# Display the first 10 rows
paired_df.head(10)


<!-- ### Phase B — Derived Metrics & Weighted F1 Integration (6.2.1)

1. **Compute Standard Metrics (FRD 6.2.1):** Calculates per-arm precision, recall, and unweighted F1 based on overlap with the True Gold Standard (TGS), along with coverage rate (recall) and specificity (inverse of false positives).  
2. **Weighted F1 Calculation:** Integrates prevalence-weighted F1 ($F1_W$) using `ConceptRecordCounts.csv` if not already present, reflecting the primary outcome measure from Section 6.2.1 of the SAP.  
3. **Output Saved (CSV):** The enhanced summary, including all derived metrics and weighted F1, is saved as `gold_standard_comparison_summary_with_metrics.csv` for downstream analysis and reporting.
 -->

In [None]:
# ===============================================================
# 🧮 GOLD STANDARD COMPARISON — FINAL VERSION (No ONLINE folders)
# ===============================================================

import os
import re
import numpy as np
import pandas as pd

# --------------------------------------------
# 🧭 PATH SETUP
# --------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, '..', 'ConceptSets'))
if not os.path.exists(ROOT_DIR):
    alt_root = os.path.join(SCRIPT_DIR, 'ConceptSets')
    if os.path.exists(alt_root):
        ROOT_DIR = os.path.abspath(alt_root)
    else:
        raise FileNotFoundError("⚠️ ConceptSets folder not found.")

RESULTS_DIR = os.path.join(SCRIPT_DIR, "results")
GOLD_FINAL_DIR = os.path.join(RESULTS_DIR, "phaseA_gold_final")
INPUT_PATH = os.path.join(RESULTS_DIR, "gold_standard_comparison_summary.csv")
RECORD_COUNT_PATH = os.path.join(SCRIPT_DIR, "ConceptRecordCounts.csv")
OUTPUT_PATH = os.path.join(RESULTS_DIR, "gold_standard_comparison_summary_with_metrics.csv")

# --------------------------------------------
# 🧹 Helper: Clean folder names
# --------------------------------------------
def strip_numeric_prefix(name: str) -> str:
    """Remove leading numeric prefixes like '0152-' or '0031_'."""
    return re.sub(r"^[\d\-\_]+", "", name).strip()

# --------------------------------------------
# 📘 LOAD INPUT FILES
# --------------------------------------------
summary_df = pd.read_csv(INPUT_PATH)
record_count_df = pd.read_csv(RECORD_COUNT_PATH, dtype=str)
record_count_df["record_count"] = pd.to_numeric(record_count_df["record_count"], errors="coerce").fillna(0)

for col in ["Overlap", "Arm_only", "GoldStandard_only"]:
    if col not in summary_df.columns:
        summary_df[col] = 0
    summary_df[col] = pd.to_numeric(summary_df[col], errors="coerce").fillna(0)

# --------------------------------------------
# 🧩 Weighted F1 function
# --------------------------------------------
def compute_weighted_f1(arm_concepts, gold_standard, record_count_df):
    df = record_count_df[record_count_df["conceptId"].isin(set(arm_concepts) | set(gold_standard))].copy()
    if df.empty:
        return np.nan

    df["TP"] = df["conceptId"].isin(arm_concepts & gold_standard)
    df["FP"] = df["conceptId"].isin(arm_concepts - gold_standard)
    df["FN"] = df["conceptId"].isin(gold_standard - arm_concepts)

    WTP = df.loc[df["TP"], "record_count"].sum()
    WFP = df.loc[df["FP"], "record_count"].sum()
    WFN = df.loc[df["FN"], "record_count"].sum()

    P_W = WTP / (WTP + WFP) if (WTP + WFP) else np.nan
    R_W = WTP / (WTP + WFN) if (WTP + WFN) else np.nan
    F1_W = (2 * P_W * R_W / (P_W + R_W)) if (P_W + R_W) else np.nan
    return round(F1_W, 3)

# --------------------------------------------
# 🚀 MAIN LOOP
# --------------------------------------------
weighted_f1_scores = []

for _, row in summary_df.iterrows():
    disease = str(row["Disease"])
    arm = str(row["Arm"]).upper()

    # --- Locate Gold Standard file ---
    gold_paths = [
        os.path.join(GOLD_FINAL_DIR, f"{disease}", f"{disease}_Gold_Standard_FINAL.csv"),
        os.path.join(GOLD_FINAL_DIR, f"{disease}_Gold_Standard_FINAL.csv"),
    ]
    gold_path = next((p for p in gold_paths if os.path.exists(p)), None)
    if not gold_path:
        print(f"⚠️ Skipping {disease}-{arm}: Gold Standard not found.")
        weighted_f1_scores.append(np.nan)
        continue

    gold_df = pd.read_csv(gold_path, dtype=str)
    gold_df["conceptId"] = gold_df["conceptId"].astype(str).str.strip().str.replace(r'\.0+$', '', regex=True)
    gold_concepts = set(gold_df["conceptId"])

    # --- Locate matching ConceptSets folder (flat structure) ---
    matched_folder = None
    for folder in os.listdir(ROOT_DIR):
        clean_name = strip_numeric_prefix(folder)
        if re.search(rf"\[{disease.split('_')[0]}\]", clean_name, re.IGNORECASE) and \
           re.search(rf"\[{arm}\]", clean_name, re.IGNORECASE):
            # ⛔ Skip online folders entirely
            if re.search(r'ONLINE', clean_name, re.IGNORECASE):
                matched_folder = None
                break
            matched_folder = os.path.join(ROOT_DIR, folder)
            break

    if not matched_folder:
        # print(f"⚠️ Skipping {disease}-{arm}: No valid (non-online) folder found in ConceptSets.")
        weighted_f1_scores.append(np.nan)
        continue

    arm_path = os.path.join(matched_folder, "includedConcepts.csv")
    if not os.path.exists(arm_path):
        print(f"⚠️ Missing includedConcepts.csv for {arm} in {matched_folder}")
        weighted_f1_scores.append(np.nan)
        continue

    arm_df = pd.read_csv(arm_path, dtype=str)
    concept_col = next((c for c in arm_df.columns if "conceptid" in c.lower()), arm_df.columns[0])
    arm_df[concept_col] = arm_df[concept_col].astype(str).str.strip().str.replace(r'\.0+$', '', regex=True)
    arm_concepts = set(arm_df[concept_col])

    f1_w = compute_weighted_f1(arm_concepts, gold_concepts, record_count_df)
    # print(f"✅ {disease}-{arm}: Weighted F1 = {f1_w}")
    weighted_f1_scores.append(f1_w)

# summary_df["Weighted_F1"] = weighted_f1_scores

# --------------------------------------------
# 💾 SAVE OUTPUT
# --------------------------------------------
summary_df.to_csv(OUTPUT_PATH, index=False)
# print(f"\n✅ Saved enriched summary → {OUTPUT_PATH}")


### Phase B — Vocabulary & Mapping Coverage Analysis (Refined)

1. **Standard vs Non-Standard Concepts (FRD 6.2.2 & 6.2.4):** This cell evaluates each workflow’s included concepts, classifying them as standard or non-standard, and summarizes the counts per arm and disease.  
2. **Vocabulary Distribution (FRD 6.2.2):** It quantifies the representation of different source vocabularies (e.g., ICD-10-CM) across arms and diseases to assess coverage and potential gaps.  
3. **Mapping Coverage (FRD 6.2.2 & 6.2.4):** Non-standard concepts are compared against mapped standards to compute the proportion successfully mapped; results are saved in `vocabulary_standard_summary.csv`, `vocabulary_distribution_summary.csv`, and `mapping_coverage_summary.csv`.


In [None]:
# ==========================================================
# 🧩 Vocabulary & Mapping Coverage Analysis — flat ConceptSets
# ==========================================================
import os
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from collections import defaultdict

sns.set(style="whitegrid", font_scale=1.0)
warnings.filterwarnings("ignore", category=RuntimeWarning, module="pandas.io.formats.style")

# --------------------------------------------
# Robust ROOT_DIR auto-detect (same as other scripts)
# --------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, '..', 'ConceptSets'))
if not os.path.exists(ROOT_DIR):
    alt_root = os.path.join(SCRIPT_DIR, 'ConceptSets')
    if os.path.exists(alt_root):
        ROOT_DIR = os.path.abspath(alt_root)
    else:
        raise FileNotFoundError("ConceptSets folder not found relative to script.")

OUT_DIR = os.path.join(SCRIPT_DIR, 'results', 'vocabulary_analysis')
os.makedirs(OUT_DIR, exist_ok=True)
PLOT_DIR = os.path.join(OUT_DIR, 'plots')
os.makedirs(PLOT_DIR, exist_ok=True)

# -------------------------------------------------------------
# Helpers
# -------------------------------------------------------------
def strip_numeric_prefix(name: str) -> str:
    """Remove leading numeric prefixes like '0152-' or '0031_'."""
    return re.sub(r'^[\d\-\_]+', '', name).strip()

def clean_arm_name(folder_name: str, include_subteams: bool=False):
    """
    Return a standard arm label (AI1, HUMAN, etc.) or None for subteams
    if include_subteams=False.
    """
    # Normalize
    name = strip_numeric_prefix(folder_name)

    # skip ONLINE arms entirely
    if re.search(r'ONLINE', name, re.IGNORECASE):
        return None

    # optionally skip subgroup Sxx
    if not include_subteams and re.search(r'\[S\d+\]', name, re.IGNORECASE):
        return None

    # find standard arm token
    m = re.search(r'\[(AI\d+|H\d+|HUMAN|REVIEWER|CLINICIAN|MANUAL|EXPERT)\]', name, re.IGNORECASE)
    if m:
        arm = m.group(1).upper()
        if arm.startswith('H'):
            return 'HUMAN'
        return arm

    # fallback: any bracketed token (take last)
    parts = re.findall(r'\[([A-Za-z0-9]+)\]', name)
    if parts:
        return parts[-1].upper()

    # last fallback: use trailing words
    toks = re.split(r'[\s_\-]+', name)
    return toks[-1].upper() if toks else name.upper()

def load_csv_safe(path):
    try:
        df = pd.read_csv(path, dtype=str)
        df.columns = df.columns.str.strip()
        return df
    except Exception as e:
        # return empty DataFrame to keep downstream logic simple
        print(f"  ⚠️ Failed to read {path}: {e}")
        return pd.DataFrame()

def classify_standard_status(df):
    if "standardConcept" not in df.columns:
        return pd.Series(["Unknown"] * len(df))
    return df["standardConcept"].fillna("").astype(str).str.strip().str.upper().map(lambda x: "Standard" if x == "S" else "Non-Standard")

# -------------------------------------------------------------
# Storage
# -------------------------------------------------------------
all_standard_stats = []
all_vocab_stats = []
all_mapping_stats = []

# -------------------------------------------------------------
# Main: iterate flat folders directly under ROOT_DIR
# -------------------------------------------------------------
folders = [f for f in os.listdir(ROOT_DIR) if os.path.isdir(os.path.join(ROOT_DIR, f))]
# print(f"Found {len(folders)} folders in ConceptSets (root: {ROOT_DIR})")

for folder in sorted(folders):
    folder_path = os.path.join(ROOT_DIR, folder)
    clean_name = strip_numeric_prefix(folder)

    # skip ONLINE and any hidden folders
    if re.search(r'ONLINE', clean_name, re.IGNORECASE):
        # print(f"Skipping ONLINE folder: {folder}")
        continue
    # identify disease code, e.g. [C01]
    m = re.search(r'\[(C\d+)\]', clean_name, re.IGNORECASE)
    if not m:
        # skip folders that do not appear to be concept set arms
        # print(f"Skipping non-conceptset folder: {folder}")
        continue
    disease_code = m.group(1).upper()

    # derive disease display name optionally from the folder text
    # pick the trailing disease name after the last bracket if present
    disease_display = re.sub(r'^.*\]', '', clean_name).strip()
    arm = clean_arm_name(folder, include_subteams=False)
    if arm is None:
        continue

    # file paths inside this flat folder
    included_path = os.path.join(folder_path, "includedConcepts.csv")
    mapped_path = os.path.join(folder_path, "mappedConcepts.csv")

    if not os.path.exists(included_path):
        # print(f"  ⚠️ No includedConcepts.csv in {folder} — skipping")
        continue

    included_df = load_csv_safe(included_path)
    mapped_df = load_csv_safe(mapped_path) if os.path.exists(mapped_path) else pd.DataFrame()

    # ---- Standard vs Non-Standard summary
    if included_df is None or included_df.empty:
        continue
    included_df["StandardStatus"] = classify_standard_status(included_df)
    std_summary = included_df["StandardStatus"].value_counts().rename_axis("StandardStatus").reset_index(name="Count")
    std_summary["Disease"] = disease_code
    std_summary["Arm"] = arm
    std_summary["Total"] = len(included_df)
    all_standard_stats.append(std_summary)

    # ---- Vocabulary distribution
    if "vocabularyId" in included_df.columns:
        vocab_summary = included_df["vocabularyId"].fillna("UNKNOWN").value_counts().rename_axis("vocabularyId").reset_index(name="Count")
        vocab_summary["Disease"] = disease_code
        vocab_summary["Arm"] = arm
        all_vocab_stats.append(vocab_summary)

    # ---- Mapping coverage (non-standard → mapped)
    # Identify non-standard concept IDs
    if "conceptId" in included_df.columns:
        non_standard_df = included_df[included_df["StandardStatus"] != "Standard"]
        non_std_ids = set(non_standard_df["conceptId"].dropna().astype(str))
    else:
        non_std_ids = set()

    if not non_std_ids:
        # append an explicit record to indicate no non-standard concepts
        all_mapping_stats.append({
            "Disease": disease_code, "Arm": arm,
            "Non_Standard_Concepts": 0,
            "Mapped_to_Standard": 0,
            "Unmapped": 0,
            "Mapping_Coverage_%": None,
            "Folder": folder
        })
    else:
        mapped_ids = set(mapped_df["conceptId"].dropna().astype(str)) if not mapped_df.empty and "conceptId" in mapped_df.columns else set()
        mapped_to_standard = non_std_ids & mapped_ids
        coverage_pct = round(len(mapped_to_standard) / len(non_std_ids) * 100, 2) if non_std_ids else None
        all_mapping_stats.append({
            "Disease": disease_code, "Arm": arm,
            "Non_Standard_Concepts": len(non_std_ids),
            "Mapped_to_Standard": len(mapped_to_standard),
            "Unmapped": len(non_std_ids - mapped_to_standard),
            "Mapping_Coverage_%": coverage_pct,
            "Folder": folder
        })

# -------------------------------------------------------------
# Combine and save
# -------------------------------------------------------------
df_standard = pd.concat(all_standard_stats, ignore_index=True) if all_standard_stats else pd.DataFrame()
df_vocab = pd.concat(all_vocab_stats, ignore_index=True) if all_vocab_stats else pd.DataFrame()
df_mapping = pd.DataFrame(all_mapping_stats) if all_mapping_stats else pd.DataFrame()

df_standard.to_csv(os.path.join(OUT_DIR, "vocabulary_standard_summary.csv"), index=False)
df_vocab.to_csv(os.path.join(OUT_DIR, "vocabulary_distribution_summary.csv"), index=False)
df_mapping.to_csv(os.path.join(OUT_DIR, "mapping_coverage_summary.csv"), index=False)

# print(f"\n✅ Saved results in: {OUT_DIR}")
# print(" - vocabulary_standard_summary.csv")
# print(" - vocabulary_distribution_summary.csv")
# print(" - mapping_coverage_summary.csv")

# -------------------------------------------------------------
# 🧾 Display Tables
# -------------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
pd.set_option('display.colheader_justify', 'center')

# ---------- STANDARD/NON-STANDARD ----------
print("\n==============================")
print("🧩 STANDARD VS NON-STANDARD SUMMARY")
print("==============================")
for disease in df_standard["Disease"].unique():
    sub = df_standard[df_standard["Disease"] == disease].copy()
    pivot = sub.pivot_table(
        index="Arm", columns="StandardStatus", values="Count", aggfunc="sum", fill_value=0
    ).reset_index()
    pivot["Total"] = pivot.sum(axis=1, numeric_only=True)
    pivot["% Standard"] = (pivot.get("Standard", 0) / pivot["Total"] * 100).round(1)
    print(f"\n=== Disease: {disease} ===")
    display(pivot.style.background_gradient(subset=["% Standard"], cmap="Blues").format(precision=1))

# ---------- VOCABULARY DISTRIBUTION ----------
print("\n==============================")
print("📚 VOCABULARY DISTRIBUTION SUMMARY")
print("==============================")
for disease in df_vocab["Disease"].unique():
    sub = df_vocab[df_vocab["Disease"] == disease].copy()
    pivot = sub.pivot_table(index="vocabularyId", columns="Arm", values="Count", aggfunc="sum", fill_value=0)
    pivot["Total"] = pivot.sum(axis=1)
    pivot = pivot.sort_values(by="Total", ascending=False).drop(columns="Total")
    print(f"\n=== Disease: {disease} ===")
    display(pivot.style.background_gradient(cmap="Purples").format(precision=0))



### Phase B — Objective 3: Assessing Clinical Impact of Workflow Disagreements (FRD 6.2.2, Objective 3)

1. **Metric Computation:** Calculates Spearman’s rank correlation between Weighted F1 ($F1_W$) and Unweighted F1 across all workflows to quantify the influence of concept prevalence on performance.  
2. **Clinical Insight:** Lower correlation values highlight that disagreements disproportionately affect high-prevalence concepts, revealing potential clinical consequences of workflow variability.  
3. **Result Display:** Outputs the correlation coefficient and p-value directly.


In [None]:
# # ==========================================================
# # 🔍 Objective 3 — Clinical Impact of Disagreements
# # ==========================================================
# print("\n🔹 Objective 3: Clinical Impact of Disagreements")

# # Identify the unweighted F1 column (often labeled 'F1')
# unweighted_col = "Unweighted_F1" if "Unweighted_F1" in summary_df.columns else "F1"

# if all(c in summary_df.columns for c in ["Weighted_F1", unweighted_col]):
#     spearman_corr, spearman_p = stats.spearmanr(
#         summary_df["Weighted_F1"], summary_df[unweighted_col],
#         nan_policy='omit'
#     )
#     print(f"✅ Spearman’s ρ = {spearman_corr:.3f}, p = {spearman_p:.4f}")
# else:
#     print("⚠️ Neither 'F1' nor 'Unweighted_F1' found — cannot compute Spearman correlation.")


<!-- ## Phase B — Source Vocabulary Sensitivity Analysis (FRD 6.2.4)

This analysis evaluates how different source vocabularies affect the workflow’s ability to match the True Gold Standard (TGS). It helps identify whether certain vocabularies disproportionately influence AI or Human arm performance.

**Procedure:**  
1. Load the adjudicated Gold Standard (`*_Gold_Standard_FINAL.csv`) for each disease.  
2. Identify all source vocabularies contributing concepts to the Gold Standard.  
3. For each arm (Human or AI), compute the weighted performance metrics restricted to each vocabulary subset.  
4. Record counts of concepts that were correctly mapped (true positives), missed (false negatives), or erroneously included (false positives).  

**Deliverable:**  
The combined results for all diseases and arms are saved as `combined_source_vocab_sensitivity.csv`, summarizing the vocabulary-specific Weighted F1, Weighted Precision, Weighted Recall, and weighted counts of TP, FP, and FN.  
 -->

In [None]:
# # ==========================================================
# # 🔍 Objective 4 — Source Vocabulary Sensitivity Analysis
# # ==========================================================
# import os
# import pandas as pd

# print("\n🔹 Objective 4: Source Vocabulary Sensitivity Analysis (Standalone)")

# # Load record counts for weighting
# record_count_df = pd.read_csv("ConceptRecordCounts.csv", dtype=str)
# record_count_df["record_count"] = record_count_df["record_count"].astype(float)

# # Identify all FINAL Gold Standard files
# final_files = [f for f in os.listdir(".") if f.endswith("_Gold_Standard_FINAL.csv")]
# print(f"📂 Found {len(final_files)} FINAL gold standard files.\n")

# all_vocab_results = []

# for final_file in final_files:
#     disease_folder = final_file.replace("_Gold_Standard_FINAL.csv", "")
#     if not os.path.exists(disease_folder):
#         print(f"⚠️ Disease folder not found for {disease_folder}")
#         continue

#     # Load final adjudicated file
#     final_df = pd.read_csv(final_file, dtype=str)
#     if "vocabularyId" not in final_df.columns:
#         print(f"⚠️ No 'vocabularyId' in {final_file}, skipping.")
#         continue

#     vocab_groups = final_df["vocabularyId"].dropna().unique()
#     print(f"=== Disease: {disease_folder} ===")
#     print(f"   Found vocabularies: {', '.join(vocab_groups)}")

#     # --- Identify all arms for this disease
#     arm_data = {}
#     for subfolder in os.listdir(disease_folder):
#         subfolder_path = os.path.join(disease_folder, subfolder)
#         if os.path.isdir(subfolder_path):
#             csv_path = os.path.join(subfolder_path, "includedConcepts.csv")
#             if os.path.exists(csv_path):
#                 arm_name = clean_arm_name(subfolder)
#                 # ✅ Skip subgroups and duplicates
#                 if arm_name is None or arm_name == "UNKNOWN" or arm_name in arm_data:
#                     continue
#                 arm_data[arm_name] = set(load_included_concepts(csv_path))

#     # --- Compute vocabulary-specific weighted F1
#     for vocab in vocab_groups:
#         vocab_gs = set(
#             final_df.loc[final_df["vocabularyId"] == vocab, "conceptId"]
#             .dropna().astype(str)
#         )
#         for arm, concepts in arm_data.items():
#             # compute_weighted_f1 returns dict with F1_W, P_W, etc.
#             weighted = compute_weighted_f1(concepts & vocab_gs, vocab_gs, record_count_df)

#             all_vocab_results.append({
#                 "Disease": disease_folder,
#                 "Arm": arm,
#                 "Vocabulary": vocab,
#                 "Weighted_F1_Source": weighted["F1_W"],
#                 "Weighted_Precision_Source": weighted["P_W"],
#                 "Weighted_Recall_Source": weighted["R_W"],
#                 "WTP_Source": weighted["WTP"],
#                 "WFP_Source": weighted["WFP"],
#                 "WFN_Source": weighted["WFN"]
#             })

#             f1_str = f"{weighted['F1_W']:.3f}" if weighted["F1_W"] is not None else "NA"
#             print(f"     {arm:<5} | {vocab:<20} → Weighted_F1_Source={f1_str}")

# print("\n✅ Completed source vocabulary sensitivity analysis for all diseases.")

# # --- Combine all results (deduplicated)
# if all_vocab_results:
#     vocab_df = pd.DataFrame(all_vocab_results)
#     vocab_df = vocab_df.drop_duplicates(subset=["Disease", "Arm", "Vocabulary"], keep="first")
#     vocab_df.to_csv("combined_source_vocab_sensitivity.csv", index=False)
#     print(f"✅ Saved combined file: combined_source_vocab_sensitivity.csv "
#           f"({vocab_df.shape[0]} unique rows across {vocab_df['Disease'].nunique()} diseases)")
# else:
#     print("⚠️ No vocabulary results generated — please check inputs.")


In [None]:
!jupyter nbconvert --to html --no-input --output "Phase2_results.html" "Phase2.ipynb" --log-level=ERROR