# Minds Meet Machines (MMM) Challenge — Phase 1: Pre-Adjudication Analysis

**Date:** 09/10/2025  
**OMOP Vocabulary Version:** August 27, 2025# Execution environment & requirements

**Python**: 3.9+ recommended.  
**Main packages**: pandas, numpy, matplotlib, seaborn, upsetplot, scikit-learn, nbformat.

Example (pip):


**Abstract:**  
This notebook performs Phase 1 (pre-adjudication) analyses for the MMM challenge. It compares concept sets generated by human and GenAI workflows, quantifies agreement using Jaccard similarity and other metrics, and produces visual summaries for downstream adjudication and consensus processes.


# Execution environment & requirements

**Python**: 3.9+ recommended.  
**Main packages**: pandas, numpy, matplotlib, seaborn, upsetplot, scikit-learn, nbformat.

Example (pip):
pip install pandas numpy matplotlib seaborn upsetplot scikit-learn nbformat

If you use conda, create an environment and install equivalents. For exact reproducibility, include a `requirements.txt` or `environment.yml` in the repo root.


In [None]:
# ------------------------------
# Import required libraries
# ------------------------------

# OS and file handling
import os
import glob
import re

# Data manipulation and analysis
import pandas as pd
import numpy as np

# For generating combinations and permutations
from itertools import combinations

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Random utilities (if needed later)
import random
import string


In [None]:
# -----------------------------
# Phase 1 — Subgroup analysis (Split & Reconciliation)
# Run this BEFORE Gold Standard (TGS) generation
# -----------------------------
import os
import re
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

# --------------------------------------------
# Robust ROOT_DIR setup (auto-detect)
# --------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

# Try ../ConceptSets relative to script
ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, '..', 'ConceptSets'))

# If not found, fallback to sibling directory ConceptSets
if not os.path.exists(ROOT_DIR):
    alt_root = os.path.join(SCRIPT_DIR, 'ConceptSets')
    if os.path.exists(alt_root):
        ROOT_DIR = os.path.abspath(alt_root)
    else:
        print("⚠️ ConceptSets folder not found relative to script.")
        ROOT_DIR = None

# print(f"\nSCRIPT_DIR: {SCRIPT_DIR}")
# print(f"Current working directory: {os.getcwd()}")
# print(f"ROOT_DIR (resolved): {ROOT_DIR}")

# Verify folder contents
if ROOT_DIR and os.path.exists(ROOT_DIR):
    folders = [f for f in os.listdir(ROOT_DIR) if os.path.isdir(os.path.join(ROOT_DIR, f))]
    # print(f"🧩 Found {len(folders)} folders in ConceptSets: {folders[:10]}")
else:
    print("⚠️ ROOT_DIR invalid or empty.")


# --------------------------------------------
# Output and constants
# --------------------------------------------
OUT_DIR_PHASE1 = os.path.join(SCRIPT_DIR, 'results', 'phase1_subgroup_analysis')
os.makedirs(OUT_DIR_PHASE1, exist_ok=True)

CONCEPT_COUNTS_FILE = "ConceptRecordCounts.csv"

# -------------------------
# Helper functions
# -------------------------
def strip_numeric_prefix(name):
    """Remove leading digits, dashes, or underscores like '0152-'."""
    return re.sub(r"^[\d\-\_]+", "", name).strip()

def clean_arm_name(folder_name):
    name = folder_name.strip()
    match = re.search(r"\[(AI\d+|H\d+|Human|Reviewer|Clinician|Manual|Expert)\]", name, re.IGNORECASE)
    if match:
        arm = match.group(1).upper()
        if arm.startswith("H"):
            arm = "HUMAN"
        return arm
    return name.upper()

def is_subgroup(folder_name):
    name = strip_numeric_prefix(folder_name)
    return bool(re.search(r"\[H\d+\].*\[S\d+\]", name, re.IGNORECASE))

def is_final_human(folder_name):
    name = strip_numeric_prefix(folder_name)
    return bool(re.search(r"\[H\d+\](?!.*\[S\d+\])", name, re.IGNORECASE))

def is_ai(folder_name):
    name = strip_numeric_prefix(folder_name)
    return bool(re.search(r"\[AI\d+\]", name, re.IGNORECASE))

def load_included_concepts(file_path):
    try:
        df = pd.read_csv(file_path, dtype=str)
    except Exception as e:
        print(f"[load_included_concepts] Error reading {file_path}: {e}")
        return set()
    cols = [c.strip() for c in df.columns]
    norm_map = {re.sub(r'[^a-z0-9]', '', c.lower()): c for c in cols}
    chosen = None
    if 'conceptid' in norm_map:
        chosen = norm_map['conceptid']
    else:
        for norm, orig in norm_map.items():
            if 'concept' in norm and 'id' in norm and 'set' not in norm:
                chosen = orig
                break
    if not chosen:
        for c in cols:
            if df[c].dropna().astype(str).str.match(r'^\d+$').any():
                chosen = c
                break
    if not chosen:
        return set()
    vals = df[chosen].dropna().astype(str).str.strip().str.replace(r'\.0+$', '', regex=True)
    return set(vals[vals != ''].tolist())


# --------------------------------------------
# Load record counts (for weighting)
# --------------------------------------------
if os.path.exists(CONCEPT_COUNTS_FILE):
    rc_df = pd.read_csv(CONCEPT_COUNTS_FILE, dtype=str)
    if 'conceptId' in rc_df.columns and 'record_count' in rc_df.columns:
        rc_df['record_count'] = rc_df['record_count'].astype(float)
        record_map = dict(zip(rc_df['conceptId'].astype(str), rc_df['record_count'].astype(float)))
    else:
        record_map = {}
else:
    record_map = {}
    print(f"⚠️ {CONCEPT_COUNTS_FILE} not found; weighted metrics will use 0 weights.")


# -------------------------
# Metric helpers
# -------------------------
def weighted_PRF(arm_set, ref_set):
    tp = set(arm_set) & set(ref_set)
    fp = set(arm_set) - set(ref_set)
    fn = set(ref_set) - set(arm_set)
    WTP = sum(record_map.get(str(c), 0.0) for c in tp)
    WFP = sum(record_map.get(str(c), 0.0) for c in fp)
    WFN = sum(record_map.get(str(c), 0.0) for c in fn)
    P = WTP / (WTP + WFP) if (WTP + WFP) else np.nan
    R = WTP / (WTP + WFN) if (WTP + WFN) else np.nan
    F1 = (2 * P * R / (P + R)) if (P and R and (P + R)) else np.nan
    return P, R, F1

def unweighted_PRF(arm_set, ref_set):
    tp = set(arm_set) & set(ref_set)
    fp = set(arm_set) - set(ref_set)
    fn = set(ref_set) - set(arm_set)
    P = len(tp) / (len(tp) + len(fp)) if (len(tp) + len(fp)) else np.nan
    R = len(tp) / (len(tp) + len(fn)) if (len(tp) + len(fn)) else np.nan
    F1 = (2 * P * R / (P + R)) if (P and R and (P + R)) else np.nan
    return P, R, F1

def pairwise_f1_matrix(sets_dict, weighted=True):
    names = list(sets_dict.keys())
    mat = pd.DataFrame(index=names, columns=names, dtype=float)
    for i, a in enumerate(names):
        for j, b in enumerate(names):
            if i <= j:
                if weighted:
                    _, _, f = weighted_PRF(sets_dict[a], sets_dict[b])
                else:
                    _, _, f = unweighted_PRF(sets_dict[a], sets_dict[b])
                mat.loc[a, b] = f
                mat.loc[b, a] = f
    return mat


# -------------------------
# Identify disease folders (robustly)
# -------------------------
# ✅ Restrict to these diseases only
TARGET_DISEASES = {
    'C01': 'SLE',
    'C02': 'RheumatoidArthritis',
    'C03': 'DiabetesMacularEdema',
    'C04': 'DeepVeinThrombosis',
    'C06': 'Uveitis',
    'C07': 'SystemicSclerosis'
}

raw_folders = [f for f in os.listdir(ROOT_DIR) if os.path.isdir(os.path.join(ROOT_DIR, f))]

# Remove leading numbers and dashes like '0152-'
cleaned_folders = [strip_numeric_prefix(f) for f in raw_folders]

# ✅ Keep only valid concept set arms:
# - Must have a [C##] pattern
# - Must NOT contain "ONLINE" (case-insensitive)
disease_folders = [
    f for f in cleaned_folders
    if re.search(r'\[C\d+\]', f, re.IGNORECASE)
    and not re.search(r'ONLINE', f, re.IGNORECASE)
]

# print(f"\n🧩 Found {len(disease_folders)} disease-arm folders (filtered for non-ONLINE):")
# print("  Examples:", disease_folders[:10])

# ✅ Group folders by disease code (C01, C02, etc.)
from collections import defaultdict
disease_groups = defaultdict(list)

for raw, clean in zip(raw_folders, cleaned_folders):
    # skip ONLINE folders entirely
    if re.search(r'ONLINE', clean, re.IGNORECASE):
        continue
    m = re.search(r'\[(C\d+)\]', clean, re.IGNORECASE)
    if m:
        disease_code = m.group(1).upper()
        # Only include if it's part of TARGET_DISEASES
        if disease_code in TARGET_DISEASES:
            disease_groups[disease_code].append(raw)

# Diagnostic output
# print(f"\n🧬 Grouped into {len(disease_groups)} target disease codes:")
# for k, v in sorted(disease_groups.items()):
#     print(f"  {k} ({TARGET_DISEASES[k]}): {len(v)} folders → {v[:3]}")

# If no matching folders found, warn early
if not disease_groups:
    print("⚠️ No matching disease folders found for specified TARGET_DISEASES. Check folder names.")

# -------------------------
# Iterate diseases and compute Phase 1 stats
# -------------------------
phase1_summaries = []

for disease_code, disease_folders in sorted(disease_groups.items()):
    # print(f"\n🔍 === Processing {disease_code} ===")
    disease_dir = ROOT_DIR
    subfolders = disease_folders
    # print(f"  Found {len(subfolders)} arms → {subfolders[:5]}")

    # Classify arms
    subteams = {}
    final_human = None
    ai_arms = {}
    for s in subfolders:
        csvp = os.path.join(disease_dir, s, 'includedConcepts.csv')
        if not os.path.exists(csvp):
            # print(f"  ⚠️ Missing includedConcepts.csv in {s}")
            continue
        if is_subgroup(s):
            subteams[s] = load_included_concepts(csvp)
        elif is_final_human(s):
            final_human = (s, load_included_concepts(csvp))
        elif is_ai(s):
            ai_arms[s] = load_included_concepts(csvp)

    # print(f"  → #subteams={len(subteams)} | #AI={len(ai_arms)} | FinalHuman={bool(final_human)}")

    if not subteams:
        # print(f"  ⚠️ No subteams detected for {disease_code}. Skipping.")
        continue
    if final_human is None:
        # print(f"  ⚠️ No final human arm found for {disease_code}. Skipping.")
        continue

    # Workflow sets
    workflow_sets = {}
    for k, sset in subteams.items():
        workflow_sets[f"SUB_{k}"] = sset
    if final_human:
        workflow_sets[f"FINAL_{final_human[0]}"] = final_human[1]
    for k, aset in ai_arms.items():
        workflow_sets[f"AI_{k}"] = aset

    # Compute metrics
    sub_names = [n for n in workflow_sets.keys() if n.startswith('SUB_')]
    pw_unweighted = pairwise_f1_matrix(workflow_sets, weighted=False)
    pw_weighted = pairwise_f1_matrix(workflow_sets, weighted=True)

    avg_pair_un = np.nan
    avg_pair_w = np.nan
    if len(sub_names) >= 2:
        pair_vals_un = [pw_unweighted.loc[a, b] for a, b in itertools.combinations(sub_names, 2)]
        pair_vals_w = [pw_weighted.loc[a, b] for a, b in itertools.combinations(sub_names, 2)]
        avg_pair_un = np.nanmean(pair_vals_un)
        avg_pair_w = np.nanmean(pair_vals_w)

    final_name = next((n for n in workflow_sets.keys() if n.startswith('FINAL_')), None)
    mean_final_vs_sub_un = np.nan
    mean_final_vs_sub_w = np.nan
    if final_name:
        f_vs_sub_un = [pw_unweighted.loc[final_name, s] for s in sub_names]
        f_vs_sub_w = [pw_weighted.loc[final_name, s] for s in sub_names]
        mean_final_vs_sub_un = np.nanmean(f_vs_sub_un)
        mean_final_vs_sub_w = np.nanmean(f_vs_sub_w)

    consensus_gain_un = (mean_final_vs_sub_un - avg_pair_un) if not np.isnan(mean_final_vs_sub_un) else np.nan
    consensus_gain_w = (mean_final_vs_sub_w - avg_pair_w) if not np.isnan(mean_final_vs_sub_w) else np.nan

    # Save summary row
    row = {
        'Disease': disease_code,
        'N_subteams': len(sub_names),
        'N_ai_arms': sum(1 for n in workflow_sets if n.startswith('AI_')),
        'Union_concepts': len(set().union(*workflow_sets.values())),
        'Avg_pairwise_sub_unweighted': avg_pair_un,
        'Avg_pairwise_sub_weighted': avg_pair_w,
        'Mean_final_vs_sub_unweighted': mean_final_vs_sub_un,
        'Mean_final_vs_sub_weighted': mean_final_vs_sub_w,
        'Consensus_Gain_proxy_unweighted': consensus_gain_un,
        'Consensus_Gain_proxy_weighted': consensus_gain_w
    }
    phase1_summaries.append(row)

    # print(f"  ✅ Added summary row for {disease_code}")

# -------------------------
# Save + verify
# -------------------------
phase1_df = pd.DataFrame(phase1_summaries)
phase1_df.to_csv(os.path.join(OUT_DIR_PHASE1, "phase1_summary_across_diseases.csv"), index=False)
# print("\n📊 Saved Phase1 summary →", os.path.join(OUT_DIR_PHASE1, "phase1_summary_across_diseases.csv"))
# print("Columns:", phase1_df.columns.tolist())
# print("Shape:", phase1_df.shape)
# print(phase1_df.head())

if phase1_df.empty:
    print("⚠️ phase1_df is EMPTY — no valid subgroups or final human folders detected. Check folder naming pattern and includedConcepts.csv presence.")


### Phase 1 — Filtered Summary Table for Consensus Gain Analysis (FRD 6.2.2)
- This cell filters the Phase 1 summary to retain only the most relevant metrics for assessing human and AI subgroup variability.
- This table summarizes Phase 1 (Pre-Adjudication) human subgroup consistency and consensus improvement.
It compares how similar individual human sub-teams were to each other and to the final reconciled human set, using Jaccard similarity (unweighted and prevalence-weighted).
The Consensus Gain Proxy indicates how much agreement improved after reconciliation — positive values reflect stronger post-reconciliation alignment.
- The selected columns are:  
  - **Disease:** Identifier for each clinical idea analyzed.  
  - **N_subteams:** Number of independent human subgroups.  
  - **N_ai_arms:** Number of AI workflows compared.  
  - **Union_concepts:** Total unique concepts across all workflows.  
  - **Unweighted Consensus Gain Proxy** → Measures improvement in agreement treating all concepts equally, regardless of how common or rare they are in clinical data.
→ Reflects pure conceptual overlap among sub-teams.
  - **Weighted Consensus Gain Proxy** → Gives more weight to clinically frequent (high-prevalence) concepts using record counts.
→ Reflects agreement improvement driven by important or common concepts, not rare ones. 


In [None]:
df = pd.read_csv(os.path.join(OUT_DIR_PHASE1, "phase1_summary_across_diseases.csv"))
# df.head(10)

In [None]:
cols_to_keep = [
    'Disease',
    'N_subteams',
    'N_ai_arms',
    'Union_concepts',
    'Consensus_Gain_proxy_unweighted',
    'Consensus_Gain_proxy_weighted'
]

df_filtered = df[cols_to_keep]
df_filtered.head(10)

In [None]:
# ==========================================================
# Phase A — Pre-Adjudication (AI vs Final Human)
# Combines statistical rigor + transparent numeric reporting
# ==========================================================

import os, re, itertools, warnings
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore", category=FutureWarning)

# --------------------------------------------
# Robust ROOT_DIR setup (auto-detect)
# --------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, '..', 'ConceptSets'))
if not os.path.exists(ROOT_DIR):
    alt_root = os.path.join(SCRIPT_DIR, 'ConceptSets')
    if os.path.exists(alt_root):
        ROOT_DIR = os.path.abspath(alt_root)
    else:
        print("⚠️ ConceptSets folder not found.")
        ROOT_DIR = None

OUT_DIR_PHASEA = os.path.join(SCRIPT_DIR, 'results', 'phaseA_pre_adjudication')
os.makedirs(OUT_DIR_PHASEA, exist_ok=True)

# ----------------------------------------------------------
# Arm Detection Helpers
# ----------------------------------------------------------
def strip_numeric_prefix(name):
    return re.sub(r"^[\d\-\_]+", "", name).strip()

def is_ai(name):
    return bool(re.search(r"\[AI\d+\]", name, re.IGNORECASE))

def is_final_human(name):
    return bool(re.search(r"\[H\d+\](?!.*\[S\d+\])", name, re.IGNORECASE))

def clean_arm_name(folder_name):
    """Convert folder to readable arm label."""
    match = re.search(r"\[(AI\d+|H\d+)\]", folder_name, re.IGNORECASE)
    if not match:
        return folder_name
    arm = match.group(1).upper()
    if arm.startswith("H"):
        arm = "HUMAN"
    return arm

# ----------------------------------------------------------
# Load Concepts
# ----------------------------------------------------------
def load_included_concepts(file_path):
    try:
        df = pd.read_csv(file_path, dtype=str)
    except Exception as e:
        print(f"[load_included_concepts] ❌ Error reading {file_path}: {e}")
        return set()

    cols = [c.strip() for c in df.columns]
    norm_map = {re.sub(r'[^a-z0-9]', '', c.lower()): c for c in cols}
    chosen = None
    if 'conceptid' in norm_map:
        chosen = norm_map['conceptid']
    else:
        for norm, orig in norm_map.items():
            if 'concept' in norm and 'id' in norm:
                chosen = orig
                break
    if not chosen:
        return set()

    vals = df[chosen].dropna().astype(str).str.strip().str.replace(r'\.0+$', '', regex=True)
    return set(vals[vals != ''])

# ----------------------------------------------------------
# Load record count metadata
# ----------------------------------------------------------
record_map = {}
if os.path.exists("ConceptRecordCounts.csv"):
    rc = pd.read_csv("ConceptRecordCounts.csv", dtype=str)
    rc['record_count'] = rc['record_count'].astype(float)
    record_map = dict(zip(rc['conceptId'].astype(str), rc['record_count']))

# ----------------------------------------------------------
# Similarity Function
# ----------------------------------------------------------
def jaccard_index(a, b):
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union else np.nan

# ----------------------------------------------------------
# Disease Analysis
# ----------------------------------------------------------
def analyze_disease(disease_code, disease_name, folders):
    """Compare AI arms vs Final Human for one disease (flat structure)."""
    arm_data = {}

    # collect includedConcepts.csv for AI + Human
    for f in folders:
        if re.search(r'ONLINE', f, re.IGNORECASE):
            continue  # skip online
        subpath = os.path.join(ROOT_DIR, f, "includedConcepts.csv")
        if not os.path.exists(subpath):
            continue
        if is_ai(f) or is_final_human(f):
            arm_data[clean_arm_name(f)] = load_included_concepts(subpath)

    if len(arm_data) < 2:
        print(f"⚠️ Skipping {disease_code}_{disease_name}: not enough valid arms (AI + Human).")
        return None

    all_concepts = set().union(*arm_data.values())
    shared_all = set.intersection(*arm_data.values()) if len(arm_data) > 1 else set()

    print(f"\n=== Disease: {disease_code}_{disease_name} ===")
    print(f"Arms: {list(arm_data.keys())}")
    print(f"Total unique concepts: {len(all_concepts)}")
    print(f"Concepts common to all arms: {len(shared_all)}")

    # Arm-level numeric summary
    stats = []
    for arm, concepts in arm_data.items():
        others_union = set.union(*(v for k, v in arm_data.items() if k != arm))
        stats.append({
            'Disease': f"{disease_code}_{disease_name}",
            'Arm': arm,
            'Concepts_Total': len(concepts),
            'Unique_to_Arm': len(concepts - others_union),
            'Shared_with_All': len(concepts & shared_all)
        })
    summary_df = pd.DataFrame(stats)

    # Pairwise Jaccard matrix
    arms = list(arm_data.keys())
    jmat = pd.DataFrame(np.nan, index=arms, columns=arms)
    for a, b in itertools.combinations_with_replacement(arms, 2):
        j = jaccard_index(arm_data[a], arm_data[b])
        jmat.loc[a, b] = jmat.loc[b, a] = j
    np.fill_diagonal(jmat.values, 1.0)

    # Save results
    disease_out = os.path.join(OUT_DIR_PHASEA, f"{disease_code}_{disease_name}")
    os.makedirs(disease_out, exist_ok=True)
    summary_df.to_csv(os.path.join(disease_out, "arm_summary.csv"), index=False)
    jmat.to_csv(os.path.join(disease_out, "jaccard_matrix.csv"))

    mean_ai_human = np.nan
    if "HUMAN" in arms:
        ai_arms = [a for a in arms if a.startswith("AI")]
        if ai_arms:
            mean_ai_human = np.nanmean([jmat.loc[a, "HUMAN"] for a in ai_arms])

    return {
        "Disease": f"{disease_code}_{disease_name}",
        "N_AI": sum(1 for a in arms if a.startswith("AI")),
        "Mean_AI_Human_Jaccard": mean_ai_human,
        "Mean_SetSize": np.mean([len(s) for s in arm_data.values()]),
        "Mean_Jaccard_All": np.nanmean(jmat.values)
    }

# ----------------------------------------------------------
# Run analysis
# ----------------------------------------------------------
all_summaries = []
for disease_code, folders in sorted(disease_groups.items()):
    res = analyze_disease(disease_code, TARGET_DISEASES[disease_code], folders)
    if res:
        all_summaries.append(res)

# ----------------------------------------------------------
# Save combined summary
# ----------------------------------------------------------
summary_df = pd.DataFrame(all_summaries)
summary_csv = os.path.join(OUT_DIR_PHASEA, "phaseA_summary.csv")
summary_df.to_csv(summary_csv, index=False)

# print("\n📊 Phase A summary saved →", summary_csv)
# print("Columns:", summary_df.columns.tolist())
# print("Shape:", summary_df.shape)
# print(summary_df.head())


In [None]:
# # ------------------------------
# # 5️⃣ Summary Bar Chart: Union vs Intersection across all arms (AI + Human)
# # ------------------------------
# import pandas as pd

# # print("\n📊 Calculating union and intersection concept counts across all arms (AI + Human)...\n")

# summary_records = []

# for disease, arm_data in all_arm_data.items():
#     # Compute union and intersection
#     all_union = set().union(*arm_data.values())
#     all_intersection = set.intersection(*arm_data.values())

#     summary_records.append({
#         "Disease": disease,
#         "Arm_Count": len(arm_data),
#         "Union_Concepts": len(all_union),
#         "Intersection_Concepts": len(all_intersection)
#     })

# summary_df = pd.DataFrame(summary_records)
# summary_df.to_csv("union_intersection_summary.csv", index=False)

# print("✅ Saved 'union_intersection_summary.csv'")


In [None]:
# ==========================================================
# Phase A — AI vs Final Human (Direct Overlap Statistics)
# ==========================================================


OUT_DIR = os.path.join(SCRIPT_DIR, 'results', 'phaseA_AI_vs_Human')
os.makedirs(OUT_DIR, exist_ok=True)


# ----------------------------------------------------------
# Compute AI vs Final Human overlap
# ----------------------------------------------------------
rows = []
concept_summary_rows = []

for disease_code, folders in sorted(disease_groups.items()):
    ai_sets = []
    human_set = set()

    for f in folders:
        fpath = os.path.join(ROOT_DIR, f, "includedConcepts.csv")
        if not os.path.exists(fpath):
            continue
        if is_ai(f):
            ai_sets.append(load_included_concepts(fpath))
        elif is_final_human(f):
            human_set = load_included_concepts(fpath)

    if not ai_sets or not human_set:
        print(f"⚠️ Skipping {disease_code}_{TARGET_DISEASES[disease_code]}: Missing AI or Human data.")
        continue

    ai_union = set().union(*ai_sets)
    shared = ai_union & human_set
    union_all = ai_union | human_set
    jacc = jaccard_index(ai_union, human_set)

    # print(f"\n=== Disease: {disease_code}_{TARGET_DISEASES[disease_code]} ===")
    # print(f"AI_Total={len(ai_union)}, Human_Total={len(human_set)}, Shared={len(shared)}, Union={len(union_all)}, Jaccard={round(jacc,4)}")

    # Append summary row
    rows.append({
        "Disease": f"{disease_code}_{TARGET_DISEASES[disease_code]}",
        "AI_Total": len(ai_union),
        "Human_Total": len(human_set),
        "Shared": len(shared),
        "Union": len(union_all),
        "Jaccard": round(jacc, 4)
    })

    # --- Per-arm breakdown ---
    for f in folders:
        fpath = os.path.join(ROOT_DIR, f, "includedConcepts.csv")
        if not os.path.exists(fpath):
            continue
        if is_ai(f) or is_final_human(f):
            arm = re.search(r"\[(AI\d+|H\d+)\]", f)
            arm_name = arm.group(1).upper() if arm else f
            if arm_name.startswith("H"):
                arm_name = "HUMAN"
            sset = load_included_concepts(fpath)
            other_sets = [
                load_included_concepts(os.path.join(ROOT_DIR, other, "includedConcepts.csv"))
                for other in folders if other != f and (is_ai(other) or is_final_human(other))
                and os.path.exists(os.path.join(ROOT_DIR, other, "includedConcepts.csv"))
            ]
            others_union = set().union(*other_sets) if other_sets else set()
            unique_to_arm = sset - others_union
            concept_summary_rows.append({
                "Disease": f"{disease_code}_{TARGET_DISEASES[disease_code]}",
                "Arm": arm_name,
                "Concepts_Total": len(sset),
                "Unique_to_Arm": len(unique_to_arm),
                "Overlap_with_All": len(sset & others_union)
            })

# ----------------------------------------------------------
# Save outputs
# ----------------------------------------------------------
summary_df = pd.DataFrame(rows)
concept_df = pd.DataFrame(concept_summary_rows)

summary_csv = os.path.join(OUT_DIR, "phaseA_AI_vs_Human.csv")
concept_csv = os.path.join(OUT_DIR, "phaseA_concept_summary.csv")

summary_df.to_csv(summary_csv, index=False)
concept_df.to_csv(concept_csv, index=False)

# print("\n📊 Phase A summary saved →", summary_csv)
# print("📄 Per-arm concept summary saved →", concept_csv)
# print("Shape:", summary_df.shape)
# print(summary_df.head())


### Phase A — Filtered AI–Human Overlap Metrics (FRD 6.1.4)
- This cell extracts only the **core overlap statistics** from the combined Phase A summary for focused reporting and visualization.
- It loads concept lists from all [AI#] and [H#] arms, computes Jaccard similarity, and reports total, shared, and unique concept counts, as well as per-arm breakdowns.
- The selected columns include:  
  - **Disease:** Identifier of the analyzed clinical condition.  
  - **AI_Total / Human_Total:** Total number of included concepts per arm.  
  - **Shared / Union:** Count of overlapping and total combined concepts.  
  - **Jaccard:** Proportion of shared concepts relative to the union (AI–Human similarity).  


In [None]:
summary_df

- This cell compiles per-arm concept statistics from all AI and Human workflows into a single structured table.  
- It records, for each disease and arm, the **total concept count**, **unique concepts specific to that arm**, and **overlap with all other arms**.  
- The resulting file **`concept_comparison_summary.csv`** (saved in `results/phaseA_AI_vs_Human/`) provides granular transparency on individual arm contributions and supports reproducibility of overlap metrics.  


In [None]:
concept_df = pd.DataFrame(concept_summary_rows)
out_concept = os.path.join(OUT_DIR, "concept_comparison_summary.csv")
concept_df.to_csv(out_concept, index=False)
# print("✅ Saved:", out_concept)


In [None]:
# concept_df

<!-- ### Phase A — Visualization of Concept Counts and Overlaps (FRD 6.1.4)
- This cell generates **bar chart visualizations** showing total, unique, and overlapping concept counts per arm (AI vs Human) for each disease.  
- Using Seaborn and Matplotlib, it standardizes color-coded metrics:
  - **Blue:** Total concepts per arm  
  - **Green:** Concepts unique to that arm  
  - **Red:** Overlap with all other arms  
- The resulting charts (one per disease) visually summarize inter-arm differences in concept inclusion, supporting transparency and reproducibility in AI–Human comparison outcomes.  
 -->

 <!-- 
 **Phase A — Visualization of Concept Counts and Overlaps (FRD 6.1.4)**

    This cell generates bar chart visualizations showing total, unique, and overlapping concept counts per arm (AI vs Human) for each disease.
    color-coded metrics:
        Blue: Total concepts per arm
        Green: Concepts unique to that arm
        Red: Overlap with all other arms
    The resulting charts (one per disease) visually summarize inter-arm differences in concept inclusion, supporting transparency and reproducibility in AI–Human comparison outcomes. 

 -->

In [None]:
# # ==========================================================
# # Visualization: Concept Counts and Overlaps per Arm
# # ==========================================================
# import seaborn as sns
# import matplotlib.pyplot as plt

# concept_summary = concept_df.copy()

# # Set Seaborn theme
# sns.set_theme(style="whitegrid", context="talk")

# # Define order and colors
# arm_order = ["HUMAN", "AI1", "AI2", "AI3", "AI4"]
# metric_palette = {
#     "Concepts_Total": "#4C72B0",       # Blue - total concepts
#     "Unique_to_Arm": "#55A868",        # Green - unique concepts
#     "Overlap_with_All": "#C44E52"      # Red - overlap
# }

# # Generate per-disease bar charts
# for disease in concept_summary["Disease"].unique():
#     sub = concept_summary[concept_summary["Disease"] == disease].copy()

#     # Keep consistent arm order
#     sub["Arm"] = pd.Categorical(
#         sub["Arm"],
#         categories=[t for t in arm_order if t in sub["Arm"].values] +
#                    [t for t in sub["Arm"].values if t not in arm_order],
#         ordered=True
#     )

#     sub_melt = sub.melt(
#         id_vars=["Arm"],
#         value_vars=["Concepts_Total", "Unique_to_Arm", "Overlap_with_All"],
#         var_name="Metric", value_name="Count"
#     )

#     plt.figure(figsize=(9, 5))
#     ax = sns.barplot(
#         data=sub_melt,
#         x="Arm", y="Count", hue="Metric",
#         hue_order=["Concepts_Total", "Unique_to_Arm", "Overlap_with_All"],
#         order=sub["Arm"].cat.categories,
#         palette=metric_palette,
#         edgecolor="black", linewidth=0.6
#     )

#     for container in ax.containers:
#         ax.bar_label(container, fmt='%d', label_type='edge', fontsize=9, padding=3)

#     plt.title(f"{disease}: Concept Counts and Overlaps per Arm", fontsize=14, weight='bold', pad=15)
#     plt.xlabel("Arm", fontsize=12)
#     plt.ylabel("Number of Concepts", fontsize=12)
#     plt.tick_params(axis='x', labelsize=12)
#     plt.tick_params(axis='y', labelsize=12)
#     plt.legend(
#         title="Metric", title_fontsize=11, fontsize=10,
#         frameon=True, facecolor='white', edgecolor='gray'
#     )

#     plt.ylim(0, sub["Concepts_Total"].max() * 1.3)
#     sns.despine(left=True, bottom=True)
#     plt.grid(axis='y', color='gray', linestyle='--', linewidth=0.4, alpha=0.6)
#     plt.tight_layout()
#     # plt.show()


<!-- ### Phase A — Pairwise & AI–Human Jaccard Analysis (FRD 6.1.3, 6.1.4)
- This cell computes **pairwise Jaccard similarity** between all arms within each disease to quantify set overlap between subteams, AI, and final human arms.  
- It also calculates **AI-vs-Human union, shared concepts, and Jaccard** for each disease, providing a direct pre-adjudication comparison.  
- Output CSVs:
  - `concept_armwise_metrics_summary.csv` → pairwise Jaccard statistics for all arm combinations.  
  - `concept_AI_vs_Human_metrics_summary.csv` → AI vs final human summary metrics including total concepts, shared, union, and Jaccard.  
 -->

In [None]:
# ==========================================================
# Phase A Extended — Pairwise Jaccard Across All Arms
# (Includes AI–Human overlap, AI–AI, Human–Human, etc.)
# ==========================================================

import os, re
import pandas as pd
from itertools import combinations
import numpy as np
from collections import defaultdict

# --------------------------------------------
# Analyze all diseases
# --------------------------------------------
pairwise_stats = []
ai_vs_human_stats = []

for disease_code, folders in sorted(disease_groups.items()):
    disease_name = TARGET_DISEASES[disease_code]
    arm_data = {}

    for f in folders:
        path = os.path.join(ROOT_DIR, f, "includedConcepts.csv")
        if not os.path.exists(path):
            continue
        arm_label = clean_arm_name(f)
        arm_data[arm_label] = load_included_concepts(path)

    if len(arm_data) < 2:
        print(f"⚠️ Skipping {disease_code}_{disease_name}: not enough arms.")
        continue

    # --------------------------
    # Pairwise Jaccard similarity
    # --------------------------
    for (arm1, set1), (arm2, set2) in combinations(arm_data.items(), 2):
        inter = len(set1 & set2)
        union = len(set1 | set2)
        jaccard = inter / union if union else 0
        pairwise_stats.append({
            "Disease": f"{disease_code}_{disease_name}",
            "Arm1": arm1,
            "Arm2": arm2,
            "Shared": inter,
            "Union": union,
            "Jaccard": round(jaccard, 4)
        })

    # --------------------------
    # AI vs Human summary
    # --------------------------
    ai_arms = [concepts for arm, concepts in arm_data.items() if arm.startswith("AI")]
    human_arms = [concepts for arm, concepts in arm_data.items() if arm == "HUMAN"]

    if ai_arms and human_arms:
        ai_union = set().union(*ai_arms)
        human_union = set().union(*human_arms)
        shared = len(ai_union & human_union)
        union_all = len(ai_union | human_union)
        jaccard = shared / union_all if union_all else 0
        ai_vs_human_stats.append({
            "Disease": f"{disease_code}_{disease_name}",
            "AI_Total": len(ai_union),
            "Human_Total": len(human_union),
            "Shared": shared,
            "Union": union_all,
            "Jaccard": round(jaccard, 4)
        })

# --------------------------------------------
# Save results
# --------------------------------------------
pairwise_df = pd.DataFrame(pairwise_stats)
ai_vs_human_df = pd.DataFrame(ai_vs_human_stats)

pairwise_csv = os.path.join(OUT_DIR, "concept_armwise_metrics_summary.csv")
aih_csv = os.path.join(OUT_DIR, "concept_AI_vs_Human_metrics_summary.csv")

pairwise_df.to_csv(pairwise_csv, index=False)
ai_vs_human_df.to_csv(aih_csv, index=False)

# print("\n✅ Results generated successfully.")
# print(f"Pairwise comparisons: {len(pairwise_df)} → {pairwise_csv}")
# print(f"AI vs Human summaries: {len(ai_vs_human_df)} → {aih_csv}")
# print(ai_vs_human_df.head())


### Phase A — Pairwise Jaccard Similarity Heatmaps

- **Purpose:** Visualize pairwise overlap (Jaccard similarity) between all arms for each disease.
- **Inputs:** `pairwise_df` containing all Arm1–Arm2 Jaccard values per disease.
- **Process:**
  1. Identify all arms in the disease.
  2. Build a square matrix of Jaccard values (diagonal = 1.0).
  3. Fill the matrix using pairwise Jaccard data.
  4. Plot a heatmap using Seaborn (`Blues` colormap, 0–1 scale).
- **Visualization Notes:**
  - Darker blue indicates higher overlap.
  - Numeric values annotated on the heatmap.
- **Outcome:** Quick visual inspection of agreement or divergence among AI, human, and subteam arms.


In [None]:
# ------------------------------
# Pairwise Jaccard Similarity Heatmaps
# ------------------------------
# For each disease, create a heatmap showing Jaccard similarity
# between all pairs of arms.

print("📊 Pairwise Jaccard Similarity Heatmaps: Shows Jaccard similarity (0–1) between all pairs of arms for each disease. Darker blue = higher overlap.\n")

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Ensure seaborn style is consistent
sns.set_theme(style="whitegrid", context="talk")

for disease in pairwise_df['Disease'].unique():
    # Subset data for current disease
    sub = pairwise_df[pairwise_df['Disease'] == disease]

    # Identify all arms involved
    arms = sorted(list(set(sub['Arm1']) | set(sub['Arm2'])))

    # Initialize square matrix with 1.0 on the diagonal
    matrix = pd.DataFrame(1.0, index=arms, columns=arms)

    # Fill in Jaccard values
    for _, row in sub.iterrows():
        matrix.loc[row['Arm1'], row['Arm2']] = row['Jaccard']
        matrix.loc[row['Arm2'], row['Arm1']] = row['Jaccard']

    # ------------------------------
    # Plot heatmap
    # ------------------------------
    plt.figure(figsize=(6, 5))
    sns.heatmap(
        matrix,
        annot=True,
        annot_kws={"size": 11},  # smaller font for numbers
        cmap="Blues",
        vmin=0,
        vmax=1
    )

    plt.title(f"Pairwise Jaccard Similarity — {disease}", fontsize=10)
    plt.xticks(fontsize=10, rotation=45, ha='right')
    plt.yticks(fontsize=10)
    plt.tight_layout()
    plt.show()


<!-- ### Phase A — Upset Plot Visualization of Concept Intersections (FRD 6.1.4–6.1.5)
- This cell generates **Upset plots** to visualize intersections between concept sets across all AI and Human workflows for each disease.  
- Each plot shows how concepts are shared or unique among workflows, highlighting methodological overlap and outliers before gold standard adjudication.  
- The outputs, saved under `results/phaseA_AI_vs_Human/upset_plots/`, are per-disease PNG files (e.g., `C01_upset_plot.png`) that provide an intuitive view of workflow intersections and uniqueness.  
 -->

In [None]:
# # ==========================================================
# # Upset Plot — Concept Set Intersections Across Workflows
# # ==========================================================
# print("\n📊 Generating Upset plots (concept intersections across workflows)...\n")

# from upsetplot import UpSet, from_memberships

# UPSET_DIR = os.path.join(OUT_DIR, "upset_plots")
# os.makedirs(UPSET_DIR, exist_ok=True)

# for disease in concept_df["Disease"].unique():
#     disease_dir = os.path.join(ROOT_DIR, disease)
#     wf_sets = {}

#     # Load all AI + HUMAN sets
#     for sub in os.listdir(disease_dir):
#         subpath = os.path.join(disease_dir, sub, "includedConcepts.csv")
#         if not os.path.exists(subpath):
#             continue
#         if is_ai(sub) or is_final_human(sub):
#             arm = re.search(r"\[(AI\d+|H\d+)\]", sub)
#             arm_name = arm.group(1).upper() if arm else sub
#             if arm_name.startswith("H"):
#                 arm_name = "HUMAN"
#             wf_sets[arm_name] = load_included_concepts(subpath)

#     if not wf_sets:
#         continue

#     # Build membership list for upset plot
#     wf_cols = list(wf_sets.keys())
#     all_concepts = sorted(set().union(*wf_sets.values()))
#     memberships = [tuple(np.array(wf_cols)[[cid in wf_sets[w] for w in wf_cols]])
#                    for cid in all_concepts]

#     data = from_memberships(memberships)
#     fig = plt.figure(figsize=(9, 5))
#     UpSet(data, show_counts=True, subset_size='count').plot(fig=fig)
#     plt.suptitle(f"{disease}: Concept Set Intersections", fontsize=13, weight='bold', y=1.02)
#     fig.subplots_adjust(top=0.9, bottom=0.18, left=0.18, right=0.95)
#     out_path = os.path.join(UPSET_DIR, f"{disease}_upset_plot.png")
#     plt.savefig(out_path, dpi=300, bbox_inches="tight")
#     # plt.show()

#     # print(f"✅ Saved Upset plot for {disease} → {out_path}")


<!-- ### Phase A — Concept Agreement & Clinical Context Visualizations (FRD 6.1.5)
- This cell analyzes **how consistently concepts are selected across AI and Human workflows** by computing a Match Score for each concept (number of workflows including it).  
- Two visualizations are generated per disease:  
  1. **Histogram of Match Scores** — shows distribution of agreement levels across workflows.  
  2. **Scatter Plot (Match Score vs Concept Prevalence)** — highlights clinically significant discrepancies (high prevalence concepts with low agreement).  
- Outputs are saved in `results/phaseA_AI_vs_Human/concept_agreement/` as PNG files, providing insight into workflow consistency and clinical relevance before gold standard adjudication.  
 -->

In [None]:
# # ==========================================================
# # Concept Agreement and Clinical Context Analysis (Refined)
# # ==========================================================
# print("\n📊 Generating Concept Agreement & Clinical Context visualizations...\n")

# import os
# import re
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# # ----------------------------------------------------------
# # Setup
# # ----------------------------------------------------------
# AGREEMENT_DIR = os.path.join(OUT_DIR, "concept_agreement")
# os.makedirs(AGREEMENT_DIR, exist_ok=True)

# # ----------------------------------------------------------
# # Load record counts (for prevalence axis)
# # ----------------------------------------------------------
# record_map = {}
# record_count_file = os.path.join(ROOT_DIR, "ConceptRecordCounts.csv")

# if os.path.exists(record_count_file):
#     rc = pd.read_csv(record_count_file, dtype=str)
#     rc['conceptId'] = rc['conceptId'].astype(str).str.strip().str.replace(r'\.0+$', '', regex=True)
#     rc['record_count'] = pd.to_numeric(rc['record_count'], errors='coerce').fillna(0.0)
#     record_map = dict(zip(rc['conceptId'], rc['record_count']))
#     print(f"✅ Loaded {len(record_map):,} concept record counts for prevalence mapping.")
# else:
#     print("⚠️ ConceptRecordCounts.csv not found — skipping prevalence axis.")

# # ----------------------------------------------------------
# # Loop through diseases
# # ----------------------------------------------------------
# for disease in concept_df["Disease"].unique():
#     disease_dir = os.path.join(ROOT_DIR, disease)
#     wf_sets = {}

#     # Collect concept sets for AI and Human workflows
#     for sub in os.listdir(disease_dir):
#         subpath = os.path.join(disease_dir, sub, "includedConcepts.csv")
#         if not os.path.exists(subpath):
#             continue
#         if is_ai(sub) or is_final_human(sub):
#             arm = re.search(r"\[(AI\d+|H\d+)\]", sub)
#             arm_name = arm.group(1).upper() if arm else sub
#             if arm_name.startswith("H"):
#                 arm_name = "HUMAN"
#             wf_sets[arm_name] = load_included_concepts(subpath)

#     if not wf_sets:
#         continue

#     # ------------------------------------------------------
#     # Compute Match Score (agreement level)
#     # ------------------------------------------------------
#     all_concepts = sorted(set().union(*wf_sets.values()))
#     match_score = {cid: sum(1 for s in wf_sets.values() if cid in s) for cid in all_concepts}

#     df = pd.DataFrame({
#         "conceptId": all_concepts,
#         "MatchScore": [match_score[cid] for cid in all_concepts],
#         "record_count": [
#             record_map.get(str(cid).strip().replace(".0", ""), np.nan)
#             for cid in all_concepts
#         ]
#     })

#     # ------------------------------------------------------
#     # Histogram of Match Scores (Agreement distribution)
#     # ------------------------------------------------------
#     plt.figure(figsize=(5.5, 3.8))
#     plt.hist(df["MatchScore"], bins=range(1, len(wf_sets) + 2), edgecolor="black", color="#4C72B0")
#     plt.xlabel("Match Score (# of Workflows)", fontsize=11)
#     plt.ylabel("Number of Concepts", fontsize=11)
#     plt.title(f"{disease}: Match Score Distribution", fontsize=12, weight="bold", pad=8)
#     plt.grid(axis='y', linestyle='--', alpha=0.6)
#     plt.tight_layout()
#     hist_path = os.path.join(AGREEMENT_DIR, f"{disease}_matchscore_histogram.png")
#     plt.savefig(hist_path, dpi=300, bbox_inches="tight")
#     plt.show()

#     # ------------------------------------------------------
#     # Scatter: Match Score vs Concept Prevalence
#     # ------------------------------------------------------
#     if record_map:
#         plt.figure(figsize=(6.2, 4.5))
#         plt.scatter(
#             df["record_count"], df["MatchScore"],
#             alpha=0.6, s=35, edgecolor='k', linewidth=0.4, color="#55A868"
#         )
#         plt.xscale("log")
#         plt.xlabel("Concept Prevalence (Record Count, log scale)", fontsize=11)
#         plt.ylabel("Match Score (# Workflows including concept)", fontsize=11)
#         plt.title(f"{disease}: Agreement vs Clinical Prevalence", fontsize=12, weight="bold", pad=8)
#         plt.grid(alpha=0.4, linestyle='--')
#         plt.tight_layout()
#         scatter_path = os.path.join(AGREEMENT_DIR, f"{disease}_agreement_vs_prevalence.png")
#         plt.savefig(scatter_path, dpi=300, bbox_inches="tight")
#         plt.show()

# # print("\n✅ Concept Agreement & Clinical Context visualizations generated in:", AGREEMENT_DIR)


<!-- ### Phase A — AI Arms Union vs Intersection Summary (FRD 6.1.4–6.1.5)
- This cell calculates the **union and intersection** of concept sets across all AI arms for each disease to quantify agreement and variability among AI workflows.  
- The results are saved in `AI_arms_union_intersection_summary.csv`, containing per-disease counts for total concepts (union) and shared concepts (intersection).  
- A bar chart visualizes these metrics, comparing **agreement vs diversity** across AI arms and highlighting areas of high or low overlap before adjudication.  
 -->

In [None]:
# # ------------------------------
# # 5️⃣ Summary Bar Chart: Union vs Intersection across AI arms
# # ------------------------------
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# from itertools import combinations

# print("\n📊 Calculating union and intersection concept counts across AI arms...\n")

# summary_ai = []

# for disease, arm_data in all_arm_data.items():
#     # Filter only AI arms
#     ai_arms = {arm: concepts for arm, concepts in arm_data.items() if arm.lower().startswith("ai")}
#     if len(ai_arms) < 2:
#         continue  # skip if less than 2 AI arms

#     # Compute union and intersection
#     union_set = set().union(*ai_arms.values())
#     intersection_set = set.intersection(*ai_arms.values()) if len(ai_arms) > 1 else set()

#     summary_ai.append({
#         "Disease": disease,
#         "AI_Arm_Count": len(ai_arms),
#         "Union_Concepts": len(union_set),
#         "Intersection_Concepts": len(intersection_set)
#     })

# summary_df = pd.DataFrame(summary_ai)
# summary_df.to_csv("AI_arms_union_intersection_summary.csv", index=False)

# #print("✅ Saved 'AI_arms_union_intersection_summary.csv'")


# # ------------------------------
# # Visualization
# # ------------------------------
# sns.set_theme(style="whitegrid", context="talk")

# plt.figure(figsize=(12, 6))
# summary_melted = summary_df.melt(id_vars="Disease",
#                                  value_vars=["Union_Concepts", "Intersection_Concepts"],
#                                  var_name="Metric",
#                                  value_name="Count")

# sns.barplot(data=summary_melted, x="Disease", y="Count", hue="Metric")
# plt.title("Union vs Intersection of Concept IDs across AI Arms per Disease")
# plt.xlabel("Disease")
# plt.ylabel("Number of Concept IDs")
# plt.xticks(rotation=45, ha="right")
# plt.tight_layout()
# plt.show()


### Phase A — AI Arms Union vs Intersection Bar Chart (FRD 6.1.4–6.1.5)
- This cell calculates the **union and intersection of concept sets** across all AI arms for each disease, highlighting agreement and variability among AI pipelines.  
- The results are saved in **`AI_arms_union_intersection_summary.csv`**, containing per-disease counts for:
  - **AI_Arm_Count:** Number of AI arms included.  
  - **Union_Concepts:** Total unique concepts across AI arms.  
  - **Intersection_Concepts:** Concepts shared by all AI arms.  
- A bar chart visualizes these metrics, comparing total vs shared concepts per disease to easily identify where AI outputs converge or diverge.  


In [None]:
# ==========================================================
# Phase A — Union vs Intersection Summary (AI + Human Arms)
# ==========================================================
import os, re, pandas as pd
from collections import defaultdict


OUT_DIR = os.path.join(SCRIPT_DIR, 'results', 'phaseA_union_intersection')
os.makedirs(OUT_DIR, exist_ok=True)


# --------------------------------------------
# Compute union vs intersection
# --------------------------------------------
summary_records = []

for disease_code, folders in sorted(disease_groups.items()):
    disease_name = TARGET_DISEASES[disease_code]
    arm_data = {}

    for f in folders:
        fpath = os.path.join(ROOT_DIR, f, "includedConcepts.csv")
        if not os.path.exists(fpath):
            continue
        if is_ai(f) or is_final_human(f):
            arm_data[clean_arm_name(f)] = load_included_concepts(fpath)

    if not arm_data:
        print(f"⚠️ Skipping {disease_code}_{disease_name}: no valid arms found.")
        continue

    all_union = set().union(*arm_data.values())
    all_intersection = set.intersection(*arm_data.values()) if len(arm_data) > 1 else set()

    summary_records.append({
        "Disease": f"{disease_code}_{disease_name}",
        "Arm_Count": len(arm_data),
        "Union_Concepts": len(all_union),
        "Intersection_Concepts": len(all_intersection),
        "Intersection_to_Union_Ratio": round(len(all_intersection) / len(all_union), 4) if all_union else 0.0
    })

# --------------------------------------------
# Save summary table
# --------------------------------------------
summary_df = pd.DataFrame(summary_records)
out_csv = os.path.join(OUT_DIR, "union_intersection_summary.csv")
summary_df.to_csv(out_csv, index=False)

# print("\n✅ Saved union/intersection summary →", out_csv)
# print("Columns:", summary_df.columns.tolist())
# print("Shape:", summary_df.shape)
# print(summary_df.head())


In [None]:
summary_df

### Phase A — Horizontal Stacked Bars: AI Union vs Intersection (FRD 6.1.4)
- This cell visualizes the **distribution of AI-generated concepts** across diseases, comparing the total union of AI concepts with the intersection shared with the Human workflow.  
- Two plots are generated per disease:  
  1. **Percentage stacked bars:** normalized view showing proportion of AI-only vs shared concepts.  
  2. **Absolute numbers stacked bars:** actual counts of concepts in AI-only and shared categories.  
- These plots help **quickly identify agreement vs divergence** between AI outputs and Human references prior to adjudication.  


In [None]:
# ------------------------------
# 6️⃣ Horizontal Stacked Bars: Union vs Intersection of AI Concepts
# ------------------------------
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

sns.set_theme(style="whitegrid", context="talk")

# ---------------------------------------------
# Prepare data
# ---------------------------------------------
stack_df = summary_df.copy()
stack_df = stack_df.sort_values(by="Union_Concepts", ascending=True)  # smallest to largest for readability

# Compute AI-only and intersection
stack_df["AI_Only"] = stack_df["Union_Concepts"] - stack_df["Intersection_Concepts"]
stack_df["In_Both"] = stack_df["Intersection_Concepts"]

# Compute percentages for normalized plot
stack_df["AI_Only_%"] = stack_df["AI_Only"] / stack_df["Union_Concepts"] * 100
stack_df["In_Both_%"] = stack_df["In_Both"] / stack_df["Union_Concepts"] * 100

# ---------------------------------------------
# Plot 1️⃣: Percentages
# ---------------------------------------------
plt.figure(figsize=(10, 6))
plot_df_pct = stack_df.set_index("Disease")[["In_Both_%", "AI_Only_%"]]

plot_df_pct.plot(
    kind="barh",
    stacked=True,
    color=["#A6CEE3", "#FDBF6F"],  # blue & amber
    edgecolor="black",
    linewidth=0.6
)

plt.title("AI Concept Distribution Across Diseases (Percentage)", fontsize=15, weight="bold")
plt.xlabel("Percentage of Concepts", fontsize=13)
plt.ylabel("Disease", fontsize=13)
plt.legend(
    ["In Both (Intersection)", "AI Only (Union − Intersection)"],
    title="Category",
    title_fontsize=12,
    fontsize=11,
    bbox_to_anchor=(1.04, 0.5),
    loc="center left",
    frameon=False
)
plt.grid(axis="x", linestyle="--", alpha=0.6)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    plt.tight_layout()
plt.show()

# ---------------------------------------------
# Plot 2️⃣: Absolute Numbers
# ---------------------------------------------
plt.figure(figsize=(10, 6))
plot_df_abs = stack_df.set_index("Disease")[["In_Both", "AI_Only"]]

plot_df_abs.plot(
    kind="barh",
    stacked=True,
    color=["#A6CEE3", "#FDBF6F"],  # same palette
    edgecolor="black",
    linewidth=0.6
)

plt.title("Concept Distribution Across Diseases (Absolute Numbers)", fontsize=15, weight="bold")
plt.xlabel("Number of Concept IDs", fontsize=13)
plt.ylabel("Disease", fontsize=13)
plt.legend(
    ["In Both (Intersection)", "AI Only (Union − Intersection)"],
    title="Category",
    title_fontsize=12,
    fontsize=11,
    bbox_to_anchor=(1.04, 0.5),
    loc="center left",
    frameon=False
)
plt.grid(axis="x", linestyle="--", alpha=0.6)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    plt.tight_layout()
plt.show()


### Phase A — Horizontal Stacked Bar: AI vs Human Concept Overlap (FRD 6.1.4–6.1.5)
- This cell generates a **stacked horizontal bar chart** showing concept overlap across diseases for AI and Human workflows.  
- Each bar is divided into **In Both (shared concepts), All AI, and Human Only**, both as percentages of the total union.  
- The visualization allows quick assessment of **agreement, divergence, and unique contributions** of AI and Human arms prior to adjudication.  


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

# ---------------------------------------------
# Load data
# ---------------------------------------------
df_path = os.path.join("results", "phaseA_AI_vs_Human", "phaseA_AI_vs_Human.csv")
df = pd.read_csv(df_path)

# ---------------------------------------------
# Compute overlap components
# ---------------------------------------------
df["AI_Only"] = df["AI_Total"] - df["Shared"]
df["Human_Only"] = df["Human_Total"] - df["Shared"]
df["In_Both"] = df["Shared"]

# Normalize by union (ensure union is correct)
df["Union"] = df["AI_Only"] + df["Human_Only"] + df["In_Both"]

df["AI_Only_%"] = df["AI_Only"] / df["Union"] * 100
df["Human_Only_%"] = df["Human_Only"] / df["Union"] * 100
df["In_Both_%"] = df["In_Both"] / df["Union"] * 100

# ---------------------------------------------
# Prepare for plotting
# ---------------------------------------------
plot_df = df.set_index("Disease")[["In_Both_%", "AI_Only_%", "Human_Only_%"]]

# Sort by shared overlap for better comparison
plot_df = plot_df.sort_values("In_Both_%", ascending=True)

# ---------------------------------------------
# Plot: Horizontal stacked bar
# ---------------------------------------------
sns.set_theme(style="whitegrid", context="talk")
fig, ax = plt.subplots(figsize=(10, 6))

plot_df.plot(
    kind="barh",
    stacked=True,
    color=["#A6CEE3", "#FB9A99", "#B2DF8A"],  # Blue, Red, Green
    edgecolor="black",
    linewidth=0.6,
    ax=ax
)

ax.set_title(
    "Phase 1: Concept Overlap Between AI and Human Workflows",
    fontsize=14,
    weight="bold"
)
ax.set_xlabel("Percentage of Concepts", fontsize=13)
ax.set_ylabel("Concept Sets", fontsize=13)

# Legend with correct order and labels
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles,
    ["In Both", "AI Only", "Human Only"],
    title="Category",
    title_fontsize=12,
    fontsize=11,
    bbox_to_anchor=(1.04, 0.5),
    loc="center left",
    frameon=False
)

# Clean gridlines and layout
ax.grid(axis="x", linestyle="--", alpha=0.6)
ax.set_xlim(0, 100)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    plt.tight_layout()

plt.show()


In [None]:
!jupyter nbconvert --to html --no-input --output "Phase1_results.html" "Phase1.ipynb" --log-level=ERROR