### Phase A — Concept Presence/Absence Matrix (FRD 6.1.1–6.1.4)
- This cell constructs a **binary matrix** indicating the presence (1) or absence (0) of each concept across all non-subgroup AI and Human arms for each disease.  
- Each row represents a unique concept, and columns represent different arms, capturing which workflow included each concept.  
- The output CSV (`<Disease>_arm_matrix.csv`) contains both **concept metadata** and the **arm-wise binary indicators**, enabling subsequent similarity metrics, overlap calculations, and visualizations.  


In [None]:
import os
import re
import pandas as pd

def clean_arm_name(folder_name):
    """
    Extracts a clean, standardized arm name from a folder name.
    Skips subgroup folders like [S01], [S02].
    """
    # 🚫 Skip subgroup folders (e.g., [H1][S01], [AI1][S02])
    if re.search(r"\[S\d+\]", folder_name, re.IGNORECASE):
        return None

    name = folder_name.strip().replace("Concept Set", "").strip()
    match = re.search(r"(AI\d+|H1|HUMAN|Clinician|Reviewer|Manual|Expert)", name, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    else:
        parts = name.split()
        return parts[-1].title() if parts else name


def build_concept_matrix(disease_folder):
    arm_frames = []
    arm_names = []
    file_info = []

    print(f"\n🔍 Processing disease folder: {os.path.basename(disease_folder)}")

    for subfolder in os.listdir(disease_folder):
        subfolder_path = os.path.join(disease_folder, subfolder)
        if os.path.isdir(subfolder_path):
            csv_path = os.path.join(subfolder_path, 'includedConcepts.csv')
            if not os.path.exists(csv_path):
                continue

            arm_name = clean_arm_name(subfolder)
            if arm_name is None:
                print(f"   ⏭️ Skipping subgroup folder: {subfolder}")
                continue

            # ✅ READ only non-subgroup folders
            try:
                df = pd.read_csv(csv_path, dtype=str)
                n_concepts = df.shape[0]
                print(f"   📄 {arm_name:10s} ← {subfolder}/includedConcepts.csv ({n_concepts} rows)")
                df.columns = [c.strip() for c in df.columns]
                df["Arm"] = arm_name
                arm_frames.append(df)
                arm_names.append(arm_name)
                file_info.append((arm_name, n_concepts))
            except Exception as e:
                print(f"   ⚠️ Error reading {csv_path}: {e}")

    if not arm_frames:
        print(f"⚠️ No includedConcepts.csv found in {disease_folder}")
        return None

    all_concepts = pd.concat(arm_frames, ignore_index=True)
    concept_id_col = [c for c in all_concepts.columns if "conceptid" in c.lower()]
    if not concept_id_col:
        raise ValueError(f"No Concept ID column found in {disease_folder}")
    concept_col = concept_id_col[0]

    matrix = all_concepts.pivot_table(
        index=concept_col,
        columns="Arm",
        values="conceptSetId",
        aggfunc="size",
        fill_value=0
    ).reset_index()

    drop_cols = [c for c in ["Arm", "Name", "conceptSetName"] if c in all_concepts.columns]
    concept_meta = (
        all_concepts
        .drop(columns=drop_cols, errors="ignore")
        .drop_duplicates(subset=[concept_col])
    )

    merged = concept_meta.merge(matrix, on=concept_col, how="left")
    arm_cols = [col for col in merged.columns if col in arm_names]
    merged[arm_cols] = (merged[arm_cols] > 0).astype(int)

    disease_name = os.path.basename(disease_folder)
    merged.insert(0, "Disease", disease_name)

    output_name = f"{disease_name}_arm_matrix.csv"
    merged.to_csv(output_name, index=False)

    print(f"✅ Saved: {output_name} ({merged.shape[0]} concepts, {len(arm_cols)} arms)")
    print(f"   └─ Arms read: {', '.join([f'{a} ({n})' for a, n in file_info])}")

    return merged


# ------------------------------------------------
ROOT_DIR = os.path.join('..', 'ConceptSets')   # adjust if needed
disease_dirs = [
    os.path.join(ROOT_DIR, d)
    for d in os.listdir(ROOT_DIR)
    if os.path.isdir(os.path.join(ROOT_DIR, d)) and re.match(r"C\d+_", d)
]

for disease_dir in disease_dirs:
    build_concept_matrix(disease_dir)


In [None]:
# ==========================================================
# Phase A — Build Arm–Concept Matrix per Disease
# ==========================================================
import os
import re
import pandas as pd
from collections import defaultdict

# --------------------------------------------
# Robust ROOT_DIR setup (auto-detect)
# --------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, '..', 'ConceptSets'))
if not os.path.exists(ROOT_DIR):
    alt_root = os.path.join(SCRIPT_DIR, 'ConceptSets')
    if os.path.exists(alt_root):
        ROOT_DIR = os.path.abspath(alt_root)
    else:
        print("⚠️ ConceptSets folder not found.")
        ROOT_DIR = None

OUT_DIR = os.path.join(SCRIPT_DIR, 'results', 'phaseA_arm_concept_matrix')
os.makedirs(OUT_DIR, exist_ok=True)

# --------------------------------------------
# Restrict to target diseases
# --------------------------------------------
TARGET_DISEASES = {
    'C01': 'SLE',
    'C02': 'RheumatoidArthritis',
    'C03': 'DiabetesMacularEdema',
    'C04': 'DeepVeinThrombosis',
    'C06': 'Uveitis',
    'C07': 'SystemicSclerosis'
}

# --------------------------------------------
# Helper functions
# --------------------------------------------
def strip_numeric_prefix(name):
    """Remove leading numeric prefixes like '0152-'."""
    return re.sub(r"^[\d\-\_]+", "", name).strip()

def clean_arm_name(folder_name):
    """
    Extracts a standardized arm label (AI1, HUMAN, etc.)
    Skips subgroup folders like [S01].
    """
    if re.search(r"\[S\d+\]", folder_name, re.IGNORECASE):
        return None
    match = re.search(r"(AI\d+|H\d+|Human|Clinician|Reviewer|Manual|Expert)", folder_name, re.IGNORECASE)
    if match:
        arm = match.group(1).upper()
        if arm.startswith("H"):
            arm = "HUMAN"
        return arm
    else:
        parts = folder_name.split()
        return parts[-1].upper() if parts else folder_name.upper()

def load_included_concepts(file_path):
    """Reads includedConcepts.csv and returns a clean DataFrame."""
    try:
        df = pd.read_csv(file_path, dtype=str)
        df.columns = [c.strip() for c in df.columns]
        return df
    except Exception as e:
        print(f"⚠️ Error reading {file_path}: {e}")
        return pd.DataFrame()

# --------------------------------------------
# Group folders by disease (flat structure)
# --------------------------------------------
raw_folders = [f for f in os.listdir(ROOT_DIR) if os.path.isdir(os.path.join(ROOT_DIR, f))]
cleaned_folders = [strip_numeric_prefix(f) for f in raw_folders]

disease_groups = defaultdict(list)
for raw, clean in zip(raw_folders, cleaned_folders):
    if re.search(r'ONLINE', clean, re.IGNORECASE):
        continue
    m = re.search(r'\[(C\d+)\]', clean, re.IGNORECASE)
    if m:
        code = m.group(1).upper()
        if code in TARGET_DISEASES:
            disease_groups[code].append(raw)

print(f"\n🧬 Found {len(disease_groups)} target diseases.")
for k, v in sorted(disease_groups.items()):
    print(f"  {k} ({TARGET_DISEASES[k]}): {len(v)} arms")

# --------------------------------------------
# Build concept matrix per disease
# --------------------------------------------
def build_concept_matrix(disease_code, folders):
    disease_name = TARGET_DISEASES[disease_code]
    print(f"\n🔍 Processing {disease_code}_{disease_name}")

    arm_frames = []
    arm_names = []
    file_info = []

    for f in folders:
        fpath = os.path.join(ROOT_DIR, f, "includedConcepts.csv")
        if not os.path.exists(fpath):
            continue

        arm_name = clean_arm_name(f)
        if arm_name is None:
            print(f"   ⏭️ Skipping subgroup folder: {f}")
            continue

        df = load_included_concepts(fpath)
        if df.empty:
            continue

        df["Arm"] = arm_name
        arm_frames.append(df)
        arm_names.append(arm_name)
        file_info.append((arm_name, df.shape[0]))
        print(f"   📄 {arm_name:8s} ← {f}/includedConcepts.csv ({df.shape[0]} rows)")

    if not arm_frames:
        print(f"⚠️ No valid arms for {disease_code}_{disease_name}")
        return None

    all_concepts = pd.concat(arm_frames, ignore_index=True)

    # Find concept ID column
    concept_col_candidates = [c for c in all_concepts.columns if re.search(r"concept.?id", c, re.IGNORECASE)]
    if not concept_col_candidates:
        print(f"⚠️ No Concept ID column found for {disease_code}")
        return None
    concept_col = concept_col_candidates[0]

    # Pivot: rows = conceptId, cols = Arm
    matrix = all_concepts.pivot_table(
        index=concept_col,
        columns="Arm",
        values="conceptSetId",
        aggfunc="size",
        fill_value=0
    ).reset_index()

    # Merge back with concept metadata
    meta_cols = [c for c in all_concepts.columns if c not in ["Arm", "conceptSetId"] + arm_names]
    concept_meta = all_concepts.drop_duplicates(subset=[concept_col])[meta_cols]
    merged = concept_meta.merge(matrix, on=concept_col, how="left")

    arm_cols = [c for c in merged.columns if c in arm_names]
    merged[arm_cols] = (merged[arm_cols] > 0).astype(int)
    merged.insert(0, "Disease", f"{disease_code}_{disease_name}")

    # Save output
    out_file = os.path.join(OUT_DIR, f"{disease_code}_{disease_name}_arm_matrix.csv")
    merged.to_csv(out_file, index=False)

    print(f"✅ Saved: {out_file} ({merged.shape[0]} concepts, {len(arm_cols)} arms)")
    print(f"   └─ Arms processed: {', '.join([f'{a} ({n})' for a, n in file_info])}")
    return merged

# --------------------------------------------
# Run for all diseases
# --------------------------------------------
for disease_code, folders in sorted(disease_groups.items()):
    build_concept_matrix(disease_code, folders)


In [None]:
df = pd.read_csv(os.path.join(OUT_DIR, 'C04_DeepVeinThrombosis_arm_matrix.csv'), dtype=str)
df.head()

In [None]:
df = pd.read_csv(os.path.join(OUT_DIR,'C06_Uveitis_arm_matrix.csv'),dtype=str)
df.head()

### Phase A — Team Blinding & Randomization (FRD 6.1.1–6.1.2)
- This cell prepares **blinded arm matrices** for unbiased analysis by replacing original workflow labels with randomized labels like `SetA`, `SetB`, etc.  
- It ensures traceability by also saving a **mapping key** linking original arm names to blinded labels.  
- Inputs are `*_arm_matrix.csv` files; outputs include:
  - Blinded matrices (`*_BLINDED_MATRIX.csv`) for statistical evaluation.  
  - Mapping keys (`*_TEAM_MAPPING_KEY.csv`) for post-analysis reconciliation.  


In [None]:
# ==========================================================
# 🧩 TEAM BLINDING & RANDOMIZATION SCRIPT (Refined)
# ==========================================================
# This script takes each *_arm_matrix.csv file,
# randomizes arm labels (e.g., SetA, SetB, ...),
# and creates a blinded dataset for unbiased analysis.
# It also generates a mapping key for traceability.
# ==========================================================

import os
import re
import glob
import random
import string
import pandas as pd

# --------------------------------------------
# 🧭 Path Configuration
# --------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

INPUT_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_arm_concept_matrix")
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_blinded")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Optional reproducibility
random.seed(42)

# --------------------------------------------
# 🔹 Function: Get all disease arm-matrix CSV files
# --------------------------------------------
def get_disease_files(input_dir=INPUT_DIR, pattern="*_arm_matrix.csv"):
    """
    Finds all arm matrix CSV files inside the specified directory.
    """
    search_path = os.path.join(input_dir, pattern)
    return glob.glob(search_path)

# --------------------------------------------
# 🔹 Function: Generate randomized Set labels
# --------------------------------------------
def generate_set_labels(n):
    """Generates labels like SetA, SetB, SetC... based on number of arms."""
    letters = string.ascii_uppercase
    return [f"Set{letters[i]}" for i in range(n)]

# --------------------------------------------
# 🔹 Function: Blind arm names randomly
# --------------------------------------------
def blind_arms_randomized(input_csv, output_csv, mapping_csv):
    """
    Randomly replaces arm names with blinded labels (SetA, SetB, ...).
    Saves both the blinded dataset and a mapping key for reference.
    """
    df = pd.read_csv(input_csv, dtype=str)
    df.fillna("", inplace=True)

    # ----------------------------------------
    # Identify which columns represent arm flags (0/1)
    # ----------------------------------------
    ignore_cols = {
        "Disease", "conceptId", "conceptName", "conceptCode",
        "conceptSetId", "domainId", "vocabularyId",
        "standardConcept", "validStartDate", "validEndDate",
        "concept_set_name", "conceptSetName"
    }

    # Consider columns with binary-like values (0 or 1)
    arm_cols = [
        col for col in df.columns
        if col not in ignore_cols
        and set(df[col].dropna().unique()) <= {"0", "1"}
    ]

    if not arm_cols:
        print(f"⚠️ No valid arm columns found in: {input_csv}")
        return

    # ----------------------------------------
    # Create randomized blinding map
    # ----------------------------------------
    set_labels = generate_set_labels(len(arm_cols))
    random.shuffle(set_labels)
    randomized_mapping = dict(zip(arm_cols, set_labels))

    # Apply mapping to dataset
    blinded_df = df.rename(columns=randomized_mapping)

    # ----------------------------------------
    # Save outputs
    # ----------------------------------------
    blinded_df.to_csv(output_csv, index=False)
    pd.DataFrame(
        list(randomized_mapping.items()),
        columns=["Original_Arm", "Blinded_Label"]
    ).to_csv(mapping_csv, index=False)

    print(f"✅ Blinded matrix saved: {os.path.basename(output_csv)}")
    print(f"🔑 Mapping key saved: {os.path.basename(mapping_csv)}")
    print(f"   └─ {len(arm_cols)} arms blinded → {', '.join(set_labels)}\n")

# --------------------------------------------
# 🔹 MAIN LOOP: Process all disease matrices
# --------------------------------------------
all_matrices = get_disease_files()

if not all_matrices:
    print("⚠️ No *_arm_matrix.csv files found in:", INPUT_DIR)
else:
    print(f"\n🧩 Found {len(all_matrices)} disease arm matrix files in {INPUT_DIR}\n")

    for input_csv in sorted(all_matrices):
        # Extract clean disease name
        disease = os.path.splitext(os.path.basename(input_csv))[0]
        disease = re.sub(r"_arm_matrix$", "", disease, flags=re.IGNORECASE)

        # Define output filenames
        blinded_output = os.path.join(OUTPUT_DIR, f"{disease}_BLINDED_MATRIX.csv")
        mapping_output = os.path.join(OUTPUT_DIR, f"{disease}_TEAM_MAPPING_KEY.csv")

        print(f"📂 Processing: {os.path.basename(input_csv)}")
        blind_arms_randomized(input_csv, blinded_output, mapping_output)
        print("-" * 70)

print("\n✅ All eligible disease matrices have been blinded.")
print(f"📁 Blinded outputs saved in: {OUTPUT_DIR}")


In [None]:
df = pd.read_csv(os.path.join(OUTPUT_DIR,'C07_SystemicSclerosis_BLINDED_MATRIX.csv'),dtype=str)
df.head()

In [None]:
df = pd.read_csv(os.path.join(OUTPUT_DIR,'C07_SystemicSclerosis_TEAM_MAPPING_KEY.csv'),dtype=str)
df.head()

### Phase 1 — Add Explicit Index to Blinded Matrices (FRD 6.1.1–6.1.4)
- This cell processes all pre-generated blinded arm matrices (`*_BLINDED_MATRIX.csv`) by adding a **numeric `original_index` column** to uniquely track each concept.  
- This ensures consistency when shuffling, merging, or analyzing concepts across workflows while preserving linkage to the original data.  
- Updated files are saved as `*_FULL_WITH_INDEX.csv`, maintaining the full matrix structure for downstream **Phase A pre-adjudication analyses** and reproducibility.  


In [None]:
# ===============================================================
# 🧩 ADD EXPLICIT INDEX TO BLINDED MATRICES (Refined)
# ===============================================================
# Purpose:
#   1️⃣ Load all blinded arm matrices (*_BLINDED_MATRIX.csv)
#   2️⃣ Add a numeric "original_index" column (1-based)
#   3️⃣ Save as *_FULL_WITH_INDEX.csv inside the same folder
#
# This ensures each concept retains a stable reference index
# for downstream tracking and adjudication processes.
# ===============================================================

import os
import glob
import pandas as pd

# --------------------------------------------
# 🧭 Path Setup (relative to script)
# --------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

BLINDED_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_blinded")

if not os.path.exists(BLINDED_DIR):
    raise FileNotFoundError(f"⚠️ Blinded directory not found: {BLINDED_DIR}")

# --------------------------------------------
# 🧮 Step 1: Load ConceptRecordCounts.csv (optional)
# --------------------------------------------
concept_count_file = os.path.join(SCRIPT_DIR, "ConceptRecordCounts.csv")
if os.path.exists(concept_count_file):
    record_count_df = pd.read_csv(concept_count_file, dtype=str)
    print(f"📘 Loaded ConceptRecordCounts.csv ({len(record_count_df):,} rows)")
else:
    record_count_df = None
    print("⚠️ ConceptRecordCounts.csv not found — proceeding without it.")

# --------------------------------------------
# 📂 Step 2: Find all blinded matrix files
# --------------------------------------------
blinded_files = sorted(glob.glob(os.path.join(BLINDED_DIR, "*_BLINDED_MATRIX.csv")))

if not blinded_files:
    print("⚠️ No blinded matrix files found in:", BLINDED_DIR)
    exit()

print(f"\n🧩 Found {len(blinded_files)} blinded matrix files to process:\n")
for f in blinded_files:
    print(f"   → {os.path.basename(f)}")

# --------------------------------------------
# 🧱 Step 3: Add index and save updated copies
# --------------------------------------------
for file_path in blinded_files:
    base_name = os.path.basename(file_path)
    print(f"\n📂 Processing: {base_name}")

    try:
        df = pd.read_csv(file_path, dtype=str)
    except Exception as e:
        print(f"❌ Failed to read {base_name}: {e}")
        continue

    if df.empty:
        print(f"⚠️ Skipping {base_name}: file is empty.")
        continue

    # Add 1-based index column
    df.insert(0, "original_index", range(1, len(df) + 1))

    # Build new output path
    output_file = os.path.join(
        BLINDED_DIR,
        base_name.replace("_BLINDED_MATRIX.csv", "_FULL_WITH_INDEX.csv")
    )

    # Save without auto index
    df.to_csv(output_file, index=False)

    # Print summary stats
    print(f"✅ Saved indexed file: {os.path.basename(output_file)}")
    print(f"   Rows: {len(df):,} | Columns: {len(df.columns)}")

print("\n🎯 All blinded matrices updated with explicit 'original_index' column.")
print(f"📁 Output directory: {BLINDED_DIR}")


### Phase 1 — Randomize & Anonymize Blinded Matrices (FRD 5.1–5.2)
- This cell processes blinded concept matrices to **anonymize and shuffle** them before adjudication, ensuring unbiased review.  
- For each `_BLINDED_MATRIX.csv` file, it:
  1. Adds an explicit original row index for traceability.  
  2. Generates a **unique random 4–5 digit key** per row.  
  3. Randomly shuffles rows and tracks which positions changed.  
- The output files (`*_SHUFFLED.csv`) contain anonymized, traceable matrices ready for blinded adjudication, preserving both integrity and reproducibility.  


In [None]:
# ===============================================================
# 🎲 RANDOMIZE & ANONYMIZE BLINDED MATRICES (Refined)
# ===============================================================
# Purpose:
#   1️⃣ Add explicit numeric index ("original_index")
#   2️⃣ Generate unique random keys (4- or 5-digit)
#   3️⃣ Shuffle rows for anonymization
#   4️⃣ Validate uniqueness and save as *_SHUFFLED.csv
#
# Output:
#   results/phaseA_shuffled/*.csv — anonymized versions
# ===============================================================

import os
import numpy as np
import pandas as pd
import glob

# ---------------------------------------------------------------
# 🧭 PATH SETUP (relative to script)
# ---------------------------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

INPUT_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_blinded")
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_shuffled")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------------------------------------------------------
# 🔍 STEP 1: Identify input blinded matrices
# ---------------------------------------------------------------
blinded_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*_BLINDED_MATRIX.csv")))

if not blinded_files:
    print(f"⚠️ No blinded matrix files found in: {INPUT_DIR}")
    raise SystemExit
else:
    print(f"🧩 Found {len(blinded_files)} blinded files to process in {INPUT_DIR}\n")

# ---------------------------------------------------------------
# 🎯 STEP 2: Process each file
# ---------------------------------------------------------------
for file_path in blinded_files:
    file_name = os.path.basename(file_path)
    print(f"\n📂 Processing file: {file_name}")

    try:
        df = pd.read_csv(file_path, dtype=str)
    except Exception as e:
        print(f"❌ Failed to read {file_name}: {e}")
        continue

    if df.empty:
        print(f"⚠️ Skipping {file_name}: file is empty.")
        continue

    # Ensure original_index exists or create it
    if "original_index" not in df.columns:
        df.insert(0, "original_index", range(1, len(df) + 1))

    # -----------------------------------------------------------
    # 🔑 Generate random unique keys
    # -----------------------------------------------------------
    n = len(df)
    np.random.seed(42)  # Reproducible randomization

    # Dynamically scale key length
    if n > 90000:
        print(f"⚠️ {file_name}: {n:,} rows — using 6-digit keys.")
        all_possible = np.arange(100000, 999999)
    elif n > 9000:
        print(f"⚠️ {file_name}: {n:,} rows — using 5-digit keys.")
        all_possible = np.arange(10000, 99999)
    else:
        all_possible = np.arange(1000, 9999)

    if n > len(all_possible):
        raise ValueError(f"❌ Dataset too large ({n:,} rows) for unique key generation.")

    random_keys = np.random.choice(all_possible, size=n, replace=False)
    df.insert(1, "key", random_keys.astype(str))

    # -----------------------------------------------------------
    # 🔀 Shuffle rows for anonymization
    # -----------------------------------------------------------
    shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
    shuffled["index_changed"] = shuffled["original_index"].astype(int) != (shuffled.index + 1)
    changed_count = shuffled["index_changed"].sum()

    print(f"🔄 Shuffled {changed_count:,} / {n:,} rows.")

    # -----------------------------------------------------------
    # ✅ Validate key uniqueness
    # -----------------------------------------------------------
    if shuffled["key"].nunique() != n:
        dup_keys = shuffled[shuffled["key"].duplicated(keep=False)]
        print(f"❌ Duplicate keys found ({len(dup_keys)} rows):")
        print(dup_keys.head())
        raise ValueError(f"Duplicate keys detected in {file_name}")

    print(f"✅ All {n:,} keys are unique.")

    # -----------------------------------------------------------
    # 💾 Save anonymized shuffled version
    # -----------------------------------------------------------
    out_name = file_name.replace("_BLINDED_MATRIX.csv", "_SHUFFLED.csv")
    out_path = os.path.join(OUTPUT_DIR, out_name)
    shuffled.to_csv(out_path, index=False)

    print(f"💾 Saved shuffled file: {out_path}")
    print(f"   Rows: {len(shuffled):,} | Columns: {len(shuffled.columns)}")

# ---------------------------------------------------------------
# 🧾 Summary
# ---------------------------------------------------------------
print("\n🎯 All blinded matrices have been randomized and anonymized.")
print(f"📁 Output folder: {OUTPUT_DIR}")


In [None]:
df = pd.read_csv(os.path.join(OUTPUT_DIR,'C07_SystemicSclerosis_SHUFFLED.csv'),dtype=str)
df.head()

In [None]:
df.columns

### Gold Standard Generation — Blinded Matrix Processing (FRD 5.1, 6.2)
- This cell processes the blinded, shuffled arm matrices to derive **two gold standard datasets per disease**:  
  1. **Gold Standard 1 (GS1)**: Concepts with full agreement across all arms.  
  2. **Gold Standard 2 (GS2)**: Concepts with partial agreement (some but not all arms).  
- Both outputs are annotated with record counts (if available) and priority flags (`high`/`low`) and are saved as CSV files for downstream review, adjudication, and comparison with AI and Human workflows.


In [None]:
# ===============================================================
# 🧬 GOLD STANDARD GENERATION — SCHEMA-PRESERVING VERSION
# ===============================================================
# Purpose:
#   1️⃣ Load each *_SHUFFLED.csv file (blinded + randomized)
#   2️⃣ Identify consensus across all Set columns (SetA, SetB, etc.)
#   3️⃣ Generate:
#        - Gold Standard 1 → full agreement (all Sets = 1)
#        - Gold Standard 2 → partial agreement (1–max-1 Sets = 1)
#   4️⃣ Retain *all* original columns
#   5️⃣ Merge record counts and priority metadata at the end
# ===============================================================

import os
import re
import glob
import pandas as pd

# --------------------------------------------
# 🧭 PATH CONFIGURATION
# --------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

INPUT_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_shuffled")
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_gold")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --------------------------------------------
# 📘 LOAD RECORD COUNT REFERENCE (if exists)
# --------------------------------------------
record_count_path = os.path.join(SCRIPT_DIR, "ConceptRecordCounts.csv")
if os.path.exists(record_count_path):
    record_count_df = pd.read_csv(record_count_path, dtype=str)
    record_count_df["record_count"] = pd.to_numeric(record_count_df["record_count"], errors="coerce")
    print(f"📘 Loaded ConceptRecordCounts.csv ({len(record_count_df):,} rows)")
else:
    record_count_df = None
    print("⚠️ ConceptRecordCounts.csv not found — record counts will be skipped.")

# ===============================================================
# 🧩 FUNCTION: Process One Shuffled Matrix
# ===============================================================
def process_blinded_matrix(filepath, record_count_df):
    """
    Processes one *_SHUFFLED.csv file to derive:
      - Gold Standard 1 (all Sets = 1)
      - Gold Standard 2 (1–max-1 Sets = 1)
    Retains all original columns and appends derived metadata.
    """

    df = pd.read_csv(filepath, dtype=str)
    df.fillna("0", inplace=True)
    disease_name = os.path.splitext(os.path.basename(filepath))[0].replace("_SHUFFLED", "")

    print(f"\n📂 Processing: {disease_name}")
    print(f"   → File: {os.path.basename(filepath)} | Rows: {len(df):,}")

    # -----------------------------------------------------------
    # 🔍 Identify Set columns dynamically
    # -----------------------------------------------------------
    set_cols = [c for c in df.columns if re.match(r"^Set[A-Z]+$", c)]
    if not set_cols:
        print(f"⚠️ No Set columns found in {disease_name}. Skipping.")
        return None, None

    # Convert to numeric (binary 0/1)
    df[set_cols] = df[set_cols].apply(pd.to_numeric, errors="coerce").fillna(0).astype(int)

    max_sum = len(set_cols)
    df["sum_sets"] = df[set_cols].sum(axis=1)

    # -----------------------------------------------------------
    # 🧩 Gold Standard 1 — Full Agreement (all Sets = 1)
    # -----------------------------------------------------------
    df_gs1 = df[df["sum_sets"] == max_sum].copy()

    # -----------------------------------------------------------
    # 🧩 Gold Standard 2 — Partial Agreement (1–max-1)
    # -----------------------------------------------------------
    df_gs2 = df[df["sum_sets"].between(1, max_sum - 1)].copy()
    df_gs2["priority"] = df_gs2["sum_sets"].apply(lambda x: "high" if x <= 2 else "low")

    # -----------------------------------------------------------
    # 🔗 Merge record counts (if available)
    # -----------------------------------------------------------
    if record_count_df is not None and "conceptId" in df.columns:
        df_gs2 = df_gs2.merge(
            record_count_df[["conceptId", "record_count"]],
            on="conceptId",
            how="left"
        )

    # -----------------------------------------------------------
    # 🧱 Preserve all original columns
    # -----------------------------------------------------------
    original_cols = list(df.columns)
    extra_cols_gs1 = ["sum_sets"]
    extra_cols_gs2 = ["sum_sets", "record_count", "priority"]

    # Ensure no duplicates and maintain order
    gs1_cols = [c for c in original_cols if c not in extra_cols_gs1] + extra_cols_gs1
    gs2_cols = [c for c in original_cols if c not in extra_cols_gs2] + extra_cols_gs2

    # Add any missing columns (if not in data)
    for col in gs1_cols:
        if col not in df_gs1.columns:
            df_gs1[col] = ""
    for col in gs2_cols:
        if col not in df_gs2.columns:
            df_gs2[col] = ""

    df_gs1 = df_gs1.reindex(columns=gs1_cols)
    df_gs2 = df_gs2.reindex(columns=gs2_cols)

    # -----------------------------------------------------------
    # 💾 Save outputs
    # -----------------------------------------------------------
    gs1_out = os.path.join(OUTPUT_DIR, f"{disease_name}_Gold_Standard_1.csv")
    gs2_out = os.path.join(OUTPUT_DIR, f"{disease_name}_Gold_Standard_2.csv")

    df_gs1.to_csv(gs1_out, index=False)
    df_gs2.to_csv(gs2_out, index=False)

    # -----------------------------------------------------------
    # 📊 Summary
    # -----------------------------------------------------------
    print(f"✅ Gold Standards saved for {disease_name}")
    print(f"   ├─ GS1 (Full Agreement): {len(df_gs1):,} rows")
    print(f"   └─ GS2 (Partial Agreement): {len(df_gs2):,} rows")
    print(f"   Columns preserved: {len(original_cols)} original + derived")
    print("------------------------------------------------------------")

    return df_gs1, df_gs2


# ===============================================================
# 🚀 MAIN EXECUTION
# ===============================================================
shuffled_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*_SHUFFLED.csv")))

if not shuffled_files:
    print(f"⚠️ No shuffled files found in {INPUT_DIR}")
else:
    print(f"\n🧩 Found {len(shuffled_files)} shuffled files to process in {INPUT_DIR}")

    for file in shuffled_files:
        process_blinded_matrix(file, record_count_df)

print("\n🎯 Gold Standard generation complete.")
print(f"📁 Outputs saved in: {OUTPUT_DIR}")


In [None]:
df = pd.read_csv(os.path.join(OUTPUT_DIR,'C03_DiabetesMacularEdema_Gold_Standard_2.csv'),dtype=str)
df.head()

### Adjudication File Preparation — Clean Gold Standard 2 (FRD 5 & 6.2)
- This cell processes all `*_Gold_Standard_2.csv` files to generate **minimal, human-readable adjudication datasets**.  
- It removes internal identifiers, blinded team labels, metadata columns, and adds blank columns (`keepConceptSet`, `comment`) for adjudicators to record decisions.  
- Cleaned files are saved in the `Adjudication/` folder as `<disease>_for_adjudication.csv`, ready for blinded human review and TGS finalization.  


In [None]:
# ===============================================================
# 🧾 ADJUDICATION FILE PREPARATION — CLEANING GOLD STANDARD 2 (Refined)
# ===============================================================
# Purpose:
#   1️⃣ Load each *_Gold_Standard_2.csv file
#   2️⃣ Remove blinded/internal columns
#   3️⃣ Sort back to original order (if available)
#   4️⃣ Add blank adjudication columns
#   5️⃣ Save as *_for_adjudication.csv in output folder
#
# Result:
#   A clean, minimal dataset ready for human adjudication.
# ===============================================================

import os
import re
import pandas as pd
import glob

# ---------------------------------------------------------------
# 🧭 PATH CONFIGURATION
# ---------------------------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

INPUT_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_gold")
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_adjudication")
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"\n📁 Input directory : {INPUT_DIR}")
print(f"📁 Output directory: {OUTPUT_DIR}")

# ---------------------------------------------------------------
# 🧹 STEP 1: Define columns to remove before adjudication
# ---------------------------------------------------------------
base_remove_cols = {
    "original_index", "index_changed", "sum_sets",
    "record_count", "priority"
}

# ---------------------------------------------------------------
# 🧩 STEP 2: Locate all Gold Standard 2 files
# ---------------------------------------------------------------
gold2_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*_Gold_Standard_2.csv")))

if not gold2_files:
    print("⚠️ No Gold Standard 2 files found in:", INPUT_DIR)
    raise SystemExit
else:
    print(f"🧩 Found {len(gold2_files)} Gold Standard 2 files to process.\n")

# ---------------------------------------------------------------
# 🚀 STEP 3: Process each file
# ---------------------------------------------------------------
for file_path in gold2_files:
    file_name = os.path.basename(file_path)
    print(f"\n📂 Cleaning file: {file_name}")

    try:
        df = pd.read_csv(file_path, dtype=str)
    except Exception as e:
        print(f"❌ Failed to read {file_name}: {e}")
        continue

    if df.empty:
        print(f"⚠️ Skipping {file_name}: file is empty.")
        continue

    # -----------------------------------------------------------
    # 🧭 Sort back to original order if tracked
    # -----------------------------------------------------------
    if "original_index" in df.columns:
        df["original_index"] = pd.to_numeric(df["original_index"], errors="coerce")
        df = df.sort_values(by="original_index", kind="stable").reset_index(drop=True)
        print("↩️ Sorted by 'original_index' to restore original order.")
    else:
        print("⚠️ No 'original_index' column found — order preserved as-is.")

    # -----------------------------------------------------------
    # 🧩 Identify Set columns dynamically (blinded arms)
    # -----------------------------------------------------------
    set_cols = [c for c in df.columns if re.match(r"^Set[A-Z]+$", c)]
    remove_cols = base_remove_cols.union(set_cols)

    # Drop internal/blinded columns
    df_clean = df.drop(columns=remove_cols, errors="ignore").reset_index(drop=True)
    print(f"🧼 Dropped {len(remove_cols)} internal/blinded columns.")

    # -----------------------------------------------------------
    # ✨ Add adjudication columns (if not present)
    # -----------------------------------------------------------
    for col in ["keepConceptSet", "comment"]:
        if col not in df_clean.columns:
            df_clean[col] = ""

    # Ensure new columns appear at the end
    ordered_cols = [c for c in df_clean.columns if c not in ["keepConceptSet", "comment"]] + ["keepConceptSet", "comment"]
    df_clean = df_clean[ordered_cols]

    # -----------------------------------------------------------
    # 💾 Save cleaned file for adjudication
    # -----------------------------------------------------------
    disease_name = file_name.replace("_Gold_Standard_2.csv", "")
    adjud_out = os.path.join(OUTPUT_DIR, f"{disease_name}_for_adjudication.csv")
    df_clean.to_csv(adjud_out, index=False)

    # -----------------------------------------------------------
    # 📊 Summary report
    # -----------------------------------------------------------
    print(f"✅ Saved adjudication file: {os.path.basename(adjud_out)}")
    print(f"   Rows: {df_clean.shape[0]:,} | Columns: {df_clean.shape[1]}")
    print(f"   → Preview columns: {', '.join(df_clean.columns[:6])} ...")
    print("------------------------------------------------------------")

print("\n🎯 All Gold Standard 2 files cleaned and ready for adjudication.")
print(f"📁 Output folder: {OUTPUT_DIR}")


In [None]:
df = pd.read_csv(os.path.join(OUTPUT_DIR,'C04_DeepVeinThrombosis_for_adjudication.csv'),dtype=str)
df.head()

### Adjudication — Summary of Concepts Submitted for Review (FRD 5.1, 6.2)
- This cell counts the **concept sets submitted for adjudication** for each disease before gold standard resolution.  
- For each disease, it compares the number of concepts in AI/Human submissions with the counts in **Gold Standard files** (`Gold_Standard_1.csv` and `Gold_Standard_2.csv`) to quantify concepts requiring adjudication.  
- The output **`adjudication_summary.csv`** provides a table with:
  - Total concepts per disease, Gold Standard 1 count, and the number of concepts going for adjudication.  
  - Supports evaluation of disagreement magnitude and guides Phase B post-adjudication analyses.  


In [None]:
# ===============================================================
# 📊 SUMMARY — CONCEPT SETS SENT FOR ADJUDICATION (Refined)
# ===============================================================
# Purpose:
#   1️⃣ Reads all *_for_adjudication.csv files
#   2️⃣ Looks up corresponding Gold Standard 1 & 2 files
#   3️⃣ Computes totals and adjudication proportions
#   4️⃣ Saves and prints summary table
# ===============================================================

import os
import pandas as pd
import glob

# ---------------------------------------------------------------
# 🧭 PATH CONFIGURATION
# ---------------------------------------------------------------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

ADJUD_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_adjudication")
GOLD_DIR = os.path.join(SCRIPT_DIR, "results", "phaseA_gold")
SUMMARY_PATH = os.path.join(SCRIPT_DIR, "results", "phaseA_adjudication_summary.csv")

# ---------------------------------------------------------------
# 🔍 Locate all adjudication files
# ---------------------------------------------------------------
adjudication_files = sorted(glob.glob(os.path.join(ADJUD_DIR, "*_for_adjudication.csv")))

if not adjudication_files:
    print(f"⚠️ No adjudication files found in {ADJUD_DIR}")
    raise SystemExit

# print(f"🧩 Found {len(adjudication_files)} adjudication files to summarize.\n")

summary_data = []

# ---------------------------------------------------------------
# 🧩 Process each adjudication file
# ---------------------------------------------------------------
for file_path in adjudication_files:
    disease_name = os.path.basename(file_path).replace("_for_adjudication.csv", "")
    # print(f"📂 {disease_name}")

    # Load adjudication file
    try:
        df_adj = pd.read_csv(file_path, dtype=str)
        adjud_count = len(df_adj)
    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")
        continue

    # Corresponding GS1 / GS2 paths
    gs1_path = os.path.join(GOLD_DIR, f"{disease_name}_Gold_Standard_1.csv")
    gs2_path = os.path.join(GOLD_DIR, f"{disease_name}_Gold_Standard_2.csv")

    gs1_count = 0
    gs2_count = 0

    if os.path.exists(gs1_path):
        gs1_count = len(pd.read_csv(gs1_path, dtype=str))
    if os.path.exists(gs2_path):
        gs2_count = len(pd.read_csv(gs2_path, dtype=str))

    total_concepts = gs1_count + gs2_count

    # Compute adjudication %
    adjud_percent = (adjud_count / total_concepts * 100) if total_concepts > 0 else 0

    summary_data.append({
        "Disease": disease_name,
        "Total_Concepts": total_concepts,
        "Gold_Standard_1_Count": gs1_count,
        "Gold_Standard_2_Count": gs2_count,
        "Concepts_for_Adjudication": adjud_count,
        "Adjudication_%": round(adjud_percent, 2)
    })

# ---------------------------------------------------------------
# 📊 Create summary DataFrame
# ---------------------------------------------------------------
summary_df = pd.DataFrame(summary_data).sort_values(by="Concepts_for_Adjudication", ascending=False)

# Save to CSV
os.makedirs(os.path.dirname(SUMMARY_PATH), exist_ok=True)
summary_df.to_csv(SUMMARY_PATH, index=False)

# ---------------------------------------------------------------
# 🖨️ Display nicely
# ---------------------------------------------------------------
# print("\n✅ Adjudication Summary Table\n")
# print(summary_df.to_string(index=False))
# # print(f"\n💾 Saved summary table → {SUMMARY_PATH}")


In [None]:
summary_df

In [None]:
# df = pd.read_csv('Adjudication/C03_DiabetesMacularEdema_for_adjudication.csv',dtype=str)
# df.head()