# Evaluation for GC dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from math import comb
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


# Your excel file path
excel_file_path = "mapped_gc.xlsx"

# Read excel, skip the first row, and select only the desired columns
df = pd.read_excel(excel_file_path)

# Show the first few rows to verify
print(df.head())


In [None]:
# Define model columns
models = ['agent_CUI', 'phetag_CUI', 'BERT_CUI', 'meta_CUI', 'ctakes_CUI', 'GPT5_CUI']

# Map to more readable model names
model_name_map = {
    'agent_CUI': 'GenOMA',
    'phetag_CUI': 'PhenoTagger',
    'BERT_CUI': 'PhenoBERT',
    'meta_CUI': 'MetaMap',
    'ctakes_CUI': 'cTAKES',
    'GPT5_CUI': 'GPT-5'
}


In [None]:
results = {
    'Model': [],
    'With_Term_Correct': [],
    'With_Term_Wrong': [],     # Predicted but mismatched
    'With_Term_NoPred': [],    # No prediction made
    'With_Term_Total': [],
    'No_Term_Correct': [],
    'No_Term_Incorrect': [],   # Should be empty but predicted something
    'No_Term_Total': []
}

for model in models:
    model_name = model_name_map.get(model, model)

    with_term = df[df['true_CUI'].notna() & (df['true_CUI'] != '')]
    no_term   = df[df['true_CUI'].isna() | (df['true_CUI'] == '')]

    # Cases where the gold has a term
    with_term_total   = len(with_term)
    with_term_correct = (with_term[model] == with_term['true_CUI']).sum()
    with_term_nopred  = (with_term[model].isna() | (with_term[model] == '')).sum()
    with_term_wrong   = with_term_total - with_term_correct - with_term_nopred

    # Cases where the gold has no term
    no_term_total     = len(no_term)
    no_term_correct   = (no_term[model].isna() | (no_term[model] == '')).sum()
    no_term_incorrect = no_term_total - no_term_correct  # Should be empty but output produced

    # Save results
    results['Model'].append(model_name)
    results['With_Term_Correct'].append(with_term_correct)
    results['With_Term_Wrong'].append(with_term_wrong)
    results['With_Term_NoPred'].append(with_term_nopred)
    results['With_Term_Total'].append(with_term_total)
    results['No_Term_Correct'].append(no_term_correct)
    results['No_Term_Incorrect'].append(no_term_incorrect)
    results['No_Term_Total'].append(no_term_total)

summary_df = pd.DataFrame(results)
print(summary_df)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import warnings

# ---- 1)  ---------------------------------------------------------
summary_df['With_Term_Incorrect'] = summary_df['With_Term_Total'] - summary_df['With_Term_Correct']
summary_df['No_Term_Incorrect']   = summary_df['No_Term_Total']   - summary_df['No_Term_Correct']

if 'With_Term_Wrong' not in summary_df.columns or 'With_Term_NoPred' not in summary_df.columns:
    if 'With_Term_Predicted' in summary_df.columns:
        # When there is a "Number of Predictions" column: Wrong = Prediction - Correct; NoPred = Total - Prediction
        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Predicted'] - summary_df['With_Term_Correct']).clip(lower=0)
        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Total']     - summary_df['With_Term_Predicted']).clip(lower=0)
    elif 'With_Term_NoPred' in summary_df.columns:
        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_NoPred']).clip(lower=0)
    elif 'With_Term_Wrong' in summary_df.columns:
        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_Wrong']).clip(lower=0)
    else:
        warnings.warn(
            "无法从 summary_df 推导 With_Term_Wrong / With_Term_NoPred，"
            "暂把所有错误视为 Wrong；建议提供 With_Term_Predicted 或 With_Term_NoPred。"
        )
        summary_df['With_Term_Wrong']  = summary_df['With_Term_Incorrect'].copy()
        summary_df['With_Term_NoPred'] = 0

# ---- 2)  == With_Term_Total ----------------------------
summary_df['With_Term_NoPred'] = summary_df['With_Term_NoPred'].clip(lower=0)
summary_df['With_Term_Wrong']  = (
    summary_df['With_Term_Total']
    - summary_df['With_Term_Correct']
    - summary_df['With_Term_NoPred']
).clip(lower=0)

# (Optional) Assertion Checks
assert (
    (summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'] + summary_df['With_Term_NoPred'])
    == summary_df['With_Term_Total']
).all(), "Decomposition does not sum to With_Term_Total."

# ---- 3) painting ----------------------------------------------------------------
bar_width = 0.45
x = np.arange(len(summary_df['Model']))
gap = 0

fig, ax = plt.subplots(figsize=(10, 8))

# color
color_with_correct   = '#66c2a5'  
color_with_wrong     = '#fc8d62' 
color_with_nopred    = '#fef0d9'  
# Unused right column colors are reserved
color_no_correct     = '#ccece6'
color_no_incorrect   = '#fddbc7'

# Left column (gold has term）
ax.bar(x - gap, summary_df['With_Term_Correct'],  bar_width,
       label='Correct (with term)', color=color_with_correct)
ax.bar(x - gap, summary_df['With_Term_Wrong'],    bar_width,
       bottom=summary_df['With_Term_Correct'],
       label='Type I error', color=color_with_wrong)
ax.bar(x - gap, summary_df['With_Term_NoPred'],   bar_width,
       bottom=summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'],
       label='Type II error', color=color_with_nopred)


# Axes and Titles
ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Number of Samples', fontsize=12)
ax.set_title('Outcome Decomposition of Model Predictions (GC)', fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels(summary_df['Model'], fontsize=12)
ax.tick_params(axis='y', labelsize=12)

# y 
max_height_with = summary_df['With_Term_Total'].max()
max_height_no   = summary_df['No_Term_Total'].max()
max_height = max(max_height_with, max_height_no)
ax.set_ylim(0, max_height * 1.12)

# ---- 4) Integer percentage labels (maximum remainder method, ensuring totals 100%) --------------------------
label_min_frac = 0.03 

def int_percentages(parts, total):

    if total <= 0:
        return [0, 0, 0]
    parts = np.array(parts, dtype=float)
    raw = parts / float(total) * 100.0          
    flo = np.floor(raw)                          
    remain = int(100 - flo.sum())                
    if remain > 0:
        remainders = raw - flo
        order = np.argsort(-remainders)          
        flo[order[:remain]] += 1
    return flo.astype(int).tolist()

for i in range(len(summary_df)):
    with_total   = float(summary_df['With_Term_Total'][i])
    with_correct = float(summary_df['With_Term_Correct'][i])
    with_wrong   = float(summary_df['With_Term_Wrong'][i])
    with_nopred  = float(summary_df['With_Term_NoPred'][i])

    if with_total <= 0:
        continue

    # Calculate integer percentages (guaranteed to be 100%)
    int_pcts = int_percentages([with_correct, with_wrong, with_nopred], with_total)
    int_labels = [f"{p}%" for p in int_pcts]

    x_left = x[i] - gap
    bottoms = [0.0, with_correct, with_correct + with_wrong]
    heights = [with_correct, with_wrong, with_nopred]

    for (b, h, lab) in zip(bottoms, heights, int_labels):
        if h <= 0:
            continue
        frac = h / with_total
        if frac >= label_min_frac:
            y_pos, va, y_adj = b + h / 2.0, 'center', 0
        else:
            y_pos, va, y_adj = b + h, 'bottom', 1
        ax.text(x_left, y_pos + y_adj, lab, ha='center', va=va, fontsize=12)

plt.tight_layout()
plt.subplots_adjust(bottom=0.25)
plt.savefig("5-1.pdf", format="pdf", bbox_inches="tight")
plt.show()


In [None]:
# 2. Define model-to-column mapping
model_columns = {
    'GenOMA': 'agent_CUI',
    'PhenoTagger': 'phetag_CUI',
    'PhenoBERT': 'BERT_CUI',
    'MetaMap': 'meta_CUI',
    'cTAKES': 'ctakes_CUI',
    'GPT-5': 'GPT5_CUI',
}

In [None]:
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd

# Canonical null representation
df['true_CUI'] = df['true_CUI'].fillna('None').astype(str).str.strip()

results = {}

def safe_div(n, d):
    return n / d if d > 0 else 0.0

for model_name, pred_col in model_columns.items():
    if pred_col not in df.columns:
        print(f"Column '{pred_col}' not found, skipping {model_name}")
        continue

    df[pred_col] = df[pred_col].fillna('None').astype(str).str.strip()
    y_true = df['true_CUI']
    y_pred = df[pred_col]

    # TP: correct code when gold exists
    tp = ((y_pred == y_true) & (y_true != 'None')).sum()

    # FP: (1) wrong code when gold exists OR (2) any code when gold is empty
    fp_wrong_when_gold = ((y_true != 'None') & (y_pred != 'None') & (y_pred != y_true)).sum()
    fp_pred_when_empty = ((y_true == 'None') & (y_pred != 'None')).sum()
    fp = fp_wrong_when_gold + fp_pred_when_empty

    # FN: predicted nothing when gold exists
    fn = ((y_pred == 'None') & (y_true != 'None')).sum()

    # TNs are excluded from headline metrics by design
    # tn = ((y_true == 'None') & (y_pred == 'None')).sum()  # not used

    precision = safe_div(tp, tp + fp)
    recall    = safe_div(tp, tp + fn)
    f1        = safe_div(2 * precision * recall, (precision + recall))

    # Mapping accuracy (positives-only): excludes TNs
    mapping_accuracy = safe_div(tp, tp + fp + fn)

    results[model_name] = {
        'tp': int(tp),
        'fp': int(fp),
        'fn': int(fn),
        'precision': round(precision, 4),
        'recall': round(recall, 4),
        'f1': round(f1, 4),
        'mapping_accuracy': round(mapping_accuracy, 4)
    }

results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
def wilson_ci(k: int, n: int, z: float = 1.96):
    """
    Wilson confidence interval (95% when z=1.96). Returns (p_hat, lo, hi).
    If n==0, returns (nan, nan, nan).
    """
    if n == 0:
        return float('nan'), float('nan'), float('nan')
    p_hat = k / n
    z2 = z * z
    denom = 1.0 + z2 / n
    center = (p_hat + z2 / (2 * n)) / denom
    half = (z / denom) * math.sqrt(p_hat * (1 - p_hat) / n + z2 / (4 * n * n))
    lo = max(0.0, center - half)
    hi = min(1.0, center + half)
    return p_hat, lo, hi

rows = []
for model, r in results_df.iterrows():
    tp = int(r['tp']); fp = int(r['fp']); fn = int(r['fn'])

    # Denominators (aligned with your definitions)
    acc_n  = tp + fp + fn             # accuracy denominator (positives-only)
    prec_n = tp + fp                  # precision denominator (predicted positives)
    rec_n  = tp + fn                  # recall denominator (actual positives)

    # Wilson CI
    acc, acc_lo, acc_hi   = wilson_ci(tp, acc_n)
    prec, prec_lo, prec_hi = wilson_ci(tp, prec_n)
    rec, rec_lo, rec_hi    = wilson_ci(tp, rec_n)

    rows.append({
        "model": model,
        "tp": tp, "fp": fp, "fn": fn,
        # Point estimates (recomputed to avoid accumulated rounding error)
        "accuracy": acc, "accuracy_CI_low": acc_lo, "accuracy_CI_high": acc_hi, "acc_n": acc_n,
        "precision": prec, "precision_CI_low": prec_lo, "precision_CI_high": prec_hi, "prec_n": prec_n,
        "recall": rec, "recall_CI_low": rec_lo, "recall_CI_high": rec_hi, "rec_n": rec_n,
    })

metrics_with_ci = pd.DataFrame(rows).set_index("model")

# Pretty print (keep 3 decimals; change to .round(2) if preferred)
cols_to_round = [
    "accuracy","accuracy_CI_low","accuracy_CI_high",
    "precision","precision_CI_low","precision_CI_high",
    "recall","recall_CI_low","recall_CI_high"
]
metrics_with_ci[cols_to_round] = metrics_with_ci[cols_to_round].round(3)

print(metrics_with_ci)

# If you want to merge with original results_df (won't overwrite point estimates)
# merged = results_df.join(metrics_with_ci, how="left", rsuffix="_wilson")
# print(merged)


In [None]:
from matplotlib.ticker import PercentFormatter  # New addition

# Assume results_df is already defined
results_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})

# Custom colors: green shades from dark to light (soft)
green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']

# Manual X coordinate positioning
x = np.arange(len(results_df))  
bar_width = 0.6  

fig, ax = plt.subplots(figsize=(6, 5))

# Plot bar chart
bars = ax.bar(x, results_df['mapping_accuracy'], width=bar_width, color=green_shades)

# Add accuracy values (percentage)
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, height + 0.02,
            f'{height:.2%}', ha='center', va='bottom', fontsize=12)

# Set title and axes
ax.set_xlabel('Model', fontsize=10)
ax.set_title('Model Accuracy Comparison (GC)', fontsize=16)
ax.set_ylabel('Accuracy', fontsize=10)
ax.set_xticks(x)
ax.set_xticklabels(results_df['Model'], fontsize=9)

# Y-axis percentage display
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1.0))

ax.set_ylim(0, 1.05)
ax.tick_params(axis='y', labelsize=10)

# Remove extra borders
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig("3-1.pdf", format="pdf", pad_inches=0.0, bbox_inches="tight")
plt.show()

In [None]:
# Models and performance data
models = ['GenOMA', 'PhenoTagger', 'PhenoBERT', 'MetaMap', 'cTAKES', 'GPT-5']
precision = [0.9144, 0.8235, 0.8105, 0.6012, 0.3583, 0.5281]
recall    = [1.0000, 0.8917, 0.7848, 0.8033, 1.0000, 0.9126]
f1_score  = [0.9553, 0.8563, 0.7974, 0.6877, 0.5276, 0.6690]

# Metric names
metrics = ['Precision', 'Recall', 'F1 Score']
metric_values = [precision, recall, f1_score]

# Axis setup
x = np.arange(len(metrics))  # [0, 1, 2]
bar_width = 0.14

# Professional palette: 6 green shades from dark to light
colors = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']

# Figure style
fig, ax = plt.subplots(figsize=(10, 5.5))
plt.rcParams.update({'font.size': 12})

# Plot bars for each model
for i, model in enumerate(models):
    values = [metric[i] for metric in metric_values]
    bars = ax.bar(x + i * bar_width, values, bar_width, label=model, color=colors[i])

    # Labels above bars
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height + 0.01),
                    ha='center', va='bottom',
                    fontsize=10, color='black')

# Axes and title
ax.set_ylabel('Score', fontsize=14)
ax.set_title('Model Evaluation Metric Comparison (GC)', fontsize=16, pad=15)
ax.set_xticks(x + 1.5 * bar_width)
ax.set_xticklabels(metrics, fontsize=13)
ax.set_ylim(0, 1.05)
ax.yaxis.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.savefig("4-1.pdf", format="pdf", bbox_inches="tight")
plt.show()


In [None]:
GOLD_COL = "true_CUI"       # Gold standard
GEN_COL  = "agent_CUI"      # GenOMA
BASE_COL = "phetag_CUI"     # PhenoTagger (strongest baseline)
def _norm_code(x):
    """Trim whitespace, normalize case; treat empty/None/nan as empty string."""
    if pd.isna(x):
        return ""
    s = str(x).strip()
    if s.lower() in {"", "none", "nan", "null"}:
        return ""
    return s.upper()

def compute_mcnemar_counts(
    df: pd.DataFrame,
    gold_col=GOLD_COL,
    gen_col=GEN_COL,
    base_col=BASE_COL,
    include_empty_gold: bool = False
):
    g  = df[gold_col].map(_norm_code)
    g1 = df[gen_col].map(_norm_code)
    g2 = df[base_col].map(_norm_code)

    if include_empty_gold:
        # Compare on all rows; prediction equals gold counts as correct (including both empty)
        mask = pd.Series(True, index=df.index)
        gen_ok  = (g1 == g)
        base_ok = (g2 == g)
    else:
        # Compare only where true_CUI is non-empty
        mask = (g != "")
        gen_ok  = (g1 == g) & mask
        base_ok = (g2 == g) & mask

    a = int(((gen_ok)  & (base_ok) & mask).sum())   # Both correct
    b = int(((gen_ok)  & (~base_ok) & mask).sum())  # GenOMA correct, baseline wrong
    c = int(((~gen_ok) & (base_ok) & mask).sum())   # GenOMA wrong, baseline correct
    d = int(((~gen_ok) & (~base_ok) & mask).sum())  # Both wrong
    N = int(mask.sum())

    return {"a": a, "b": b, "c": c, "d": d, "N": N}

def mcnemar_exact_p(b: int, c: int) -> float:
    """McNemar exact binomial (two-sided) p-value; valid for any b+c."""
    n = b + c
    if n == 0:
        return 1.0
    k = min(b, c)
    tail = sum(comb(n, i) for i in range(0, k + 1)) / (2 ** n)
    return min(1.0, 2 * tail)

# ==== Computation ====
counts = compute_mcnemar_counts(df, include_empty_gold=False)  # Set to True to include rows with empty true_CUI
a, b, c, d, N = counts["a"], counts["b"], counts["c"], counts["d"], counts["N"]
p_exact = mcnemar_exact_p(b, c)

print(f"N={N}")
print(f"a={a}, b={b}, c={c}, d={d}")
print(f"McNemar exact binomial (two-sided) p = {p_exact:.4g}")

In [None]:
# 1) Assemble the "long table" (containing 6 models; automatically discarding NaNs)
df_long = pd.concat([
    pd.DataFrame({'Model': 'GenOMA',     'LLM_Score': df['agent_LLM_Score']}),
    pd.DataFrame({'Model': 'PhenoTagger', 'LLM_Score': df['phetag_LLM_Score']}),
    pd.DataFrame({'Model': 'PhenoBERT',   'LLM_Score': df['BERT_LLM_Score']}),
    pd.DataFrame({'Model': 'MetaMap',     'LLM_Score': df['meta_LLM_Score']}),
    pd.DataFrame({'Model': 'cTAKES',      'LLM_Score': df['ctake_LLM_Score']}),
    pd.DataFrame({'Model': 'GPT-5',        'LLM_Score': df['GPT5_LLM_Score']}),
], ignore_index=True).dropna(subset=['LLM_Score'])

# 2) Calculate the mean and standard deviation of each model (can be sorted in descending order by mean)
stats = (df_long
         .groupby('Model', as_index=False)['LLM_Score']
         .agg(mean='mean', std='std'))

# Sort by average from high to low (delete this row if not needed)
stats = stats.sort_values('mean', ascending=False)

models = stats['Model'].tolist()
means  = stats['mean'].to_numpy()
stds   = stats['std'].fillna(0).to_numpy()   # When there is only one sample, std may be NaN, set to 0

# 3) ploting
fig, ax = plt.subplots(figsize=(9.5, 5.6))

# Optional: Soft green 6-level gradient (dark→light)
green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']

bars = ax.bar(models, means, yerr=stds, capsize=6, linewidth=0, color=green_shades[:len(models)])

# 4) Smart placement of value labels (on top of column + error bars, plus a little adaptive offset)
ymin, ymax = ax.get_ylim()
offset = 0.015 * (ymax - ymin) 
for rect, mean, std in zip(bars, means, stds):
    top = rect.get_height() + std + offset
    ax.text(rect.get_x() + rect.get_width()/2, top,
            f'{mean:.3f}',
            ha='center', va='bottom', fontsize=10)

# 5) Axes and Styles
ax.set_ylabel('Mean LLM Similarity Score', fontsize=12)
ax.set_title('Mean LLM Score with Standard Deviation (DS-PALS)', fontsize=14, pad=10)
ax.set_ylim(0, max(means + stds) * 1.15)     
ax.yaxis.grid(True, linestyle='--', alpha=0.6)
ax.set_axisbelow(True)                        
ax.tick_params(axis='x', labelrotation=0)     

# 6) Legend (can be used if required)
# ax.legend(handles=bars, labels=models, title="Model", frameon=False, fontsize=10, title_fontsize=11)

plt.tight_layout()
plt.show()

# Evaluation for DS-PALS dataset

In [None]:
# Your excel file path
excel_file_path = "1result.xlsx"

# Read excel, skip the first row, and select only the desired columns
df = pd.read_excel(excel_file_path)

# Show the first few rows to verify
print(df.head())


In [None]:
# Defining model columns
models = ['agent_CUI', 'phetag_CUI', 'BERT_CUI', 'meta_CUI', 'ctake_CUI', 'GPT5_CUI']

# Mapping to more friendly names
model_name_map = {
    'agent_CUI': 'GenOMA',
    'phetag_CUI': 'PhenoTagger',
    'BERT_CUI': 'PhenoBERT',
    'meta_CUI': 'MetaMap',
    'ctake_CUI': 'cTAKES',
    'GPT5_CUI': 'GPT-5'
}


In [None]:
results = {
    'Model': [],
    'With_Term_Correct': [],
    'With_Term_Wrong': [],     
    'With_Term_NoPred': [],    
    'With_Term_Total': [],
    'No_Term_Correct': [],
    'No_Term_Incorrect': [],   
    'No_Term_Total': []
}

for model in models:
    model_name = model_name_map.get(model, model)

    with_term = df[df['true_CUI'].notna() & (df['true_CUI'] != '')]
    no_term   = df[df['true_CUI'].isna() | (df['true_CUI'] == '')]

    # Wiyh term section
    with_term_total   = len(with_term)
    with_term_correct = (with_term[model] == with_term['true_CUI']).sum()
    with_term_nopred  = (with_term[model].isna() | (with_term[model] == '')).sum()
    with_term_wrong   = with_term_total - with_term_correct - with_term_nopred

    # No term section
    no_term_total     = len(no_term)
    no_term_correct   = (no_term[model].isna() | (no_term[model] == '')).sum()
    no_term_incorrect = no_term_total - no_term_correct  

    # save resluts
    results['Model'].append(model_name)
    results['With_Term_Correct'].append(with_term_correct)
    results['With_Term_Wrong'].append(with_term_wrong)
    results['With_Term_NoPred'].append(with_term_nopred)
    results['With_Term_Total'].append(with_term_total)
    results['No_Term_Correct'].append(no_term_correct)
    results['No_Term_Incorrect'].append(no_term_incorrect)
    results['No_Term_Total'].append(no_term_total)

summary_df = pd.DataFrame(results)
print(summary_df)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import warnings

# ---- 1) Compute error counts (original) -------------------------------------
summary_df['With_Term_Incorrect'] = summary_df['With_Term_Total'] - summary_df['With_Term_Correct']
summary_df['No_Term_Incorrect']   = summary_df['No_Term_Total']   - summary_df['No_Term_Correct']

# ---- 2) Split With_Term_Incorrect -> Wrong / NoPred -------------------------
if 'With_Term_Wrong' not in summary_df.columns or 'With_Term_NoPred' not in summary_df.columns:
    if 'With_Term_Predicted' in summary_df.columns:
        # Wrong = Predicted - Correct; NoPred = Total - Predicted
        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Predicted'] - summary_df['With_Term_Correct']).clip(lower=0)
        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Total']     - summary_df['With_Term_Predicted']).clip(lower=0)
    elif 'With_Term_NoPred' in summary_df.columns:
        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_NoPred']).clip(lower=0)
    elif 'With_Term_Wrong' in summary_df.columns:
        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_Wrong']).clip(lower=0)
    else:
        warnings.warn(
            "Unable to infer With_Term_Wrong / With_Term_NoPred; treating all errors as Wrong. "
            "Consider providing With_Term_Predicted or With_Term_NoPred in the summary."
        )
        summary_df['With_Term_Wrong']  = summary_df['With_Term_Incorrect'].copy()
        summary_df['With_Term_NoPred'] = 0

# ---- 3) Backfill correction: ensure Correct+Wrong+NoPred == With_Term_Total --
summary_df['With_Term_NoPred'] = summary_df['With_Term_NoPred'].clip(lower=0)
summary_df['With_Term_Wrong']  = (
    summary_df['With_Term_Total']
    - summary_df['With_Term_Correct']
    - summary_df['With_Term_NoPred']
).clip(lower=0)

# (Optional) Assertion to ensure decomposition holds
assert (
    (summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'] + summary_df['With_Term_NoPred'])
    == summary_df['With_Term_Total']
).all(), "Decomposition does not sum to With_Term_Total."

# Define right-bar consistency as well (derive incorrect from total and correct)
summary_df['No_Term_Incorrect'] = (
    summary_df['No_Term_Total'] - summary_df['No_Term_Correct']
).clip(lower=0)

# ---- 4) Plot ----------------------------------------------------------------
bar_width = 0.35
x = np.arange(len(summary_df['Model']))
gap = bar_width * 0.6

fig, ax = plt.subplots(figsize=(10, 8))

# color
color_with_correct   = '#66c2a5'  
color_with_wrong     = '#fc8d62'  
color_with_nopred    = '#fef0d9'  
color_no_correct     = '#ccece6'  
color_no_incorrect   = '#fddbc7'  

ax.bar(x - gap, summary_df['With_Term_Correct'],  bar_width,
       label='Correct (with term)', color=color_with_correct)
ax.bar(x - gap, summary_df['With_Term_Wrong'],    bar_width,
       bottom=summary_df['With_Term_Correct'],
       label='Type IA error', color=color_with_wrong)
ax.bar(x - gap, summary_df['With_Term_NoPred'],   bar_width,
       bottom=summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'],
       label='Type II error', color=color_with_nopred)

ax.bar(x + gap, summary_df['No_Term_Correct'],    bar_width,
       label='Correct (no term)', color=color_no_correct)
ax.bar(x + gap, summary_df['No_Term_Incorrect'],  bar_width,
       bottom=summary_df['No_Term_Correct'],
       label='Type IB error', color=color_no_incorrect)

ax.legend(loc='upper left', bbox_to_anchor=(1.02, 1),
          fancybox=False, shadow=False, ncol=1, fontsize=10)

ax.set_xlabel('Model', fontsize=10)
ax.set_ylabel('Number of Samples', fontsize=10)
ax.set_title('Outcome Decomposition of Model Predictions (DS-PALS)', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(summary_df['Model'], fontsize=10)
ax.tick_params(axis='y', labelsize=10)

max_height_with = summary_df['With_Term_Total'].max()
max_height_no   = summary_df['No_Term_Total'].max()
ax.set_ylim(0, max(max_height_with, max_height_no) * 1.12)

label_min_frac = 0.03  

def int_percentages(parts, total):

    if total <= 0:
        return [0 for _ in parts]
    arr = np.array(parts, dtype=float)
    raw = arr / float(total) * 100.0
    flo = np.floor(raw)
    remain = int(100 - flo.sum())
    if remain > 0:
        order = np.argsort(-(raw - flo))  
        flo[order[:remain]] += 1
    return flo.astype(int).tolist()

for i in range(len(summary_df)):
    
    with_total   = float(summary_df['With_Term_Total'][i])
    with_correct = float(summary_df['With_Term_Correct'][i])
    with_wrong   = float(summary_df['With_Term_Wrong'][i])
    with_nopred  = float(summary_df['With_Term_NoPred'][i])

    if with_total > 0:
        x_left = x[i] - gap
        bottoms = [0.0, with_correct, with_correct + with_wrong]
        heights = [with_correct, with_wrong, with_nopred]

        int_pcts = int_percentages(heights, with_total)
        int_labels = [f"{p}%" for p in int_pcts]

        for b, h, lab in zip(bottoms, heights, int_labels):
            if h <= 0:
                continue
            frac = h / with_total
            if frac >= label_min_frac:
                y_pos, va, y_adj = b + h / 2.0, 'center', 0
            else:
                y_pos, va, y_adj = b + h, 'bottom', 1
            ax.text(x_left, y_pos + y_adj, lab, ha='center', va=va, fontsize=10)

    no_total     = float(summary_df['No_Term_Total'][i])
    no_correct   = float(summary_df['No_Term_Correct'][i])
    no_incorrect = float(summary_df['No_Term_Incorrect'][i])

    if no_total > 0:
        x_right = x[i] + gap
        bottoms = [0.0, no_correct]
        heights = [no_correct, no_incorrect]

        int_pcts = int_percentages(heights, no_total)
        int_labels = [f"{p}%" for p in int_pcts]

        for b, h, lab in zip(bottoms, heights, int_labels):
            if h <= 0:
                continue
            frac = h / no_total
            if frac >= label_min_frac:
                y_pos, va, y_adj = b + h / 2.0, 'center', 0
            else:
                y_pos, va, y_adj = b + h, 'bottom', 1
            ax.text(x_right, y_pos + y_adj, lab, ha='center', va=va, fontsize=10)

plt.tight_layout()
plt.subplots_adjust(bottom=0.25)
plt.show()


In [None]:
# Define model-to-column mapping
model_columns = {
    'GenOMA': 'agent_CUI',
    'PhenoTagger': 'phetag_CUI',
    'PhenoBERT': 'BERT_CUI',
    'MetaMap': 'meta_CUI',
    'cTAKES': 'ctake_CUI',
    'GPT-5': 'GPT5_CUI',
}

In [None]:
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd

# Normalize empty values
df['true_CUI'] = df['true_CUI'].fillna('None').astype(str).str.strip()

results = {}

def safe_div(n, d):
    return n / d if d > 0 else 0.0

for model_name, pred_col in model_columns.items():
    if pred_col not in df.columns:
        print(f"Column '{pred_col}' not found, skipping {model_name}")
        continue

    df[pred_col] = df[pred_col].fillna('None').astype(str).str.strip()
    y_true = df['true_CUI']
    y_pred = df[pred_col]

    # TP: correct code when gold exists
    tp = ((y_pred == y_true) & (y_true != 'None')).sum()

    # FP: (1) wrong code when gold exists OR (2) any code when gold is empty
    fp_wrong_when_gold = ((y_true != 'None') & (y_pred != 'None') & (y_pred != y_true)).sum()
    fp_pred_when_empty = ((y_true == 'None') & (y_pred != 'None')).sum()
    fp = fp_wrong_when_gold + fp_pred_when_empty

    # FN: predicted nothing when gold exists
    fn = ((y_pred == 'None') & (y_true != 'None')).sum()

    # TNs are excluded from headline metrics by design
    # tn = ((y_true == 'None') & (y_pred == 'None')).sum()  # not used

    precision = safe_div(tp, tp + fp)
    recall    = safe_div(tp, tp + fn)
    f1        = safe_div(2 * precision * recall, (precision + recall))

    # Mapping accuracy (positives-only): excludes TNs
    mapping_accuracy = safe_div(tp, tp + fp + fn)

    results[model_name] = {
        'tp': int(tp),
        'fp': int(fp),
        'fn': int(fn),
        'precision': round(precision, 4),
        'recall': round(recall, 4),
        'f1': round(f1, 4),
        'mapping_accuracy': round(mapping_accuracy, 4)
    }

results_df = pd.DataFrame(results).T
print(results_df)


In [None]:


def wilson_ci(k: int, n: int, z: float = 1.96):
    """
    Wilson confidence interval (95% when z=1.96). Returns (p_hat, lo, hi).
    If n==0, returns (nan, nan, nan).
    """
    if n == 0:
        return float('nan'), float('nan'), float('nan')
    p_hat = k / n
    z2 = z * z
    denom = 1.0 + z2 / n
    center = (p_hat + z2 / (2 * n)) / denom
    half = (z / denom) * math.sqrt(p_hat * (1 - p_hat) / n + z2 / (4 * n * n))
    lo = max(0.0, center - half)
    hi = min(1.0, center + half)
    return p_hat, lo, hi

rows = []
for model, r in results_df.iterrows():
    tp = int(r['tp']); fp = int(r['fp']); fn = int(r['fn'])

    # Denominators (aligned with your definitions)
    acc_n  = tp + fp + fn             # accuracy denominator (positives-only)
    prec_n = tp + fp                  # precision denominator (predicted positives)
    rec_n  = tp + fn                  # recall denominator (actual positives)

    # Wilson CI
    acc, acc_lo, acc_hi   = wilson_ci(tp, acc_n)
    prec, prec_lo, prec_hi = wilson_ci(tp, prec_n)
    rec, rec_lo, rec_hi    = wilson_ci(tp, rec_n)

    rows.append({
        "model": model,
        "tp": tp, "fp": fp, "fn": fn,
        # Point estimates (recomputed to avoid accumulated rounding error)
        "accuracy": acc, "accuracy_CI_low": acc_lo, "accuracy_CI_high": acc_hi, "acc_n": acc_n,
        "precision": prec, "precision_CI_low": prec_lo, "precision_CI_high": prec_hi, "prec_n": prec_n,
        "recall": rec, "recall_CI_low": rec_lo, "recall_CI_high": rec_hi, "rec_n": rec_n,
    })

metrics_with_ci = pd.DataFrame(rows).set_index("model")

# Pretty print (keep 3 decimals; change to .round(2) if preferred)
cols_to_round = [
    "accuracy","accuracy_CI_low","accuracy_CI_high",
    "precision","precision_CI_low","precision_CI_high",
    "recall","recall_CI_low","recall_CI_high"
]
metrics_with_ci[cols_to_round] = metrics_with_ci[cols_to_round].round(3)

print(metrics_with_ci)

# If you want to merge with original results_df (won't overwrite point estimates)
# merged = results_df.join(metrics_with_ci, how="left", rsuffix="_wilson")
# print(merged)


In [None]:
# Assuming results_df has already been defined
results_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})

# Custom color: Green from dark to light (soft)
green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']

# Manually set the X coordinate position
x = np.arange(len(results_df))  
bar_width = 0.6  

fig, ax = plt.subplots(figsize=(6, 5))

# Draw a bar chart
bars = ax.bar(x, results_df['mapping_accuracy'], width=bar_width, color=green_shades)

# Added accuracy value (percentage)
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, height + 0.02,
            f'{height:.2%}', ha='center', va='bottom', fontsize=12)

# Set the title and axis
ax.set_xlabel('Model', fontsize=10)
ax.set_title('Model Accuracy Comparison (DS-PALS)', fontsize=16)
ax.set_ylabel('Accuracy', fontsize=10)
ax.set_xticks(x)
ax.set_xticklabels(results_df['Model'], fontsize=9)

# y 
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1.0))

ax.set_ylim(0, 1.05)
ax.tick_params(axis='y', labelsize=10)

# Remove extra borders
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Models and performance data
models = ['GenOMA', 'PhenoTagger', 'PhenoBERT', 'MetaMap', 'cTAKES', 'GPT-5']
precision = [0.9310, 0.8585, 0.8659, 0.3333, 0.6949, 0.5083]
recall    = [0.9818, 0.8750, 0.6762, 0.8367, 0.8817, 0.8971]
f1_score  = [0.9558, 0.8667, 0.7594, 0.4767, 0.7773, 0.6489]

# Metric names
metrics = ['Precision', 'Recall', 'F1 Score']
metric_values = [precision, recall, f1_score]

# Axis setup
x = np.arange(len(metrics))  # [0, 1, 2]
bar_width = 0.14

# Professional palette: 6 green shades from dark to light
colors = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']

# Figure style
fig, ax = plt.subplots(figsize=(10, 5.5))
plt.rcParams.update({'font.size': 12})

# Plot bars for each model
for i, model in enumerate(models):
    values = [metric[i] for metric in metric_values]
    bars = ax.bar(x + i * bar_width, values, bar_width, label=model, color=colors[i])

    # Labels above bars
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height + 0.01),
                    ha='center', va='bottom',
                    fontsize=10, color='black')

# Axes and title
ax.set_ylabel('Score', fontsize=14)
ax.set_title('Model Evaluation Metric Comparison (DS-PALS)', fontsize=16, pad=15)
ax.set_xticks(x + 1.5 * bar_width)
ax.set_xticklabels(metrics, fontsize=13)
ax.set_ylim(0, 1.05)
ax.yaxis.grid(True, linestyle='--', alpha=0.6)

# Legend
ax.legend(title="Model", bbox_to_anchor=(1.02, 1), loc='upper left',
          borderaxespad=0, fontsize=11, title_fontsize=12, frameon=False)

plt.tight_layout()
plt.show()


In [None]:
# 1) Assemble a long-form table (6 models; drop NaNs automatically)
df_long = pd.concat([
    pd.DataFrame({'Model': 'GenOMA',     'LLM_Score': df['agent_LLM_Score']}),
    pd.DataFrame({'Model': 'PhenoTagger', 'LLM_Score': df['phetag_LLM_Score']}),
    pd.DataFrame({'Model': 'PhenoBERT',   'LLM_Score': df['BERT_LLM_Score']}),
    pd.DataFrame({'Model': 'MetaMap',     'LLM_Score': df['meta_LLM_Score']}),
    pd.DataFrame({'Model': 'cTAKES',      'LLM_Score': df['ctake_LLM_Score']}),
    pd.DataFrame({'Model': 'GPT-5',        'LLM_Score': df['GPT5_LLM_Score']}),
], ignore_index=True).dropna(subset=['LLM_Score'])

# 2) Compute mean and std per model (optionally sort by mean desc)
stats = (df_long
         .groupby('Model', as_index=False)['LLM_Score']
         .agg(mean='mean', std='std'))

# Sort by mean descending (optional)
stats = stats.sort_values('mean', ascending=False)

models = stats['Model'].tolist()
means  = stats['mean'].to_numpy()
stds   = stats['std'].fillna(0).to_numpy()   

# 3) painting
fig, ax = plt.subplots(figsize=(9.5, 5.6))

# Optional: Soft Green 6-step gradient (dark→light)
green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']

bars = ax.bar(models, means, yerr=stds, capsize=6, linewidth=0, color=green_shades[:len(models)])

# 4)Smart placement of value labels (on top of column + error bars, plus a little adaptive offset)
ymin, ymax = ax.get_ylim()
offset = 0.015 * (ymax - ymin)  
for rect, mean, std in zip(bars, means, stds):
    top = rect.get_height() + std + offset
    ax.text(rect.get_x() + rect.get_width()/2, top,
            f'{mean:.3f}',
            ha='center', va='bottom', fontsize=10)

# 5) Axes and Styles
ax.set_ylabel('Mean LLM Similarity Score', fontsize=12)
ax.set_title('Mean LLM Score with Standard Deviation (DS-PALS)', fontsize=14, pad=10)
ax.set_ylim(0, max(means + stds) * 1.15)     
ax.yaxis.grid(True, linestyle='--', alpha=0.6)
ax.set_axisbelow(True)                       
ax.tick_params(axis='x', labelrotation=0)    


plt.tight_layout()
plt.show()





In [None]:

GOLD_COL = "true_CUI"       # Gold standard
GEN_COL  = "agent_CUI"      # GenOMA
BASE_COL = "phetag_CUI"     # PhenoTagger (strongest baseline)

def _norm_code(x):
    """Trim whitespace, normalize case; treat empty/None/nan as empty string."""
    if pd.isna(x):
        return ""
    s = str(x).strip()
    if s.lower() in {"", "none", "nan", "null"}:
        return ""
    return s.upper()

def compute_mcnemar_counts(
    df: pd.DataFrame,
    gold_col=GOLD_COL,
    gen_col=GEN_COL,
    base_col=BASE_COL,
    include_empty_gold: bool = False
):
    g  = df[gold_col].map(_norm_code)
    g1 = df[gen_col].map(_norm_code)
    g2 = df[base_col].map(_norm_code)

    if include_empty_gold:
        # Compare on all rows; prediction equals gold counts as correct (including both empty)
        mask = pd.Series(True, index=df.index)
        gen_ok  = (g1 == g)
        base_ok = (g2 == g)
    else:
        # Compare only where true_CUI is non-empty
        mask = (g != "")
        gen_ok  = (g1 == g) & mask
        base_ok = (g2 == g) & mask

    a = int(((gen_ok)  & (base_ok) & mask).sum())   # Both correct
    b = int(((gen_ok)  & (~base_ok) & mask).sum())  # GenOMA correct, baseline wrong
    c = int(((~gen_ok) & (base_ok) & mask).sum())   # GenOMA wrong, baseline correct
    d = int(((~gen_ok) & (~base_ok) & mask).sum())  # Both wrong
    N = int(mask.sum())

    return {"a": a, "b": b, "c": c, "d": d, "N": N}

def mcnemar_exact_p(b: int, c: int) -> float:
    """McNemar exact binomial (two-sided) p-value; valid for any b+c."""
    n = b + c
    if n == 0:
        return 1.0
    k = min(b, c)
    tail = sum(comb(n, i) for i in range(0, k + 1)) / (2 ** n)
    return min(1.0, 2 * tail)

# ==== Computation ====
counts = compute_mcnemar_counts(df, include_empty_gold=False)  # Set to True to include rows with empty true_CUI
a, b, c, d, N = counts["a"], counts["b"], counts["c"], counts["d"], counts["N"]
p_exact = mcnemar_exact_p(b, c)

print(f"N={N}")
print(f"a={a}, b={b}, c={c}, d={d}")
print(f"McNemar exact binomial (two-sided) p = {p_exact:.4g}")

# Evaluation for XGS dataset

In [None]:
# Your CSV file path
csv_file_path = "2result.xlsx"

# Read CSV, skip the first row, and select only the desired columns
df = pd.read_excel(csv_file_path)

# Show the first few rows to verify
print(df.head())

In [None]:
# Define model columns
models = ['agent_CUI', 'phetag_CUI', 'BERT_CUI', 'meta_CUI', 'ctake_CUI', 'GPT5_CUI']

# Map to more readable model names
model_name_map = {
    'agent_CUI': 'GenOMA',
    'phetag_CUI': 'PhenoTagger',
    'BERT_CUI': 'PhenoBERT',
    'meta_CUI': 'MetaMap',
    'ctake_CUI': 'cTAKES',
    'GPT5_CUI': 'GPT-5'
}


In [None]:
results = {
    'Model': [],
    'With_Term_Correct': [],
    'With_Term_Wrong': [],     # Predicted but mismatched
    'With_Term_NoPred': [],    # No prediction made
    'With_Term_Total': [],
    'No_Term_Correct': [],
    'No_Term_Incorrect': [],   # Should be empty but predicted something
    'No_Term_Total': []
}

for model in models:
    model_name = model_name_map.get(model, model)

    with_term = df[df['true_CUI'].notna() & (df['true_CUI'] != '')]
    no_term   = df[df['true_CUI'].isna() | (df['true_CUI'] == '')]

    # Cases where the gold has a term
    with_term_total   = len(with_term)
    with_term_correct = (with_term[model] == with_term['true_CUI']).sum()
    with_term_nopred  = (with_term[model].isna() | (with_term[model] == '')).sum()
    with_term_wrong   = with_term_total - with_term_correct - with_term_nopred

    # Cases where the gold has no term
    no_term_total     = len(no_term)
    no_term_correct   = (no_term[model].isna() | (no_term[model] == '')).sum()
    no_term_incorrect = no_term_total - no_term_correct  # Should be empty but output produced

    # Save results
    results['Model'].append(model_name)
    results['With_Term_Correct'].append(with_term_correct)
    results['With_Term_Wrong'].append(with_term_wrong)
    results['With_Term_NoPred'].append(with_term_nopred)
    results['With_Term_Total'].append(with_term_total)
    results['No_Term_Correct'].append(no_term_correct)
    results['No_Term_Incorrect'].append(no_term_incorrect)
    results['No_Term_Total'].append(no_term_total)

summary_df = pd.DataFrame(results)
print(summary_df)


In [None]:
# ---- 1) Derived columns -----------------------------------------------------
summary_df['With_Term_Incorrect'] = summary_df['With_Term_Total'] - summary_df['With_Term_Correct']
summary_df['No_Term_Incorrect']   = summary_df['No_Term_Total']   - summary_df['No_Term_Correct']

if 'With_Term_Wrong' not in summary_df.columns or 'With_Term_NoPred' not in summary_df.columns:
    if 'With_Term_Predicted' in summary_df.columns:
        # When a "predicted count" column is present: Wrong = Predicted - Correct; NoPred = Total - Predicted
        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Predicted'] - summary_df['With_Term_Correct']).clip(lower=0)
        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Total']     - summary_df['With_Term_Predicted']).clip(lower=0)
    elif 'With_Term_NoPred' in summary_df.columns:
        summary_df['With_Term_Wrong']  = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_NoPred']).clip(lower=0)
    elif 'With_Term_Wrong' in summary_df.columns:
        summary_df['With_Term_NoPred'] = (summary_df['With_Term_Incorrect'] - summary_df['With_Term_Wrong']).clip(lower=0)
    else:
        warnings.warn(
            "Unable to infer With_Term_Wrong / With_Term_NoPred from summary_df. "
            "Temporarily treat all errors as Wrong; consider providing With_Term_Predicted or With_Term_NoPred."
        )
        summary_df['With_Term_Wrong']  = summary_df['With_Term_Incorrect'].copy()
        summary_df['With_Term_NoPred'] = 0

# ---- 2) Backfill correction: enforce Correct+Wrong+NoPred == With_Term_Total ----
summary_df['With_Term_NoPred'] = summary_df['With_Term_NoPred'].clip(lower=0)
summary_df['With_Term_Wrong']  = (
    summary_df['With_Term_Total']
    - summary_df['With_Term_Correct']
    - summary_df['With_Term_NoPred']
).clip(lower=0)

# (Optional) Assertion check
assert (
    (summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'] + summary_df['With_Term_NoPred'])
    == summary_df['With_Term_Total']
).all(), "Decomposition does not sum to With_Term_Total."

# ---- 3) painting----------------------------------------------------------------
bar_width = 0.45
x = np.arange(len(summary_df['Model']))
gap = 0

fig, ax = plt.subplots(figsize=(10, 8))

# color
color_with_correct   = '#66c2a5'  
color_with_wrong     = '#fc8d62'  
color_with_nopred    = '#fef0d9'  
# Unused right column colors are reserved
color_no_correct     = '#ccece6'
color_no_incorrect   = '#fddbc7'

# Left column (gold has term)
ax.bar(x - gap, summary_df['With_Term_Correct'],  bar_width,
       label='Correct (with term)', color=color_with_correct)
ax.bar(x - gap, summary_df['With_Term_Wrong'],    bar_width,
       bottom=summary_df['With_Term_Correct'],
       label='Type I error', color=color_with_wrong)
ax.bar(x - gap, summary_df['With_Term_NoPred'],   bar_width,
       bottom=summary_df['With_Term_Correct'] + summary_df['With_Term_Wrong'],
       label='Type II error', color=color_with_nopred)

# legend
ax.legend(loc='upper left', bbox_to_anchor=(1.02, 1),
          fancybox=False, shadow=False, ncol=1, fontsize=12)

# Axes and Titles
ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Number of Samples', fontsize=12)
ax.set_title('Outcome Decomposition of Model Predictions (XGS)', fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels(summary_df['Model'], fontsize=12)
ax.tick_params(axis='y', labelsize=12)

# y 
max_height_with = summary_df['With_Term_Total'].max()
max_height_no   = summary_df['No_Term_Total'].max()
max_height = max(max_height_with, max_height_no)
ax.set_ylim(0, max_height * 1.12)

# ---- 4) Integer percentage labels (maximum remainder method, ensuring totals 100%) --------------------------
label_min_frac = 0.03  

def int_percentages(parts, total):
    """返回三个整数百分比，合计必为 100。"""
    if total <= 0:
        return [0, 0, 0]
    parts = np.array(parts, dtype=float)
    raw = parts / float(total) * 100.0          
    flo = np.floor(raw)                         
    remain = int(100 - flo.sum())               
    if remain > 0:
        remainders = raw - flo
        order = np.argsort(-remainders)         
        flo[order[:remain]] += 1
    return flo.astype(int).tolist()

for i in range(len(summary_df)):
    with_total   = float(summary_df['With_Term_Total'][i])
    with_correct = float(summary_df['With_Term_Correct'][i])
    with_wrong   = float(summary_df['With_Term_Wrong'][i])
    with_nopred  = float(summary_df['With_Term_NoPred'][i])

    if with_total <= 0:
        continue

    # Calculate integer percentages (guaranteed to be 100%)
    int_pcts = int_percentages([with_correct, with_wrong, with_nopred], with_total)
    int_labels = [f"{p}%" for p in int_pcts]

    x_left = x[i] - gap
    bottoms = [0.0, with_correct, with_correct + with_wrong]
    heights = [with_correct, with_wrong, with_nopred]

    for (b, h, lab) in zip(bottoms, heights, int_labels):
        if h <= 0:
            continue
        frac = h / with_total
        if frac >= label_min_frac:
            y_pos, va, y_adj = b + h / 2.0, 'center', 0
        else:
            y_pos, va, y_adj = b + h, 'bottom', 1
        ax.text(x_left, y_pos + y_adj, lab, ha='center', va=va, fontsize=12)

plt.tight_layout()
plt.subplots_adjust(bottom=0.25)
plt.show()


In [None]:
# Define model-to-column mapping
model_columns = {
    'GenOMA': 'agent_CUI',
    'PhenoTagger': 'phetag_CUI',
    'PhenoBERT': 'BERT_CUI',
    'MetaMap': 'meta_CUI',
    'cTAKES': 'ctake_CUI',
    'GPT-5': 'GPT5_CUI',
}

In [None]:

df['true_CUI'] = df['true_CUI'].fillna('None').astype(str).str.strip()

results = {}

def safe_div(n, d):
    return n / d if d > 0 else 0.0

for model_name, pred_col in model_columns.items():
    if pred_col not in df.columns:
        print(f"Column '{pred_col}' not found, skipping {model_name}")
        continue

    df[pred_col] = df[pred_col].fillna('None').astype(str).str.strip()
    y_true = df['true_CUI']
    y_pred = df[pred_col]

    # TP: correct code when gold exists
    tp = ((y_pred == y_true) & (y_true != 'None')).sum()

    # FP: (1) wrong code when gold exists OR (2) any code when gold is empty
    fp_wrong_when_gold = ((y_true != 'None') & (y_pred != 'None') & (y_pred != y_true)).sum()
    fp_pred_when_empty = ((y_true == 'None') & (y_pred != 'None')).sum()
    fp = fp_wrong_when_gold + fp_pred_when_empty

    # FN: predicted nothing when gold exists
    fn = ((y_pred == 'None') & (y_true != 'None')).sum()

    # TNs are excluded from headline metrics by design
    # tn = ((y_true == 'None') & (y_pred == 'None')).sum()  # not used

    precision = safe_div(tp, tp + fp)
    recall    = safe_div(tp, tp + fn)
    f1        = safe_div(2 * precision * recall, (precision + recall))

    # Mapping accuracy (positives-only): excludes TNs
    mapping_accuracy = safe_div(tp, tp + fp + fn)

    results[model_name] = {
        'tp': int(tp),
        'fp': int(fp),
        'fn': int(fn),
        'precision': round(precision, 4),
        'recall': round(recall, 4),
        'f1': round(f1, 4),
        'mapping_accuracy': round(mapping_accuracy, 4)
    }

results_df = pd.DataFrame(results).T
print(results_df)


In [None]:
def wilson_ci(k: int, n: int, z: float = 1.96):

    if n == 0:
        return float('nan'), float('nan'), float('nan')
    p_hat = k / n
    z2 = z * z
    denom = 1.0 + z2 / n
    center = (p_hat + z2 / (2 * n)) / denom
    half = (z / denom) * math.sqrt(p_hat * (1 - p_hat) / n + z2 / (4 * n * n))
    lo = max(0.0, center - half)
    hi = min(1.0, center + half)
    return p_hat, lo, hi

rows = []
for model, r in results_df.iterrows():
    tp = int(r['tp']); fp = int(r['fp']); fn = int(r['fn'])

    # Denominator (same as your definition)
    acc_n  = tp + fp + fn             # accuracy
    prec_n = tp + fp                  # precision 
    rec_n  = tp + fn                  # recall 

    # Wilson CI
    acc, acc_lo, acc_hi   = wilson_ci(tp, acc_n)
    prec, prec_lo, prec_hi = wilson_ci(tp, prec_n)
    rec, rec_lo, rec_hi    = wilson_ci(tp, rec_n)

    rows.append({
        "model": model,
        "tp": tp, "fp": fp, "fn": fn,
        # Point estimates (can be aligned with your existing columns; recalculated here to avoid cumulative errors)
        "accuracy": acc, "accuracy_CI_low": acc_lo, "accuracy_CI_high": acc_hi, "acc_n": acc_n,
        "precision": prec, "precision_CI_low": prec_lo, "precision_CI_high": prec_hi, "prec_n": prec_n,
        "recall": rec, "recall_CI_low": rec_lo, "recall_CI_high": rec_hi, "rec_n": rec_n,
    })

metrics_with_ci = pd.DataFrame(rows).set_index("model")

#Beautify the number (keep three decimal places; if you prefer two, change to .round(2))
cols_to_round = [
    "accuracy","accuracy_CI_low","accuracy_CI_high",
    "precision","precision_CI_low","precision_CI_high",
    "recall","recall_CI_low","recall_CI_high"
]
metrics_with_ci[cols_to_round] = metrics_with_ci[cols_to_round].round(3)

print(metrics_with_ci)


In [None]:
# Assuming results_df is already defined
results_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})

# Custom color: Green from dark to light (soft)
green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']

# Manually set the X coordinate position
x = np.arange(len(results_df))  
bar_width = 0.6  

fig, ax = plt.subplots(figsize=(6, 5))

# Draw a bar chart
bars = ax.bar(x, results_df['mapping_accuracy'], width=bar_width, color=green_shades)

# Added accuracy value (percentage)
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, height + 0.02,
            f'{height:.2%}', ha='center', va='bottom', fontsize=12)

# Set the title and axis
ax.set_xlabel('Model', fontsize=10)
ax.set_title('Model Accuracy Comparison (XGS)', fontsize=16)
ax.set_ylabel('Accuracy', fontsize=10)
ax.set_xticks(x)
ax.set_xticklabels(results_df['Model'], fontsize=9)

#  y
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1.0))

ax.set_ylim(0, 1.05)
ax.tick_params(axis='y', labelsize=10)

# Remove extra borders
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Model and performance data (XGS dataset)
models = ['GenOMA', 'PhenoTagger', 'PhenoBERT', 'MetaMap', 'cTAKES', 'GPT-5']
precision = [0.9397, 0.8667, 0.8706, 0.5962, 0.7284, 0.6216]
recall    = [0.9732, 0.7290, 0.6852, 0.8052, 0.6082, 0.8961]
f1_score  = [0.9561, 0.7919, 0.7668, 0.6851, 0.6629, 0.7340]

# Indicator name
metrics = ['Precision', 'Recall', 'F1 Score']
metric_values = [precision, recall, f1_score]

# Coordinate settings
x = np.arange(len(metrics))  # [0, 1, 2]
bar_width = 0.14

# Professional color palette: 4 yellow-green transition colors from dark to light
colors = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']

# Set the image style
fig, ax = plt.subplots(figsize=(10, 5.5))
plt.rcParams.update({'font.size': 12})

# Draw the columns for each model
for i, model in enumerate(models):
    values = [metric[i] for metric in metric_values]
    bars = ax.bar(x + i * bar_width, values, bar_width, label=model, color=colors[i])

    # Add labels inside columns
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height + 0.01),
                    ha='center', va='bottom',
                    fontsize=10, color='black')

# Set up axes and titles
ax.set_ylabel('Score', fontsize=14)
ax.set_title('Model Evaluation Metric Comparison (XGS)', fontsize=16, pad=15)
ax.set_xticks(x + 1.5 * bar_width)
ax.set_xticklabels(metrics, fontsize=13)
ax.set_ylim(0, 1.05)
ax.yaxis.grid(True, linestyle='--', alpha=0.6)

# Setting the Legend
ax.legend(title="Model", bbox_to_anchor=(1.02, 1), loc='upper left',
          borderaxespad=0, fontsize=11, title_fontsize=12, frameon=False)

plt.tight_layout()
plt.show()


In [None]:
# 1)Assemble the "long table" (containing 6 models; automatically discarding NaNs)
df_long = pd.concat([
    pd.DataFrame({'Model': 'GenOMA',     'LLM_Score': df['agent_LLM_Score']}),
    pd.DataFrame({'Model': 'PhenoTagger', 'LLM_Score': df['phetag_LLM_Score']}),
    pd.DataFrame({'Model': 'PhenoBERT',   'LLM_Score': df['BERT_LLM_Score']}),
    pd.DataFrame({'Model': 'MetaMap',     'LLM_Score': df['meta_LLM_Score']}),
    pd.DataFrame({'Model': 'cTAKES',      'LLM_Score': df['ctake_LLM_Score']}),
    pd.DataFrame({'Model': 'GPT-5',        'LLM_Score': df['GPT5_LLM_Score']}),
], ignore_index=True).dropna(subset=['LLM_Score'])

# 2) Calculate the mean and standard deviation of each model (can be sorted in descending order by mean)
stats = (df_long
         .groupby('Model', as_index=False)['LLM_Score']
         .agg(mean='mean', std='std'))

# Sort by average from high to low (delete this row if not needed)
stats = stats.sort_values('mean', ascending=False)

models = stats['Model'].tolist()
means  = stats['mean'].to_numpy()
stds   = stats['std'].fillna(0).to_numpy()  

# 3) painting
fig, ax = plt.subplots(figsize=(9.5, 5.6))

green_shades = ['#00441b', '#006d2c', '#238b45', '#41ab5d', '#74c476', '#c7e9c0']

bars = ax.bar(models, means, yerr=stds, capsize=6, linewidth=0, color=green_shades[:len(models)])

# 4) Smart placement of value labels (on top of column + error bars, plus a little adaptive offset)
ymin, ymax = ax.get_ylim()
offset = 0.015 * (ymax - ymin)  
for rect, mean, std in zip(bars, means, stds):
    top = rect.get_height() + std + offset
    ax.text(rect.get_x() + rect.get_width()/2, top,
            f'{mean:.3f}',
            ha='center', va='bottom', fontsize=10)

# 5) Axes and Styles
ax.set_ylabel('Mean LLM Similarity Score', fontsize=12)
ax.set_title('Mean LLM Score with Standard Deviation (XGS)', fontsize=14, pad=10)
ax.set_ylim(0, max(means + stds) * 1.15)    
ax.yaxis.grid(True, linestyle='--', alpha=0.6)
ax.set_axisbelow(True)                     
ax.tick_params(axis='x', labelrotation=0)     



plt.tight_layout()
plt.show()





In [None]:
GOLD_COL = "true_CUI"       
GEN_COL  = "agent_CUI"    
BASE_COL = "phetag_CUI"    
def _norm_code(x):
  
    if pd.isna(x):
        return ""
    s = str(x).strip()
    if s.lower() in {"", "none", "nan", "null"}:
        return ""
    return s.upper()

def compute_mcnemar_counts(
    df: pd.DataFrame,
    gold_col=GOLD_COL,
    gen_col=GEN_COL,
    base_col=BASE_COL,
    include_empty_gold: bool = False
):
    g  = df[gold_col].map(_norm_code)
    g1 = df[gen_col].map(_norm_code)
    g2 = df[base_col].map(_norm_code)

    if include_empty_gold:
       
        mask = pd.Series(True, index=df.index)
        gen_ok  = (g1 == g)
        base_ok = (g2 == g)
    else:
       
        mask = (g != "")
        gen_ok  = (g1 == g) & mask
        base_ok = (g2 == g) & mask

    a = int(((gen_ok)  & (base_ok) & mask).sum())   
    b = int(((gen_ok)  & (~base_ok) & mask).sum()) 
    c = int(((~gen_ok) & (base_ok) & mask).sum())   
    d = int(((~gen_ok) & (~base_ok) & mask).sum())  
    N = int(mask.sum())

    return {"a": a, "b": b, "c": c, "d": d, "N": N}

def mcnemar_exact_p(b: int, c: int) -> float:
    """McNemar p 。"""
    n = b + c
    if n == 0:
        return 1.0
    k = min(b, c)
    tail = sum(comb(n, i) for i in range(0, k + 1)) / (2 ** n)
    return min(1.0, 2 * tail)


counts = compute_mcnemar_counts(df, include_empty_gold=False)  
a, b, c, d, N = counts["a"], counts["b"], counts["c"], counts["d"], counts["N"]
p_exact = mcnemar_exact_p(b, c)

print(f"N={N}")
print(f"a={a}, b={b}, c={c}, d={d}")
print(f"McNemar p = {p_exact:.4g}")