In [None]:
import os, sys
import pandas as pd
import numpy as np
from pathlib import Path
import csv
import json
import subprocess
import re

# Display settings: do not abbreviate DataFrame output
pd.set_option("display.max_rows", None)        # show all rows
pd.set_option("display.max_columns", None)     # show all columns
pd.set_option("display.width", None)           # don't wrap to fit console width
pd.set_option("display.max_colwidth", None)    # don't truncate column contents



In [None]:
BASE_DIR = Path("responses/cancer")
SUMMARY_CSV = BASE_DIR / "eval_summary.csv"

def parse_given_edges_tag(path: Path):
    """
    Parse given-edges info from filenames like:
      ..._gedge20_...
    Returns (has_given_edges, given_edge_frac, given_edge_pct)
      has_given_edges: 0/1
      given_edge_frac: float or None
      given_edge_pct:  int or None
    """
    stem = path.stem  # e.g. responses_obs200_int3_shuf3_gedge20_anon_gpt-4o-mini
    m = re.search(r"_gedge(\d+)", stem)
    if not m:
        return 0, None, None
    pct = int(m.group(1))
    frac = pct / 100.0
    return 1, frac, pct

def count_nonempty(colname, rows):
    return sum(1 for r in rows if (r.get(colname) or "").strip())

def count_valid_flag(rows):
    # valid column is expected to be 1/0 or truthy/falsy
    return sum(1 for r in rows if str(r.get("valid", "")).strip() in {"1", "true", "True"})

def count_error_raw(rows):
    # count rows where raw_response contains "[ERROR]"
    return sum(
        1
        for r in rows
        if "[ERROR]" in (r.get("raw_response") or "")
    )


incomplete_files = []
complete_files = []
file_stats = {}  # file -> dict with n_rows, n_raw, n_valid, n_error


if not BASE_DIR.exists():
    print(f"Base directory not found: {BASE_DIR.resolve()}")
else:
    for csv_path in sorted(BASE_DIR.rglob("*.csv")):
        try:
            with csv_path.open("r", encoding="utf-8", newline="") as f:
                reader = csv.DictReader(f)
                rows = list(reader)
                fieldnames = reader.fieldnames or []
        except Exception as e:
            print(f"[ERROR] Failed to read {csv_path}: {e}")
            incomplete_files.append((csv_path, "read_error"))
            continue

        n_rows = len(rows)

        # Detect ENCO vs LLM responses by filename
        is_enco = "ENCO" in csv_path.name

        # Which column (if any) is the "prompt-like" column? (for logging only)
        if "prompt" in fieldnames:
            prompt_col = "prompt"
        elif "prompt_path" in fieldnames:
            prompt_col = "prompt_path"
        else:
            prompt_col = None

        has_raw = "raw_response" in fieldnames

        if is_enco:
            # ENCO files: no prompt/raw_response expected.
            n_prompt = 0
            n_raw = 0
            n_valid = 0
            n_error = 0

            complete = n_rows > 0
            status = "OK" if complete else "INCOMPLETE"
            print(
                f"{csv_path} -> rows={n_rows}, (ENCO: skipping prompt/raw checks) "
                f"valid_flags={n_valid} [{status}]"
            )

        else:
            # Normal LLM response files (must have raw_response)
            if not has_raw:
                # We cannot evaluate without raw_response column
                n_prompt = count_nonempty(prompt_col, rows) if prompt_col else 0
                n_raw = 0
                n_valid = 0
                n_error = 0

                complete = False
                status = "INCOMPLETE"
                reason = "missing raw_response column"
                print(f"{csv_path} -> rows={n_rows}, prompts={n_prompt}, "
                      f"raw_responses={n_raw}, error_responses={n_error}, "
                      f"valid_flags={n_valid} [{status}] ({reason})")

                file_stats[csv_path] = {
                    "n_rows": n_rows,
                    "completed_rows": n_raw,
                    "valid_flag_rows": n_valid,
                    "error_raw_responses": n_error,
                }
                incomplete_files.append((csv_path, reason))
                continue

            # We *do* have raw_response; use it for completeness
            n_prompt = count_nonempty(prompt_col, rows) if prompt_col else 0
            n_raw = count_nonempty("raw_response", rows)
            n_valid = count_valid_flag(rows) if "valid" in fieldnames else 0
            n_error = count_error_raw(rows)

            # Completeness = every row has some raw_response
            complete = (n_raw == n_rows)
            status = "OK" if complete else "INCOMPLETE"

            print(
                f"{csv_path} -> rows={n_rows}, prompts={n_prompt}, "
                f"raw_responses={n_raw}, error_responses={n_error}, "
                f"valid_flags={n_valid} [{status}]"
            )

        # Store stats for later use in the summary CSV
        file_stats[csv_path] = {
            "n_rows": n_rows,
            "completed_rows": n_raw,
            "valid_flag_rows": n_valid,
            "error_raw_responses": n_error,
        }

        if complete:
            complete_files.append(csv_path)
        else:
            # Explicit reason matching our completeness rule
            if is_enco:
                reason = f"rows={n_rows}, ENCO_file"
            else:
                reason = (
                    f"rows={n_rows}, raw_responses={n_raw}, "
                    f"has_raw={has_raw}"
                )
            incomplete_files.append((csv_path, reason))


responses/cancer/responses_obs5000_int200_shuf3_anon_gpt-4o-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_shuf3_anon_gpt-5-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_shuf3_anon_rules_gpt-4o-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_shuf3_anon_rules_gpt-5-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_shuf3_anon_rules_steps_gpt-4o-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_shuf3_anon_rules_steps_gpt-5-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_shuf3_anon_steps_gpt-4o-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_shuf3_anon_steps_gpt-5-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_shuf3_gpt-4o-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_shuf3_gpt-5-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_shuf3_rules_gpt-4o-mini.csv: INCOMPLETE ()
responses/cancer/responses_obs5000_int200_

In [None]:
df = pd.read_csv("experiments/responses/cancer/responses_obs5000_int200_shuf3_anon_steps_gpt-5-mini.csv")
df

In [None]:
df = pd.read_csv('responses/cancer/eval_summary.csv')
mask = (
    # df['file'].str.contains('steps') &
    # df['file'].str.contains('obs200_int3_shuf3_anon') &
    # df['file'].str.contains('responses') &
    df['file'].str.contains('_anon_rules_steps_gpt', case=False)
    # & ~df['file'].str.contains('anon')
    # & (df['valid'].astype(int) == 0)
)
# df[mask][['file', 'num_rows','error_raw_responses','valid', 'avg_f1', 'avg_shd']].sort_values('avg_f1', ascending=False)
df[mask].sort_values('avg_f1', ascending=False)[["file","num_pred_edges","avg_f1","avg_shd"]]



FileNotFoundError: [Errno 2] No such file or directory: 'responses/cancer/eval_summary.csv'

In [7]:
import pandas as pd
from pathlib import Path
import math

EVAL_SUMMARY = Path("responses/cancer/eval_summary.csv")

df = pd.read_csv(EVAL_SUMMARY)
df = df[~df['file'].str.contains('4B', case=False) &
        ~df['file'].str.contains('2024-07-18', case=False)]
df = df[[df['file'].str.contains('int200') | df['file'].str.contains('obs5000')]]
# --- 0. Keep only anon LLM runs + ENCO baseline --------------------
file_str = df["file"].astype(str)
is_anon = file_str.str.contains("_anon", regex=False)
is_enco = file_str.str.contains("ENCO", regex=False)  # non-anon ENCO runs
df = df[is_anon | is_enco].reset_index(drop=True)

from pathlib import Path

def detect_variant(path_str: str) -> str:
    name = Path(path_str).name.lower()
    if "rules_steps" in name:
        return "rules_steps"
    if "rules" in name:
        return "rules"
    if "steps" in name:
        return "steps"
    return "base"

df["variant"] = df["file"].astype(str).apply(detect_variant)

# Make life easier: ensure 'valid' exists & integer
if "valid" in df.columns:
    df["valid"] = df["valid"].fillna(0).astype(int)
else:
    df["valid"] = 0


# --- 2. Metric lookup helpers --------------------------------------

def lookup_metrics(setting_tags, model_tag,
                   variant=None,
                   min_valid=1,
                   given_edge_count=None):
    """
    setting_tags: list like ["obs5000_int200"]
    model_tag:    e.g. "gpt-4o-mini" or "ENCO"
    variant:      one of {"base", "rules", "steps", "rules_steps"} or None
    given_edge_count: if not None, require df['given_edge_count'] == this value
    """
    m = df["file"].astype(str).str.contains(setting_tags[0], regex=False)
    for s in setting_tags[1:]:
        m &= df["file"].astype(str).str.contains(s, regex=False)

    # model name constraint
    m &= df["file"].astype(str).str.contains(model_tag, regex=False)

    # only apply valid filter if requested
    if min_valid > 0:
        m &= df["valid"] >= min_valid

    # variant constraint (base / rules / steps / rules_steps)
    if variant is not None:
        m &= (df["variant"] == variant)

    # given-edge constraint
    if given_edge_count is not None and "given_edge_count" in df.columns:
        m &= (df["given_edge_count"] == given_edge_count)

    subset = df[m]
    if subset.empty:
        return None, None

    row = subset.iloc[0]   # or subset.mean() if you prefer
    return row["avg_f1"], row["avg_shd"]


def fmt(x, ndigits=2):
    if x is None or (isinstance(x, float) and (math.isnan(x) or math.isinf(x))):
        return ""
    return f"{x:.{ndigits}f}"


# -------------------------------------------------------------------
# 3. Settings (Obs/Inter combinations)
# -------------------------------------------------------------------

SETTINGS = {
    "O5000_I3": ["obs5000_int200"],
    "O5000_I0": ["obs5000_int0"],
    "O0_I3":   ["obs0_int200"],
}

# -------------------------------------------------------------------
# 4. Extract model tags from filenames (LLMs only; ENCO is special)
# -------------------------------------------------------------------

def extract_model_tag(path_str: str) -> str:
    """Get the model part from a path, e.g.
       '...responses_obs0_int200_shuf3_anon_Qwen3-32B.csv' -> 'Qwen3-32B'
       '...predictions_obs5000_int200_ENCO.csv'             -> 'ENCO'
    """
    stem = Path(path_str).stem    # e.g. 'responses_obs0_int200_shuf3_anon_Qwen3-32B'
    parts = stem.split("_")
    return parts[-1]

# Only look at response files for LLM models (ENCO is in predictions files)
mask_llm = df["file"].str.contains("responses", regex=False) & \
           ~df["file"].str.contains("ENCO", regex=False)
df_models = df[mask_llm].copy()
df_models["model"] = df_models["file"].apply(extract_model_tag)

all_models = sorted(df_models["model"].unique())
# print("Detected LLM models:", all_models)

llm_models = all_models  # ENCO handled separately

PRETTY_NAME = {
    "gemini-2.5-flash": "Gemini-2.5-Flash",
    # add others if you care; fall back to tag itself
}
def pretty_model_name(model_tag: str) -> str:
    return PRETTY_NAME.get(model_tag, model_tag)

# -------------------------------------------------------------------
# 5. Variant blocks (sections)
# -------------------------------------------------------------------

VARIANTS = [
    ("base",        r"\multicolumn{7}{l}{\textbf{\textit{Zero-Shot LLMs}}} \\"),
    ("rules",       r"\multicolumn{7}{l}{\textbf{\textit{Zero-Shot LLMs + Causality Rules}}} \\"),
    ("steps",       r"\multicolumn{7}{l}{\textbf{\textit{Zero-Shot LLMs + CD Steps}}} \\"),
    ("rules_steps", r"\multicolumn{7}{l}{\textbf{\textit{Zero-Shot LLMs + Causality Rules + CD Steps}}} \\"),
]

ROWS = []

# 5.a ENCO (pure causal discovery method, non-anon predictions_*.csv)
ROWS.append({
    "section": r"\multicolumn{7}{l}{\textbf{\textit{Pure Causal Discovery Method}}} \\",
    "label":   "ENCO",
    "model_tag": "ENCO",
    "variant": "base",
    "given_edge_count": None,   # no given-edge prior
})

# 5.b LLM rows for each variant + model
for variant_key, section_header in VARIANTS:
    first_in_section = True
    for model_tag in llm_models:
        ROWS.append({
            "section":   section_header if first_in_section else None,
            "label":     pretty_model_name(model_tag),
            "model_tag": model_tag,
            "variant":   variant_key,
            "given_edge_count": None,   # normal zero-shot / rules / steps runs
        })
        first_in_section = False

# 5.c NEW: Zero-Shot LLMs + Given One Edge
# pick models that have given_edge_count == 1 somewhere
if "given_edge_count" in df.columns:
    mask_given = (
        (df["given_edge_count"] == 1) &
        df["file"].str.contains("responses", regex=False) &
        ~df["file"].str.contains("ENCO", regex=False)
    )
    df_given = df[mask_given].copy()
    given_models = sorted(df_given["file"].apply(extract_model_tag).unique())
else:
    given_models = []

first_in_section = True
for model_tag in given_models:
    ROWS.append({
        "section": r"\multicolumn{7}{l}{\textbf{\textit{Zero-Shot LLMs + Given One Edge}}} \\"
                   if first_in_section else None,
        "label":   pretty_model_name(model_tag),
        "model_tag": model_tag,
        "variant": "base",          # assume these are base + given-edge
        "given_edge_count": 1,      # key for filtering in lookup_metrics
    })
    first_in_section = False

# -------------------------------------------------------------------
# 6. Build LaTeX lines (F1 / SHD table)
# -------------------------------------------------------------------
lines = []

lines.append(r"\begin{table*}[ht!]")
lines.append(r"\centering")
lines.append(r"\setlength{\tabcolsep}{6pt}")
lines.append(r"\caption{Causal discovery performance on the \textit{Cancer} causal graph. The variable names are anonymized.}")
lines.append(r"\resizebox{\textwidth}{!}{%")
lines.append(r"\begin{tabular}{lcccccc}")
lines.append(r"\toprule")
lines.append(
    r" & \multicolumn{2}{c}{\textbf{Obs. 5000, Inter. 200}}"
    r" & \multicolumn{2}{c}{\textbf{Obs. 5000, Inter. 0}}"
    r" & \multicolumn{2}{c}{\textbf{Obs. 0, Inter. 200}} \\"
)
lines.append(r"\cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}")
lines.append(
    r"\textbf{Method} & \textbf{SHD}~$\downarrow$ & \textbf{F1}~$\uparrow$"
    r" & \textbf{SHD}~$\downarrow$ & \textbf{F1}~$\uparrow$"
    r" & \textbf{SHD}~$\downarrow$ & \textbf{F1}~$\uparrow$ \\"
)

lines.append(r"\midrule")

for rowdef in ROWS:
    # Optional section header
    if rowdef.get("section"):
        lines.append(rowdef["section"])

    label       = rowdef["label"]
    model_tag   = rowdef["model_tag"]
    variant_key = rowdef["variant"]
    given_edge_count = rowdef.get("given_edge_count", None)

    # ENCO: don't require valid>=1, and ignore variant filter
    is_enco = (model_tag == "ENCO")
    min_valid = 0 if is_enco else 1
    variant_for_lookup = None if is_enco else variant_key

    cells = []
    for key, setting_subs in SETTINGS.items():
        f1, shd = lookup_metrics(
            setting_subs,
            model_tag=model_tag,
            variant=variant_for_lookup,
            min_valid=min_valid,
            given_edge_count=given_edge_count,
        )
        cells.append(fmt(f1))
        cells.append(fmt(shd, ndigits=2))

    # If *all* entries for this row are empty, skip the row entirely
    if all(c == "" for c in cells):
        continue

    row_tex = (
        f"{label} & {cells[1]} & {cells[0]} & "
        f"{cells[3]} & {cells[2]} & {cells[5]} & {cells[4]} \\\\"
    )

    lines.append(row_tex)
    lines.append(r"\addlinespace[0.8ex]")

lines.append(r"\bottomrule")
lines.append(r"\end{tabular}")
lines.append(r"}")
lines.append(r"\end{table*}")

latex_table = "\n".join(lines)
print(latex_table)


# -------------------------------------------------------------------
# 7. Valid-ratio table, same sections (including Given One Edge)
# -------------------------------------------------------------------

def lookup_valid_counts(setting_tags, model_tag,
                        variant=None,
                        min_rows=1,
                        given_edge_count=None):
    """
    Returns (valid, num_rows) or (None, None) if not found.
    """
    m = df["file"].astype(str).str.contains(setting_tags[0], regex=False)
    for s in setting_tags[1:]:
        m &= df["file"].astype(str).str.contains(s, regex=False)

    # constrain to this model
    m &= df["file"].astype(str).str.contains(model_tag, regex=False)

    # constrain to variant if given
    if variant is not None:
        m &= (df["variant"] == variant)

    # constrain to given_edge_count if given
    if given_edge_count is not None and "given_edge_count" in df.columns:
        m &= (df["given_edge_count"] == given_edge_count)

    subset = df[m]
    if subset.empty:
        return None, None

    row = subset.iloc[0]
    num_rows = row.get("num_rows", 0)
    valid    = row.get("valid", 0)

    if not num_rows or num_rows < min_rows:
        return None, None

    return int(valid), int(num_rows)


valid_lines = []

valid_lines.append(r"\begin{table*}[ht!]")
valid_lines.append(r"\centering")
valid_lines.append(r"\setlength{\tabcolsep}{6pt}")
valid_lines.append(
    r"\caption{Valid adjacency extraction ratio for each method and data setting on the \textit{Cancer} graph.}"
)
valid_lines.append(r"\resizebox{\textwidth}{!}{%")
valid_lines.append(r"\begin{tabular}{lccc}")
valid_lines.append(r"\toprule")
valid_lines.append(
    r" & \textbf{Obs. 5000, Inter. 200}"
    r" & \textbf{Obs. 5000, Inter. 0}"
    r" & \textbf{Obs. 0, Inter. 200} \\"
)
valid_lines.append(r"\midrule")

for rowdef in ROWS:
    # Optional section header â€“ adapt from 7 columns to 4
    if rowdef.get("section"):
        sec = rowdef["section"]
        sec = sec.replace(r"\multicolumn{7}", r"\multicolumn{4}")
        valid_lines.append(sec)

    label       = rowdef["label"]
    model_tag   = rowdef["model_tag"]
    variant_key = rowdef["variant"]
    given_edge_count = rowdef.get("given_edge_count", None)

    cells = []
    for key, setting_subs in SETTINGS.items():
        valid_count, num_rows = lookup_valid_counts(
            setting_subs,
            model_tag=model_tag,
            variant=variant_key if model_tag != "ENCO" else None,
            given_edge_count=given_edge_count,
        )

        if valid_count is None or num_rows is None:
            cells.append("")  # empty cell
        else:
            cells.append(rf"$\frac{{{valid_count}}}{{{num_rows}}}$")

    # Skip row entirely if ALL settings are empty
    if all(c == "" for c in cells):
        continue

    row_tex = f"{label} & {cells[0]} & {cells[1]} & {cells[2]} \\\\"
    valid_lines.append(row_tex)
    valid_lines.append(r"\addlinespace[0.8ex]")

valid_lines.append(r"\bottomrule")
valid_lines.append(r"\end{tabular}")
valid_lines.append(r"}")
valid_lines.append(r"\end{table*}")

latex_valid_table = "\n".join(valid_lines)
print(latex_valid_table)


FileNotFoundError: [Errno 2] No such file or directory: 'responses/cancer/eval_summary.csv'

In [8]:
from copy import deepcopy
from pathlib import Path
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout
sys.path.append("../")
from causal_graphs.graph_real_world import load_graph_file
from causal_graphs.graph_visualization import visualize_graph
import matplotlib.pyplot as plt
# ---------------------------------------------------------------------
# 1. Load the cancer graph
# ---------------------------------------------------------------------
bif_path = Path("../causal_graphs/real_data/small_graphs/cancer.bif")
graph = load_graph_file(str(bif_path))

# ---------------------------------------------------------------------
# 2. Build a NetworkX graph with integer nodes (0..n-1)
#    so we can reuse the same layout but change labels.
# ---------------------------------------------------------------------
G = nx.DiGraph()
n = len(graph.variables)

# Nodes are integers, but we keep two separate label dicts:
labels_orig = {}
labels_anon = {}

for i, v in enumerate(graph.variables):
    G.add_node(i)
    labels_orig[i] = v.name        # original variable name
    labels_anon[i] = f"X{i+1}"     # anonymized name

# Add directed edges from the CausalDAG
for (u_idx, v_idx) in graph.edges.tolist():
    G.add_edge(u_idx, v_idx)

# ---------------------------------------------------------------------
# 3. Compute a single layout (Graphviz) and reuse for both panels
# ---------------------------------------------------------------------
pos = graphviz_layout(G, prog="dot")  # requires graphviz + pydot

# ---------------------------------------------------------------------
# 4. Plot side-by-side: original (left) and anonymized (right)
# ---------------------------------------------------------------------
figsize = max(3, n ** 0.7)
fig, axes = plt.subplots(1, 2, figsize=(2 * figsize, figsize))

# Common draw kwargs to mimic your existing style
draw_kwargs = dict(
    arrows=True,
    node_color="lightgrey",
    edgecolors="black",
    node_size=600,
    arrowstyle="-|>",
    arrowsize=16,
)

# Left: original names
ax = axes[1]
nx.draw(G, pos, ax=ax, with_labels=False, **draw_kwargs)
nx.draw_networkx_labels(G, pos, labels=labels_orig, font_weight="bold", ax=ax)
ax.set_title("Cancer graph (original names)")
ax.set_axis_off()
ax.margins(0.2)  # <<< add padding inside axes
# Right: anonymized names
ax = axes[0]
nx.draw(G, pos, ax=ax, with_labels=False, **draw_kwargs)
nx.draw_networkx_labels(G, pos, labels=labels_anon, font_weight="bold", ax=ax)
ax.set_title("Cancer graph (anonymized)")
ax.set_axis_off()
ax.margins(0.2)
plt.tight_layout(pad=1.0)

plt.tight_layout()
fig.savefig("cancer_graph_original_vs_anon.pdf", bbox_inches="tight", transparent=True)
plt.close(fig)

In [None]:
df_check = pd.read_csv("responses/cancer/responses_obs200_int3_shuf3_anon_Qwen3-32B.csv")

In [None]:
df_check['prediction'].nunique()

10