# Automated-evaluation

# Automated Evaluation Pipeline

In [None]:

import re
from pathlib import Path

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer


# Project root

def find_project_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(20):
        if (cur / "inputs").exists() and (cur / "outputs").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

ROOT_DIR = find_project_root(Path.cwd())


# Dataset location
DATASET_DIR = ROOT_DIR / "inputs" / "dataset"
INPUT_BRANCH = "primary"  


# Automated-evaluation inputs

AUTO_IN_DIR = DATASET_DIR / INPUT_BRANCH / "automated-evaluation"
HUMAN_XLSX = AUTO_IN_DIR / "Human.xlsx"
GPT_DIR    = AUTO_IN_DIR / "gpt-DMPs"
LLAMA_DIR  = AUTO_IN_DIR / "llama-DMPs"


# Automated-evaluation outputs
AUTO_OUT_DIR = ROOT_DIR / "outputs" / "automated-evaluation"
AUTO_OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR = AUTO_OUT_DIR


# Output filenames used later
# Step 5 outputs
GPT_OUT_CSV   = OUT_DIR / "filtered_Gpt.csv"
LLAMA_OUT_CSV = OUT_DIR / "filtered_Llama.csv"
# Step 6 outputs
GPT_MERGED   = OUT_DIR / "merged_output_Gpt.csv"
LLAMA_MERGED = OUT_DIR / "merged_output_Llama.csv"
# Step 7 outputs
GPT_CLEANED   = OUT_DIR / "merged_output_Gpt_cleaned.csv"
LLAMA_CLEANED = OUT_DIR / "merged_output_Llama_cleaned.csv"
# Step 8 outputs (collapsed elements)
OUT_FOLDER = {
    "Gpt":   OUT_DIR / "dmp_similarity_summary_Gpt.csv",
    "Llama": OUT_DIR / "dmp_similarity_summary_Llama.csv",
}
OUT_RAW = {
    "Gpt":   OUT_DIR / "element_similarity_raw_Gpt.csv",
    "Llama": OUT_DIR / "element_similarity_raw_Llama.csv",
}
OUT_ELEMENT = {
    "Gpt":   OUT_DIR / "element_similarity_summary_Gpt.csv",
    "Llama": OUT_DIR / "element_similarity_summary_Llama.csv",
}
OUT_SUB_ELEMENT = {
    "Gpt":   OUT_DIR / "sub_element_similarity_summary_Gpt.csv",
    "Llama": OUT_DIR / "sub_element_similarity_summary_Llama.csv",
}

# STEP 2 — Shared Utilities

In [None]:

def safe_text(x) -> str:
    if x is None:
        return ""
    try:
        if pd.isna(x):
            return ""
    except Exception:
        pass
    return str(x)


def norm_text(x) -> str:
    s = safe_text(x).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s


def choose_title_column(df: pd.DataFrame) -> str:
    candidates = ["title", "Title", "dmp_title", "DMP Title", "DMP_title"]
    for c in candidates:
        if c in df.columns:
            return c
    raise ValueError(f"Could not find title column. Columns: {list(df.columns)}")


def sort_element_key(x: str):
    s = str(x).strip().lower()
    m = re.match(r"^element_(\d+)([a-z]?)$", s)
    if not m:
        return (999, "z", s)
    return (int(m.group(1)), m.group(2) or "", s)


def rougeL_recall(ref: str, gen: str, scorer: rouge_scorer.RougeScorer) -> float:
    ref = ref or ""
    gen = gen or ""
    if not ref.strip() and not gen.strip():
        return 1.0
    if not ref.strip() or not gen.strip():
        return 0.0
    return float(scorer.score(ref, gen)["rougeL"].recall)

# STEP 3 — Markdown Parsing Helpers

In [None]:
def clean_title_for_match(name: str) -> str:
    s = str(name).strip()
    s = re.sub(r'[\\/*?:"<>|]', "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


def normalize_md_stem(stem: str) -> str:
    s = (stem or "").strip().lower()
    s = re.sub(r"[-_\s]?gpt[-_\s]?[\d\.]+$", "", s)
    s = re.sub(r"[-_\s]?llama[-_\s]?[\d\.]+$", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


def find_md_by_excel_title(search_dir: Path, excel_title: str) -> Path | None:
    excel_norm = norm_text(clean_title_for_match(excel_title))
    md_files = list(search_dir.rglob("*.md"))

    for p in md_files:
        if normalize_md_stem(p.stem) == excel_norm:
            return p

    candidates = [p for p in md_files if excel_norm and excel_norm in normalize_md_stem(p.stem)]
    if candidates:
        candidates = sorted(candidates, key=lambda x: (len(x.stem), str(x)))
        return candidates[0]
    return None


def is_title(line: str) -> bool:
    s = (line or "").strip()
    if not s:
        return False
    if s.startswith("#"):
        return True
    if re.match(r"^\s*\d+[\.\)]?\s*\*\*.+\*\*\s*:?\s*$", s):
        return True
    if re.match(r"^\s*\*\*\s*element\s*\d+\s*:\s*.+\*\*\s*:?\s*$", s, flags=re.I):
        return True
    return False


def extract_titles_and_text(md_text: str, content_col: str) -> pd.DataFrame:
    cleaned = re.sub(r"<think>.*?</think>", "", md_text, flags=re.DOTALL | re.IGNORECASE)
    lines = cleaned.splitlines()

    rows = []
    current_title = None
    buf = []

    def flush():
        nonlocal current_title, buf, rows
        if current_title is None:
            return
        text = "\n".join(buf).strip()
        if text:
            rows.append({"Element title": current_title.strip(), content_col: text})

    for line in lines:
        if is_title(line):
            flush()
            current_title = line
            buf = []
        else:
            buf.append(line)

    flush()
    return pd.DataFrame(rows)

# STEP 4 — Section Title → Element Mapping

In [None]:

def map_section_title_to_element(section_title) -> str | None:
    if section_title is None:
        return None
    try:
        if pd.isna(section_title):
            return None
    except Exception:
        pass

    t = str(section_title).strip()
    t_plain = re.sub(r"^#+\s*", "", t).strip()
    t_low = t_plain.lower()

    if re.search(r"\belement\s*2\b", t_low):
        return "element_2"
    if re.search(r"\belement\s*3\b", t_low):
        return "element_3"
    if re.search(r"\belement\s*6\b", t_low):
        return "element_6"

    if "types and amount of scientific data" in t_low:
        return "element_1a"
    if "scientific data that will be preserved and shared" in t_low:
        return "element_1b"
    if "metadata, other relevant data, and associated documentation" in t_low:
        return "element_1c"

    if "repository where scientific data and metadata will be archived" in t_low:
        return "element_4a"
    if "how scientific data will be findable and identifiable" in t_low:
        return "element_4b"
    if "when and how long the scientific data will be made available" in t_low:
        return "element_4c"

    if "factors affecting subsequent access, distribution, or reuse" in t_low:
        return "element_5a"
    if "whether access to scientific data will be controlled" in t_low:
        return "element_5b"
    if "protections for privacy, rights, and confidentiality" in t_low:
        return "element_5c"

    return None


def get_element_columns(df: pd.DataFrame) -> list:
    cols = []
    for c in df.columns:
        cc = str(c).strip().lower()
        if re.match(r"^element_\d+[a-z]?$", cc):
            cols.append(cc)
    if not cols:
        raise ValueError("No element_* columns found in Human.xlsx.")

    return sorted(cols, key=sort_element_key)


# STEP 5 — Extract NIH Template Sections From Markdown (per title)

In [None]:
human_df = pd.read_excel(HUMAN_XLSX)
title_col = choose_title_column(human_df)
titles = human_df[title_col].dropna().astype(str).tolist()


def process_model_folder(model_dir: Path, content_col: str, out_csv: Path) -> pd.DataFrame:
    records = []

    for title in titles:
        md_path = find_md_by_excel_title(model_dir, title)

        if md_path is None:
            records.append(
                {
                    "dmp_title": title,
                    "md_path": None,
                    "Element title": None,
                    content_col: None,
                    "status": "missing_md",
                }
            )
            continue

        md_text = md_path.read_text(encoding="utf-8", errors="ignore")
        df_sec = extract_titles_and_text(md_text, content_col=content_col)

        if df_sec.empty:
            records.append(
                {
                    "dmp_title": title,
                    "md_path": str(md_path),
                    "Element title": None,
                    content_col: None,
                    "status": "no_sections_found",
                }
            )
            continue

        for _, r in df_sec.iterrows():
            records.append(
                {
                    "dmp_title": title,
                    "md_path": str(md_path),
                    "Element title": r["Element title"],
                    content_col: r[content_col],
                    "status": "ok",
                }
            )

    out_df = pd.DataFrame(records)
    out_df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"Saved: {out_csv} | rows={len(out_df)}")
    print(out_df["status"].value_counts(dropna=False))
    return out_df


process_model_folder(GPT_DIR, "Generated_Gpt_content", GPT_OUT_CSV)
process_model_folder(LLAMA_DIR, "Generated_Llama_content", LLAMA_OUT_CSV)
print("Done: Step 5")

Saved: C:\Users\Nahid\nih-dmp-llm-evaluation-paper-code\outputs\automated-evaluation\filtered_Gpt.csv | rows=312
status
ok    312
Name: count, dtype: int64
Saved: C:\Users\Nahid\nih-dmp-llm-evaluation-paper-code\outputs\automated-evaluation\filtered_Llama.csv | rows=312
status
ok    312
Name: count, dtype: int64
Done: Step 5


# STEP 6 — Merge NIH Reference With Model Output (sub-elements preserved)

In [None]:
ref = pd.read_excel(HUMAN_XLSX)
title_col = choose_title_column(ref)
ref = ref.rename(columns={title_col: "title"})
ref.columns = [str(c).strip().lower() for c in ref.columns]

ref["title_norm"] = ref["title"].apply(norm_text)
element_cols = get_element_columns(ref)

ref_long = ref[["title", "title_norm"] + element_cols].fillna("").melt(
    id_vars=["title", "title_norm"],
    value_vars=element_cols,
    var_name="Element number",
    value_name="NIH Value",
)
ref_long["Element number"] = ref_long["Element number"].str.lower().str.strip()


def load_model_filtered(path: Path, content_col: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["title_norm"] = df["dmp_title"].apply(norm_text)
    df["Element title"] = df["Element title"].fillna("").astype(str)

    df["Element number"] = df["Element title"].apply(map_section_title_to_element)
    df = df[df["Element number"].notna()].copy()
    df["Element number"] = df["Element number"].str.lower().str.strip()

    df = df.rename(columns={content_col: "Generated Content"})
    return df[["title_norm", "Element number", "Element title", "Generated Content"]]


def merge_and_save(sec_df: pd.DataFrame, out_path: Path):
    merged = ref_long.merge(sec_df, on=["title_norm", "Element number"], how="left")
    merged = merged[["title", "Element number", "NIH Value", "Element title", "Generated Content"]]
    merged.to_csv(out_path, index=False, encoding="utf-8")
    print(f"Saved: {out_path} | rows={len(merged)}")


gpt_sec = load_model_filtered(GPT_OUT_CSV, "Generated_Gpt_content")
llm_sec = load_model_filtered(LLAMA_OUT_CSV, "Generated_Llama_content")

merge_and_save(gpt_sec, GPT_MERGED)
merge_and_save(llm_sec, LLAMA_MERGED)
print("Done: Step 6")

Saved: C:\Users\Nahid\nih-dmp-llm-evaluation-paper-code\outputs\automated-evaluation\merged_output_Gpt.csv | rows=312
Saved: C:\Users\Nahid\nih-dmp-llm-evaluation-paper-code\outputs\automated-evaluation\merged_output_Llama.csv | rows=312
Done: Step 6


# STEP 7 — Collapse Sub-elements to Core Elements (1, 4, 5)

In [None]:
INPUTS = {"Gpt": GPT_MERGED, "Llama": LLAMA_MERGED}
OUTPUTS = {"Gpt": GPT_CLEANED, "Llama": LLAMA_CLEANED}

group_map = {
    "element_1": ["element_1a", "element_1b", "element_1c"],
    "element_4": ["element_4a", "element_4b", "element_4c"],
    "element_5": ["element_5a", "element_5b", "element_5c"],
}


def clean_text(x) -> str:
    s = safe_text(x).strip()
    s = re.sub(r"\n\s*\n+", "\n\n", s)
    return s


def combine_group_for_title(df_title: pd.DataFrame, new_element: str, group: list[str]) -> dict:
    g = df_title[df_title["Element number"].isin(group)].copy()
    g["Element number"] = pd.Categorical(g["Element number"], categories=group, ordered=True)
    g = g.sort_values("Element number")

    nih = "\n\n".join([clean_text(v) for v in g["NIH Value"].tolist() if clean_text(v)])
    gen = "\n\n".join([clean_text(v) for v in g["Generated Content"].tolist() if clean_text(v)])

    return {"Element number": new_element, "NIH Value": nih, "Generated Content": gen}


for model_name, in_path in INPUTS.items():
    if not in_path.exists():
        print(f"Missing input: {in_path}")
        continue

    df = pd.read_csv(in_path)

    required = {"title", "Element number", "NIH Value", "Generated Content"}
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"{in_path} missing columns: {missing}. Found: {list(df.columns)}")

    df["Element number"] = df["Element number"].astype(str).str.strip().str.lower()

    group_flat = [e for grp in group_map.values() for e in grp]
    out_blocks = []

    for title, df_title in df.groupby("title", dropna=False):
        df_title = df_title.copy()

        keep_df = df_title[~df_title["Element number"].isin(group_flat)].copy()
        keep_df = keep_df[["Element number", "NIH Value", "Generated Content"]]

        merged_rows = []
        for new_element, grp in group_map.items():
            merged_rows.append(combine_group_for_title(df_title, new_element, grp))

        final_title_df = pd.concat([keep_df, pd.DataFrame(merged_rows)], ignore_index=True)

        for col in ["NIH Value", "Generated Content"]:
            final_title_df[col] = final_title_df[col].apply(clean_text)

        final_title_df = final_title_df.sort_values(
            by="Element number", key=lambda s: s.map(sort_element_key)
        ).reset_index(drop=True)

        final_title_df.insert(0, "title", title)
        out_blocks.append(final_title_df)

    final_df = pd.concat(out_blocks, ignore_index=True)
    out_path = OUTPUTS[model_name]
    final_df.to_csv(out_path, index=False, encoding="utf-8")

    print(f"Saved: {out_path} | rows={len(final_df)}")
    print("Element counts:", final_df["Element number"].value_counts().to_dict())

print("Done: Step 7")


Saved: C:\Users\Nahid\nih-dmp-llm-evaluation-paper-code\outputs\automated-evaluation\merged_output_Gpt_cleaned.csv | rows=156
Element counts: {'element_1': 26, 'element_2': 26, 'element_3': 26, 'element_4': 26, 'element_5': 26, 'element_6': 26}
Saved: C:\Users\Nahid\nih-dmp-llm-evaluation-paper-code\outputs\automated-evaluation\merged_output_Llama_cleaned.csv | rows=156
Element counts: {'element_1': 26, 'element_2': 26, 'element_3': 26, 'element_4': 26, 'element_5': 26, 'element_6': 26}
Done: Step 7


# STEP 8 — Similarity Scoring (collapsed elements) + Sub-element Summary

In [None]:
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)


def compute_similarity_tables(
    in_path: Path,
    model_name: str,
    write_folder_summary: bool,
    folder_summary_path: Path | None,
    raw_path: Path | None,
    element_summary_path: Path | None,
) -> pd.DataFrame:
    df = pd.read_csv(in_path)

    required = {"title", "Element number", "NIH Value", "Generated Content"}
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"{in_path} missing columns: {missing}. Found: {list(df.columns)}")

    df["title"] = df["title"].apply(safe_text)
    df["Element number"] = df["Element number"].apply(safe_text).str.strip().str.lower()
    df["NIH Value"] = df["NIH Value"].apply(safe_text)
    df["Generated Content"] = df["Generated Content"].apply(safe_text)

    nih_texts = df["NIH Value"].tolist()
    gen_texts = df["Generated Content"].tolist()

    emb_nih = sbert.encode(nih_texts, convert_to_tensor=True, normalize_embeddings=True)
    emb_gen = sbert.encode(gen_texts, convert_to_tensor=True, normalize_embeddings=True)
    sbert_sims = (emb_nih * emb_gen).sum(dim=1).cpu().numpy().astype(float)

    rouge_recalls = [rougeL_recall(r, g, rouge) for r, g in zip(nih_texts, gen_texts)]

    raw = pd.DataFrame(
        {
            "Model": model_name,
            "DMP Title": df["title"],
            "Element number": df["Element number"],
            "SBERT_Similarity": sbert_sims,
            "ROUGE_L_Recall": rouge_recalls,
        }
    )

    if raw_path is not None:
        raw.to_csv(raw_path, index=False, encoding="utf-8")

    if write_folder_summary and folder_summary_path is not None:
        folder_summary = (
            raw.groupby(["Model", "DMP Title"], as_index=False)[["SBERT_Similarity", "ROUGE_L_Recall"]]
            .mean()
            .rename(columns={"DMP Title": "Folder"})
        )
        folder_summary.to_csv(folder_summary_path, index=False, encoding="utf-8")

    if element_summary_path is not None:
        element_summary = (
            raw.groupby(["Model", "Element number"], as_index=False)[["SBERT_Similarity", "ROUGE_L_Recall"]]
            .mean()
        )
        element_summary.to_csv(element_summary_path, index=False, encoding="utf-8")

    return raw


# Step 8a: collapsed elements (from cleaned files)
clean_inputs = {"Gpt": GPT_CLEANED, "Llama": LLAMA_CLEANED}

for model_name, in_path in clean_inputs.items():
    if not in_path.exists():
        print(f"Missing input: {in_path}")
        continue

    compute_similarity_tables(
        in_path=in_path,
        model_name=model_name,
        write_folder_summary=True,
        folder_summary_path=OUT_FOLDER[model_name],
        raw_path=OUT_RAW[model_name],
        element_summary_path=OUT_ELEMENT[model_name],
    )

    print(f"Saved Step 8 outputs for {model_name}:")
    print(f"  {OUT_FOLDER[model_name].as_posix()}")
    print(f"  {OUT_RAW[model_name].as_posix()}")
    print(f"  {OUT_ELEMENT[model_name].as_posix()}")

print("Done: Step 8a")


# Step 8b: sub-element summary (from non-cleaned merged files)
sub_inputs = {"Gpt": GPT_MERGED, "Llama": LLAMA_MERGED}

for model_name, in_path in sub_inputs.items():
    if not in_path.exists():
        print(f"Missing input: {in_path}")
        continue

    raw_sub = compute_similarity_tables(
        in_path=in_path,
        model_name=model_name,
        write_folder_summary=False,
        folder_summary_path=None,
        raw_path=None,
        element_summary_path=None,
    )

    sub_element_summary = (
        raw_sub.groupby(["Model", "Element number"], as_index=False)[["SBERT_Similarity", "ROUGE_L_Recall"]]
        .mean()
        .sort_values(by="Element number", key=lambda s: s.map(sort_element_key))
        .reset_index(drop=True)
    )

    sub_element_summary.to_csv(OUT_SUB_ELEMENT[model_name], index=False, encoding="utf-8")
    print(f"Saved: {OUT_SUB_ELEMENT[model_name].as_posix()}")

print("Done: Step 8b")

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1106.70it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Saved Step 8 outputs for Gpt:
  C:/Users/Nahid/nih-dmp-llm-evaluation-paper-code/outputs/automated-evaluation/dmp_similarity_summary_Gpt.csv
  C:/Users/Nahid/nih-dmp-llm-evaluation-paper-code/outputs/automated-evaluation/element_similarity_raw_Gpt.csv
  C:/Users/Nahid/nih-dmp-llm-evaluation-paper-code/outputs/automated-evaluation/element_similarity_summary_Gpt.csv
Saved Step 8 outputs for Llama:
  C:/Users/Nahid/nih-dmp-llm-evaluation-paper-code/outputs/automated-evaluation/dmp_similarity_summary_Llama.csv
  C:/Users/Nahid/nih-dmp-llm-evaluation-paper-code/outputs/automated-evaluation/element_similarity_raw_Llama.csv
  C:/Users/Nahid/nih-dmp-llm-evaluation-paper-code/outputs/automated-evaluation/element_similarity_summary_Llama.csv
Done: Step 8a
Saved: C:/Users/Nahid/nih-dmp-llm-evaluation-paper-code/outputs/automated-evaluation/sub_element_similarity_summary_Gpt.csv
Saved: C:/Users/Nahid/nih-dmp-llm-evaluation-paper-code/outputs/automated-evaluation/sub_element_similarity_summary_Llam