# Automated-evaluation

In [119]:
import re
from pathlib import Path
import pandas as pd

PRIMARY_DIR = Path("primary")
HUMAN_XLSX  = PRIMARY_DIR / "Human.xlsx"
GPT_DIR     = PRIMARY_DIR / "gpt-DMPs"
LLAMA_DIR   = PRIMARY_DIR / "llama-DMPs"

DERIV_DIR = Path("derivative")
DERIV_DIR.mkdir(parents=True, exist_ok=True)

GPT_OUT_CSV   = DERIV_DIR / "filtered_Gpt.csv"
LLAMA_OUT_CSV = DERIV_DIR / "filtered_Llama.csv"

def norm_text(x) -> str:
    if x is None:
        return ""
    try:
        if pd.isna(x):
            return ""
    except Exception:
        pass
    s = str(x).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

def clean_title_for_match(name: str) -> str:
    s = str(name).strip()
    s = re.sub(r'[\\/*?:"<>|]', "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def normalize_md_stem(stem: str) -> str:
    s = (stem or "").strip().lower()
    s = re.sub(r"[-_\s]?gpt[-_\s]?[\d\.]+$", "", s)
    s = re.sub(r"[-_\s]?llama[-_\s]?[\d\.]+$", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def find_md_by_excel_title(search_dir: Path, excel_title: str) -> Path | None:
    excel_norm = norm_text(clean_title_for_match(excel_title))
    md_files = list(search_dir.rglob("*.md"))

    for p in md_files:
        if normalize_md_stem(p.stem) == excel_norm:
            return p

    candidates = [p for p in md_files if excel_norm and excel_norm in normalize_md_stem(p.stem)]
    if candidates:
        candidates = sorted(candidates, key=lambda x: (len(x.stem), str(x)))
        return candidates[0]
    return None

def is_title(line: str) -> bool:
    """
    Title detection for NIH template-style outputs:
    - Markdown headers: #, ##, ...
    - Numbered bold prompts: 1. **...**
    - Bold element headers: **Element 2: ...**
    """
    s = (line or "").strip()
    if not s:
        return False

    if s.startswith("#"):
        return True

    # 1. **Types...**
    if re.match(r"^\s*\d+[\.\)]?\s*\*\*.+\*\*\s*:?\s*$", s):
        return True

    # **Element 2: Related Tools...**
    if re.match(r"^\s*\*\*\s*element\s*\d+\s*:\s*.+\*\*\s*:?\s*$", s, flags=re.I):
        return True

    return False

def extract_titles_and_text(md_text: str, content_col: str) -> pd.DataFrame:
    cleaned = re.sub(r"<think>.*?</think>", "", md_text, flags=re.DOTALL | re.IGNORECASE)
    lines = cleaned.splitlines()

    rows = []
    current_title = None
    buf = []

    def flush():
        nonlocal current_title, buf, rows
        if current_title is None:
            return
        text = "\n".join(buf).strip()
        if text:
            rows.append({"Element title": current_title.strip(), content_col: text})

    for line in lines:
        if is_title(line):
            flush()
            current_title = line
            buf = []
        else:
            buf.append(line)

    flush()
    return pd.DataFrame(rows)

def choose_title_column(df: pd.DataFrame) -> str:
    candidates = ["title", "Title", "dmp_title", "DMP Title", "DMP_title"]
    for c in candidates:
        if c in df.columns:
            return c
    raise ValueError(f"Could not find title column. Columns: {list(df.columns)}")

human_df = pd.read_excel(HUMAN_XLSX)
title_col = choose_title_column(human_df)
titles = human_df[title_col].dropna().astype(str).tolist()

def process_model_folder(model_dir: Path, content_col: str, out_csv: Path) -> pd.DataFrame:
    records = []

    for title in titles:
        md_path = find_md_by_excel_title(model_dir, title)

        if md_path is None:
            records.append({
                "dmp_title": title,
                "md_path": None,
                "Element title": None,
                content_col: None,
                "status": "missing_md"
            })
            continue

        md_text = md_path.read_text(encoding="utf-8", errors="ignore")
        df_sec = extract_titles_and_text(md_text, content_col=content_col)

        if df_sec.empty:
            records.append({
                "dmp_title": title,
                "md_path": str(md_path),
                "Element title": None,
                content_col: None,
                "status": "no_sections_found"
            })
            continue

        for _, r in df_sec.iterrows():
            records.append({
                "dmp_title": title,
                "md_path": str(md_path),
                "Element title": r["Element title"],
                content_col: r[content_col],
                "status": "ok"
            })

    out_df = pd.DataFrame(records)
    out_df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"Saved: {out_csv} | rows={len(out_df)}")
    print(out_df["status"].value_counts(dropna=False))
    return out_df

process_model_folder(GPT_DIR,   "Generated_Gpt_content",   GPT_OUT_CSV)
process_model_folder(LLAMA_DIR, "Generated_Llama_content", LLAMA_OUT_CSV)

print("Done: Step 5 (UPDATED)")

Saved: derivative\filtered_Gpt.csv | rows=312
status
ok    312
Name: count, dtype: int64
Saved: derivative\filtered_Llama.csv | rows=312
status
ok    312
Name: count, dtype: int64
Done: Step 5 (UPDATED)


In [120]:
import re
from pathlib import Path
import pandas as pd

PRIMARY_DIR = Path("primary")
HUMAN_XLSX  = PRIMARY_DIR / "Human.xlsx"

DERIV_DIR = Path("derivative")
GPT_FILTERED   = DERIV_DIR / "filtered_Gpt.csv"
LLAMA_FILTERED = DERIV_DIR / "filtered_Llama.csv"

GPT_MERGED   = DERIV_DIR / "merged_output_Gpt.csv"
LLAMA_MERGED = DERIV_DIR / "merged_output_Llama.csv"

def norm_text(x) -> str:
    if x is None:
        return ""
    try:
        if pd.isna(x):
            return ""
    except Exception:
        pass
    s = str(x).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

def choose_title_column(df: pd.DataFrame) -> str:
    candidates = ["title", "Title", "dmp_title", "DMP Title", "DMP_title"]
    for c in candidates:
        if c in df.columns:
            return c
    raise ValueError(f"Could not find title column. Columns: {list(df.columns)}")

def get_element_columns(df: pd.DataFrame) -> list:
    cols = []
    for c in df.columns:
        cc = str(c).strip().lower()
        if re.match(r"^element_\d+[a-z]?$", cc):
            cols.append(cc)
    if not cols:
        raise ValueError("No element_* columns found in Human.xlsx.")

    def sort_key(c):
        m = re.match(r"^element_(\d+)([a-z]?)$", c)
        return (int(m.group(1)), m.group(2) or "")
    return sorted(cols, key=sort_key)

def map_section_title_to_element(section_title) -> str | None:
    if section_title is None:
        return None
    try:
        if pd.isna(section_title):
            return None
    except Exception:
        pass

    t = str(section_title).strip()
    t_plain = re.sub(r"^#+\s*", "", t).strip()
    t_low = t_plain.lower()

    # Element headers
    if re.search(r"\belement\s*2\b", t_low): return "element_2"
    if re.search(r"\belement\s*3\b", t_low): return "element_3"
    if re.search(r"\belement\s*6\b", t_low): return "element_6"

    # Element 1 sub-questions (1a/1b/1c)
    if "types and amount of scientific data" in t_low: return "element_1a"
    if "scientific data that will be preserved and shared" in t_low: return "element_1b"
    if "metadata, other relevant data, and associated documentation" in t_low: return "element_1c"

    # Element 4 sub-questions (4a/4b/4c)
    if "repository where scientific data and metadata will be archived" in t_low: return "element_4a"
    if "how scientific data will be findable and identifiable" in t_low: return "element_4b"
    if "when and how long the scientific data will be made available" in t_low: return "element_4c"

    # Element 5 sub-questions (5a/5b/5c)
    if "factors affecting subsequent access, distribution, or reuse" in t_low: return "element_5a"
    if "whether access to scientific data will be controlled" in t_low: return "element_5b"
    if "protections for privacy, rights, and confidentiality" in t_low: return "element_5c"

    return None

# NIH reference
ref = pd.read_excel(HUMAN_XLSX)
title_col = choose_title_column(ref)
ref = ref.rename(columns={title_col: "title"})
ref.columns = [str(c).strip().lower() for c in ref.columns]

ref["title_norm"] = ref["title"].apply(norm_text)
element_cols = get_element_columns(ref)

ref_long = ref[["title", "title_norm"] + element_cols].fillna("").melt(
    id_vars=["title", "title_norm"],
    value_vars=element_cols,
    var_name="Element number",
    value_name="NIH Value",
)
ref_long["Element number"] = ref_long["Element number"].str.lower().str.strip()

def load_model_filtered(path: Path, content_col: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["title_norm"] = df["dmp_title"].apply(norm_text)
    df["Element title"] = df["Element title"].fillna("").astype(str)

    df["Element number"] = df["Element title"].apply(map_section_title_to_element)
    df = df[df["Element number"].notna()].copy()
    df["Element number"] = df["Element number"].str.lower().str.strip()

    df = df.rename(columns={content_col: "Generated Content"})
    return df[["title_norm", "Element number", "Element title", "Generated Content"]]

gpt_sec = load_model_filtered(GPT_FILTERED, "Generated_Gpt_content")
llm_sec = load_model_filtered(LLAMA_FILTERED, "Generated_Llama_content")

def merge_and_save(sec_df: pd.DataFrame, out_path: Path):
    merged = ref_long.merge(sec_df, on=["title_norm", "Element number"], how="left")
    merged = merged[["title", "Element number", "NIH Value", "Element title", "Generated Content"]]
    merged.to_csv(out_path, index=False, encoding="utf-8")
    print(f"Saved: {out_path} | rows={len(merged)}")

merge_and_save(gpt_sec, GPT_MERGED)
merge_and_save(llm_sec, LLAMA_MERGED)

print("Done: Step 6 (UPDATED)")

Saved: derivative\merged_output_Gpt.csv | rows=312
Saved: derivative\merged_output_Llama.csv | rows=312
Done: Step 6 (UPDATED)


In [122]:
import re
from pathlib import Path
import pandas as pd

DERIV_DIR = Path("derivative")
DERIV_DIR.mkdir(parents=True, exist_ok=True)

INPUTS = {
    "Gpt": DERIV_DIR / "merged_output_Gpt.csv",
    "Llama": DERIV_DIR / "merged_output_Llama.csv",
}
OUTPUTS = {
    "Gpt": DERIV_DIR / "merged_output_Gpt_cleaned.csv",
    "Llama": DERIV_DIR / "merged_output_Llama_cleaned.csv",
}

# Collapse these sub-elements into one per title
group_map = {
    "element_1": ["element_1a", "element_1b", "element_1c"],
    "element_4": ["element_4a", "element_4b", "element_4c"],
    "element_5": ["element_5a", "element_5b", "element_5c"],
}

def clean_text(x) -> str:
    """Trim + normalize blank lines; safe for NaN."""
    if x is None:
        return ""
    try:
        if pd.isna(x):
            return ""
    except Exception:
        pass
    s = str(x).strip()
    s = re.sub(r"\n\s*\n+", "\n\n", s)  # collapse multi-blank lines
    return s

def sort_element_key(x: str):
    """element_1, element_2, ... element_6 ordering."""
    s = str(x).strip().lower()
    m = re.match(r"^element_(\d+)([a-z]?)$", s)
    if not m:
        return (999, "z", s)
    return (int(m.group(1)), m.group(2) or "", s)

def combine_group_for_title(df_title: pd.DataFrame, new_element: str, group: list[str]) -> dict:
    """
    Combine NIH Value and Generated Content for the given group, preserving group order.
    """
    g = df_title[df_title["Element number"].isin(group)].copy()
    g["Element number"] = pd.Categorical(g["Element number"], categories=group, ordered=True)
    g = g.sort_values("Element number")

    nih = "\n\n".join([clean_text(v) for v in g["NIH Value"].tolist() if clean_text(v)])
    gen = "\n\n".join([clean_text(v) for v in g["Generated Content"].tolist() if clean_text(v)])

    return {
        "Element number": new_element,
        "NIH Value": nih,
        "Generated Content": gen,
    }

for model_name, in_path in INPUTS.items():
    if not in_path.exists():
        print(f"⚠️ Missing input: {in_path}")
        continue

    df = pd.read_csv(in_path)

    required = {"title", "Element number", "NIH Value", "Generated Content"}
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"{in_path} missing columns: {missing}. Found: {list(df.columns)}")

    # normalize
    df["Element number"] = df["Element number"].astype(str).str.strip().str.lower()

    group_flat = [e for grp in group_map.values() for e in grp]
    out_blocks = []

    # IMPORTANT: combine within each title
    for title, df_title in df.groupby("title", dropna=False):
        df_title = df_title.copy()

        keep_df = df_title[~df_title["Element number"].isin(group_flat)].copy()
        keep_df = keep_df[["Element number", "NIH Value", "Generated Content"]]

        merged_rows = []
        for new_element, grp in group_map.items():
            merged_rows.append(combine_group_for_title(df_title, new_element, grp))

        final_title_df = pd.concat(
            [keep_df, pd.DataFrame(merged_rows)],
            ignore_index=True
        )

        # clean text
        for col in ["NIH Value", "Generated Content"]:
            final_title_df[col] = final_title_df[col].apply(clean_text)

        # sort + attach title
        final_title_df = final_title_df.sort_values(
            by="Element number",
            key=lambda s: s.map(sort_element_key)
        ).reset_index(drop=True)

        final_title_df.insert(0, "title", title)
        out_blocks.append(final_title_df)

    final_df = pd.concat(out_blocks, ignore_index=True)

    out_path = OUTPUTS[model_name]
    final_df.to_csv(out_path, index=False, encoding="utf-8")

    # quick sanity check
    print(f" Saved: {out_path} | rows={len(final_df)}")
    print("Element counts:", final_df["Element number"].value_counts().to_dict())

print("Done: Step 7")

 Saved: derivative\merged_output_Gpt_cleaned.csv | rows=156
Element counts: {'element_1': 26, 'element_2': 26, 'element_3': 26, 'element_4': 26, 'element_5': 26, 'element_6': 26}
 Saved: derivative\merged_output_Llama_cleaned.csv | rows=156
Element counts: {'element_1': 26, 'element_2': 26, 'element_3': 26, 'element_4': 26, 'element_5': 26, 'element_6': 26}
Done: Step 7


In [124]:
from pathlib import Path
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# -----------------------------
# Paths
# -----------------------------
DERIV_DIR = Path("derivative")

INPUTS = {
    "Gpt":   DERIV_DIR / "merged_output_Gpt_cleaned.csv",
    "Llama": DERIV_DIR / "merged_output_Llama_cleaned.csv",
}

OUT_FOLDER = {
    "Gpt":   DERIV_DIR / "folder_similarity_summary_Gpt.csv",
    "Llama": DERIV_DIR / "folder_similarity_summary_Llama.csv",
}
OUT_RAW = {
    "Gpt":   DERIV_DIR / "element_similarity_raw_Gpt.csv",
    "Llama": DERIV_DIR / "element_similarity_raw_Llama.csv",
}
OUT_ELEMENT = {
    "Gpt":   DERIV_DIR / "element_similarity_summary_Gpt.csv",
    "Llama": DERIV_DIR / "element_similarity_summary_Llama.csv",
}

# -----------------------------
# Models
# -----------------------------
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

# -----------------------------
# Helpers
# -----------------------------
def safe_text(x) -> str:
    if x is None:
        return ""
    try:
        if pd.isna(x):
            return ""
    except Exception:
        pass
    return str(x)

def rougeL_recall(ref: str, gen: str) -> float:
    ref = ref or ""
    gen = gen or ""
    if not ref.strip() and not gen.strip():
        return 1.0
    if not ref.strip() or not gen.strip():
        return 0.0
    return float(rouge.score(ref, gen)["rougeL"].recall)

# -----------------------------
# Main
# -----------------------------
for model_name, in_path in INPUTS.items():
    if not in_path.exists():
        print(f"⚠️ Missing input: {in_path}")
        continue

    df = pd.read_csv(in_path)

    required = {"title", "Element number", "NIH Value", "Generated Content"}
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"{in_path} missing columns: {missing}. Found: {list(df.columns)}")

    # normalize
    df["title"] = df["title"].apply(safe_text)
    df["Element number"] = df["Element number"].apply(safe_text).str.strip().str.lower()
    df["NIH Value"] = df["NIH Value"].apply(safe_text)
    df["Generated Content"] = df["Generated Content"].apply(safe_text)

    # ---- SBERT (batch encode) ----
    nih_texts = df["NIH Value"].tolist()
    gen_texts = df["Generated Content"].tolist()

    # Encode in batches; normalize embeddings for stable cosine similarity
    emb_nih = sbert.encode(nih_texts, convert_to_tensor=True, normalize_embeddings=True)
    emb_gen = sbert.encode(gen_texts, convert_to_tensor=True, normalize_embeddings=True)

    # Cosine similarity per row (dot product works because normalized)
    sbert_sims = (emb_nih * emb_gen).sum(dim=1).cpu().numpy().astype(float)

    # ---- ROUGE-L recall (row-wise) ----
    rouge_recalls = [
        rougeL_recall(r, g) for r, g in zip(nih_texts, gen_texts)
    ]

    raw = pd.DataFrame({
        "Model": model_name,
        "DMP Title": df["title"],
        "Element number": df["Element number"],
        "SBERT_Similarity": sbert_sims,
        "ROUGE_L_Recall": rouge_recalls,
    })

    # Folder (DMP)-level summary
    folder_summary = (
        raw.groupby(["Model", "DMP Title"], as_index=False)[["SBERT_Similarity", "ROUGE_L_Recall"]]
           .mean()
           .rename(columns={"DMP Title": "Folder"})
    )

    # Element-level summary
    element_summary = (
        raw.groupby(["Model", "Element number"], as_index=False)[["SBERT_Similarity", "ROUGE_L_Recall"]]
           .mean()
    )

    # Save
    raw.to_csv(OUT_RAW[model_name], index=False, encoding="utf-8")
    folder_summary.to_csv(OUT_FOLDER[model_name], index=False, encoding="utf-8")
    element_summary.to_csv(OUT_ELEMENT[model_name], index=False, encoding="utf-8")

    print(f" Saved Step 8 outputs for {model_name}:")
    print(f"   - {OUT_FOLDER[model_name].as_posix()}")
    print(f"   - {OUT_RAW[model_name].as_posix()}")
    print(f"   - {OUT_ELEMENT[model_name].as_posix()}")

print("Done: Step 8")

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1562.94it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


 Saved Step 8 outputs for Gpt:
   - derivative/folder_similarity_summary_Gpt.csv
   - derivative/element_similarity_raw_Gpt.csv
   - derivative/element_similarity_summary_Gpt.csv
 Saved Step 8 outputs for Llama:
   - derivative/folder_similarity_summary_Llama.csv
   - derivative/element_similarity_raw_Llama.csv
   - derivative/element_similarity_summary_Llama.csv
Done: Step 8


In [125]:
import pandas as pd
from pathlib import Path

gpt_raw = pd.read_csv(Path("derivative") / "element_similarity_raw_Gpt.csv")
print(gpt_raw[["SBERT_Similarity", "ROUGE_L_Recall"]].describe())
print("Zero Generated cases (approx):", (gpt_raw["ROUGE_L_Recall"] == 0).mean())

       SBERT_Similarity  ROUGE_L_Recall
count        156.000000      156.000000
mean           0.688753        0.266886
std            0.126661        0.109872
min            0.323550        0.080000
25%            0.617297        0.191332
50%            0.700317        0.244146
75%            0.778203        0.325333
max            0.931892        0.675000
Zero Generated cases (approx): 0.0


In [111]:
import re
from pathlib import Path
import pandas as pd

# -----------------------------
# Paths
# -----------------------------
PRIMARY_DIR = Path("primary")
HUMAN_XLSX  = PRIMARY_DIR / "Human.xlsx"
GPT_DIR     = PRIMARY_DIR / "gpt-DMPs"
LLAMA_DIR   = PRIMARY_DIR / "llama-DMPs"

DERIV_DIR = Path("derivative")          # <-- top-level folder
DERIV_DIR.mkdir(parents=True, exist_ok=True)

GPT_OUT_CSV   = DERIV_DIR / "filtered_Gpt.csv"
LLAMA_OUT_CSV = DERIV_DIR / "filtered_Llama.csv"

# -----------------------------
# Helpers
# -----------------------------
def norm_text(x) -> str:
    """Lowercase + collapse whitespace; safe for NaN."""
    if x is None:
        return ""
    try:
        if pd.isna(x):
            return ""
    except Exception:
        pass
    s = str(x).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

def clean_title_for_match(name: str) -> str:
    """Normalize Excel titles for filename matching (keeps spaces)."""
    s = str(name).strip()
    s = re.sub(r'[\\/*?:"<>|]', "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def normalize_md_stem(stem: str) -> str:
    """
    Remove trailing model suffixes:
      "...-gpt-4.1" -> "..."
      "...-llama3.3" -> "..."
    """
    s = (stem or "").strip().lower()
    s = re.sub(r"[-_\s]?gpt[-_\s]?[\d\.]+$", "", s)
    s = re.sub(r"[-_\s]?llama[-_\s]?[\d\.]+$", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def find_md_by_excel_title(search_dir: Path, excel_title: str) -> Path | None:
    """Find the .md file whose normalized stem matches the Excel title."""
    excel_norm = norm_text(clean_title_for_match(excel_title))
    md_files = list(search_dir.rglob("*.md"))

    # Exact normalized match
    for p in md_files:
        if normalize_md_stem(p.stem) == excel_norm:
            return p

    # Fallback: contains match
    candidates = [p for p in md_files if excel_norm and excel_norm in normalize_md_stem(p.stem)]
    if candidates:
        candidates = sorted(candidates, key=lambda x: (len(x.stem), str(x)))
        return candidates[0]

    return None

def is_title(line: str) -> bool:
    """
    Detect NIH element headers in markdown.
    Supports:
      - Markdown headers: '#', '##', ...
      - Numbered bold headings: '1. **Data Type**'
    """
    s = (line or "").strip()
    if not s:
        return False
    if s.startswith("#"):
        return True
    return bool(re.match(r"^\s*\d+[\.\)]?\s*\*\*.+\*\*\s*$", s))

def extract_titles_and_text(md_text: str, content_col: str) -> pd.DataFrame:
    """Extract titles and their content; remove <think>...</think>."""
    cleaned = re.sub(r"<think>.*?</think>", "", md_text, flags=re.DOTALL | re.IGNORECASE)
    lines = cleaned.splitlines()

    rows = []
    current_title = None
    buf = []

    def flush():
        nonlocal current_title, buf, rows
        if current_title is None:
            return
        text = "\n".join(buf).strip()
        if text:
            rows.append({"Element title": current_title.strip(), content_col: text})

    for line in lines:
        if is_title(line):
            flush()
            current_title = line
            buf = []
        else:
            buf.append(line)

    flush()
    return pd.DataFrame(rows)

def choose_title_column(df: pd.DataFrame) -> str:
    candidates = ["title", "Title", "dmp_title", "DMP Title", "DMP_title"]
    for c in candidates:
        if c in df.columns:
            return c
    raise ValueError(f"Could not find title column. Columns: {list(df.columns)}")

# -----------------------------
# Load titles from Human.xlsx
# -----------------------------
if not HUMAN_XLSX.exists():
    raise FileNotFoundError(f"Missing: {HUMAN_XLSX}")

human_df = pd.read_excel(HUMAN_XLSX)
title_col = choose_title_column(human_df)
titles = human_df[title_col].dropna().astype(str).tolist()

# -----------------------------
# Process model folder
# -----------------------------
def process_model_folder(model_dir: Path, content_col: str, out_csv: Path) -> pd.DataFrame:
    records = []

    for title in titles:
        md_path = find_md_by_excel_title(model_dir, title)

        if md_path is None:
            records.append({
                "dmp_title": title,
                "md_path": None,
                "Element title": None,
                content_col: None,
                "status": "missing_md"
            })
            continue

        md_text = md_path.read_text(encoding="utf-8", errors="ignore")
        df_sec = extract_titles_and_text(md_text, content_col=content_col)

        if df_sec.empty:
            records.append({
                "dmp_title": title,
                "md_path": str(md_path),
                "Element title": None,
                content_col: None,
                "status": "no_sections_found"
            })
            continue

        for _, r in df_sec.iterrows():
            records.append({
                "dmp_title": title,
                "md_path": str(md_path),
                "Element title": r["Element title"],
                content_col: r[content_col],
                "status": "ok"
            })

    out_df = pd.DataFrame(records)
    out_df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"Saved: {out_csv} | rows={len(out_df)}")
    print(out_df["status"].value_counts(dropna=False))
    return out_df

# -----------------------------
# Run Step 5
# -----------------------------
process_model_folder(GPT_DIR,   content_col="Generated_Gpt_content",   out_csv=GPT_OUT_CSV)
process_model_folder(LLAMA_DIR, content_col="Generated_Llama_content", out_csv=LLAMA_OUT_CSV)

print("Done: Step 5 extraction completed.")

Saved: derivative\filtered_Gpt.csv | rows=234
status
ok    234
Name: count, dtype: int64
Saved: derivative\filtered_Llama.csv | rows=312
status
ok    312
Name: count, dtype: int64
Done: Step 5 extraction completed.


In [112]:
import re
from pathlib import Path
import pandas as pd

# -----------------------------
# Paths
# -----------------------------
PRIMARY_DIR = Path("primary")
HUMAN_XLSX  = PRIMARY_DIR / "Human.xlsx"

DERIV_DIR = Path("derivative")
DERIV_DIR.mkdir(parents=True, exist_ok=True)

GPT_FILTERED   = DERIV_DIR / "filtered_Gpt.csv"
LLAMA_FILTERED = DERIV_DIR / "filtered_Llama.csv"

GPT_MERGED   = DERIV_DIR / "merged_output_Gpt.csv"
LLAMA_MERGED = DERIV_DIR / "merged_output_Llama.csv"

# -----------------------------
# Helpers
# -----------------------------
def norm_text(x) -> str:
    if x is None:
        return ""
    try:
        if pd.isna(x):
            return ""
    except Exception:
        pass
    s = str(x).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

def choose_title_column(df: pd.DataFrame) -> str:
    candidates = ["title", "Title", "dmp_title", "DMP Title", "DMP_title"]
    for c in candidates:
        if c in df.columns:
            return c
    raise ValueError(f"Could not find title column. Columns: {list(df.columns)}")

def get_element_columns(df: pd.DataFrame) -> list:
    cols = []
    for c in df.columns:
        cc = str(c).strip().lower()
        if re.match(r"^element_\d+[a-z]?$", cc):
            cols.append(cc)
    if not cols:
        raise ValueError(
            "No element_* columns found in Human.xlsx. "
            "Expected: element_1a, element_1b, ..., element_6."
        )

    def sort_key(c):
        m = re.match(r"^element_(\d+)([a-z]?)$", c)
        num = int(m.group(1))
        suf = m.group(2) or ""
        return (num, suf)

    return sorted(cols, key=sort_key)

# --- mapping markdown section title -> NIH element key ---
def build_element_map():
    return [
        (re.compile(r"\b1\b.*\bdata\b.*\btype\b", re.I), "element_1a"),
        (re.compile(r"\b1\b.*\bformat\b", re.I), "element_1b"),
        (re.compile(r"\b1\b.*\bmetadata\b|\bdocumentation\b", re.I), "element_1c"),

        (re.compile(r"\b2\b.*\btools\b|\bsoftware\b|\bcode\b", re.I), "element_2"),
        (re.compile(r"\b3\b.*\bstandards\b|\bquality\b|\bcontrol\b", re.I), "element_3"),

        (re.compile(r"\b4\b.*\brepository\b|\bpreserv", re.I), "element_4a"),
        (re.compile(r"\b4\b.*\btimeline\b|\bshare\b|\brelease\b", re.I), "element_4b"),
        (re.compile(r"\b4\b.*\baccess\b|\brestriction\b|\bcontrolled\b", re.I), "element_4c"),

        (re.compile(r"\b5\b.*\bprivacy\b|\bconfidential", re.I), "element_5a"),
        (re.compile(r"\b5\b.*\bsecurity\b|\bencrypt\b|\bstorage\b", re.I), "element_5b"),
        (re.compile(r"\b5\b.*\bconsent\b|\birb\b|\birs\b|\bcompliance\b", re.I), "element_5c"),

        (re.compile(r"\b6\b.*\boversight\b|\bresponsib", re.I), "element_6"),
    ]

ELEMENT_MAP = build_element_map()

def map_section_title_to_element(section_title) -> str | None:
    if section_title is None:
        return None
    try:
        if pd.isna(section_title):
            return None
    except Exception:
        pass

    t = str(section_title)
    t = re.sub(r"^#+\s*", "", t).strip()

    for pattern, element_key in ELEMENT_MAP:
        if pattern.search(t):
            return element_key
    return None

# -----------------------------
# Load NIH reference (Human.xlsx)
# -----------------------------
ref = pd.read_excel(HUMAN_XLSX)
ref.columns = [str(c).strip() for c in ref.columns]

title_col = choose_title_column(ref)
ref = ref.rename(columns={title_col: "title"})
ref.columns = [str(c).strip().lower() for c in ref.columns]

ref["title_norm"] = ref["title"].apply(norm_text)
element_cols = get_element_columns(ref)

ref = ref[["title", "title_norm"] + element_cols].fillna("")

ref_long = ref.melt(
    id_vars=["title", "title_norm"],
    value_vars=element_cols,
    var_name="Element number",
    value_name="NIH Value",
)
ref_long["Element number"] = ref_long["Element number"].str.lower().str.strip()

# -----------------------------
# Load Step 5 CSVs + map section titles to element keys
# -----------------------------
def load_model_filtered(path: Path, content_col: str) -> pd.DataFrame:
    df = pd.read_csv(path)

    needed = {"dmp_title", "Element title", content_col}
    missing = [c for c in needed if c not in df.columns]
    if missing:
        raise ValueError(f"{path} missing columns: {missing}. Found: {list(df.columns)}")

    df["title_norm"] = df["dmp_title"].apply(norm_text)
    df["Element title"] = df["Element title"].fillna("").astype(str)

    df["Element number"] = df["Element title"].apply(map_section_title_to_element)
    df = df[df["Element number"].notna()].copy()
    df["Element number"] = df["Element number"].str.lower().str.strip()

    df = df.rename(columns={content_col: "Generated Content"})
    return df[["dmp_title", "title_norm", "Element number", "Element title", "Generated Content"]]

gpt_sec = load_model_filtered(GPT_FILTERED, content_col="Generated_Gpt_content")
llm_sec = load_model_filtered(LLAMA_FILTERED, content_col="Generated_Llama_content")

# -----------------------------
# Merge + Save
# -----------------------------
def merge_and_save(ref_long: pd.DataFrame, sec_df: pd.DataFrame, out_path: Path):
    merged = ref_long.merge(sec_df, on=["title_norm", "Element number"], how="left")
    merged = merged[["title", "Element number", "NIH Value", "Element title", "Generated Content"]]
    merged.to_csv(out_path, index=False, encoding="utf-8")
    print(f"Saved: {out_path} | rows={len(merged)}")

merge_and_save(ref_long, gpt_sec, GPT_MERGED)
merge_and_save(ref_long, llm_sec, LLAMA_MERGED)

print("Done: Step 6 merge completed.")

Saved: derivative\merged_output_Gpt.csv | rows=338
Saved: derivative\merged_output_Llama.csv | rows=338
Done: Step 6 merge completed.


In [113]:
import re
from pathlib import Path
import pandas as pd

# -----------------------------
# Paths
# -----------------------------
DERIV_DIR = Path("derivative")

INPUTS = {
    "Gpt": DERIV_DIR / "merged_output_Gpt.csv",
    "Llama": DERIV_DIR / "merged_output_Llama.csv",
}

OUTPUTS = {
    "Gpt": DERIV_DIR / "merged_output_Gpt_cleaned.csv",
    "Llama": DERIV_DIR / "merged_output_Llama_cleaned.csv",
}

# -----------------------------
# Groups to collapse
# -----------------------------
group_map = {
    "element_1": ["element_1a", "element_1b", "element_1c"],
    "element_4": ["element_4a", "element_4b", "element_4c"],
    "element_5": ["element_5a", "element_5b", "element_5c"],
}

# -----------------------------
# Helpers
# -----------------------------
def clean_text(text) -> str:
    """Remove multiple blank lines, leading/trailing spaces/newlines; safe for NaN."""
    if text is None:
        return ""
    try:
        if pd.isna(text):
            return ""
    except Exception:
        pass

    s = str(text).strip()
    # Replace 2+ consecutive newlines (with optional whitespace) by a single blank line
    s = re.sub(r"\n\s*\n+", "\n\n", s)
    return s

def sort_element_key(x: str):
    """
    Sort element numbers in a logical order:
      element_1, element_2, element_3, element_4, element_5, element_6
      and if any subelements remain, keep them after their parent.
    """
    s = str(x).strip().lower()
    m = re.match(r"^element_(\d+)([a-z]?)$", s)
    if not m:
        return (999, "z", s)
    num = int(m.group(1))
    suf = m.group(2) or ""
    return (num, suf, s)

def combine_group(df: pd.DataFrame, new_element: str, group: list[str]) -> dict:
    group_df = df[df["Element number"].isin(group)].copy()

    # Join NIH Value and Generated Content in the group order (1a,1b,1c etc.)
    group_df["Element number"] = pd.Categorical(group_df["Element number"], categories=group, ordered=True)
    group_df = group_df.sort_values("Element number")

    merged_value = "\n\n".join([clean_text(x) for x in group_df["NIH Value"].tolist() if clean_text(x)])
    merged_generated = "\n\n".join([clean_text(x) for x in group_df["Generated Content"].tolist() if clean_text(x)])

    return {
        "Element number": new_element,
        "NIH Value": merged_value,
        "Generated Content": merged_generated,
    }

# -----------------------------
# Main
# -----------------------------
for model_name, in_path in INPUTS.items():
    if not in_path.exists():
        print(f" Missing input file: {in_path}")
        continue

    df = pd.read_csv(in_path)

    # Basic validation
    required_cols = {"Element number", "NIH Value", "Generated Content"}
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"{in_path} missing columns: {missing}. Found: {list(df.columns)}")

    # Normalize element number casing
    df["Element number"] = df["Element number"].astype(str).str.strip().str.lower()

    # Keep non-group rows
    group_flat = [e for group in group_map.values() for e in group]
    remaining_df = df[~df["Element number"].isin(group_flat)].copy()

    # Build collapsed rows
    merged_rows = []
    for new_element, group in group_map.items():
        merged_rows.append(combine_group(df, new_element, group))

    final_df = pd.concat([remaining_df, pd.DataFrame(merged_rows)], ignore_index=True)

    # Clean text columns
    for col in ["NIH Value", "Generated Content"]:
        final_df[col] = final_df[col].apply(clean_text)

    # Sort nicely
    final_df = final_df.sort_values(by="Element number", key=lambda s: s.map(sort_element_key)).reset_index(drop=True)

    out_path = OUTPUTS[model_name]
    final_df.to_csv(out_path, index=False, encoding="utf-8")
    print(f" Saved: {out_path}")

 Saved: derivative\merged_output_Gpt_cleaned.csv
 Saved: derivative\merged_output_Llama_cleaned.csv


In [114]:
import re
from pathlib import Path
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# -----------------------------
# Paths
# -----------------------------
DERIV_DIR = Path("derivative")

# Use cleaned merged outputs (recommended)
INPUTS = {
    "Gpt":   DERIV_DIR / "merged_output_Gpt_cleaned.csv",
    "Llama": DERIV_DIR / "merged_output_Llama_cleaned.csv",
}

# Output files (saved in derivative/)
OUT_FOLDER_SUMMARY = {
    "Gpt":   DERIV_DIR / "folder_similarity_summary_Gpt.csv",
    "Llama": DERIV_DIR / "folder_similarity_summary_Llama.csv",
}
OUT_ELEMENT_RAW = {
    "Gpt":   DERIV_DIR / "element_similarity_raw_Gpt.csv",
    "Llama": DERIV_DIR / "element_similarity_raw_Llama.csv",
}
OUT_ELEMENT_SUMMARY = {
    "Gpt":   DERIV_DIR / "element_similarity_summary_Gpt.csv",
    "Llama": DERIV_DIR / "element_similarity_summary_Llama.csv",
}

# -----------------------------
# Models
# -----------------------------
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

# -----------------------------
# Helpers
# -----------------------------
def safe_text(x) -> str:
    """Convert to string; return '' for NaN/None."""
    if x is None:
        return ""
    try:
        if pd.isna(x):
            return ""
    except Exception:
        pass
    return str(x)

def compute_sbert_similarity(t1: str, t2: str) -> float:
    if not t1.strip() and not t2.strip():
        return 1.0  # both empty => perfect match (you can change to 0.0 if you prefer)
    if not t1.strip() or not t2.strip():
        return 0.0
    emb1 = sbert.encode(t1, convert_to_tensor=True)
    emb2 = sbert.encode(t2, convert_to_tensor=True)
    return float(util.cos_sim(emb1, emb2).item())

def compute_rougeL_recall(t_ref: str, t_gen: str) -> float:
    if not t_ref.strip() and not t_gen.strip():
        return 1.0
    if not t_ref.strip() or not t_gen.strip():
        return 0.0
    scores = rouge.score(t_ref, t_gen)
    return float(scores["rougeL"].recall)

# -----------------------------
# Main loop per model
# -----------------------------
for model_name, in_path in INPUTS.items():
    if not in_path.exists():
        print(f" Missing input: {in_path}")
        continue

    df = pd.read_csv(in_path)

    # Expected columns from Step 6:
    # title, Element number, NIH Value, Element title, Generated Content
    required_cols = ["title", "Element number", "NIH Value", "Generated Content"]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"{in_path} missing columns: {missing}. Found: {list(df.columns)}")

    element_rows = []

    # Compute per-row scores
    for _, row in df.iterrows():
        dmp_title = safe_text(row["title"])
        elem = safe_text(row["Element number"]).strip().lower()

        nih_text = safe_text(row["NIH Value"])
        gen_text = safe_text(row["Generated Content"])

        sbert_sim = compute_sbert_similarity(nih_text, gen_text)
        rouge_recall = compute_rougeL_recall(nih_text, gen_text)

        element_rows.append({
            "Model": model_name,
            "DMP Title": dmp_title,
            "Element number": elem,
            "SBERT_Similarity": sbert_sim,
            "ROUGE_L_Recall": rouge_recall,
        })

    element_raw = pd.DataFrame(element_rows)

    # Folder (DMP)-level summary: mean across elements for each DMP
    folder_summary = (
        element_raw
        .groupby(["Model", "DMP Title"], as_index=False)[["SBERT_Similarity", "ROUGE_L_Recall"]]
        .mean()
        .rename(columns={"DMP Title": "Folder"})
    )

    # Element-level summary: mean across DMPs for each element
    element_summary = (
        element_raw
        .groupby(["Model", "Element number"], as_index=False)[["SBERT_Similarity", "ROUGE_L_Recall"]]
        .mean()
    )

    # Save
    element_raw.to_csv(OUT_ELEMENT_RAW[model_name], index=False)
    folder_summary.to_csv(OUT_FOLDER_SUMMARY[model_name], index=False)
    element_summary.to_csv(OUT_ELEMENT_SUMMARY[model_name], index=False)

    print(f" Saved for {model_name}:")
    print(f"   - {OUT_FOLDER_SUMMARY[model_name]}")
    print(f"   - {OUT_ELEMENT_RAW[model_name]}")
    print(f"   - {OUT_ELEMENT_SUMMARY[model_name]}")

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1587.34it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


 Saved for Gpt:
   - derivative\folder_similarity_summary_Gpt.csv
   - derivative\element_similarity_raw_Gpt.csv
   - derivative\element_similarity_summary_Gpt.csv
 Saved for Llama:
   - derivative\folder_similarity_summary_Llama.csv
   - derivative\element_similarity_raw_Llama.csv
   - derivative\element_similarity_summary_Llama.csv
