In [None]:
# ============================================
# STEP 1 — Imports, Config (YAML), and Helpers
# (Notebook-safe outputs under: ./noteboo_DMP_RAG/)
# Reuses pipeline FAISS index (read-only)
# ============================================
import os, re, time
from pathlib import Path
from datetime import datetime

import yaml
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import pypandoc  # for Markdown → DOCX

# --- LangChain Core ---
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

load_dotenv()

# ---------- Resolve project root (works in notebook or script) ----------
try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]  # when running a .py script
except NameError:
    PROJECT_ROOT = Path.cwd().parent                    # when running inside Jupyter

# ---------- Notebook-only output root (keeps experiments separate) ----------
NOTEBOOK_DIR = Path.cwd()  # folder where the notebook is running
NB_OUT_ROOT = NOTEBOOK_DIR / "Output_experiemnt_RAG3_NIH_all"

# Optional: timestamped run folder so outputs never overwrite
RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_DIR = NB_OUT_ROOT / f"run_{RUN_ID}"

NB_MD_DIR   = RUN_DIR / "md"
NB_DOCX_DIR = RUN_DIR / "docx"

for p in [NB_OUT_ROOT, RUN_DIR, NB_MD_DIR, NB_DOCX_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# ---------- YAML loader + path resolver ----------
def load_yaml_config(cfg_path: Path) -> dict:
    cfg_path = Path(cfg_path)
    if not cfg_path.exists():
        raise FileNotFoundError(f"Config YAML not found: {cfg_path}")
    with cfg_path.open("r", encoding="utf-8") as f:
        cfg = yaml.safe_load(f)
    if not isinstance(cfg, dict):
        raise ValueError(f"Config YAML must parse to a dict. Got: {type(cfg)}")
    return cfg


def resolve_from_root(project_root: Path, root_dir_value: str | Path) -> Path:
    """
    YAML root_dir:
      - "." means project root
      - relative paths are relative to project root
      - absolute paths are used as-is
    """
    p = Path(root_dir_value).expanduser()
    if p.is_absolute():
        return p.resolve()
    return (project_root / p).resolve()


def resolve_path(base: Path, rel_or_abs: str | Path | None) -> Path | None:
    """Resolve a path relative to `base` if not absolute. Keep None as None."""
    if rel_or_abs is None:
        return None
    p = Path(rel_or_abs).expanduser()
    if p.is_absolute():
        return p.resolve()
    return (base / p).resolve()


# ---------- Choose your YAML file here ----------
CONFIG_YAML = PROJECT_ROOT / "config" / "config.yaml"
cfg = load_yaml_config(CONFIG_YAML)

# ---------- Root dir from YAML ----------
ROOT_DIR = resolve_from_root(PROJECT_ROOT, cfg["root_dir"])

# ---------- Paths from YAML (READ-ONLY / pipeline assets) ----------
DATA_PDFS  = resolve_path(ROOT_DIR, cfg["paths"]["data_pdfs"])        # optional here, but kept for reference
INDEX_DIR  = resolve_path(ROOT_DIR, cfg["paths"]["index_dir"])        # <-- reuse pipeline index (read-only)
EXCEL_PATH = resolve_path(ROOT_DIR, cfg["paths"]["excel_path"])       # optional depending on your notebook

# Template (read-only)
TEMPLATE_MD = resolve_path(
    ROOT_DIR,
    cfg["paths"].get("template_md", "data/inputs/dmp-template.md")
)

# ---------- RAG params ----------
TOP_K = int(cfg["rag"]["retriever_top_k"])

# ---------- Models ----------
EMBED_MODEL = cfg["models"]["embedding_model"]
LLM_MODEL   = cfg["models"]["llm_name"]

EMBED_DEVICE       = cfg["models"]["embedding_device"]
EMBED_BATCH_SIZE   = int(cfg["models"]["embedding_batch_size"])
NORMALIZE_EMBEDS   = bool(cfg["models"]["normalize_embeddings"])
HF_CACHE_DIR       = resolve_path(ROOT_DIR, cfg["models"]["hf_cache_dir"])
LOCAL_FILES_ONLY   = bool(cfg["models"]["local_files_only"])
ALLOW_DL_IF_MISS   = bool(cfg["models"]["allow_download_if_missing"])

# ---------- Notebook-only outputs (always under noteboo_DMP_RAG/) ----------
OUTPUT_MD   = NB_MD_DIR / "generated_dmp.md"
OUTPUT_DOCX = NB_DOCX_DIR / "generated_dmp.docx"


# ---------- Helper functions ----------
def create_folder(folderpath: Path | str) -> None:
    Path(folderpath).mkdir(parents=True, exist_ok=True)

def save_md(folderpath: Path | str, filename: str, text: str) -> Path:
    create_folder(folderpath)
    out_path = Path(folderpath) / filename
    out_path.write_text(text, encoding="utf-8")
    print("Saved:", out_path)
    return out_path

def md_to_docs(md_filepath: Path | str, docx_folderpath: Path | str, docx_filename: str) -> Path:
    create_folder(docx_folderpath)
    out_path = Path(docx_folderpath) / docx_filename
    pypandoc.convert_file(str(md_filepath), "docx", outputfile=str(out_path))
    print("Converted:", out_path)
    return out_path

def clean_filename(name: str) -> str:
    """Remove illegal characters from filenames (Windows-safe)."""
    return re.sub(r'[\\/*?:"<>|]', "_", str(name)).strip()


# ---------- Sanity print ----------
print("STEP 1 ready (reuses pipeline index, notebook-local outputs)")
print(f"CONFIG_YAML : {CONFIG_YAML}")
print(f"PROJECT_ROOT: {PROJECT_ROOT}")
print(f"ROOT_DIR    : {ROOT_DIR}")
print(f"NOTEBOOK_DIR: {NOTEBOOK_DIR}")
print(f"NB_OUT_ROOT : {NB_OUT_ROOT}")
print(f"RUN_DIR     : {RUN_DIR}")

print(f"INDEX_DIR (pipeline): {INDEX_DIR}")
print(f"OUTPUT_MD (notebook): {OUTPUT_MD}")
print(f"OUTPUT_DOCX(notebook): {OUTPUT_DOCX}")

print(f"DATA_PDFS   : {DATA_PDFS}")
print(f"EXCEL_PATH  : {EXCEL_PATH}")
print(f"TEMPLATE_MD : {TEMPLATE_MD}")

print(f"EMBED_MODEL : {EMBED_MODEL}")
print(f"LLM_MODEL   : {LLM_MODEL}")
print(f"TOP_K       : {TOP_K}")
print(f"EMBED_DEVICE: {EMBED_DEVICE} | BATCH: {EMBED_BATCH_SIZE} | NORMALIZE: {NORMALIZE_EMBEDS}")
print(f"HF_CACHE_DIR: {HF_CACHE_DIR} | local_files_only={LOCAL_FILES_ONLY} | allow_download_if_missing={ALLOW_DL_IF_MISS}")

STEP 1 ready (reuses pipeline index, notebook-local outputs)
CONFIG_YAML : c:\Users\Nahid\dmpchef\config\config.yaml
PROJECT_ROOT: c:\Users\Nahid\dmpchef
ROOT_DIR    : C:\Users\Nahid\dmpchef
NOTEBOOK_DIR: c:\Users\Nahid\dmpchef\notebook_DMP_RAG
NB_OUT_ROOT : c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all
RUN_DIR     : c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527
INDEX_DIR (pipeline): C:\Users\Nahid\dmpchef\data\vector_db\NIH_all_db
OUTPUT_MD (notebook): c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\md\generated_dmp.md
OUTPUT_DOCX(notebook): c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\docx\generated_dmp.docx
DATA_PDFS   : C:\Users\Nahid\dmpchef\data\data_ingestion\NIH_all
EXCEL_PATH  : C:\Users\Nahid\dmpchef\data\inputs\inputs.xlsx
TEMPLATE_MD : C:\Users\Nahid\dmpchef\data\inputs\dmp-template.md
EMBED_MODEL : sentence-transformers/all-MiniLM-L6-v2
LLM_MODEL   : llama3.3:latest
TOP_K

In [10]:
# ============================================
# STEP 2 — Load pipeline FAISS index (READ-ONLY) and create retriever
# ============================================
from pathlib import Path
from langchain_community.vectorstores import FAISS

# Prefer new HuggingFace integration if installed; fallback to langchain_community
try:
    from langchain_huggingface import HuggingFaceEmbeddings  # type: ignore
    _EMB_BACKEND = "langchain_huggingface"
except Exception:
    from langchain_community.embeddings import HuggingFaceEmbeddings  # type: ignore
    _EMB_BACKEND = "langchain_community"

import torch
import os

# Ensure HF uses the same cache directory you configured (very important in notebooks)
# This helps avoid "couldn't find them in the cached files" surprises.
if HF_CACHE_DIR is not None:
    os.environ.setdefault("HF_HOME", str(HF_CACHE_DIR))

def _pick_device(requested: str) -> str:
    req = (requested or "auto").lower().strip()
    if req in ("auto", "cuda"):
        return "cuda" if torch.cuda.is_available() else "cpu"
    return "cpu"

device = _pick_device(EMBED_DEVICE)

def _make_embeddings(local_only: bool):
    return HuggingFaceEmbeddings(
        model_name=EMBED_MODEL,
        cache_folder=str(HF_CACHE_DIR) if HF_CACHE_DIR is not None else None,
        model_kwargs={
            "device": device,
            "local_files_only": bool(local_only),
        },
        encode_kwargs={
            "batch_size": int(EMBED_BATCH_SIZE),
            "normalize_embeddings": bool(NORMALIZE_EMBEDS),
        },
    )

# 1) Try per YAML: local_files_only
try:
    embeddings = _make_embeddings(local_only=LOCAL_FILES_ONLY)
except Exception as e1:
    # 2) If offline cache miss but allowed to download, retry online
    if LOCAL_FILES_ONLY and ALLOW_DL_IF_MISS:
        print("Embeddings not found in cache; retrying with download enabled...")
        embeddings = _make_embeddings(local_only=False)
    else:
        raise

index_dir = Path(INDEX_DIR)
faiss_path = index_dir / "index.faiss"
pkl_path   = index_dir / "index.pkl"

if not (faiss_path.exists() and pkl_path.exists()):
    raise FileNotFoundError(
        "FAISS index files not found.\n"
        f"Expected:\n- {faiss_path}\n- {pkl_path}\n"
        "Run your pipeline build_index.py or fix config.paths.index_dir."
    )

print("Loading FAISS index (read-only) from:", index_dir)
vectorstore = FAISS.load_local(
    str(index_dir),
    embeddings,
    allow_dangerous_deserialization=True
)

retriever = vectorstore.as_retriever(search_kwargs={"k": int(TOP_K)})

print(
    "Retriever ready",
    f"top_k={TOP_K}",
    f"embed_model={EMBED_MODEL}",
    f"device={device}",
    f"backend={_EMB_BACKEND}",
    sep=" | "
)

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1391.84it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading FAISS index (read-only) from: C:\Users\Nahid\dmpchef\data\vector_db\NIH_all_db
Retriever ready | top_k=6 | embed_model=sentence-transformers/all-MiniLM-L6-v2 | device=cuda | backend=langchain_huggingface


In [11]:
# ============================================
# STEP 3 — Load Excel + Template, Build Few-shot, and Build RAG Chain
# ============================================
import re
import pandas as pd
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Ollama

# ---- Guard: Step 2 must run first ----
if "retriever" not in globals():
    raise RuntimeError("`retriever` is not defined. Run STEP 2 (load FAISS index) before STEP 3.")

# --- Load Excel file ---
if EXCEL_PATH is None or not EXCEL_PATH.exists():
    raise FileNotFoundError(f"Excel file not found: {EXCEL_PATH}")

df = pd.read_excel(EXCEL_PATH)
df.columns = df.columns.str.strip().str.lower()
df = df.fillna("")
print(f"Excel loaded successfully: {len(df)} rows")

# --- Load Markdown Template ---
if TEMPLATE_MD is None or not TEMPLATE_MD.exists():
    raise FileNotFoundError(f"Template file not found: {TEMPLATE_MD}")

dmp_template_text = TEMPLATE_MD.read_text(encoding="utf-8")
print("DMP Markdown template loaded.")


# ============================================
# Build RAG chain (RAG grounding)
# ============================================
def build_rag_chain(retriever, llm_model=LLM_MODEL, n_few_shot: int = 3):
    llm = Ollama(model=llm_model)
    parser = StrOutputParser()

    def format_docs(docs):
        if not docs:
            return ""
        formatted = []
        for d in docs:
            page = d.metadata.get("page", d.metadata.get("page_number", ""))
            source = d.metadata.get("source", d.metadata.get("file_path", ""))
            page_disp = (page + 1) if isinstance(page, int) else page
            formatted.append(f"[Page {page_disp}] {source}\n{(d.page_content or '').strip()}")
        return "\n\n".join(formatted)

    prompt_template = """You are an expert biomedical data steward and grant writer.
Create a high-quality NIH Data Management and Sharing Plan (DMSP) based on the retrieved NIH context and the user's query.

---- Context from NIH Repository (grounding) ----
{context}

---- Question ----
{question}

Rules:
- Use NIH context when relevant; do NOT invent policy details.
- If a specific policy detail is not supported by the provided context, write: "Not specified in provided NIH context."
- Follow the NIH template structure and keep section titles unchanged when the template is provided.
"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    rag_chain = (
        {
            "context": retriever | format_docs,
            "question": RunnablePassthrough(),
        }
        | prompt
        | llm
        | parser
    )

    print(f"RAG chain initialized with model: {llm_model}")
    return rag_chain


rag_chain = build_rag_chain(retriever, n_few_shot=3)
print("RAG chain ready for generation.")

Excel loaded successfully: 26 rows
DMP Markdown template loaded.
RAG chain initialized with model: llama3.3:latest
RAG chain ready for generation.


In [12]:
# ============================================
# STEP 4 — RAG-Based DMP Generation (Notebook-safe, no icons)
# Writes ONLY under: Output_7/run_*/ (from STEP 1)
# Reuses: df, dmp_template_text, rag_chain (from STEP 3)
# ============================================
import re
import pandas as pd
import pypandoc
from tqdm import tqdm
from pathlib import Path

# ---------- Ensure notebook output folders exist ----------
for p in [NB_OUT_ROOT, RUN_DIR, NB_MD_DIR, NB_DOCX_DIR]:
    Path(p).mkdir(parents=True, exist_ok=True)

# ---------- Output log (inside the same run folder) ----------
OUTPUT_LOG = RUN_DIR / "rag_generated_dmp_log.csv"

# ---------- Helper functions ----------
def sanitize_filename(name: str, max_len: int = 140) -> str:
    """
    Windows-safe filename:
    - remove illegal characters
    - collapse whitespace
    - limit length to avoid long-path issues
    """
    s = re.sub(r'[\\/*?:"<>|]', "_", str(name).strip())
    s = re.sub(r"\s+", " ", s).strip()
    if len(s) > max_len:
        s = s[:max_len].rstrip()
    return s or "untitled"

def save_md(folderpath: Path, filename: str, text: str) -> Path:
    folderpath.mkdir(parents=True, exist_ok=True)
    out_path = folderpath / filename
    out_path.write_text(text, encoding="utf-8")
    print("Saved:", out_path)
    return out_path

def md_to_docx(md_filepath: Path, docx_folder: Path, docx_filename: str) -> Path:
    docx_folder.mkdir(parents=True, exist_ok=True)
    out_path = docx_folder / docx_filename
    pypandoc.convert_file(str(md_filepath), "docx", outputfile=str(out_path))
    print("Converted:", out_path)
    return out_path

# ---------- Guards ----------
if "rag_chain" not in globals():
    raise RuntimeError("rag_chain not found. Run STEP 2 (retriever) + STEP 3 (rag_chain) first.")
if "df" not in globals():
    raise RuntimeError("df not found. Run STEP 3 (Excel load) first.")
if "dmp_template_text" not in globals():
    raise RuntimeError("dmp_template_text not found. Run STEP 3 (template load) first.")

print("Notebook output folders:")
print("RUN_DIR    :", RUN_DIR)
print("MD folder  :", NB_MD_DIR)
print("DOCX folder:", NB_DOCX_DIR)
print("LOG CSV    :", OUTPUT_LOG)

# ---------- Main generation ----------
records = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Generating NIH DMPs"):
    title = str(row.get("title", "")).strip()
    if not title:
        continue

    print("\nGenerating DMP for:", title)

    # Build proposal details from the row (only element* columns)
    element_cols = [c for c in df.columns if str(c).startswith("element")]
    element_texts = []
    for col in element_cols:
        val = str(row.get(col, "")).strip()
        if val:
            element_texts.append(f"{col.upper()}: {val}")
    query_data = "\n".join(element_texts).strip()

    # Build question for rag_chain (retrieval + few-shot happens inside chain)
    question = f"""
Create a complete NIH Data Management and Sharing Plan (DMSP) for the project titled: "{title}".

Use the NIH DMSP Markdown template below and DO NOT change section titles.

Project background / proposal details:
{query_data}

NIH DMSP Markdown template:
{dmp_template_text}
""".strip()

    try:
        response = rag_chain.invoke(question)

        safe_title = sanitize_filename(title)
        md_filename = f"{safe_title}.md"
        docx_filename = f"{safe_title}.docx"

        md_path = save_md(NB_MD_DIR, md_filename, response)
        docx_path = md_to_docx(md_path, NB_DOCX_DIR, docx_filename)

        records.append({
            "Title": title,
            "MD_Path": str(md_path),
            "DOCX_Path": str(docx_path),
            "Question_Preview": question[:1000],
            "Generated_DMP_Preview": response[:1000],
            "Error": ""
        })

    except Exception as e:
        print("Error generating DMP for:", title, "|", str(e))
        records.append({
            "Title": title,
            "MD_Path": "",
            "DOCX_Path": "",
            "Question_Preview": question[:1000],
            "Generated_DMP_Preview": "",
            "Error": str(e)
        })

# ---------- Save log ----------
pd.DataFrame(records).to_csv(OUTPUT_LOG, index=False, encoding="utf-8")
print("\nFinished processing all rows.")
print("CSV log saved to:", OUTPUT_LOG)

Notebook output folders:
RUN_DIR    : c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527
MD folder  : c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\md
DOCX folder: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\docx
LOG CSV    : c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\rag_generated_dmp_log.csv


Generating NIH DMPs:   0%|          | 0/26 [00:00<?, ?it/s]


Generating DMP for: Clinical and MRI data from human research participants
Saved: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\md\Clinical and MRI data from human research participants.md


Generating NIH DMPs:   4%|▍         | 1/26 [12:53<5:22:23, 773.76s/it]

Converted: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\docx\Clinical and MRI data from human research participants.docx

Generating DMP for: Genomic data from human research participants


Generating NIH DMPs:   8%|▊         | 2/26 [25:14<5:01:41, 754.23s/it]

Saved: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\md\Genomic data from human research participants.md
Converted: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\docx\Genomic data from human research participants.docx

Generating DMP for: Genomic data from a non-human source


Generating NIH DMPs:  12%|█▏        | 3/26 [34:41<4:16:27, 669.02s/it]

Saved: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\md\Genomic data from a non-human source.md
Converted: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\docx\Genomic data from a non-human source.docx

Generating DMP for: Secondary data analysis


Generating NIH DMPs:  15%|█▌        | 4/26 [45:56<4:06:03, 671.05s/it]

Saved: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\md\Secondary data analysis.md
Converted: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\docx\Secondary data analysis.docx

Generating DMP for: Human clinical and genomics data


Generating NIH DMPs:  19%|█▉        | 5/26 [1:00:05<4:17:25, 735.51s/it]

Saved: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\md\Human clinical and genomics data.md
Converted: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260225_140527\docx\Human clinical and genomics data.docx

Generating DMP for: Gene expression analysis data from non-human model organism (zebrafish)


Generating NIH DMPs:  19%|█▉        | 5/26 [1:06:39<4:39:56, 799.81s/it]


KeyboardInterrupt: 

In [None]:
# ============================================
# STEP 5 — Full DMP Comparison: Markdown (Generated) vs PDF (Gold, Fuzzy Matching)
# Notebook-safe , and writes results ONLY under your current RUN_DIR
# Uses:
#   - NB_MD_DIR (generated .md from STEP 4)
#   - RUN_DIR   (evaluation output folder)
# You set GOLD_DIR manually (PDF gold folder)
# ============================================
import re
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from difflib import SequenceMatcher

from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# ------------------------------------------------------------
# Paths (Notebook-safe)
# ------------------------------------------------------------
# 1) Generated markdown comes from the current run folder (STEP 4)
GENERATED_DIR = Path(NB_MD_DIR)

# 2) Gold PDFs: set your correct folder here
#    Examples you can try (uncomment the one that matches your repo):
# GOLD_DIR = ROOT_DIR / "data" / "inputs" / "gold_dmps"
# GOLD_DIR = ROOT_DIR / "data" / "gold_dmps"
GOLD_DIR = ROOT_DIR / "data" / "inputs" / "gold_dmps"

# 3) Evaluation output: store under this run
EVAL_DIR = Path(RUN_DIR) / "evaluation_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

print("Gold PDF folder     :", GOLD_DIR,   "| exists=", GOLD_DIR.exists())
print("Generated MD folder :", GENERATED_DIR, "| exists=", GENERATED_DIR.exists())
print("Evaluation output   :", EVAL_DIR)

gold_pdf_count = len(list(GOLD_DIR.glob("*.pdf"))) if GOLD_DIR.exists() else 0
gen_md_count   = len(list(GENERATED_DIR.glob("*.md"))) if GENERATED_DIR.exists() else 0
print("Found generated .md files:", gen_md_count)
print("Found gold .pdf files    :", gold_pdf_count)

if gen_md_count == 0:
    raise FileNotFoundError(f"No generated Markdown files found in: {GENERATED_DIR}")
if gold_pdf_count == 0:
    raise FileNotFoundError(f"No gold PDF files found in: {GOLD_DIR}")

# ------------------------------------------------------------
# Models
# ------------------------------------------------------------
print("Loading models...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
print("Models ready.")

# ------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------
def normalize_name(name: str) -> str:
    name = name.lower()
    name = re.sub(r"[^a-z0-9\s]", " ", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

def clean_text(text: str) -> str:
    # remove special tags some LLMs produce
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    # remove markdown formatting
    text = re.sub(r"#+\s*", "", text)
    text = re.sub(r"\*\*|\*", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def extract_text_from_pdf(pdf_path: Path) -> str:
    txt = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                txt += page.get_text("text") + "\n"
    except Exception as e:
        print("Error reading PDF:", pdf_path.name, "|", str(e))
    return clean_text(txt)

def chunk_text(text: str, size: int = 300) -> list[str]:
    words = text.split()
    return [" ".join(words[i:i+size]) for i in range(0, len(words), size)]

def compare_chunked(gold_text: str, gen_text: str, model) -> tuple[float, float]:
    gold_chunks = chunk_text(gold_text)
    gen_chunks  = chunk_text(gen_text)

    sbert_scores = []
    rouge_scores = []

    # Pre-embed generated chunks once (much faster)
    gen_embs = model.encode(gen_chunks, convert_to_tensor=True) if gen_chunks else None

    for g in gold_chunks:
        emb_g = model.encode(g, convert_to_tensor=True)

        # SBERT max similarity over generated chunks
        if gen_embs is not None and len(gen_chunks) > 0:
            sims = util.cos_sim(emb_g, gen_embs)[0].cpu().numpy()
            sbert_scores.append(float(np.max(sims)))
        else:
            sbert_scores.append(0.0)

        # ROUGE-L max recall over generated chunks
        if gen_chunks:
            rouge_chunk_scores = [rouge.score(g, gg)["rougeL"].recall for gg in gen_chunks]
            rouge_scores.append(float(np.max(rouge_chunk_scores)))
        else:
            rouge_scores.append(0.0)

    return float(np.mean(sbert_scores)), float(np.mean(rouge_scores))

def best_fuzzy_match(target: str, gold_names: list[str], threshold: float = 0.6) -> tuple[str | None, float]:
    best_match, best_score = None, 0.0
    for g in gold_names:
        score = SequenceMatcher(None, target, g).ratio()
        if score > best_score:
            best_match, best_score = g, score
    return (best_match, best_score) if best_score >= threshold else (None, best_score)

# ------------------------------------------------------------
# Collect files
# ------------------------------------------------------------
gold_files = {normalize_name(f.stem): f for f in GOLD_DIR.glob("*.pdf")}
gen_files  = {normalize_name(f.stem): f for f in GENERATED_DIR.glob("*.md")}
print("Indexed generated DMPs:", len(gen_files))
print("Indexed gold PDFs     :", len(gold_files))

# ------------------------------------------------------------
# Compare
# ------------------------------------------------------------
results = []
gold_keys = list(gold_files.keys())

for name, gen_path in tqdm(gen_files.items(), desc="Matching & Comparing DMPs"):
    best_match, match_score = best_fuzzy_match(name, gold_keys, threshold=0.6)
    if not best_match:
        continue

    gold_path = gold_files[best_match]

    gold_text = extract_text_from_pdf(gold_path)
    gen_text  = clean_text(gen_path.read_text(encoding="utf-8", errors="ignore"))

    if not gold_text or not gen_text:
        continue

    sbert_sim, rouge_l = compare_chunked(gold_text, gen_text, sbert)

    results.append({
        "Generated_File": gen_path.name,
        "Matched_Gold_PDF": gold_path.name,
        "Match_Score": round(match_score, 3),
        "SBERT_Similarity": round(sbert_sim, 4),
        "ROUGE_L_Recall": round(rouge_l, 4),
    })

df_results = pd.DataFrame(results)

out_path = EVAL_DIR / "full_dmp_pdf_comparison_fuzzy.csv"
df_results.to_csv(out_path, index=False, encoding="utf-8")

print("\nResults saved to:", out_path)
print("Total matched DMP pairs:", len(df_results))

Gold PDF folder     : C:\Users\Nahid\dmpchef\data\inputs\gold_dmps | exists= True
Generated MD folder : c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260220_095540\md | exists= True
Evaluation output   : c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260220_095540\evaluation_results
Found generated .md files: 26
Found gold .pdf files    : 26
Loading models...


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1634.20it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Models ready.
Indexed generated DMPs: 26
Indexed gold PDFs     : 26


Matching & Comparing DMPs: 100%|██████████| 26/26 [00:05<00:00,  4.65it/s]


Results saved to: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260220_095540\evaluation_results\full_dmp_pdf_comparison_fuzzy.csv
Total matched DMP pairs: 26





In [None]:
# ============================================
# STEP 6 — Element-Level Comparison with Gold Standard 
# Compares: generated Markdown sections (NB_MD_DIR) vs gold Excel element fields
# Writes:   RUN_DIR/evaluation_results/element_similarity_exact_titles.csv
# ============================================
import re
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# ------------------------------------------------------------
# Paths (Notebook-safe; reuse STEP 1 variables)
# ------------------------------------------------------------
# Generated markdown from current run
GENERATED_DIR = Path(NB_MD_DIR)

# Gold Excel: your reference file (same Excel you used to generate)
GOLD_PATH = Path(EXCEL_PATH)  # from STEP 1

# Evaluation output goes inside this run folder
EVAL_DIR = Path(RUN_DIR) / "evaluation_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

print("Gold Excel:", GOLD_PATH, "| exists=", GOLD_PATH.exists())
print("Generated MD folder:", GENERATED_DIR, "| exists=", GENERATED_DIR.exists())
print("Eval output folder:", EVAL_DIR)

if not GOLD_PATH.exists():
    raise FileNotFoundError(f"Gold Excel not found: {GOLD_PATH}")
if not GENERATED_DIR.exists():
    raise FileNotFoundError(f"Generated Markdown folder not found: {GENERATED_DIR}")

md_files = sorted(GENERATED_DIR.glob("*.md"))
print("Found generated Markdown files:", len(md_files))
if not md_files:
    raise FileNotFoundError(f"No .md files found in: {GENERATED_DIR}")

# ------------------------------------------------------------
# Load gold reference (Excel)
# ------------------------------------------------------------
df_gold = pd.read_excel(GOLD_PATH)
df_gold.columns = df_gold.columns.str.strip().str.lower()
df_gold = df_gold.fillna("").astype(str)

def normalize_title(name: str) -> str:
    name = str(name).lower()
    name = re.sub(r"[^a-z0-9\s]", " ", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

if "title" not in df_gold.columns:
    raise KeyError("Gold Excel must contain a 'title' column.")

df_gold["title_norm"] = df_gold["title"].apply(normalize_title)

gold_elements = [
    "element_1a","element_1b","element_1c",
    "element_2","element_3",
    "element_4a","element_4b","element_4c",
    "element_5a","element_5b","element_5c",
    "element_6"
]
gold_elements = [c for c in gold_elements if c in df_gold.columns]

print("Loaded gold projects:", len(df_gold))
print("Gold element columns used:", gold_elements)

# ------------------------------------------------------------
# Models
# ------------------------------------------------------------
print("Loading evaluation models...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
print("Models ready.")

# ------------------------------------------------------------
# Markdown parsing helpers
# ------------------------------------------------------------
def is_title(line: str) -> bool:
    s = line.strip()
    # Markdown headers OR numbered bold titles like: 1. **Data Types**
    return s.startswith("#") or bool(re.match(r"^\s*\d*\.?\s*\*\*.*\*\*\s*$", s))

def strip_llm_noise(text: str) -> str:
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    return text

def extract_sections(md_path: Path) -> pd.DataFrame:
    text = md_path.read_text(encoding="utf-8", errors="ignore")
    text = strip_llm_noise(text)

    lines = text.splitlines()
    entries, current_title, buf = [], None, []

    for ln in lines:
        if is_title(ln):
            if current_title and any(x.strip() for x in buf):
                entries.append({
                    "Section Title": current_title.strip(),
                    "Generated Content": "\n".join(buf).strip()
                })
            current_title, buf = ln, []
        else:
            buf.append(ln)

    if current_title and any(x.strip() for x in buf):
        entries.append({
            "Section Title": current_title.strip(),
            "Generated Content": "\n".join(buf).strip()
        })

    return pd.DataFrame(entries)

# ------------------------------------------------------------
# Compare (exact normalized title match)
# ------------------------------------------------------------
results = []

# Pre-index gold rows by normalized title for fast lookup
gold_by_title = {r["title_norm"]: r for _, r in df_gold.iterrows()}

for md_file in tqdm(md_files, desc="Comparing element-level"):
    gen_title_raw = md_file.stem
    gen_title_norm = normalize_title(gen_title_raw)

    gold_row = gold_by_title.get(gen_title_norm)
    if gold_row is None:
        continue

    gold_title = gold_row["title"]

    # Collect non-empty gold element texts
    gold_texts = {e: str(gold_row.get(e, "")).strip() for e in gold_elements}
    gold_texts = {k: v for k, v in gold_texts.items() if v}

    if not gold_texts:
        continue

    # Extract generated sections
    gen_df = extract_sections(md_file)
    if gen_df.empty:
        continue

    # Clean sections and drop empties
    gen_df["Generated Content"] = gen_df["Generated Content"].astype(str).str.strip()
    gen_df = gen_df[gen_df["Generated Content"].str.len() > 0]
    if gen_df.empty:
        continue

    # Embed all generated sections once (speedup)
    section_texts = gen_df["Generated Content"].tolist()
    section_embs = sbert.encode(section_texts, convert_to_tensor=True)

    for element, gold_text in gold_texts.items():
        emb_gold = sbert.encode(gold_text, convert_to_tensor=True)

        sims = util.cos_sim(emb_gold, section_embs)[0].cpu().numpy()
        best_idx = int(np.argmax(sims))
        best_sbert = float(sims[best_idx])

        best_section_title = gen_df.iloc[best_idx]["Section Title"]
        best_section_text  = gen_df.iloc[best_idx]["Generated Content"]
        best_rouge = float(rouge.score(gold_text, best_section_text)["rougeL"].recall)

        results.append({
            "Gold Project": gold_title,
            "Gold Element": element,
            "Generated File": md_file.name,
            "Best Generated Section Title": best_section_title,
            "SBERT_Similarity": round(best_sbert, 4),
            "ROUGE_L_Recall": round(best_rouge, 4),
        })

# ------------------------------------------------------------
# Save
# ------------------------------------------------------------
df_results = pd.DataFrame(results)
out_path = EVAL_DIR / "element_similarity_exact_titles.csv"
df_results.to_csv(out_path, index=False, encoding="utf-8")

print("\nElement-level similarity saved to:", out_path)
print("Total element–section best matches:", len(df_results))

Gold Excel: C:\Users\Nahid\dmpchef\data\inputs\inputs.xlsx | exists= True
Generated MD folder: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260220_095540\md | exists= True
Eval output folder: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260220_095540\evaluation_results
Found generated Markdown files: 26
Loaded gold projects: 26
Gold element columns used: ['element_1a', 'element_1b', 'element_1c', 'element_2', 'element_3', 'element_4a', 'element_4b', 'element_4c', 'element_5a', 'element_5b', 'element_5c', 'element_6']
Loading evaluation models...


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1855.37it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Models ready.


Comparing element-level: 100%|██████████| 26/26 [00:01<00:00, 16.64it/s]


Element-level similarity saved to: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260220_095540\evaluation_results\element_similarity_exact_titles.csv
Total element–section best matches: 312





In [None]:
# ============================================
# STEP 7 — Summarize Evaluation Results 
# Reads results from:
#   RUN_DIR/evaluation_results/full_dmp_pdf_comparison_fuzzy.csv
#   RUN_DIR/evaluation_results/element_similarity_exact_titles.csv
# Writes summaries to the same folder.
# ============================================
import pandas as pd
from pathlib import Path

# ------------------------------------------------------------
# Notebook-safe eval directory (from STEP 1)
# ------------------------------------------------------------
EVAL_DIR = Path(RUN_DIR) / "evaluation_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)
print("EVAL_DIR:", EVAL_DIR)

# ------------------------------------------------------------
# Load CSVs (fail loudly if missing)
# ------------------------------------------------------------
full_path = EVAL_DIR / "full_dmp_pdf_comparison_fuzzy.csv"
elem_path = EVAL_DIR / "element_similarity_exact_titles.csv"

if not full_path.exists():
    raise FileNotFoundError(f"Missing: {full_path} (Run STEP 5 first)")
if not elem_path.exists():
    raise FileNotFoundError(f"Missing: {elem_path} (Run STEP 6 first)")

df_full = pd.read_csv(full_path)
df_elem = pd.read_csv(elem_path)

print("Loaded full-document rows:", len(df_full))
print("Loaded element-level rows:", len(df_elem))

if df_full.empty:
    raise ValueError("full_dmp_pdf_comparison_fuzzy.csv is empty (STEP 5 matched 0 pairs).")
if df_elem.empty:
    raise ValueError("element_similarity_exact_titles.csv is empty (STEP 6 matched 0 pairs).")

# ============================================================
# 1) FULL-DOCUMENT LEVEL SUMMARY (Mean by Generated_File)
# ============================================================
project_col = "Generated_File" if "Generated_File" in df_full.columns else df_full.columns[0]

sbert_col = next((c for c in df_full.columns if "sbert" in c.lower()), None)
rouge_col = next((c for c in df_full.columns if "rouge" in c.lower()), None)

if not sbert_col or not rouge_col:
    raise ValueError(f"Could not find SBERT/ROUGE columns in: {df_full.columns.tolist()}")

df_full_summary = (
    df_full.groupby(project_col)[[sbert_col, rouge_col]]
    .mean()
    .reset_index()
)

# numeric + formatted columns
df_full_summary["SBERT_mean"] = df_full_summary[sbert_col].round(4)
df_full_summary["ROUGE_mean"] = df_full_summary[rouge_col].round(4)

df_full_table = df_full_summary[[project_col, "SBERT_mean", "ROUGE_mean"]].rename(
    columns={project_col: "Generated_File"}
)

print("\nFull-document summary (mean by Generated_File):")
display(df_full_table)

# Optional overall mean (across projects)
overall_full = {
    "SBERT_overall_mean": float(df_full_summary["SBERT_mean"].mean()),
    "ROUGE_overall_mean": float(df_full_summary["ROUGE_mean"].mean()),
}
print("\nOverall full-document means:")
print(overall_full)

# ============================================================
# 2) ELEMENT-LEVEL SUMMARY (Mean ± SD)
# ============================================================
elem_col = next((c for c in df_elem.columns if "element" in c.lower()), None)
if not elem_col:
    raise ValueError(f"Could not find element column in: {df_elem.columns.tolist()}")

sbert_col_e = next((c for c in df_elem.columns if "sbert" in c.lower()), None)
rouge_col_e = next((c for c in df_elem.columns if "rouge" in c.lower()), None)

if not sbert_col_e or not rouge_col_e:
    raise ValueError(f"Could not find SBERT/ROUGE columns in: {df_elem.columns.tolist()}")

df_elem_summary = (
    df_elem.groupby(elem_col)[[sbert_col_e, rouge_col_e]]
    .agg(["mean", "std"])
    .reset_index()
)

# Flatten columns
df_elem_summary.columns = [
    elem_col,
    "SBERT_mean", "SBERT_sd",
    "ROUGE_mean", "ROUGE_sd"
]

# Round for readability
for c in ["SBERT_mean", "SBERT_sd", "ROUGE_mean", "ROUGE_sd"]:
    df_elem_summary[c] = df_elem_summary[c].astype(float).round(4)

# Add formatted strings (mean ± sd)
df_elem_summary["SBERT_mean_sd"] = df_elem_summary.apply(
    lambda r: f"{r['SBERT_mean']:.2f} ± {r['SBERT_sd']:.2f}", axis=1
)
df_elem_summary["ROUGE_mean_sd"] = df_elem_summary.apply(
    lambda r: f"{r['ROUGE_mean']:.2f} ± {r['ROUGE_sd']:.2f}", axis=1
)

df_elem_table = df_elem_summary[[elem_col, "SBERT_mean_sd", "ROUGE_mean_sd"]].rename(
    columns={elem_col: "Element"}
)

print("\nElement-level summary (mean ± sd):")
display(df_elem_table)

# ============================================================
# Save tables
# ============================================================
out_full = EVAL_DIR / "summary_full_table_mean_only.csv"
out_elem = EVAL_DIR / "summary_element_table_mean_sd.csv"

df_full_table.to_csv(out_full, index=False, encoding="utf-8")
df_elem_table.to_csv(out_elem, index=False, encoding="utf-8")

print("\nSaved formatted tables:")
print(out_full)
print(out_elem)

EVAL_DIR: c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260220_095540\evaluation_results
Loaded full-document rows: 26
Loaded element-level rows: 312

Full-document summary (mean by Generated_File):


Unnamed: 0,Generated_File,SBERT_mean,ROUGE_mean
0,Analysis of social media posts.md,0.8082,0.4339
1,Basic Research from a Non-Human Source Example.md,0.7808,0.4705
2,Clinical Data from Human Research Participants.md,0.6758,0.252
3,Clinical and MRI data from human research part...,0.7154,0.2653
4,Clinical data (human biospecimens).md,0.7622,0.4827
5,Clinical data from human research participants...,0.7615,0.4439
6,Drug discovery including intellectual property.md,0.7997,0.4646
7,Gene expression analysis data from non-human m...,0.814,0.5315
8,Genomic data from a non-human source.md,0.7162,0.2764
9,Genomic data from human research participants.md,0.7241,0.2671



Overall full-document means:
{'SBERT_overall_mean': 0.7444884615384617, 'ROUGE_overall_mean': 0.41528076923076923}

Element-level summary (mean ± sd):


Unnamed: 0,Element,SBERT_mean_sd,ROUGE_mean_sd
0,element_1a,0.84 ± 0.16,0.56 ± 0.37
1,element_1b,0.77 ± 0.15,0.50 ± 0.32
2,element_1c,0.84 ± 0.12,0.63 ± 0.34
3,element_2,0.85 ± 0.11,0.58 ± 0.31
4,element_3,0.83 ± 0.11,0.54 ± 0.32
5,element_4a,0.82 ± 0.10,0.58 ± 0.29
6,element_4b,0.86 ± 0.13,0.63 ± 0.30
7,element_4c,0.88 ± 0.10,0.62 ± 0.28
8,element_5a,0.84 ± 0.15,0.61 ± 0.32
9,element_5b,0.82 ± 0.14,0.57 ± 0.32



Saved formatted tables:
c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260220_095540\evaluation_results\summary_full_table_mean_only.csv
c:\Users\Nahid\dmpchef\notebook_DMP_RAG\Output_7_NIH_all\run_20260220_095540\evaluation_results\summary_element_table_mean_sd.csv
