In [3]:
print ("Start 10/08/2025")

Start 10/08/2025


In [4]:
# ============================================
# STEP 1 — Imports, Config, and Helpers
# ============================================
import os, re, time
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import pypandoc  # for Markdown → DOCX

# --- LangChain Core ---
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

load_dotenv()

# ---------- Paths (works in notebook or script) ----------
try:
    ROOT_DIR = Path(__file__).resolve().parents[1]  # when running a .py script
except NameError:
    ROOT_DIR = Path.cwd().parent                     # when running inside Jupyter

# --- Data folders ---
DATA_PDFS   = ROOT_DIR / "data" / "general_web_ingestion"
INDEX_DIR   = ROOT_DIR / "data" / "faiss_index"
EXCEL_PATH  = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"
TEMPLATE_MD = ROOT_DIR / "data" / "inputs" / "dmp-template.md"

# --- Output folders ---
OUTPUT_MD   = ROOT_DIR / "data" / "outputs" / "markdown"
OUTPUT_DOCX = ROOT_DIR / "data" / "outputs" / "docx"

# --- Models / parameters ---
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL   = "llama3.3"
TOP_K       = 6

# ---------- Helper functions ----------
def create_folder(folderpath):
    Path(folderpath).mkdir(parents=True, exist_ok=True)

def save_md(folderpath, filename, text):
    create_folder(folderpath)
    (Path(folderpath) / filename).write_text(text, encoding="utf-8")
    print("💾 Saved:", Path(folderpath) / filename)

def md_to_docs(md_filepath, docx_folderpath, docx_filename):
    create_folder(docx_folderpath)
    pypandoc.convert_file(
        str(md_filepath), "docx",
        outputfile=str(Path(docx_folderpath) / docx_filename)
    )
    print("📄 Converted:", Path(docx_folderpath) / docx_filename)

def clean_filename(name: str) -> str:
    """Remove illegal characters from filenames (Windows-safe)."""
    return re.sub(r'[\\/*?:"<>|]', "_", str(name)).strip()

# ---------- Ensure required folders exist ----------
for p in [DATA_PDFS, INDEX_DIR, OUTPUT_MD, OUTPUT_DOCX]:
    p.mkdir(parents=True, exist_ok=True)

# ---------- Sanity print ----------
print("✅ STEP 1 ready")
print(f"ROOT_DIR   : {ROOT_DIR}")
print(f"DATA_PDFS  : {DATA_PDFS}")
print(f"INDEX_DIR  : {INDEX_DIR}")
print(f"EXCEL_PATH : {EXCEL_PATH}")
print(f"TEMPLATE_MD: {TEMPLATE_MD}")
print(f"OUTPUT_MD  : {OUTPUT_MD}")
print(f"OUTPUT_DOCX: {OUTPUT_DOCX}")
print(f"EMBED_MODEL: {EMBED_MODEL} | LLM_MODEL: {LLM_MODEL} | TOP_K: {TOP_K}")


✅ STEP 1 ready
ROOT_DIR   : c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline
DATA_PDFS  : c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\general_web_ingestion
INDEX_DIR  : c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\faiss_index
EXCEL_PATH : c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\inputs\inputs.xlsx
TEMPLATE_MD: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\inputs\dmp-template.md
OUTPUT_MD  : c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown
OUTPUT_DOCX: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx
EMBED_MODEL: sentence-transformers/all-MiniLM-L6-v2 | LLM_MODEL: llama3.3 | TOP_K: 6


In [5]:
# =========================================================
# STEP 2 — Load PDFs and TXT Files, Split into Text Chunks
# =========================================================

# --- Imports ---
from pathlib import Path
from tqdm import tqdm
import warnings
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Optional: Silence PDFMiner warnings ---
warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")

# --------------------------------------------------------
# Function: Load PDFs and TXT files
# --------------------------------------------------------
def load_docs_from_folder(folder: Path):
    """
    Load all PDF and TXT files from a folder into LangChain Document objects.
    Skips unreadable or corrupted files gracefully.
    """
    if not folder.exists():
        raise FileNotFoundError(f"❌ Folder not found: {folder}")

    pdf_files = sorted(folder.glob("*.pdf"))
    txt_files = sorted(folder.glob("*.txt"))
    all_files = pdf_files + txt_files

    if not all_files:
        raise FileNotFoundError(f"⚠️ No PDF or TXT files found in {folder}")

    docs = []
    for fpath in tqdm(all_files, desc=f"📄 Loading files from {folder}"):
        try:
            if fpath.suffix.lower() == ".pdf":
                loader = PyPDFLoader(str(fpath))
            elif fpath.suffix.lower() == ".txt":
                loader = TextLoader(str(fpath), encoding="utf-8")
            else:
                print(f"⏭️ Skipped unsupported file: {fpath.name}")
                continue

            file_docs = loader.load()
            docs.extend(file_docs)

        except Exception as e:
            print(f"❌ Error loading {fpath.name}: {e}")

    print(f"\n✅ Loaded {len(docs)} pages from {len(all_files)} files in '{folder}'.")
    return docs


# --------------------------------------------------------
# Function: Split documents into text chunks
# --------------------------------------------------------
def split_into_chunks(docs, chunk_size=800, chunk_overlap=120):
    """
    Split LangChain Document objects into overlapping chunks
    for embeddings and retrieval-augmented generation (RAG).
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(docs)
    print(f"🧩 Created {len(chunks)} chunks from {len(docs)} document pages.")
    return chunks


# --------------------------------------------------------
# Example Run


# Load and split
raw_docs = load_docs_from_folder(DATA_PDFS)
chunks = split_into_chunks(raw_docs)




📄 Loading files from c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\general_web_ingestion:   5%|▌         | 556/10445 [04:35<2:02:09,  1.35it/s]Ignoring wrong pointing object 2 65536 (offset 0)
Ignoring wrong pointing object 36 65536 (offset 0)
Ignoring wrong pointing object 51 65536 (offset 0)
Ignoring wrong pointing object 65 65536 (offset 0)
Ignoring wrong pointing object 68 65536 (offset 0)
Ignoring wrong pointing object 71 65536 (offset 0)
Ignoring wrong pointing object 74 65536 (offset 0)
Ignoring wrong pointing object 77 65536 (offset 0)
Ignoring wrong pointing object 80 65536 (offset 0)
Ignoring wrong pointing object 83 65536 (offset 0)
Ignoring wrong pointing object 86 65536 (offset 0)
Ignoring wrong pointing object 89 65536 (offset 0)
Ignoring wrong pointing object 92 65536 (offset 0)
Ignoring wrong pointing object 95 65536 (offset 0)
Ignoring wrong pointing object 98 65536 (offset 0)
Ignoring wrong pointing object 101 65536 (offset 0)
Ignoring wrong pointing object 107 65536 (o

❌ Error loading cancer-facts-and-figures-2021.pdf: cryptography>=3.1 is required for AES algorithm


📄 Loading files from c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\general_web_ingestion:   8%|▊         | 815/10445 [06:28<27:15,  5.89it/s]  parsing for Object Streams
parsing for Object Streams
📄 Loading files from c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\general_web_ingestion:   8%|▊         | 817/10445 [06:28<23:18,  6.88it/s]parsing for Object Streams
parsing for Object Streams
📄 Loading files from c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\general_web_ingestion:   8%|▊         | 819/10445 [06:28<20:49,  7.70it/s]parsing for Object Streams
parsing for Object Streams
📄 Loading files from c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\general_web_ingestion:   8%|▊         | 821/10445 [06:28<23:47,  6.74it/s]parsing for Object Streams
parsing for Object Streams
📄 Loading files from c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\general_web_ingestion:   8%|▊         | 823/10445 [06:29<20:28,  7.83it/s]parsing for Object Streams
📄 Loading files from c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\

❌ Error loading NIH-CADR-Implementation-Guidebook.pdf: cryptography>=3.1 is required for AES algorithm


📄 Loading files from c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\general_web_ingestion: 100%|██████████| 10445/10445 [18:12<00:00,  9.56it/s]  



✅ Loaded 52733 pages from 10445 files in 'c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\general_web_ingestion'.
🧩 Created 541409 chunks from 52733 document pages.


In [6]:
# ============================================
# STEP 3 — Build or Load FAISS Index
# ============================================
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import time

# --- Initialize embedding model ---
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

def build_or_load_faiss_index(index_dir=INDEX_DIR, chunks=None):
    """
    Builds a new FAISS index from text chunks if none exists,
    otherwise loads the saved one from disk.
    """
    faiss_path = index_dir / "index.faiss"
    pkl_path   = index_dir / "index.pkl"

    # --- If index exists, load it ---
    if faiss_path.exists() and pkl_path.exists():
        print("📦 Existing FAISS index found. Loading from disk...")
        vectorstore = FAISS.load_local(
            str(index_dir),
            embeddings,
            allow_dangerous_deserialization=True
        )
        print("✅ FAISS index loaded successfully.")
        return vectorstore

    # --- Otherwise, build new index ---
    if chunks is None or len(chunks) == 0:
        raise RuntimeError("❌ No chunks provided. Please run Step 2 first to load and split PDFs.")

    print("🧱 Building new FAISS index...")
    start_time = time.time()

    vectorstore = FAISS.from_documents(
        tqdm(chunks, desc="🔢 Embedding text chunks"),
        embeddings
    )

    # --- Save the index ---
    vectorstore.save_local(str(index_dir))
    duration = time.time() - start_time

    print(f"💾 Saved new FAISS index to {index_dir}")
    print(f"⏱️ Build completed in {duration/60:.2f} minutes ({duration:.1f} seconds)")
    return vectorstore


# --- Execute step ---
vectorstore = build_or_load_faiss_index(INDEX_DIR, chunks)
retriever   = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
print(f"✅ Retriever ready (top_k={TOP_K})")


  embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)


🧱 Building new FAISS index...


🔢 Embedding text chunks: 100%|██████████| 541409/541409 [00:00<00:00, 3355087.49it/s]


💾 Saved new FAISS index to c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\faiss_index
⏱️ Build completed in 69.23 minutes (4153.8 seconds)
✅ Retriever ready (top_k=6)


In [12]:
# ============================================
# 🧩 STEP 4 — Load Excel, Template, and Build RAG Chain (Fixed)
# ============================================
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Ollama
import pandas as pd

# --- Load Excel file ---
if not EXCEL_PATH.exists():
    raise FileNotFoundError(f"❌ Excel file not found: {EXCEL_PATH}")

df = pd.read_excel(EXCEL_PATH)
print(f"✅ Excel loaded successfully: {len(df)} rows")

# --- Load Markdown Template ---
if not TEMPLATE_MD.exists():
    raise FileNotFoundError(f"❌ Template file not found: {TEMPLATE_MD}")

dmp_template_text = TEMPLATE_MD.read_text(encoding="utf-8")
print("✅ DMP Markdown template loaded.")


# --- Build RAG chain ---
def build_rag_chain(retriever, llm_model=LLM_MODEL):
    """
    Build a flexible RAG pipeline that retrieves context
    and generates a context-grounded NIH DMP section.
    """
    llm = Ollama(model=llm_model)

    prompt_template = """You are an expert biomedical data steward and grant writer.
Create a high-quality NIH Data Management and Sharing Plan (DMSP)
based on the retrieved NIH context and the user's query.

----
Context from NIH Repository:
{context}

----
Question:
{question}

Use the context above and follow the NIH template structure. Write fluently and cohesively.
"""
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    parser = StrOutputParser()

    def format_docs(docs):
        """Format retrieved documents into clean text."""
        if not docs:
            return ""
        formatted = []
        for d in docs:
            page = d.metadata.get("page", "")
            title = d.metadata.get("source", "")
            formatted.append(f"[Page {page}] {title}\n{d.page_content.strip()}")
        return "\n\n".join(formatted)

    rag_chain = (
        {
            "context": retriever | format_docs,
            "question": RunnablePassthrough()
        }
        | prompt
        | llm
        | parser
    )

    print(f"🔗 RAG chain initialized with model: {llm_model}")
    return rag_chain


# --- Initialize the RAG chain ---
rag_chain = build_rag_chain(retriever)
print("✅ RAG chain ready for generation.")


✅ Excel loaded successfully: 26 rows
✅ DMP Markdown template loaded.
🔗 RAG chain initialized with model: llama3.3
✅ RAG chain ready for generation.


In [13]:
# ============================================
# 🧩 STEP 5 — RAG-Based DMP Generation Using Titles
# ============================================
import re, pandas as pd, pypandoc
from tqdm import tqdm
from pathlib import Path

# ---------- Paths ----------
EXCEL_PATH = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"
OUTPUT_LOG = ROOT_DIR / "data" / "outputs" / "rag_generated_dmp_log.csv"
OUTPUT_MD.mkdir(parents=True, exist_ok=True)
OUTPUT_DOCX.mkdir(parents=True, exist_ok=True)

# ---------- Load Excel ----------
df = pd.read_excel(EXCEL_PATH)
print(f"✅ Loaded input Excel — {len(df)} rows")

# Normalize column names
df.columns = df.columns.str.strip().str.lower()
df = df.fillna("")

# ---------- Verify template ----------
if not TEMPLATE_MD.exists():
    raise FileNotFoundError(f"❌ Template not found: {TEMPLATE_MD}")
dmp_template_text = TEMPLATE_MD.read_text(encoding="utf-8")
print(f"✅ Loaded NIH DMP Markdown template from: {TEMPLATE_MD}")

# ---------- Helper functions ----------
def sanitize_filename(name: str) -> str:
    """Replace illegal filename characters but preserve readable title."""
    return re.sub(r'[\\/*?:"<>|]', "_", name.strip())

def create_folder(folderpath: Path):
    folderpath.mkdir(parents=True, exist_ok=True)

def save_md(folderpath: Path, filename: str, response: str):
    create_folder(folderpath)
    filepath = folderpath / filename
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(response)
    print(f"💾 Saved: {filepath}")

def md_to_docx(md_filepath: Path, docx_folder: Path, docx_filename: str):
    create_folder(docx_folder)
    docx_path = docx_folder / docx_filename
    pypandoc.convert_file(str(md_filepath), "docx", outputfile=str(docx_path))
    print(f"📄 Converted: {docx_path}")

# ---------- Main Generation ----------
records = []
TOP_K = 6  # retrieved context chunks

for idx, row in tqdm(df.iterrows(), total=len(df), desc="🧠 Generating NIH DMPs"):
    title = str(row["title"]).strip()
    print(f"\n🧩 Generating DMP for: {title}")

    # 1️⃣ Build query from Excel elements
    element_texts = []
    for col in [c for c in df.columns if c.startswith("element")]:
        val = str(row[col]).strip()
        if val:
            element_texts.append(f"{col.upper()}: {val}")
    query_data = "\n".join(element_texts)

    query = (
        f"You are an expert biomedical data steward and grant writer. "
        f"Create a complete NIH Data Management and Sharing Plan (DMSP) for the project titled '{title}'. "
        f"Use retrieved context from the NIH corpus to fill in all template sections accurately.\n\n"
        f"Here is background information from the proposal:\n{query_data}\n"
    )

    # 2️⃣ Retrieve context from FAISS
    try:
        retrieved_docs = retriever.get_relevant_documents(query)
        context_text = "\n\n".join(doc.page_content for doc in retrieved_docs[:TOP_K])
        print(f"🔎 Retrieved {len(retrieved_docs)} context chunks.")
    except Exception as e:
        print(f"⚠️ Retrieval failed for {title}: {e}")
        context_text = ""

    # 3️⃣ Combine context, query, and template
    full_prompt = f"""
You are an expert biomedical data steward and grant writer.
Use the retrieved NIH context and the provided template to generate a complete Data Management and Sharing Plan.

----
Context:
{context_text}

----
Project Query:
{query}

Use the following NIH DMSP Markdown template. Do not alter section titles:
{dmp_template_text}
"""

    # 4️⃣ Run through RAG model
    try:
        response = rag_chain.invoke(full_prompt)

        # 5️⃣ Save using SAME TITLE as in Excel
        safe_title = sanitize_filename(title)
        md_filename = f"{safe_title}.md"
        docx_filename = f"{safe_title}.docx"
        md_path = OUTPUT_MD / md_filename

        save_md(OUTPUT_MD, md_filename, response)
        md_to_docx(md_path, OUTPUT_DOCX, docx_filename)

        # 6️⃣ Log summary
        records.append({
            "Title": title,
            "Query": query,
            "Retrieved_Context": context_text[:1000],
            "Generated_DMP_Preview": response[:1000],
            "Error": ""
        })

    except Exception as e:
        print(f"❌ Error generating DMP for {title}: {e}")
        records.append({
            "Title": title,
            "Query": query,
            "Retrieved_Context": context_text[:1000],
            "Generated_DMP_Preview": "",
            "Error": str(e)
        })

# ---------- Save Log ----------
pd.DataFrame(records).to_csv(OUTPUT_LOG, index=False, encoding="utf-8")
print("\n✅ All NIH DMPs generated successfully — titles preserved exactly as in Excel!")
print(f"📊 CSV log saved to: {OUTPUT_LOG}")


✅ Loaded input Excel — 26 rows
✅ Loaded NIH DMP Markdown template from: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\inputs\dmp-template.md


🧠 Generating NIH DMPs:   0%|          | 0/26 [00:00<?, ?it/s]


🧩 Generating DMP for: Clinical and MRI data from human research participants
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:   4%|▍         | 1/26 [01:42<42:35, 102.20s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Clinical and MRI data from human research participants.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Clinical and MRI data from human research participants.docx

🧩 Generating DMP for: Genomic data from human research participants
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:   8%|▊         | 2/26 [03:11<37:45, 94.41s/it] 

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Genomic data from human research participants.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Genomic data from human research participants.docx

🧩 Generating DMP for: Genomic data from a non-human source
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  12%|█▏        | 3/26 [04:27<33:06, 86.38s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Genomic data from a non-human source.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Genomic data from a non-human source.docx

🧩 Generating DMP for: Secondary data analysis
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  15%|█▌        | 4/26 [05:54<31:42, 86.49s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Secondary data analysis.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Secondary data analysis.docx

🧩 Generating DMP for: Human clinical and genomics data
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  19%|█▉        | 5/26 [07:25<30:50, 88.13s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Human clinical and genomics data.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Human clinical and genomics data.docx

🧩 Generating DMP for: Gene expression analysis data from non-human model organism (zebrafish)
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  23%|██▎       | 6/26 [08:50<29:03, 87.17s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Gene expression analysis data from non-human model organism (zebrafish).md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Gene expression analysis data from non-human model organism (zebrafish).docx

🧩 Generating DMP for: Human survey data
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  27%|██▋       | 7/26 [10:10<26:46, 84.54s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Human survey data.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Human survey data.docx

🧩 Generating DMP for: Clinical Data from Human Research Participants
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  31%|███       | 8/26 [11:36<25:34, 85.23s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Clinical Data from Human Research Participants.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Clinical Data from Human Research Participants.docx

🧩 Generating DMP for: Human genomic data
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  35%|███▍      | 9/26 [13:11<25:01, 88.32s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Human genomic data.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Human genomic data.docx

🧩 Generating DMP for: Technology development
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  38%|███▊      | 10/26 [14:33<23:01, 86.37s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Technology development.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Technology development.docx

🧩 Generating DMP for: Basic Research from a Non-Human Source Example
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  42%|████▏     | 11/26 [16:01<21:41, 86.77s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Basic Research from a Non-Human Source Example.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Basic Research from a Non-Human Source Example.docx

🧩 Generating DMP for: Secondary Data Analysis Example
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  46%|████▌     | 12/26 [17:19<19:38, 84.19s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Secondary Data Analysis Example.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Secondary Data Analysis Example.docx

🧩 Generating DMP for: Survey and Interview Example
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  50%|█████     | 13/26 [18:48<18:31, 85.49s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Survey and Interview Example.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Survey and Interview Example.docx

🧩 Generating DMP for: Human Clinical Trial Data
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  54%|█████▍    | 14/26 [20:14<17:06, 85.57s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Human Clinical Trial Data.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Human Clinical Trial Data.docx

🧩 Generating DMP for: Clinical data from human research participants-NIA
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  58%|█████▊    | 15/26 [21:35<15:28, 84.38s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Clinical data from human research participants-NIA.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Clinical data from human research participants-NIA.docx

🧩 Generating DMP for: Survey, interview, and biological data (tiered access)
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  62%|██████▏   | 16/26 [22:43<13:13, 79.36s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Survey, interview, and biological data (tiered access).md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Survey, interview, and biological data (tiered access).docx

🧩 Generating DMP for: Non-human data (primates)
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  65%|██████▌   | 17/26 [23:44<11:05, 73.93s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Non-human data (primates).md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Non-human data (primates).docx

🧩 Generating DMP for: Secondary data analysis-NIA
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  69%|██████▉   | 18/26 [25:06<10:11, 76.42s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Secondary data analysis-NIA.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Secondary data analysis-NIA.docx

🧩 Generating DMP for: Survey and interview data-NIA
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  73%|███████▎  | 19/26 [26:22<08:53, 76.17s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Survey and interview data-NIA.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Survey and interview data-NIA.docx

🧩 Generating DMP for: Human clinical and genomic data-NIA
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  77%|███████▋  | 20/26 [27:54<08:04, 80.80s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Human clinical and genomic data-NIA.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Human clinical and genomic data-NIA.docx

🧩 Generating DMP for: Non-human data (rodents)-NIA
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  81%|████████  | 21/26 [29:29<07:06, 85.21s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Non-human data (rodents)-NIA.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Non-human data (rodents)-NIA.docx

🧩 Generating DMP for: Clinical data (human biospecimens)
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  85%|████████▍ | 22/26 [30:48<05:33, 83.42s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Clinical data (human biospecimens).md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Clinical data (human biospecimens).docx

🧩 Generating DMP for: Drug discovery including intellectual property
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  88%|████████▊ | 23/26 [32:06<04:05, 81.76s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Drug discovery including intellectual property.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Drug discovery including intellectual property.docx

🧩 Generating DMP for: HeLa Cell Whole Genome Sequence (DNA or RNA)
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  92%|█████████▏| 24/26 [33:33<02:46, 83.14s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\HeLa Cell Whole Genome Sequence (DNA or RNA).md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\HeLa Cell Whole Genome Sequence (DNA or RNA).docx

🧩 Generating DMP for: Secondary Data Analysis on Data from Human Subjects-NIA
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs:  96%|█████████▌| 25/26 [34:59<01:24, 84.02s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Secondary Data Analysis on Data from Human Subjects-NIA.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Secondary Data Analysis on Data from Human Subjects-NIA.docx

🧩 Generating DMP for: Analysis of social media posts
🔎 Retrieved 6 context chunks.


🧠 Generating NIH DMPs: 100%|██████████| 26/26 [36:18<00:00, 83.81s/it]

💾 Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Analysis of social media posts.md
📄 Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Analysis of social media posts.docx

✅ All NIH DMPs generated successfully — titles preserved exactly as in Excel!
📊 CSV log saved to: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\rag_generated_dmp_log.csv





In [14]:
# ============================================
# 🧩 STEP 7 — Full DMP Comparison: Markdown (Generated) vs PDF (Gold, Fuzzy Matching)
# ============================================
import os, re
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# --------------------------------------------------------
# 🗂️ Define ROOT_DIR manually to your project folder
# --------------------------------------------------------
ROOT_DIR = Path(r"C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline")  # ✅ change if needed

# --- Paths ---
GOLD_DIR      = ROOT_DIR / "data" /"inputs"/ "gold_dmps"      # PDF gold-standard DMPs
GENERATED_DIR = ROOT_DIR / "data" / "outputs" / "markdown"      # Generated DMPs
EVAL_DIR      = ROOT_DIR / "data" / "outputs" / "evaluation_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

print(f"📗 Gold PDF folder: {GOLD_DIR}")
print(f"📘 Generated Markdown folder: {GENERATED_DIR}")

# --- Models ---
print("🚀 Loading models...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
print("✅ Models ready.")

# --- Helper functions ---
def normalize_name(name: str) -> str:
    name = name.lower()
    name = re.sub(r"[^a-z0-9\s]", " ", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

def clean_text(text: str) -> str:
    """Remove markdown or formatting artifacts."""
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    text = re.sub(r"#+\s*", "", text)
    text = re.sub(r"\*\*|\*", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def extract_text_from_pdf(pdf_path: Path) -> str:
    """Extract readable text from PDF using PyMuPDF."""
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text("text") + "\n"
    except Exception as e:
        print(f"❌ Error reading {pdf_path.name}: {e}")
    return clean_text(text)

def chunk_text(text, size=300):
    """Split long text into 300-word chunks."""
    words = text.split()
    return [" ".join(words[i:i+size]) for i in range(0, len(words), size)]

def compare_chunked(gold_text, gen_text, model):
    """Chunked SBERT + ROUGE similarity between two long texts."""
    gold_chunks = chunk_text(gold_text)
    gen_chunks = chunk_text(gen_text)

    sbert_scores, rouge_scores = [], []
    for g in gold_chunks:
        emb_g = model.encode(g, convert_to_tensor=True)
        chunk_sims = []
        for gen in gen_chunks:
            emb_gen = model.encode(gen, convert_to_tensor=True)
            chunk_sims.append(util.cos_sim(emb_g, emb_gen).item())
        sbert_scores.append(max(chunk_sims))  # best match per gold chunk

        rouge_chunk_scores = [rouge.score(g, gen)["rougeL"].recall for gen in gen_chunks]
        rouge_scores.append(max(rouge_chunk_scores))

    return np.mean(sbert_scores), np.mean(rouge_scores)

def best_fuzzy_match(target, gold_names, threshold=0.6):
    """Find best matching name among gold files using fuzzy ratio."""
    best_match, best_score = None, 0
    for g in gold_names:
        score = SequenceMatcher(None, target, g).ratio()
        if score > best_score:
            best_match, best_score = g, score
    return (best_match, best_score) if best_score >= threshold else (None, best_score)

# --- Collect gold PDFs and generated MDs ---
gold_files = {normalize_name(f.stem): f for f in GOLD_DIR.glob("*.pdf")}
gen_files  = {normalize_name(f.stem): f for f in GENERATED_DIR.glob("*.md")}
print(f"📊 Found {len(gen_files)} generated DMPs and {len(gold_files)} gold PDFs.")

# --- Compare all matching files ---
results = []
for name, gen_path in tqdm(gen_files.items(), desc="🔎 Matching & Comparing DMPs"):
    best_match, score = best_fuzzy_match(name, list(gold_files.keys()))
    if not best_match:
        print(f"⚠️ No gold match for: {gen_path.name}")
        continue

    gold_path = gold_files[best_match]
    gold_text = extract_text_from_pdf(gold_path)
    gen_text  = clean_text(gen_path.read_text(encoding="utf-8"))

    if not gold_text.strip() or not gen_text.strip():
        print(f"⚠️ Skipping empty file: {name}")
        continue

    sbert_sim, rouge_l = compare_chunked(gold_text, gen_text, sbert)
    results.append({
        "Generated_File": gen_path.name,
        "Matched_Gold_PDF": gold_path.name,
        "Match_Score": round(score, 3),
        "SBERT_Similarity": round(sbert_sim, 4),
        "ROUGE_L_Recall": round(rouge_l, 4),
    })
    print(f"✅ Matched {gen_path.name} ↔ {gold_path.name} (score={score:.2f})")

# --- Save results ---
df_results = pd.DataFrame(results)
out_path = EVAL_DIR / "full_dmp_pdf_comparison_fuzzy.csv"
df_results.to_csv(out_path, index=False)
print(f"\n✅ Markdown–PDF (fuzzy) similarity results saved to: {out_path}")
print(f"🧾 Total matched DMP pairs: {len(df_results)}")


📗 Gold PDF folder: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\inputs\gold_dmps
📘 Generated Markdown folder: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown
🚀 Loading models...
✅ Models ready.
📊 Found 26 generated DMPs and 26 gold PDFs.


🔎 Matching & Comparing DMPs:   4%|▍         | 1/26 [00:00<00:08,  2.97it/s]

✅ Matched Analysis of social media posts.md ↔ 26-Analysis of social media posts-NCI.pdf (score=0.90)


🔎 Matching & Comparing DMPs:   8%|▊         | 2/26 [00:00<00:08,  2.87it/s]

✅ Matched Basic Research from a Non-Human Source Example.md ↔ 11-Basic Research from a Non-Human Source Example-NIDDK.pdf (score=0.91)


🔎 Matching & Comparing DMPs:  12%|█▏        | 3/26 [00:01<00:09,  2.45it/s]

✅ Matched Clinical and MRI data from human research participants.md ↔ 1-Clinical andor MRI data from human research participants-NIMH.pdf (score=0.92)


🔎 Matching & Comparing DMPs:  15%|█▌        | 4/26 [00:01<00:09,  2.38it/s]

✅ Matched Clinical data (human biospecimens).md ↔ 22-Clinical data (human biospecimens)-NIA.pdf (score=0.90)


🔎 Matching & Comparing DMPs:  19%|█▉        | 5/26 [00:01<00:08,  2.51it/s]

✅ Matched Clinical data from human research participants-NIA.md ↔ 15-Clinical data from human research participants-NIA.pdf (score=0.97)


🔎 Matching & Comparing DMPs:  23%|██▎       | 6/26 [00:02<00:07,  2.58it/s]

✅ Matched Clinical Data from Human Research Participants.md ↔ 15-Clinical data from human research participants-NIA.pdf (score=0.93)


🔎 Matching & Comparing DMPs:  27%|██▋       | 7/26 [00:02<00:07,  2.61it/s]

✅ Matched Drug discovery including intellectual property.md ↔ 23-Drug discovery including intellectual property-NIA.pdf (score=0.93)


🔎 Matching & Comparing DMPs:  31%|███       | 8/26 [00:03<00:06,  2.70it/s]

✅ Matched Gene expression analysis data from non-human model organism (zebrafish).md ↔ 8-Gene expression analysis data from non-human model organism (zebrafish)-NICHD.pdf (score=0.95)


🔎 Matching & Comparing DMPs:  35%|███▍      | 9/26 [00:03<00:06,  2.51it/s]

✅ Matched Genomic data from a non-human source.md ↔ 3-Genomic data from a non-human source-NIMH.pdf (score=0.91)


🔎 Matching & Comparing DMPs:  38%|███▊      | 10/26 [00:04<00:07,  2.28it/s]

✅ Matched Genomic data from human research participants.md ↔ 2-Genomic data from human research participants-NIMH.pdf (score=0.93)


🔎 Matching & Comparing DMPs:  42%|████▏     | 11/26 [00:04<00:06,  2.40it/s]

✅ Matched HeLa Cell Whole Genome Sequence (DNA or RNA).md ↔ 24-HeLa Cell Whole Genome Sequence (DNA or RNA)-OD, NHGRI.pdf (score=0.88)


🔎 Matching & Comparing DMPs:  46%|████▌     | 12/26 [00:04<00:06,  2.30it/s]

✅ Matched Human clinical and genomic data-NIA.md ↔ 20-Human clinical and genomic data-NIA.pdf (score=0.96)


🔎 Matching & Comparing DMPs:  50%|█████     | 13/26 [00:05<00:06,  2.14it/s]

✅ Matched Human clinical and genomics data.md ↔ 7-Human clinical and genomics data-NICHD.pdf (score=0.89)


🔎 Matching & Comparing DMPs:  54%|█████▍    | 14/26 [00:05<00:05,  2.09it/s]

✅ Matched Human Clinical Trial Data.md ↔ 14-Human Clinical Trial Data-NICHD.pdf (score=0.85)


🔎 Matching & Comparing DMPs:  58%|█████▊    | 15/26 [00:06<00:05,  2.03it/s]

✅ Matched Human genomic data.md ↔ 5-Human genomic data-NHGRI.pdf (score=0.82)


🔎 Matching & Comparing DMPs:  62%|██████▏   | 16/26 [00:06<00:04,  2.16it/s]

✅ Matched Human survey data.md ↔ 9-Human survey data-NICHD.pdf (score=0.81)


🔎 Matching & Comparing DMPs:  65%|██████▌   | 17/26 [00:07<00:03,  2.44it/s]

✅ Matched Non-human data (primates).md ↔ 17-Non-human data (primates)-NIA.pdf (score=0.87)


🔎 Matching & Comparing DMPs:  69%|██████▉   | 18/26 [00:07<00:03,  2.36it/s]

✅ Matched Non-human data (rodents)-NIA.md ↔ 21-Non-human data (rodents)-NIA.pdf (score=0.95)


🔎 Matching & Comparing DMPs:  73%|███████▎  | 19/26 [00:07<00:02,  2.70it/s]

✅ Matched Secondary Data Analysis Example.md ↔ 12-Secondary Data Analysis Example-NIDDK.pdf (score=0.87)


🔎 Matching & Comparing DMPs:  77%|███████▋  | 20/26 [00:08<00:02,  2.57it/s]

✅ Matched Secondary Data Analysis on Data from Human Subjects-NIA.md ↔ 25-Secondary Data Analysis on Data from Human Subjects-NIA.pdf (score=0.97)


🔎 Matching & Comparing DMPs:  81%|████████  | 21/26 [00:08<00:01,  2.63it/s]

✅ Matched Secondary data analysis-NIA.md ↔ 18-Secondary data analysis-NIA.pdf (score=0.95)


🔎 Matching & Comparing DMPs:  85%|████████▍ | 22/26 [00:08<00:01,  2.70it/s]

✅ Matched Secondary data analysis.md ↔ 18-Secondary data analysis-NIA.pdf (score=0.87)


🔎 Matching & Comparing DMPs:  88%|████████▊ | 23/26 [00:09<00:01,  2.83it/s]

✅ Matched Survey and interview data-NIA.md ↔ 19-Survey and interview data-NIA.pdf (score=0.95)


🔎 Matching & Comparing DMPs:  92%|█████████▏| 24/26 [00:09<00:00,  2.38it/s]

✅ Matched Survey and Interview Example.md ↔ 13-Survey and Interview Example-NHGRI.pdf (score=0.86)


🔎 Matching & Comparing DMPs:  96%|█████████▌| 25/26 [00:10<00:00,  2.56it/s]

✅ Matched Survey, interview, and biological data (tiered access).md ↔ 16-Survey, interview, and biological data (tiered access)-NIA.pdf (score=0.93)


🔎 Matching & Comparing DMPs: 100%|██████████| 26/26 [00:10<00:00,  2.46it/s]

✅ Matched Technology development.md ↔ 6-Technology development-NHGRI.pdf (score=0.85)

✅ Markdown–PDF (fuzzy) similarity results saved to: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\evaluation_results\full_dmp_pdf_comparison_fuzzy.csv
🧾 Total matched DMP pairs: 26





In [15]:
# ============================================
# 🧩 STEP 7 — Element-Level Comparison with NIH Gold Standard (Exact Title Match)
# ============================================
import re
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# --- Paths ---
# --- Define ROOT_DIR dynamically (project root) ---
from pathlib import Path

# --------------------------------------------------------
# 🗂️ Define ROOT_DIR manually to your project folder
# --------------------------------------------------------
ROOT_DIR = Path(r"C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline")  # ✅ change if needed

print(f"📂 ROOT_DIR set to: {ROOT_DIR}")
GOLD_PATH      = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"
GENERATED_DIR  = ROOT_DIR / "data" / "outputs" / "markdown"
EVAL_DIR       = ROOT_DIR / "data" / "outputs" / "evaluation_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

print(f"📗 Gold Excel: {GOLD_PATH}")
print(f"📘 Generated MD folder: {GENERATED_DIR}")

# --- Load gold reference (Excel) ---
df_gold = pd.read_excel(GOLD_PATH)
df_gold.columns = df_gold.columns.str.strip().str.lower()
df_gold = df_gold.fillna("").astype(str)

def normalize_title(name: str) -> str:
    name = name.lower()
    name = re.sub(r"[^a-z0-9\s]", " ", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

df_gold["title_norm"] = df_gold["title"].apply(normalize_title)

gold_elements = [
    "element_1a","element_1b","element_1c",
    "element_2","element_3",
    "element_4a","element_4b","element_4c",
    "element_5a","element_5b","element_5c",
    "element_6"
]
print(f"✅ Loaded {len(df_gold)} gold projects.")

# --- Models ---
print("🚀 Loading evaluation models...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
print("✅ Models ready.")

# --- Markdown parsing helpers ---
def is_title(line: str) -> bool:
    s = line.strip()
    # Accept markdown headers (#, ##, ...) OR numbered bold section titles like "1. **Data Types**"
    return s.startswith("#") or bool(re.match(r"^\s*\d*\.?\s*\*\*.*\*\*\s*$", s))

def extract_sections(md_path: Path) -> pd.DataFrame:
    """
    Extract {Section Title, Generated Content} pairs from a Markdown file.
    Also strips any <think>...</think> blocks if present.
    """
    text = md_path.read_text(encoding="utf-8")
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)

    lines = text.splitlines()
    entries, current_title, buf = [], None, []

    for ln in lines:
        if is_title(ln):
            if current_title and any(x.strip() for x in buf):
                entries.append({
                    "Section Title": current_title.strip(),
                    "Generated Content": "\n".join(buf).strip()
                })
            current_title, buf = ln, []
        else:
            buf.append(ln)

    if current_title and any(x.strip() for x in buf):
        entries.append({
            "Section Title": current_title.strip(),
            "Generated Content": "\n".join(buf).strip()
        })

    return pd.DataFrame(entries)

# --- Compare (exact title match) ---
results = []
md_files = sorted(GENERATED_DIR.glob("*.md"))
print(f"🔍 Found {len(md_files)} generated Markdown files.")

for md_file in tqdm(md_files, desc="📊 Comparing element-level"):
    # Your MD files are saved with the SAME title (sanitized) — reverse-sanitize to match Excel
    # We’ll normalize both sides and do exact equality on normalized strings
    gen_title_raw = md_file.stem  # e.g., "National Institute of Mental Health (NIMH)"
    gen_title_norm = normalize_title(gen_title_raw)

    gold_row = df_gold[df_gold["title_norm"] == gen_title_norm]
    if gold_row.empty:
        print(f"⚠️ No gold match for file: {md_file.name}")
        continue

    gold_row = gold_row.iloc[0]
    gold_title = gold_row["title"]

    # Gather gold element texts
    gold_texts = {e: gold_row.get(e, "").strip() for e in gold_elements if gold_row.get(e, "").strip()}
    if not gold_texts:
        print(f"⚠️ Empty gold elements for: {gold_title}")
        continue

    # Extract sections from generated MD
    gen_df = extract_sections(md_file)
    if gen_df.empty:
        print(f"⚠️ No sections extracted from: {md_file.name}")
        continue

    # For each gold element, compare to ALL generated sections; keep best match
    for element, gold_text in gold_texts.items():
        best = None
        for _, sec in gen_df.iterrows():
            gen_text = str(sec["Generated Content"]).strip()
            if not gen_text:
                continue

            emb_gold = sbert.encode(gold_text, convert_to_tensor=True)
            emb_gen  = sbert.encode(gen_text,  convert_to_tensor=True)
            sbert_sim = util.cos_sim(emb_gold, emb_gen).item()
            rouge_l   = rouge.score(gold_text, gen_text)["rougeL"].recall

            cand = {
                "Gold Project": gold_title,
                "Gold Element": element,
                "Generated File": md_file.name,
                "Generated Section Title": sec["Section Title"],
                "SBERT_Similarity": round(sbert_sim, 4),
                "ROUGE_L_Recall": round(rouge_l, 4),
            }
            if (best is None) or (sbert_sim > best["SBERT_Similarity"]):
                best = cand

        if best:
            results.append(best)

# --- Save ---
df_results = pd.DataFrame(results)
out_path = EVAL_DIR / "element_similarity_exact_titles.csv"
df_results.to_csv(out_path, index=False, encoding="utf-8")
print(f"\n✅ Element-level similarity saved to: {out_path}")
print(f"🧾 Total element–section best matches: {len(df_results)}")


📂 ROOT_DIR set to: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline
📗 Gold Excel: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\inputs\inputs.xlsx
📘 Generated MD folder: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown
✅ Loaded 26 gold projects.
🚀 Loading evaluation models...
✅ Models ready.
🔍 Found 26 generated Markdown files.


📊 Comparing element-level: 100%|██████████| 26/26 [00:54<00:00,  2.11s/it]


✅ Element-level similarity saved to: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\evaluation_results\element_similarity_exact_titles.csv
🧾 Total element–section best matches: 312





In [16]:
# ============================================
# 🧮 Step 8: Summarize Evaluation Results (with Generated_File titles)
# ============================================
import pandas as pd
import numpy as np
from pathlib import Path

# --- Auto-detect project root ---
# --------------------------------------------------------
# 🗂️ Define ROOT_DIR manually to your project folder
# --------------------------------------------------------
ROOT_DIR = Path(r"C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline")  # ✅ change if needed

EVAL_DIR = ROOT_DIR / "data" / "outputs" / "evaluation_results"

# --- Load CSVs ---
full_path = EVAL_DIR / "full_dmp_pdf_comparison_fuzzy.csv"
elem_path = EVAL_DIR / "element_similarity_exact_titles.csv"

df_full = pd.read_csv(full_path)
df_elem = pd.read_csv(elem_path)

print(f"✅ Loaded full-document ({len(df_full)} rows)")
print(f"✅ Loaded element-level ({len(df_elem)} rows)\n")

# ============================================================
# 🧩 1️⃣ FULL-DOCUMENT LEVEL SUMMARY (Mean Only, by Generated_File)
# ============================================================

# Prefer "Generated_File" column; fallback to detected one
if "Generated_File" in df_full.columns:
    project_col = "Generated_File"
else:
    project_col = next(
        (c for c in df_full.columns if "title" in c.lower() or "project" in c.lower() or "matched" in c.lower()),
        df_full.columns[0],
    )

# Find numeric columns
numeric_cols = [c for c in df_full.columns if "sbert" in c.lower() or "rouge" in c.lower()]

# Compute mean per file (if multiple rows)
df_full_summary = (
    df_full.groupby(project_col)[numeric_cols]
    .mean()
    .reset_index()
)

# Format to 2 decimals
df_full_summary["SBERT"] = df_full_summary[numeric_cols[0]].apply(lambda x: f"{x:.2f}")
df_full_summary["ROUGE"] = df_full_summary[numeric_cols[1]].apply(lambda x: f"{x:.2f}")

# Reorder columns and rename for clarity
df_full_table = df_full_summary[[project_col, "SBERT", "ROUGE"]].rename(
    columns={project_col: "Generated_File"}
)

print("📊 Full-document summary table (Mean only, by Generated_File):")
display(df_full_table)

# ============================================================
# 🧩 2️⃣ ELEMENT-LEVEL SUMMARY (Mean ± SD)
# ============================================================

elem_col = next(
    (c for c in df_elem.columns if "element" in c.lower()),
    df_elem.columns[0],
)

numeric_cols_elem = [c for c in df_elem.columns if "sbert" in c.lower() or "rouge" in c.lower()]
df_elem_summary = (
    df_elem.groupby(elem_col)[numeric_cols_elem]
    .agg(["mean", "std"])
    .reset_index()
)
flat_cols_elem = [elem_col, "SBERT_Mean", "SBERT_SD", "ROUGE_Mean", "ROUGE_SD"]
df_elem_summary.columns = flat_cols_elem

df_elem_summary["SBERT"] = df_elem_summary.apply(
    lambda r: f"{r['SBERT_Mean']:.2f} ± {r['SBERT_SD']:.2f}", axis=1)
df_elem_summary["ROUGE"] = df_elem_summary.apply(
    lambda r: f"{r['ROUGE_Mean']:.2f} ± {r['ROUGE_SD']:.2f}", axis=1)

df_elem_table = df_elem_summary[[elem_col, "SBERT", "ROUGE"]].rename(
    columns={elem_col: "Element"}
)

print("\n📊 Element-level summary table (Mean ± SD):")
display(df_elem_table)

# ============================================================
# 💾 Save formatted tables
# ============================================================
out_full = EVAL_DIR / "summary_full_table_mean_only.csv"
out_elem = EVAL_DIR / "summary_element_table_mean_sd.csv"

df_full_table.to_csv(out_full, index=False)
df_elem_table.to_csv(out_elem, index=False)

print(f"\n💾 Saved formatted tables →\n• {out_full}\n• {out_elem}")


✅ Loaded full-document (26 rows)
✅ Loaded element-level (312 rows)

📊 Full-document summary table (Mean only, by Generated_File):


Unnamed: 0,Generated_File,SBERT,ROUGE
0,Analysis of social media posts.md,0.76,0.39
1,Basic Research from a Non-Human Source Example.md,0.74,0.5
2,Clinical Data from Human Research Participants.md,0.71,0.23
3,Clinical and MRI data from human research part...,0.73,0.29
4,Clinical data (human biospecimens).md,0.8,0.44
5,Clinical data from human research participants...,0.8,0.38
6,Drug discovery including intellectual property.md,0.81,0.36
7,Gene expression analysis data from non-human m...,0.78,0.41
8,Genomic data from a non-human source.md,0.7,0.31
9,Genomic data from human research participants.md,0.74,0.28



📊 Element-level summary table (Mean ± SD):


Unnamed: 0,Element,SBERT,ROUGE
0,element_1a,0.79 ± 0.13,0.46 ± 0.29
1,element_1b,0.72 ± 0.10,0.41 ± 0.23
2,element_1c,0.76 ± 0.10,0.45 ± 0.29
3,element_2,0.82 ± 0.08,0.54 ± 0.25
4,element_3,0.78 ± 0.14,0.47 ± 0.27
5,element_4a,0.79 ± 0.11,0.59 ± 0.28
6,element_4b,0.84 ± 0.11,0.56 ± 0.25
7,element_4c,0.87 ± 0.08,0.60 ± 0.27
8,element_5a,0.76 ± 0.14,0.47 ± 0.24
9,element_5b,0.79 ± 0.12,0.45 ± 0.25



💾 Saved formatted tables →
• C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\evaluation_results\summary_full_table_mean_only.csv
• C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\evaluation_results\summary_element_table_mean_sd.csv
