In [15]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

2.5.1+cu121
True
NVIDIA GeForce RTX 3090


In [16]:
# ============================================
# STEP 1 ‚Äî Imports, Config, and Helpers  (LanceDB)
# ============================================
import os, re, time
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import pypandoc  # for Markdown ‚Üí DOCX

# --- LangChain Core ---
from langchain_community.vectorstores import LanceDB       # ‚úÖ use LanceDB instead of FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

load_dotenv()

# ---------- Paths (works in notebook or script) ----------
try:
    ROOT_DIR = Path(__file__).resolve().parents[1]  # when running a .py script
except NameError:
    ROOT_DIR = Path.cwd().parent                     # when running inside Jupyter

# --- Data folders ---
DATA_PDFS   = ROOT_DIR / "data" / "pdfs"
INDEX_DIR   = ROOT_DIR / "data" / "lancedb_index"    # ‚úÖ new folder for LanceDB
EXCEL_PATH  = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"
TEMPLATE_MD = ROOT_DIR / "data" / "inputs" / "dmp-template.md"

# --- Output folders ---
OUTPUT_MD   = ROOT_DIR / "data" / "outputs" / "markdown"
OUTPUT_DOCX = ROOT_DIR / "data" / "outputs" / "docx"

# --- Models / parameters ---
LLM_MODEL   = "llama3.3"

# ---------- Helper functions ----------
def create_folder(folderpath):
    Path(folderpath).mkdir(parents=True, exist_ok=True)

def save_md(folderpath, filename, text):
    create_folder(folderpath)
    (Path(folderpath) / filename).write_text(text, encoding="utf-8")
    print("üíæ Saved:", Path(folderpath) / filename)

def md_to_docs(md_filepath, docx_folderpath, docx_filename):
    create_folder(docx_folderpath)
    pypandoc.convert_file(
        str(md_filepath), "docx",
        outputfile=str(Path(docx_folderpath) / docx_filename)
    )
    print("üìÑ Converted:", Path(docx_folderpath) / docx_filename)

def clean_filename(name: str) -> str:
    """Remove illegal characters from filenames (Windows-safe)."""
    return re.sub(r'[\\/*?:"<>|]', "_", str(name)).strip()

# ---------- Ensure required folders exist ----------
for p in [DATA_PDFS, INDEX_DIR, OUTPUT_MD, OUTPUT_DOCX]:
    p.mkdir(parents=True, exist_ok=True)

# ---------- Sanity print ----------
print("‚úÖ STEP 1 ready (LanceDB version)")
print(f"ROOT_DIR   : {ROOT_DIR}")
print(f"DATA_PDFS  : {DATA_PDFS}")
print(f"INDEX_DIR  : {INDEX_DIR}")
print(f"EXCEL_PATH : {EXCEL_PATH}")
print(f"TEMPLATE_MD: {TEMPLATE_MD}")
print(f"OUTPUT_MD  : {OUTPUT_MD}")
print(f"OUTPUT_DOCX: {OUTPUT_DOCX}")


‚úÖ STEP 1 ready (LanceDB version)
ROOT_DIR   : c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline
DATA_PDFS  : c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\pdfs
INDEX_DIR  : c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\lancedb_index
EXCEL_PATH : c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\inputs\inputs.xlsx
TEMPLATE_MD: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\inputs\dmp-template.md
OUTPUT_MD  : c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown
OUTPUT_DOCX: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx


In [17]:
# =========================================================
# STEP 2 ‚Äî Load PDFs and TXT Files, Split into Text Chunks (Cached) ‚Äî LanceDB Version
# =========================================================
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pickle
import warnings
from tqdm import tqdm
from pathlib import Path

# --- Optional: Silence PDFMiner warnings ---
warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")

# --- Paths based on your system ---
ROOT_DIR = Path(r"c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline")
DATA_PDFS = ROOT_DIR / "data" / "pdfs"
INDEX_DIR = ROOT_DIR / "data" / "lancedb_index"          # ‚úÖ updated for LanceDB
CHUNK_CACHE_PATH = INDEX_DIR / "chunks_cache.pkl"

# --- Ensure folders exist ---
DATA_PDFS.mkdir(parents=True, exist_ok=True)
INDEX_DIR.mkdir(parents=True, exist_ok=True)

# --------------------------------------------------------
# Function: Load PDFs and TXT files
# --------------------------------------------------------
def load_docs_from_folder(folder: Path):
    """Load all PDF and TXT files as LangChain Document objects."""
    if not folder.exists():
        raise FileNotFoundError(f"‚ùå Folder not found: {folder}")

    pdf_files = sorted(folder.glob("*.pdf"))
    txt_files = sorted(folder.glob("*.txt"))
    all_files = pdf_files + txt_files
    if not all_files:
        print(f"‚ö†Ô∏è No PDF or TXT files found in {folder}. Please add some files.")
        return []

    docs = []
    for fpath in tqdm(all_files, desc=f"üìÑ Loading files from {folder}"):
        try:
            if fpath.suffix.lower() == ".pdf":
                loader = PyPDFLoader(str(fpath))
            elif fpath.suffix.lower() == ".txt":
                loader = TextLoader(str(fpath), encoding="utf-8")
            else:
                print(f"‚è≠Ô∏è Skipped unsupported file: {fpath.name}")
                continue
            docs.extend(loader.load())
        except Exception as e:
            print(f"‚ùå Error loading {fpath.name}: {e}")

    print(f"\n‚úÖ Loaded {len(docs)} pages from {len(all_files)} files in '{folder}'.")
    return docs

# --------------------------------------------------------
# Function: Split documents into chunks
# --------------------------------------------------------
def split_into_chunks(docs, chunk_size=800, chunk_overlap=120):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(docs)
    print(f"üß© Created {len(chunks)} chunks from {len(docs)} document pages.")
    return chunks

# --------------------------------------------------------
# Function: Load or Create Cached Chunks
# --------------------------------------------------------
def load_or_create_chunks(folder=DATA_PDFS, cache_path=CHUNK_CACHE_PATH):
    """Load cached chunks if available; otherwise load, split, and cache."""
    if cache_path.exists():
        print(f"‚ö° Loading cached chunks from {cache_path}")
        with open(cache_path, "rb") as f:
            chunks = pickle.load(f)
    else:
        print("üïí No cache found ‚Äî processing documents...")
        raw_docs = load_docs_from_folder(folder)
        if not raw_docs:
            return []  # exit early if folder empty
        chunks = split_into_chunks(raw_docs)
        cache_path.parent.mkdir(parents=True, exist_ok=True)
        with open(cache_path, "wb") as f:
            pickle.dump(chunks, f)
        print(f"üíæ Saved chunks cache ‚Üí {cache_path}")
    return chunks

# --------------------------------------------------------
# Example Run
# --------------------------------------------------------
chunks = load_or_create_chunks()
print(f"‚úÖ STEP 2 ready ‚Äî {len(chunks)} chunks loaded (for LanceDB).")


‚ö° Loading cached chunks from c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\lancedb_index\chunks_cache.pkl
‚úÖ STEP 2 ready ‚Äî 953996 chunks loaded (for LanceDB).


In [18]:
import lancedb
print("LanceDB version:", lancedb.__version__)
db = lancedb.connect("data/index/lancedb_index")
print("‚úÖ Connected successfully:", db.uri)

LanceDB version: 0.25.3
‚úÖ Connected successfully: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\notebook_DMP_RAG\data\index\lancedb_index


In [19]:
# ============================================ 
# STEP 3 ‚Äî Build or Load LanceDB Index (RAG-Version 5 ‚Äî Cross-Encoder Re-Ranking)
# ============================================

from langchain_community.vectorstores import LanceDB
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from sentence_transformers import CrossEncoder
import lancedb
import torch
import time
from tqdm import tqdm
from pathlib import Path
import numpy as np
import pandas as pd

# --- Parameters ---
TOP_K = 8
EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"     # semantic precision
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"     # re-ranking layer
INDEX_DIR = Path("data/index/lancedb_v5_index")

# --- Device setup ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üíª Using device: {DEVICE.upper()}")

# --- Auto-adjust batch size by VRAM ---
def auto_batch_size():
    if DEVICE != "cuda":
        return 16
    vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    if vram_gb >= 22:
        return 128
    elif vram_gb >= 12:
        return 64
    else:
        return 32

BATCH_SIZE = auto_batch_size()
print(f"üß† GPU VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print(f"‚öôÔ∏è Batch size set to: {BATCH_SIZE}")

# --- Initialize embedding model ---
embeddings = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    model_kwargs={"device": DEVICE},
    encode_kwargs={"batch_size": BATCH_SIZE}
)

# ------------------------------------------------------------
# Function: Build or Load LanceDB Index
# ------------------------------------------------------------
def build_or_load_lancedb_index(index_dir=INDEX_DIR, chunks=None):
    """
    Builds or loads a LanceDB index for RAG-Version 5.
    Uses GPU embeddings and precise re-ranking for improved retrieval quality.
    """
    index_dir.mkdir(parents=True, exist_ok=True)
    db = lancedb.connect(str(index_dir))

    # --- Load existing index ---
    if "documents" in db.table_names():
        print("üì¶ Existing LanceDB index found ‚Äî loading...")
        table = db.open_table("documents")
        vectorstore = LanceDB(connection=db, table=table, embedding=embeddings)
        print("‚úÖ LanceDB index loaded successfully.")
        return vectorstore

    # --- Validate chunks ---
    if not chunks:
        raise RuntimeError("‚ùå No chunks provided. Please run Step 2 first.")

    total = len(chunks)
    print(f"üß± Building new LanceDB index (RAG-V5) on {DEVICE.upper()}...")
    print(f"üìä Total chunks to embed: {total:,}")

    start_time = time.time()

    # --- Create LanceDB vector store ---
    vectorstore = LanceDB.from_documents(
        tqdm(chunks, desc="üî¢ Embedding text chunks", ncols=100),
        embedding=embeddings,
        uri=str(index_dir),
        table_name="documents"
    )

    duration = time.time() - start_time
    print(f"üíæ Index saved to {index_dir}")
    print(f"‚è±Ô∏è Build completed in {duration/60:.2f} min ({duration:.1f} sec)")
    return vectorstore


# ------------------------------------------------------------
# Execute Step 3 ‚Äî Build or Load Index
# ------------------------------------------------------------
vectorstore = build_or_load_lancedb_index(INDEX_DIR, chunks)

# ------------------------------------------------------------
# Step 3.1 ‚Äî Base Retriever (pull more for re-ranking)
# ------------------------------------------------------------
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": TOP_K * 2}
)

# ------------------------------------------------------------
# Step 3.2 ‚Äî Custom Cross-Encoder Re-Ranker
# ------------------------------------------------------------
print("‚öôÔ∏è Initializing custom Cross-Encoder reranker...")
cross_encoder = CrossEncoder(RERANKER_MODEL, device=DEVICE)

def rerank_with_cross_encoder(query, docs, top_k=TOP_K):
    pairs = [(query, d.page_content) for d in docs]
    scores = cross_encoder.predict(pairs)  # numpy array
    ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    top_docs = [d for d, _ in ranked[:top_k]]
    top_scores = [float(s) for _, s in ranked[:top_k]]
    return top_docs, top_scores


print(f"‚úÖ Retriever ready (LanceDB + {DEVICE.upper()} + Cross-Encoder Re-Ranking, top_k={TOP_K})")

# --- Performance Summary ---
torch.cuda.empty_cache()
print("üìà Configuration Summary:")
print(f"   ‚Ä¢ LLM Model: llama3.3 (Ollama)")
print(f"   ‚Ä¢ Embedding Model: {EMBED_MODEL}")
print(f"   ‚Ä¢ Re-Ranker Model: {RERANKER_MODEL}")
print(f"   ‚Ä¢ Vector DB: LanceDB")
print(f"   ‚Ä¢ Top-k Retrieval: {TOP_K}")
print(f"   ‚Ä¢ Batch Size: {BATCH_SIZE}")
print(f"   ‚Ä¢ Device: {DEVICE.upper()}")


üíª Using device: CUDA
üß† GPU VRAM: 25.8 GB
‚öôÔ∏è Batch size set to: 128
üß± Building new LanceDB index (RAG-V5) on CUDA...
üìä Total chunks to embed: 953,996


üî¢ Embedding text chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 953996/953996 [00:00<00:00, 3393697.52it/s]


üíæ Index saved to data\index\lancedb_v5_index
‚è±Ô∏è Build completed in 69.76 min (4185.4 sec)
‚öôÔ∏è Initializing custom Cross-Encoder reranker...
‚úÖ Retriever ready (LanceDB + CUDA + Cross-Encoder Re-Ranking, top_k=8)
üìà Configuration Summary:
   ‚Ä¢ LLM Model: llama3.3 (Ollama)
   ‚Ä¢ Embedding Model: sentence-transformers/all-mpnet-base-v2
   ‚Ä¢ Re-Ranker Model: cross-encoder/ms-marco-MiniLM-L-6-v2
   ‚Ä¢ Vector DB: LanceDB
   ‚Ä¢ Top-k Retrieval: 8
   ‚Ä¢ Batch Size: 128
   ‚Ä¢ Device: CUDA


In [20]:
# ============================================
# üß© STEP 4 ‚Äî Load Excel, Template, and Build RAG Chain (Fixed)
# ============================================
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Ollama
import pandas as pd

# --- Load Excel file ---
if not EXCEL_PATH.exists():
    raise FileNotFoundError(f"‚ùå Excel file not found: {EXCEL_PATH}")

df = pd.read_excel(EXCEL_PATH)
print(f"‚úÖ Excel loaded successfully: {len(df)} rows")

# --- Load Markdown Template ---
if not TEMPLATE_MD.exists():
    raise FileNotFoundError(f"‚ùå Template file not found: {TEMPLATE_MD}")

dmp_template_text = TEMPLATE_MD.read_text(encoding="utf-8")
print("‚úÖ DMP Markdown template loaded.")


# --- Build RAG chain ---
def build_rag_chain(retriever, llm_model=LLM_MODEL):
    """
    Build a flexible RAG pipeline that retrieves context
    and generates a context-grounded NIH DMP section.
    """
    llm = Ollama(model=llm_model)

    prompt_template = """You are an expert biomedical data steward and grant writer.
Create a high-quality NIH Data Management and Sharing Plan (DMSP)
based on the retrieved NIH context and the user's query.

----
Context from NIH Repository:
{context}

----
Question:
{question}

Use the context above and follow the NIH template structure. Write fluently and cohesively.
"""
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    parser = StrOutputParser()

    def format_docs(docs):
        """Format retrieved documents into clean text."""
        if not docs:
            return ""
        formatted = []
        for d in docs:
            page = d.metadata.get("page", "")
            title = d.metadata.get("source", "")
            formatted.append(f"[Page {page}] {title}\n{d.page_content.strip()}")
        return "\n\n".join(formatted)

    rag_chain = (
        {
            "context": retriever | format_docs,
            "question": RunnablePassthrough()
        }
        | prompt
        | llm
        | parser
    )

    print(f"üîó RAG chain initialized with model: {llm_model}")
    return rag_chain


# --- Initialize the RAG chain ---
rag_chain = build_rag_chain(retriever)
print("‚úÖ RAG chain ready for generation.")


‚úÖ Excel loaded successfully: 26 rows
‚úÖ DMP Markdown template loaded.
üîó RAG chain initialized with model: llama3.3
‚úÖ RAG chain ready for generation.


In [21]:
# ============================================
# üß© STEP 5 ‚Äî RAG-Based DMP Generation Using Titles (LanceDB Version)
# ============================================
import re, pandas as pd, pypandoc
from tqdm import tqdm
from pathlib import Path

# ---------- Paths ----------
EXCEL_PATH  = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"
OUTPUT_LOG  = ROOT_DIR / "data" / "outputs" / "rag_generated_dmp_log.csv"
OUTPUT_MD.mkdir(parents=True, exist_ok=True)
OUTPUT_DOCX.mkdir(parents=True, exist_ok=True)

# ---------- Load Excel ----------
df = pd.read_excel(EXCEL_PATH)
print(f"‚úÖ Loaded input Excel ‚Äî {len(df)} rows")

# Normalize column names
df.columns = df.columns.str.strip().str.lower()
df = df.fillna("")

# ---------- Verify template ----------
if not TEMPLATE_MD.exists():
    raise FileNotFoundError(f"‚ùå Template not found: {TEMPLATE_MD}")
dmp_template_text = TEMPLATE_MD.read_text(encoding="utf-8")
print(f"‚úÖ Loaded NIH DMP Markdown template from: {TEMPLATE_MD}")

# ---------- Helper functions ----------
def sanitize_filename(name: str) -> str:
    """Replace illegal filename characters but preserve readable title."""
    return re.sub(r'[\\/*?:"<>|]', "_", name.strip())

def create_folder(folderpath: Path):
    folderpath.mkdir(parents=True, exist_ok=True)

def save_md(folderpath: Path, filename: str, response: str):
    create_folder(folderpath)
    filepath = folderpath / filename
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(response)
    print(f"üíæ Saved: {filepath}")

def md_to_docx(md_filepath: Path, docx_folder: Path, docx_filename: str):
    create_folder(docx_folder)
    docx_path = docx_folder / docx_filename
    pypandoc.convert_file(str(md_filepath), "docx", outputfile=str(docx_path))
    print(f"üìÑ Converted: {docx_path}")

# ---------- Main Generation ----------
records = []


for idx, row in tqdm(df.iterrows(), total=len(df), desc="üß† Generating NIH DMPs"):
    title = str(row["title"]).strip()
    if not title:
        print("‚ö†Ô∏è Skipping row with empty title.")
        continue

    print(f"\nüß© Generating DMP for: {title}")

    # 1Ô∏è‚É£ Build query from Excel elements
    element_texts = []
    for col in [c for c in df.columns if c.startswith("element")]:
        val = str(row[col]).strip()
        if val:
            element_texts.append(f"{col.upper()}: {val}")
    query_data = "\n".join(element_texts)

    query = (
        f"You are an expert biomedical data steward and grant writer. "
        f"Create a complete NIH Data Management and Sharing Plan (DMSP) "
        f"for the project titled '{title}'. Use retrieved context from the "
        f"NIH corpus stored in LanceDB to fill all template sections accurately.\n\n"
        f"Background information from the proposal:\n{query_data}\n"
    )

    # 2Ô∏è‚É£ Retrieve context from LanceDB
    try:
        retrieved_docs = retriever.get_relevant_documents(query)
        context_text = "\n\n".join(doc.page_content for doc in retrieved_docs[:TOP_K])
        print(f"üîé Retrieved {len(retrieved_docs)} context chunks.")
    except Exception as e:
        print(f"‚ö†Ô∏è Retrieval failed for {title}: {e}")
        context_text = ""

    # 3Ô∏è‚É£ Combine context, query, and template
    full_prompt = f"""
You are an expert biomedical data steward and grant writer.
Use the retrieved NIH context and the provided template to generate a complete Data Management and Sharing Plan.

----
Context:
{context_text}

----
Project Query:
{query}

Use the following NIH DMSP Markdown template. Do not alter section titles:
{dmp_template_text}
"""

    # 4Ô∏è‚É£ Run through RAG model
    try:
        response = rag_chain.invoke(full_prompt)

        # 5Ô∏è‚É£ Save using SAME TITLE as in Excel
        safe_title = sanitize_filename(title)
        md_filename = f"{safe_title}.md"
        docx_filename = f"{safe_title}.docx"
        md_path = OUTPUT_MD / md_filename

        save_md(OUTPUT_MD, md_filename, response)
        md_to_docx(md_path, OUTPUT_DOCX, docx_filename)

        # 6Ô∏è‚É£ Log summary
        records.append({
            "Title": title,
            "Query": query,
            "Retrieved_Context": context_text[:1000],
            "Generated_DMP_Preview": response[:1000],
            "Error": ""
        })

    except Exception as e:
        print(f"‚ùå Error generating DMP for {title}: {e}")
        records.append({
            "Title": title,
            "Query": query,
            "Retrieved_Context": context_text[:1000],
            "Generated_DMP_Preview": "",
            "Error": str(e)
        })

# ---------- Save Log ----------
pd.DataFrame(records).to_csv(OUTPUT_LOG, index=False, encoding="utf-8")
print("\n‚úÖ All NIH DMPs generated successfully ‚Äî titles preserved exactly as in Excel!")
print(f"üìä CSV log saved to: {OUTPUT_LOG}")


‚úÖ Loaded input Excel ‚Äî 26 rows
‚úÖ Loaded NIH DMP Markdown template from: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\inputs\dmp-template.md


üß† Generating NIH DMPs:   0%|          | 0/26 [00:00<?, ?it/s]


üß© Generating DMP for: Clinical and MRI data from human research participants
üîé Retrieved 16 context chunks.
üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Clinical and MRI data from human research participants.md


üß† Generating NIH DMPs:   4%|‚ñç         | 1/26 [02:13<55:29, 133.17s/it]

üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Clinical and MRI data from human research participants.docx

üß© Generating DMP for: Genomic data from human research participants
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:   8%|‚ñä         | 2/26 [03:39<42:21, 105.89s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Genomic data from human research participants.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Genomic data from human research participants.docx

üß© Generating DMP for: Genomic data from a non-human source
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  12%|‚ñà‚ñè        | 3/26 [05:08<37:32, 97.94s/it] 

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Genomic data from a non-human source.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Genomic data from a non-human source.docx

üß© Generating DMP for: Secondary data analysis
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  15%|‚ñà‚ñå        | 4/26 [06:21<32:15, 88.00s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Secondary data analysis.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Secondary data analysis.docx

üß© Generating DMP for: Human clinical and genomics data
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  19%|‚ñà‚ñâ        | 5/26 [07:54<31:27, 89.88s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Human clinical and genomics data.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Human clinical and genomics data.docx

üß© Generating DMP for: Gene expression analysis data from non-human model organism (zebrafish)
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  23%|‚ñà‚ñà‚ñé       | 6/26 [09:12<28:33, 85.70s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Gene expression analysis data from non-human model organism (zebrafish).md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Gene expression analysis data from non-human model organism (zebrafish).docx

üß© Generating DMP for: Human survey data
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  27%|‚ñà‚ñà‚ñã       | 7/26 [10:37<27:10, 85.79s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Human survey data.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Human survey data.docx

üß© Generating DMP for: Clinical Data from Human Research Participants
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  31%|‚ñà‚ñà‚ñà       | 8/26 [12:06<26:00, 86.69s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Clinical Data from Human Research Participants.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Clinical Data from Human Research Participants.docx

üß© Generating DMP for: Human genomic data
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  35%|‚ñà‚ñà‚ñà‚ñç      | 9/26 [13:26<23:59, 84.70s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Human genomic data.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Human genomic data.docx

üß© Generating DMP for: Technology development
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  38%|‚ñà‚ñà‚ñà‚ñä      | 10/26 [14:54<22:49, 85.60s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Technology development.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Technology development.docx

üß© Generating DMP for: Basic Research from a Non-Human Source Example
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 11/26 [16:24<21:43, 86.87s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Basic Research from a Non-Human Source Example.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Basic Research from a Non-Human Source Example.docx

üß© Generating DMP for: Secondary Data Analysis Example
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 12/26 [17:49<20:08, 86.34s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Secondary Data Analysis Example.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Secondary Data Analysis Example.docx

üß© Generating DMP for: Survey and Interview Example
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 13/26 [19:02<17:50, 82.32s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Survey and Interview Example.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Survey and Interview Example.docx

üß© Generating DMP for: Human Clinical Trial Data
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 14/26 [20:35<17:05, 85.47s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Human Clinical Trial Data.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Human Clinical Trial Data.docx

üß© Generating DMP for: Clinical data from human research participants-NIA
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 15/26 [21:56<15:25, 84.11s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Clinical data from human research participants-NIA.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Clinical data from human research participants-NIA.docx

üß© Generating DMP for: Survey, interview, and biological data (tiered access)
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 16/26 [23:16<13:50, 83.06s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Survey, interview, and biological data (tiered access).md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Survey, interview, and biological data (tiered access).docx

üß© Generating DMP for: Non-human data (primates)
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 17/26 [24:34<12:13, 81.47s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Non-human data (primates).md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Non-human data (primates).docx

üß© Generating DMP for: Secondary data analysis-NIA
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 18/26 [25:59<11:00, 82.51s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Secondary data analysis-NIA.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Secondary data analysis-NIA.docx

üß© Generating DMP for: Survey and interview data-NIA
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 19/26 [27:21<09:36, 82.33s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Survey and interview data-NIA.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Survey and interview data-NIA.docx

üß© Generating DMP for: Human clinical and genomic data-NIA
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 20/26 [29:01<08:46, 87.71s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Human clinical and genomic data-NIA.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Human clinical and genomic data-NIA.docx

üß© Generating DMP for: Non-human data (rodents)-NIA
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 21/26 [30:19<07:04, 84.88s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Non-human data (rodents)-NIA.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Non-human data (rodents)-NIA.docx

üß© Generating DMP for: Clinical data (human biospecimens)
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 22/26 [31:50<05:46, 86.55s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Clinical data (human biospecimens).md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Clinical data (human biospecimens).docx

üß© Generating DMP for: Drug discovery including intellectual property
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 23/26 [33:04<04:08, 82.81s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Drug discovery including intellectual property.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Drug discovery including intellectual property.docx

üß© Generating DMP for: HeLa Cell Whole Genome Sequence (DNA or RNA)
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 24/26 [34:24<02:44, 82.11s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\HeLa Cell Whole Genome Sequence (DNA or RNA).md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\HeLa Cell Whole Genome Sequence (DNA or RNA).docx

üß© Generating DMP for: Secondary Data Analysis on Data from Human Subjects-NIA
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 25/26 [36:02<01:26, 86.79s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Secondary Data Analysis on Data from Human Subjects-NIA.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Secondary Data Analysis on Data from Human Subjects-NIA.docx

üß© Generating DMP for: Analysis of social media posts
üîé Retrieved 16 context chunks.


üß† Generating NIH DMPs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [37:30<00:00, 86.57s/it]

üíæ Saved: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown\Analysis of social media posts.md
üìÑ Converted: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\docx\Analysis of social media posts.docx

‚úÖ All NIH DMPs generated successfully ‚Äî titles preserved exactly as in Excel!
üìä CSV log saved to: c:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\rag_generated_dmp_log.csv





In [22]:
# ============================================
# üß© STEP 7 ‚Äî Full DMP Comparison: Markdown (Generated) vs PDF (Gold, Fuzzy Matching)
# ============================================
import os, re
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# --------------------------------------------------------
# üóÇÔ∏è Define ROOT_DIR manually to your project folder
# --------------------------------------------------------
ROOT_DIR = Path(r"C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline")  # ‚úÖ change if needed

# --- Paths ---
GOLD_DIR      = ROOT_DIR / "data" /"inputs"/ "gold_dmps"      # PDF gold-standard DMPs
GENERATED_DIR = ROOT_DIR / "data" / "outputs" / "markdown"      # Generated DMPs
EVAL_DIR      = ROOT_DIR / "data" / "outputs" / "evaluation_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìó Gold PDF folder: {GOLD_DIR}")
print(f"üìò Generated Markdown folder: {GENERATED_DIR}")

# --- Models ---
print("üöÄ Loading models...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
print("‚úÖ Models ready.")

# --- Helper functions ---
def normalize_name(name: str) -> str:
    name = name.lower()
    name = re.sub(r"[^a-z0-9\s]", " ", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

def clean_text(text: str) -> str:
    """Remove markdown or formatting artifacts."""
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    text = re.sub(r"#+\s*", "", text)
    text = re.sub(r"\*\*|\*", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def extract_text_from_pdf(pdf_path: Path) -> str:
    """Extract readable text from PDF using PyMuPDF."""
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text("text") + "\n"
    except Exception as e:
        print(f"‚ùå Error reading {pdf_path.name}: {e}")
    return clean_text(text)

def chunk_text(text, size=300):
    """Split long text into 300-word chunks."""
    words = text.split()
    return [" ".join(words[i:i+size]) for i in range(0, len(words), size)]

def compare_chunked(gold_text, gen_text, model):
    """Chunked SBERT + ROUGE similarity between two long texts."""
    gold_chunks = chunk_text(gold_text)
    gen_chunks = chunk_text(gen_text)

    sbert_scores, rouge_scores = [], []
    for g in gold_chunks:
        emb_g = model.encode(g, convert_to_tensor=True)
        chunk_sims = []
        for gen in gen_chunks:
            emb_gen = model.encode(gen, convert_to_tensor=True)
            chunk_sims.append(util.cos_sim(emb_g, emb_gen).item())
        sbert_scores.append(max(chunk_sims))  # best match per gold chunk

        rouge_chunk_scores = [rouge.score(g, gen)["rougeL"].recall for gen in gen_chunks]
        rouge_scores.append(max(rouge_chunk_scores))

    return np.mean(sbert_scores), np.mean(rouge_scores)

def best_fuzzy_match(target, gold_names, threshold=0.6):
    """Find best matching name among gold files using fuzzy ratio."""
    best_match, best_score = None, 0
    for g in gold_names:
        score = SequenceMatcher(None, target, g).ratio()
        if score > best_score:
            best_match, best_score = g, score
    return (best_match, best_score) if best_score >= threshold else (None, best_score)

# --- Collect gold PDFs and generated MDs ---
gold_files = {normalize_name(f.stem): f for f in GOLD_DIR.glob("*.pdf")}
gen_files  = {normalize_name(f.stem): f for f in GENERATED_DIR.glob("*.md")}
print(f"üìä Found {len(gen_files)} generated DMPs and {len(gold_files)} gold PDFs.")

# --- Compare all matching files ---
results = []
for name, gen_path in tqdm(gen_files.items(), desc="üîé Matching & Comparing DMPs"):
    best_match, score = best_fuzzy_match(name, list(gold_files.keys()))
    if not best_match:
        print(f"‚ö†Ô∏è No gold match for: {gen_path.name}")
        continue

    gold_path = gold_files[best_match]
    gold_text = extract_text_from_pdf(gold_path)
    gen_text  = clean_text(gen_path.read_text(encoding="utf-8"))

    if not gold_text.strip() or not gen_text.strip():
        print(f"‚ö†Ô∏è Skipping empty file: {name}")
        continue

    sbert_sim, rouge_l = compare_chunked(gold_text, gen_text, sbert)
    results.append({
        "Generated_File": gen_path.name,
        "Matched_Gold_PDF": gold_path.name,
        "Match_Score": round(score, 3),
        "SBERT_Similarity": round(sbert_sim, 4),
        "ROUGE_L_Recall": round(rouge_l, 4),
    })
    print(f"‚úÖ Matched {gen_path.name} ‚Üî {gold_path.name} (score={score:.2f})")

# --- Save results ---
df_results = pd.DataFrame(results)
out_path = EVAL_DIR / "full_dmp_pdf_comparison_fuzzy.csv"
df_results.to_csv(out_path, index=False)
print(f"\n‚úÖ Markdown‚ÄìPDF (fuzzy) similarity results saved to: {out_path}")
print(f"üßæ Total matched DMP pairs: {len(df_results)}")


üìó Gold PDF folder: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\inputs\gold_dmps
üìò Generated Markdown folder: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown
üöÄ Loading models...
‚úÖ Models ready.
üìä Found 26 generated DMPs and 26 gold PDFs.


üîé Matching & Comparing DMPs:   4%|‚ñç         | 1/26 [00:00<00:10,  2.47it/s]

‚úÖ Matched Analysis of social media posts.md ‚Üî 26-Analysis of social media posts-NCI.pdf (score=0.90)


üîé Matching & Comparing DMPs:   8%|‚ñä         | 2/26 [00:00<00:09,  2.64it/s]

‚úÖ Matched Basic Research from a Non-Human Source Example.md ‚Üî 11-Basic Research from a Non-Human Source Example-NIDDK.pdf (score=0.91)


üîé Matching & Comparing DMPs:  12%|‚ñà‚ñè        | 3/26 [00:01<00:10,  2.13it/s]

‚úÖ Matched Clinical and MRI data from human research participants.md ‚Üî 1-Clinical andor MRI data from human research participants-NIMH.pdf (score=0.92)


üîé Matching & Comparing DMPs:  15%|‚ñà‚ñå        | 4/26 [00:01<00:10,  2.19it/s]

‚úÖ Matched Clinical data (human biospecimens).md ‚Üî 22-Clinical data (human biospecimens)-NIA.pdf (score=0.90)


üîé Matching & Comparing DMPs:  19%|‚ñà‚ñâ        | 5/26 [00:02<00:08,  2.55it/s]

‚úÖ Matched Clinical data from human research participants-NIA.md ‚Üî 15-Clinical data from human research participants-NIA.pdf (score=0.97)


üîé Matching & Comparing DMPs:  23%|‚ñà‚ñà‚ñé       | 6/26 [00:02<00:07,  2.85it/s]

‚úÖ Matched Clinical Data from Human Research Participants.md ‚Üî 15-Clinical data from human research participants-NIA.pdf (score=0.93)


üîé Matching & Comparing DMPs:  27%|‚ñà‚ñà‚ñã       | 7/26 [00:02<00:06,  3.01it/s]

‚úÖ Matched Drug discovery including intellectual property.md ‚Üî 23-Drug discovery including intellectual property-NIA.pdf (score=0.93)


üîé Matching & Comparing DMPs:  31%|‚ñà‚ñà‚ñà       | 8/26 [00:02<00:05,  3.23it/s]

‚úÖ Matched Gene expression analysis data from non-human model organism (zebrafish).md ‚Üî 8-Gene expression analysis data from non-human model organism (zebrafish)-NICHD.pdf (score=0.95)


üîé Matching & Comparing DMPs:  35%|‚ñà‚ñà‚ñà‚ñç      | 9/26 [00:03<00:05,  2.90it/s]

‚úÖ Matched Genomic data from a non-human source.md ‚Üî 3-Genomic data from a non-human source-NIMH.pdf (score=0.91)


üîé Matching & Comparing DMPs:  38%|‚ñà‚ñà‚ñà‚ñä      | 10/26 [00:03<00:05,  2.79it/s]

‚úÖ Matched Genomic data from human research participants.md ‚Üî 2-Genomic data from human research participants-NIMH.pdf (score=0.93)


üîé Matching & Comparing DMPs:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 11/26 [00:03<00:04,  3.01it/s]

‚úÖ Matched HeLa Cell Whole Genome Sequence (DNA or RNA).md ‚Üî 24-HeLa Cell Whole Genome Sequence (DNA or RNA)-OD, NHGRI.pdf (score=0.88)


üîé Matching & Comparing DMPs:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 12/26 [00:04<00:04,  2.85it/s]

‚úÖ Matched Human clinical and genomic data-NIA.md ‚Üî 20-Human clinical and genomic data-NIA.pdf (score=0.96)


üîé Matching & Comparing DMPs:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 13/26 [00:04<00:05,  2.54it/s]

‚úÖ Matched Human clinical and genomics data.md ‚Üî 7-Human clinical and genomics data-NICHD.pdf (score=0.89)


üîé Matching & Comparing DMPs:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 14/26 [00:05<00:05,  2.39it/s]

‚úÖ Matched Human Clinical Trial Data.md ‚Üî 14-Human Clinical Trial Data-NICHD.pdf (score=0.85)


üîé Matching & Comparing DMPs:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 15/26 [00:05<00:04,  2.49it/s]

‚úÖ Matched Human genomic data.md ‚Üî 5-Human genomic data-NHGRI.pdf (score=0.82)


üîé Matching & Comparing DMPs:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 16/26 [00:06<00:03,  2.61it/s]

‚úÖ Matched Human survey data.md ‚Üî 9-Human survey data-NICHD.pdf (score=0.81)


üîé Matching & Comparing DMPs:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 17/26 [00:06<00:03,  2.92it/s]

‚úÖ Matched Non-human data (primates).md ‚Üî 17-Non-human data (primates)-NIA.pdf (score=0.87)


üîé Matching & Comparing DMPs:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 18/26 [00:06<00:02,  3.03it/s]

‚úÖ Matched Non-human data (rodents)-NIA.md ‚Üî 21-Non-human data (rodents)-NIA.pdf (score=0.95)


üîé Matching & Comparing DMPs:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 19/26 [00:06<00:02,  3.35it/s]

‚úÖ Matched Secondary Data Analysis Example.md ‚Üî 12-Secondary Data Analysis Example-NIDDK.pdf (score=0.87)


üîé Matching & Comparing DMPs:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 20/26 [00:07<00:02,  2.89it/s]

‚úÖ Matched Secondary Data Analysis on Data from Human Subjects-NIA.md ‚Üî 25-Secondary Data Analysis on Data from Human Subjects-NIA.pdf (score=0.97)


üîé Matching & Comparing DMPs:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 21/26 [00:07<00:01,  3.02it/s]

‚úÖ Matched Secondary data analysis-NIA.md ‚Üî 18-Secondary data analysis-NIA.pdf (score=0.95)


üîé Matching & Comparing DMPs:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 22/26 [00:07<00:01,  3.35it/s]

‚úÖ Matched Secondary data analysis.md ‚Üî 18-Secondary data analysis-NIA.pdf (score=0.87)


üîé Matching & Comparing DMPs:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 23/26 [00:08<00:00,  3.42it/s]

‚úÖ Matched Survey and interview data-NIA.md ‚Üî 19-Survey and interview data-NIA.pdf (score=0.95)


üîé Matching & Comparing DMPs:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 24/26 [00:08<00:00,  3.20it/s]

‚úÖ Matched Survey and Interview Example.md ‚Üî 13-Survey and Interview Example-NHGRI.pdf (score=0.86)


üîé Matching & Comparing DMPs:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 25/26 [00:08<00:00,  3.32it/s]

‚úÖ Matched Survey, interview, and biological data (tiered access).md ‚Üî 16-Survey, interview, and biological data (tiered access)-NIA.pdf (score=0.93)


üîé Matching & Comparing DMPs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:09<00:00,  2.88it/s]

‚úÖ Matched Technology development.md ‚Üî 6-Technology development-NHGRI.pdf (score=0.85)

‚úÖ Markdown‚ÄìPDF (fuzzy) similarity results saved to: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\evaluation_results\full_dmp_pdf_comparison_fuzzy.csv
üßæ Total matched DMP pairs: 26





In [23]:
# ============================================
# üß© STEP 7 ‚Äî Element-Level Comparison with NIH Gold Standard (Exact Title Match)
# ============================================
import re
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# --- Paths ---
# --- Define ROOT_DIR dynamically (project root) ---
from pathlib import Path

# --------------------------------------------------------
# üóÇÔ∏è Define ROOT_DIR manually to your project folder
# --------------------------------------------------------
ROOT_DIR = Path(r"C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline")  # ‚úÖ change if needed

print(f"üìÇ ROOT_DIR set to: {ROOT_DIR}")
GOLD_PATH      = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"
GENERATED_DIR  = ROOT_DIR / "data" / "outputs" / "markdown"
EVAL_DIR       = ROOT_DIR / "data" / "outputs" / "evaluation_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìó Gold Excel: {GOLD_PATH}")
print(f"üìò Generated MD folder: {GENERATED_DIR}")

# --- Load gold reference (Excel) ---
df_gold = pd.read_excel(GOLD_PATH)
df_gold.columns = df_gold.columns.str.strip().str.lower()
df_gold = df_gold.fillna("").astype(str)

def normalize_title(name: str) -> str:
    name = name.lower()
    name = re.sub(r"[^a-z0-9\s]", " ", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

df_gold["title_norm"] = df_gold["title"].apply(normalize_title)

gold_elements = [
    "element_1a","element_1b","element_1c",
    "element_2","element_3",
    "element_4a","element_4b","element_4c",
    "element_5a","element_5b","element_5c",
    "element_6"
]
print(f"‚úÖ Loaded {len(df_gold)} gold projects.")

# --- Models ---
print("üöÄ Loading evaluation models...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
print("‚úÖ Models ready.")

# --- Markdown parsing helpers ---
def is_title(line: str) -> bool:
    s = line.strip()
    # Accept markdown headers (#, ##, ...) OR numbered bold section titles like "1. **Data Types**"
    return s.startswith("#") or bool(re.match(r"^\s*\d*\.?\s*\*\*.*\*\*\s*$", s))

def extract_sections(md_path: Path) -> pd.DataFrame:
    """
    Extract {Section Title, Generated Content} pairs from a Markdown file.
    Also strips any <think>...</think> blocks if present.
    """
    text = md_path.read_text(encoding="utf-8")
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)

    lines = text.splitlines()
    entries, current_title, buf = [], None, []

    for ln in lines:
        if is_title(ln):
            if current_title and any(x.strip() for x in buf):
                entries.append({
                    "Section Title": current_title.strip(),
                    "Generated Content": "\n".join(buf).strip()
                })
            current_title, buf = ln, []
        else:
            buf.append(ln)

    if current_title and any(x.strip() for x in buf):
        entries.append({
            "Section Title": current_title.strip(),
            "Generated Content": "\n".join(buf).strip()
        })

    return pd.DataFrame(entries)

# --- Compare (exact title match) ---
results = []
md_files = sorted(GENERATED_DIR.glob("*.md"))
print(f"üîç Found {len(md_files)} generated Markdown files.")

for md_file in tqdm(md_files, desc="üìä Comparing element-level"):
    # Your MD files are saved with the SAME title (sanitized) ‚Äî reverse-sanitize to match Excel
    # We‚Äôll normalize both sides and do exact equality on normalized strings
    gen_title_raw = md_file.stem  # e.g., "National Institute of Mental Health (NIMH)"
    gen_title_norm = normalize_title(gen_title_raw)

    gold_row = df_gold[df_gold["title_norm"] == gen_title_norm]
    if gold_row.empty:
        print(f"‚ö†Ô∏è No gold match for file: {md_file.name}")
        continue

    gold_row = gold_row.iloc[0]
    gold_title = gold_row["title"]

    # Gather gold element texts
    gold_texts = {e: gold_row.get(e, "").strip() for e in gold_elements if gold_row.get(e, "").strip()}
    if not gold_texts:
        print(f"‚ö†Ô∏è Empty gold elements for: {gold_title}")
        continue

    # Extract sections from generated MD
    gen_df = extract_sections(md_file)
    if gen_df.empty:
        print(f"‚ö†Ô∏è No sections extracted from: {md_file.name}")
        continue

    # For each gold element, compare to ALL generated sections; keep best match
    for element, gold_text in gold_texts.items():
        best = None
        for _, sec in gen_df.iterrows():
            gen_text = str(sec["Generated Content"]).strip()
            if not gen_text:
                continue

            emb_gold = sbert.encode(gold_text, convert_to_tensor=True)
            emb_gen  = sbert.encode(gen_text,  convert_to_tensor=True)
            sbert_sim = util.cos_sim(emb_gold, emb_gen).item()
            rouge_l   = rouge.score(gold_text, gen_text)["rougeL"].recall

            cand = {
                "Gold Project": gold_title,
                "Gold Element": element,
                "Generated File": md_file.name,
                "Generated Section Title": sec["Section Title"],
                "SBERT_Similarity": round(sbert_sim, 4),
                "ROUGE_L_Recall": round(rouge_l, 4),
            }
            if (best is None) or (sbert_sim > best["SBERT_Similarity"]):
                best = cand

        if best:
            results.append(best)

# --- Save ---
df_results = pd.DataFrame(results)
out_path = EVAL_DIR / "element_similarity_exact_titles.csv"
df_results.to_csv(out_path, index=False, encoding="utf-8")
print(f"\n‚úÖ Element-level similarity saved to: {out_path}")
print(f"üßæ Total element‚Äìsection best matches: {len(df_results)}")


üìÇ ROOT_DIR set to: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline
üìó Gold Excel: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\inputs\inputs.xlsx
üìò Generated MD folder: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\markdown
‚úÖ Loaded 26 gold projects.
üöÄ Loading evaluation models...
‚úÖ Models ready.
üîç Found 26 generated Markdown files.


üìä Comparing element-level: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:25<00:00,  1.04it/s]


‚úÖ Element-level similarity saved to: C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\evaluation_results\element_similarity_exact_titles.csv
üßæ Total element‚Äìsection best matches: 312





In [24]:
# ============================================
# üßÆ Step 8: Summarize Evaluation Results (with Generated_File titles)
# ============================================
import pandas as pd
import numpy as np
from pathlib import Path

# --- Auto-detect project root ---
# --------------------------------------------------------
# üóÇÔ∏è Define ROOT_DIR manually to your project folder
# --------------------------------------------------------
ROOT_DIR = Path(r"C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline")  # ‚úÖ change if needed

EVAL_DIR = ROOT_DIR / "data" / "outputs" / "evaluation_results"

# --- Load CSVs ---
full_path = EVAL_DIR / "full_dmp_pdf_comparison_fuzzy.csv"
elem_path = EVAL_DIR / "element_similarity_exact_titles.csv"

df_full = pd.read_csv(full_path)
df_elem = pd.read_csv(elem_path)

print(f"‚úÖ Loaded full-document ({len(df_full)} rows)")
print(f"‚úÖ Loaded element-level ({len(df_elem)} rows)\n")

# ============================================================
# üß© 1Ô∏è‚É£ FULL-DOCUMENT LEVEL SUMMARY (Mean Only, by Generated_File)
# ============================================================

# Prefer "Generated_File" column; fallback to detected one
if "Generated_File" in df_full.columns:
    project_col = "Generated_File"
else:
    project_col = next(
        (c for c in df_full.columns if "title" in c.lower() or "project" in c.lower() or "matched" in c.lower()),
        df_full.columns[0],
    )

# Find numeric columns
numeric_cols = [c for c in df_full.columns if "sbert" in c.lower() or "rouge" in c.lower()]

# Compute mean per file (if multiple rows)
df_full_summary = (
    df_full.groupby(project_col)[numeric_cols]
    .mean()
    .reset_index()
)

# Format to 2 decimals
df_full_summary["SBERT"] = df_full_summary[numeric_cols[0]].apply(lambda x: f"{x:.2f}")
df_full_summary["ROUGE"] = df_full_summary[numeric_cols[1]].apply(lambda x: f"{x:.2f}")

# Reorder columns and rename for clarity
df_full_table = df_full_summary[[project_col, "SBERT", "ROUGE"]].rename(
    columns={project_col: "Generated_File"}
)

print("üìä Full-document summary table (Mean only, by Generated_File):")
display(df_full_table)

# ============================================================
# üß© 2Ô∏è‚É£ ELEMENT-LEVEL SUMMARY (Mean ¬± SD)
# ============================================================

elem_col = next(
    (c for c in df_elem.columns if "element" in c.lower()),
    df_elem.columns[0],
)

numeric_cols_elem = [c for c in df_elem.columns if "sbert" in c.lower() or "rouge" in c.lower()]
df_elem_summary = (
    df_elem.groupby(elem_col)[numeric_cols_elem]
    .agg(["mean", "std"])
    .reset_index()
)
flat_cols_elem = [elem_col, "SBERT_Mean", "SBERT_SD", "ROUGE_Mean", "ROUGE_SD"]
df_elem_summary.columns = flat_cols_elem

df_elem_summary["SBERT"] = df_elem_summary.apply(
    lambda r: f"{r['SBERT_Mean']:.2f} ¬± {r['SBERT_SD']:.2f}", axis=1)
df_elem_summary["ROUGE"] = df_elem_summary.apply(
    lambda r: f"{r['ROUGE_Mean']:.2f} ¬± {r['ROUGE_SD']:.2f}", axis=1)

df_elem_table = df_elem_summary[[elem_col, "SBERT", "ROUGE"]].rename(
    columns={elem_col: "Element"}
)

print("\nüìä Element-level summary table (Mean ¬± SD):")
display(df_elem_table)

# ============================================================
# üíæ Save formatted tables
# ============================================================
out_full = EVAL_DIR / "summary_full_table_mean_only.csv"
out_elem = EVAL_DIR / "summary_element_table_mean_sd.csv"

df_full_table.to_csv(out_full, index=False)
df_elem_table.to_csv(out_elem, index=False)

print(f"\nüíæ Saved formatted tables ‚Üí\n‚Ä¢ {out_full}\n‚Ä¢ {out_elem}")


‚úÖ Loaded full-document (26 rows)
‚úÖ Loaded element-level (312 rows)

üìä Full-document summary table (Mean only, by Generated_File):


Unnamed: 0,Generated_File,SBERT,ROUGE
0,Analysis of social media posts.md,0.76,0.37
1,Basic Research from a Non-Human Source Example.md,0.73,0.46
2,Clinical Data from Human Research Participants.md,0.7,0.23
3,Clinical and MRI data from human research part...,0.74,0.28
4,Clinical data (human biospecimens).md,0.79,0.42
5,Clinical data from human research participants...,0.78,0.44
6,Drug discovery including intellectual property.md,0.8,0.37
7,Gene expression analysis data from non-human m...,0.77,0.35
8,Genomic data from a non-human source.md,0.72,0.33
9,Genomic data from human research participants.md,0.68,0.31



üìä Element-level summary table (Mean ¬± SD):


Unnamed: 0,Element,SBERT,ROUGE
0,element_1a,0.81 ¬± 0.14,0.42 ¬± 0.29
1,element_1b,0.74 ¬± 0.11,0.47 ¬± 0.22
2,element_1c,0.77 ¬± 0.11,0.46 ¬± 0.26
3,element_2,0.81 ¬± 0.09,0.48 ¬± 0.23
4,element_3,0.78 ¬± 0.12,0.49 ¬± 0.25
5,element_4a,0.78 ¬± 0.10,0.54 ¬± 0.22
6,element_4b,0.83 ¬± 0.09,0.54 ¬± 0.22
7,element_4c,0.86 ¬± 0.10,0.59 ¬± 0.25
8,element_5a,0.77 ¬± 0.13,0.45 ¬± 0.27
9,element_5b,0.78 ¬± 0.09,0.48 ¬± 0.20



üíæ Saved formatted tables ‚Üí
‚Ä¢ C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\evaluation_results\summary_full_table_mean_only.csv
‚Ä¢ C:\Users\Nahid\AI_DMP\DMP_RAG_Pipeline\data\outputs\evaluation_results\summary_element_table_mean_sd.csv
