In [10]:
print ("Start 10/08/2025")

Start 10/08/2025


In [11]:
# ============================================
# STEP 1 ‚Äî Imports, Config, and Helpers
# ============================================
import os, re, time
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import pypandoc  # for Markdown ‚Üí DOCX

# --- LangChain Core ---
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

load_dotenv()

# ---------- Paths (works in notebook or script) ----------
try:
    ROOT_DIR = Path(__file__).resolve().parents[1]  # when running a .py script
except NameError:
    ROOT_DIR = Path.cwd().parent                     # when running inside Jupyter

# --- Data folders ---
DATA_PDFS   = ROOT_DIR / "data" / "NIH_95"
INDEX_DIR   = ROOT_DIR / "data" / "faiss_index"
EXCEL_PATH  = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"
TEMPLATE_MD = ROOT_DIR / "data" / "inputs" / "dmp-template.md"

# --- Output folders ---
OUTPUT_MD   = ROOT_DIR / "data" / "outputs1" / "markdown"
OUTPUT_DOCX = ROOT_DIR / "data" / "outputs1" / "docx"

# --- Models / parameters ---
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL   = "llama3.3"
TOP_K       = 6

# ---------- Helper functions ----------
def create_folder(folderpath):
    Path(folderpath).mkdir(parents=True, exist_ok=True)

def save_md(folderpath, filename, text):
    create_folder(folderpath)
    (Path(folderpath) / filename).write_text(text, encoding="utf-8")
    print("üíæ Saved:", Path(folderpath) / filename)

def md_to_docs(md_filepath, docx_folderpath, docx_filename):
    create_folder(docx_folderpath)
    pypandoc.convert_file(
        str(md_filepath), "docx",
        outputfile=str(Path(docx_folderpath) / docx_filename)
    )
    print("üìÑ Converted:", Path(docx_folderpath) / docx_filename)

def clean_filename(name: str) -> str:
    """Remove illegal characters from filenames (Windows-safe)."""
    return re.sub(r'[\\/*?:"<>|]', "_", str(name)).strip()

# ---------- Ensure required folders exist ----------
for p in [DATA_PDFS, INDEX_DIR, OUTPUT_MD, OUTPUT_DOCX]:
    p.mkdir(parents=True, exist_ok=True)

# ---------- Sanity print ----------
print("‚úÖ STEP 1 ready")
print(f"ROOT_DIR   : {ROOT_DIR}")
print(f"DATA_PDFS  : {DATA_PDFS}")
print(f"INDEX_DIR  : {INDEX_DIR}")
print(f"EXCEL_PATH : {EXCEL_PATH}")
print(f"TEMPLATE_MD: {TEMPLATE_MD}")
print(f"OUTPUT_MD  : {OUTPUT_MD}")
print(f"OUTPUT_DOCX: {OUTPUT_DOCX}")
print(f"EMBED_MODEL: {EMBED_MODEL} | LLM_MODEL: {LLM_MODEL} | TOP_K: {TOP_K}")


‚úÖ STEP 1 ready
ROOT_DIR   : c:\Users\Nahid\dmpchef
DATA_PDFS  : c:\Users\Nahid\dmpchef\data\NIH_95
INDEX_DIR  : c:\Users\Nahid\dmpchef\data\faiss_index
EXCEL_PATH : c:\Users\Nahid\dmpchef\data\inputs\inputs.xlsx
TEMPLATE_MD: c:\Users\Nahid\dmpchef\data\inputs\dmp-template.md
OUTPUT_MD  : c:\Users\Nahid\dmpchef\data\outputs1\markdown
OUTPUT_DOCX: c:\Users\Nahid\dmpchef\data\outputs1\docx
EMBED_MODEL: sentence-transformers/all-MiniLM-L6-v2 | LLM_MODEL: llama3.3 | TOP_K: 6


In [12]:
# ============================================
# STEP 2 ‚Äî Load PDFs and Split into Text Chunks
# ============================================
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


def load_pdfs_from_folder(folder: Path):
    """Load all PDF files from a folder into LangChain Document objects."""
    if not folder.exists():
        raise FileNotFoundError(f"‚ùå Folder not found: {folder}")
    pdf_files = sorted(folder.glob("*.pdf"))
    if not pdf_files:
        raise FileNotFoundError(f"‚ö†Ô∏è No PDF files found in {folder}")

    docs = []
    for pdf_path in tqdm(pdf_files, desc="üì• Loading PDFs"):
        try:
            loader = PyPDFLoader(str(pdf_path))
            docs.extend(loader.load())
        except Exception as e:
            print(f"‚ö†Ô∏è Skipped {pdf_path.name}: {e}")

    print(f"‚úÖ Loaded {len(docs)} pages from {len(pdf_files)} PDFs.")
    return docs


def split_into_chunks(docs, chunk_size=800, chunk_overlap=120):
    """Split PDF text into overlapping chunks for embedding/indexing."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(docs)
    print(f"‚úÖ Created {len(chunks)} chunks from {len(docs)} pages.")
    return chunks


# --- Run quick test ---
raw_docs = load_pdfs_from_folder(DATA_PDFS)
chunks = split_into_chunks(raw_docs)


üì• Loading PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105/105 [00:22<00:00,  4.63it/s]

‚úÖ Loaded 586 pages from 105 PDFs.
‚úÖ Created 2016 chunks from 586 pages.





In [13]:
# ============================================
# STEP 3 ‚Äî Build or Load FAISS Index
# ============================================
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import time

# --- Initialize embedding model ---
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

def build_or_load_faiss_index(index_dir=INDEX_DIR, chunks=None):
    """
    Builds a new FAISS index from text chunks if none exists,
    otherwise loads the saved one from disk.
    """
    faiss_path = index_dir / "index.faiss"
    pkl_path   = index_dir / "index.pkl"

    # --- If index exists, load it ---
    if faiss_path.exists() and pkl_path.exists():
        print("üì¶ Existing FAISS index found. Loading from disk...")
        vectorstore = FAISS.load_local(
            str(index_dir),
            embeddings,
            allow_dangerous_deserialization=True
        )
        print("‚úÖ FAISS index loaded successfully.")
        return vectorstore

    # --- Otherwise, build new index ---
    if chunks is None or len(chunks) == 0:
        raise RuntimeError("‚ùå No chunks provided. Please run Step 2 first to load and split PDFs.")

    print("üß± Building new FAISS index...")
    start_time = time.time()

    vectorstore = FAISS.from_documents(
        tqdm(chunks, desc="üî¢ Embedding text chunks"),
        embeddings
    )

    # --- Save the index ---
    vectorstore.save_local(str(index_dir))
    duration = time.time() - start_time

    print(f"üíæ Saved new FAISS index to {index_dir}")
    print(f"‚è±Ô∏è Build completed in {duration/60:.2f} minutes ({duration:.1f} seconds)")
    return vectorstore


# --- Execute step ---
vectorstore = build_or_load_faiss_index(INDEX_DIR, chunks)
retriever   = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
print(f"‚úÖ Retriever ready (top_k={TOP_K})")


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 1607.01it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


üì¶ Existing FAISS index found. Loading from disk...
‚úÖ FAISS index loaded successfully.
‚úÖ Retriever ready (top_k=6)


In [14]:
# ============================================
# üß© STEP 4 ‚Äî Load Excel, Template, and Build RAG Chain (Fixed)
# ============================================
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Ollama
import pandas as pd

# --- Load Excel file ---
if not EXCEL_PATH.exists():
    raise FileNotFoundError(f"‚ùå Excel file not found: {EXCEL_PATH}")

df = pd.read_excel(EXCEL_PATH)
print(f"‚úÖ Excel loaded successfully: {len(df)} rows")

# --- Load Markdown Template ---
if not TEMPLATE_MD.exists():
    raise FileNotFoundError(f"‚ùå Template file not found: {TEMPLATE_MD}")

dmp_template_text = TEMPLATE_MD.read_text(encoding="utf-8")
print("‚úÖ DMP Markdown template loaded.")


# --- Build RAG chain ---
def build_rag_chain(retriever, llm_model=LLM_MODEL):
    """
    Build a flexible RAG pipeline that retrieves context
    and generates a context-grounded NIH DMP section.
    """
    llm = Ollama(model=llm_model)

    prompt_template = """You are an expert biomedical data steward and grant writer.
Create a high-quality NIH Data Management and Sharing Plan (DMSP)
based on the retrieved NIH context and the user's query.

----
Context from NIH Repository:
{context}

----
Question:
{question}

Use the context above and follow the NIH template structure. Write fluently and cohesively.
"""
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    parser = StrOutputParser()

    def format_docs(docs):
        """Format retrieved documents into clean text."""
        if not docs:
            return ""
        formatted = []
        for d in docs:
            page = d.metadata.get("page", "")
            title = d.metadata.get("source", "")
            formatted.append(f"[Page {page}] {title}\n{d.page_content.strip()}")
        return "\n\n".join(formatted)

    rag_chain = (
        {
            "context": retriever | format_docs,
            "question": RunnablePassthrough()
        }
        | prompt
        | llm
        | parser
    )

    print(f"üîó RAG chain initialized with model: {llm_model}")
    return rag_chain


# --- Initialize the RAG chain ---
rag_chain = build_rag_chain(retriever)
print("‚úÖ RAG chain ready for generation.")


‚úÖ Excel loaded successfully: 26 rows
‚úÖ DMP Markdown template loaded.
üîó RAG chain initialized with model: llama3.3
‚úÖ RAG chain ready for generation.


In [15]:
# ============================================
# üß© STEP 5 ‚Äî RAG-Based DMP Generation Using Titles (CLEAN + ROBUST)
# ============================================
import re
import pandas as pd
import pypandoc
from tqdm import tqdm
from pathlib import Path

# ---------- REQUIRED (assumed already defined earlier in your notebook/script) ----------
# ROOT_DIR: Path
# TEMPLATE_MD: Path
# retriever: LangChain retriever (FAISS retriever, etc.)
# rag_chain: your RAG chain (supports .invoke(prompt) -> str)

# ---------- Paths ----------
EXCEL_PATH = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"

OUTPUT_DIR = ROOT_DIR / "data" / "outputs1"
OUTPUT_MD = OUTPUT_DIR / "md"
OUTPUT_DOCX = OUTPUT_DIR / "docx"
OUTPUT_LOG = OUTPUT_DIR / "rag_generated_dmp_log.csv"

OUTPUT_MD.mkdir(parents=True, exist_ok=True)
OUTPUT_DOCX.mkdir(parents=True, exist_ok=True)

# ---------- Load Excel ----------
df = pd.read_excel(EXCEL_PATH)
print(f"‚úÖ Loaded input Excel ‚Äî {len(df)} rows from: {EXCEL_PATH}")

# Normalize column names
df.columns = df.columns.str.strip().str.lower()
df = df.fillna("")

# ---------- Validate required columns ----------
if "title" not in df.columns:
    raise ValueError("‚ùå Excel must contain a 'title' column (case-insensitive).")

element_cols = [c for c in df.columns if c.startswith("element")]
if not element_cols:
    raise ValueError("‚ùå Excel must contain at least one column starting with 'element' (e.g., element1, element_2).")

# ---------- Verify template ----------
if not TEMPLATE_MD.exists():
    raise FileNotFoundError(f"‚ùå Template not found: {TEMPLATE_MD}")

dmp_template_text = TEMPLATE_MD.read_text(encoding="utf-8")
print(f"‚úÖ Loaded NIH DMP Markdown template from: {TEMPLATE_MD}")

# ---------- Helpers ----------
def sanitize_filename(name: str) -> str:
    """Replace illegal filename characters but preserve readable title."""
    name = (name or "").strip()
    name = re.sub(r"\s+", " ", name)  # collapse whitespace
    return re.sub(r'[\\/*?:"<>|]', "_", name)

def save_text(path: Path, content: str):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")

def md_to_docx(md_path: Path, docx_path: Path) -> str:
    """
    Convert Markdown to DOCX using Pandoc (via pypandoc).
    Returns empty string on success, error message on failure.
    """
    try:
        docx_path.parent.mkdir(parents=True, exist_ok=True)
        pypandoc.convert_file(str(md_path), "docx", outputfile=str(docx_path))
        return ""
    except Exception as e:
        return str(e)

def retrieve_context(query: str, top_k: int) -> tuple[str, int, str]:
    """
    Returns (context_text, retrieved_count, error_message).
    Uses retriever.invoke(query) if present; otherwise get_relevant_documents(query).
    """
    try:
        if hasattr(retriever, "invoke"):
            docs = retriever.invoke(query)
        else:
            docs = retriever.get_relevant_documents(query)

        docs = docs or []
        context_text = "\n\n".join(getattr(d, "page_content", str(d)) for d in docs[:top_k])
        return context_text, len(docs), ""
    except Exception as e:
        return "", 0, str(e)

# ---------- Main Generation ----------
records = []
TOP_K = 6

for idx, row in tqdm(df.iterrows(), total=len(df), desc="üß† Generating NIH DMPs"):
    raw_title = str(row.get("title", "")).strip()

    # If title is missing, create a fallback
    if not raw_title:
        raw_title = f"Untitled_Project_Row_{idx+1}"

    print(f"\nüß© Generating DMP for: {raw_title}")

    # 1Ô∏è‚É£ Build query from Excel elements
    element_texts = []
    for col in element_cols:
        val = str(row.get(col, "")).strip()
        if val:
            element_texts.append(f"{col.upper()}: {val}")
    query_data = "\n".join(element_texts)

    query = (
        "You are an expert biomedical data steward and grant writer. "
        f"Create a complete NIH Data Management and Sharing Plan (DMSP) for the project titled '{raw_title}'. "
        "Use retrieved context from the NIH corpus to fill in all template sections accurately.\n\n"
        f"Here is background information from the proposal:\n{query_data}\n"
    )

    # 2Ô∏è‚É£ Retrieve context
    context_text, retrieved_n, retrieval_error = retrieve_context(query, TOP_K)
    if retrieval_error:
        print(f"‚ö†Ô∏è Retrieval failed for {raw_title}: {retrieval_error}")
    else:
        print(f"üîé Retrieved {retrieved_n} context chunks (using top {TOP_K}).")

    # 3Ô∏è‚É£ Combine context, query, and template
    full_prompt = f"""
You are an expert biomedical data steward and grant writer.
Use the retrieved NIH context and the provided template to generate a complete Data Management and Sharing Plan.

----
Context:
{context_text}

----
Project Query:
{query}

Use the following NIH DMSP Markdown template. Do not alter section titles:
{dmp_template_text}
""".strip()

    # 4Ô∏è‚É£ Generate with RAG chain
    error_msg = ""
    response_text = ""

    try:
        response = rag_chain.invoke(full_prompt)

        # Normalize response to string (some chains return dict/AIMessage)
        if isinstance(response, str):
            response_text = response
        elif hasattr(response, "content"):
            response_text = response.content
        elif isinstance(response, dict):
            # common keys: "output_text", "text", "result"
            response_text = response.get("output_text") or response.get("text") or response.get("result") or str(response)
        else:
            response_text = str(response)

        # 5Ô∏è‚É£ Save files with title-based filename
        safe_title = sanitize_filename(raw_title)
        md_path = OUTPUT_MD / f"{safe_title}.md"
        docx_path = OUTPUT_DOCX / f"{safe_title}.docx"

        save_text(md_path, response_text)
        print(f"üíæ Saved: {md_path}")

        docx_err = md_to_docx(md_path, docx_path)
        if docx_err:
            print(f"‚ö†Ô∏è DOCX conversion failed (Pandoc?): {docx_err}")
            error_msg = f"DOCX conversion failed: {docx_err}"
        else:
            print(f"üìÑ Converted: {docx_path}")

    except Exception as e:
        error_msg = str(e)
        print(f"‚ùå Error generating DMP for {raw_title}: {error_msg}")

    # 6Ô∏è‚É£ Log summary (always)
    records.append({
        "Title": raw_title,
        "Query": query,
        "Retrieved_Context": (context_text or "")[:1000],
        "Generated_DMP_Preview": (response_text or "")[:1000],
        "Retrieval_Error": retrieval_error,
        "Error": error_msg
    })

# ---------- Save Log ----------
pd.DataFrame(records).to_csv(OUTPUT_LOG, index=False, encoding="utf-8")
print("\n‚úÖ Finished generating NIH DMPs ‚Äî titles preserved from Excel!")
print(f"üìä CSV log saved to: {OUTPUT_LOG}")
print(f"üìÅ MD outputs:   {OUTPUT_MD}")
print(f"üìÅ DOCX outputs: {OUTPUT_DOCX}")


‚úÖ Loaded input Excel ‚Äî 26 rows from: c:\Users\Nahid\dmpchef\data\inputs\inputs.xlsx
‚úÖ Loaded NIH DMP Markdown template from: c:\Users\Nahid\dmpchef\data\inputs\dmp-template.md


üß† Generating NIH DMPs:   0%|          | 0/26 [00:00<?, ?it/s]


üß© Generating DMP for: Clinical and MRI data from human research participants
üîé Retrieved 6 context chunks (using top 6).
üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Clinical and MRI data from human research participants.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Clinical and MRI data from human research participants.docx


üß† Generating NIH DMPs:   4%|‚ñç         | 1/26 [01:19<33:18, 79.94s/it]


üß© Generating DMP for: Genomic data from human research participants
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:   8%|‚ñä         | 2/26 [02:30<29:51, 74.64s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Genomic data from human research participants.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Genomic data from human research participants.docx

üß© Generating DMP for: Genomic data from a non-human source
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  12%|‚ñà‚ñè        | 3/26 [03:41<27:54, 72.80s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Genomic data from a non-human source.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Genomic data from a non-human source.docx

üß© Generating DMP for: Secondary data analysis
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  15%|‚ñà‚ñå        | 4/26 [04:52<26:22, 71.94s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Secondary data analysis.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Secondary data analysis.docx

üß© Generating DMP for: Human clinical and genomics data
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  19%|‚ñà‚ñâ        | 5/26 [06:04<25:15, 72.15s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Human clinical and genomics data.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Human clinical and genomics data.docx

üß© Generating DMP for: Gene expression analysis data from non-human model organism (zebrafish)
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  23%|‚ñà‚ñà‚ñé       | 6/26 [07:25<25:05, 75.27s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Gene expression analysis data from non-human model organism (zebrafish).md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Gene expression analysis data from non-human model organism (zebrafish).docx

üß© Generating DMP for: Human survey data
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  27%|‚ñà‚ñà‚ñã       | 7/26 [08:27<22:24, 70.76s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Human survey data.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Human survey data.docx

üß© Generating DMP for: Clinical Data from Human Research Participants
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  31%|‚ñà‚ñà‚ñà       | 8/26 [09:37<21:09, 70.53s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Clinical Data from Human Research Participants.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Clinical Data from Human Research Participants.docx

üß© Generating DMP for: Human genomic data
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  35%|‚ñà‚ñà‚ñà‚ñç      | 9/26 [10:46<19:51, 70.06s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Human genomic data.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Human genomic data.docx

üß© Generating DMP for: Technology development
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  38%|‚ñà‚ñà‚ñà‚ñä      | 10/26 [12:02<19:08, 71.75s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Technology development.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Technology development.docx

üß© Generating DMP for: Basic Research from a Non-Human Source Example
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 11/26 [13:20<18:25, 73.67s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Basic Research from a Non-Human Source Example.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Basic Research from a Non-Human Source Example.docx

üß© Generating DMP for: Secondary Data Analysis Example
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 12/26 [14:33<17:08, 73.47s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Secondary Data Analysis Example.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Secondary Data Analysis Example.docx

üß© Generating DMP for: Survey and Interview Example
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 13/26 [15:39<15:26, 71.28s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Survey and Interview Example.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Survey and Interview Example.docx

üß© Generating DMP for: Human Clinical Trial Data
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 14/26 [16:44<13:54, 69.52s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Human Clinical Trial Data.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Human Clinical Trial Data.docx

üß© Generating DMP for: Clinical data from human research participants-NIA
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 15/26 [17:53<12:41, 69.27s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Clinical data from human research participants-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Clinical data from human research participants-NIA.docx

üß© Generating DMP for: Survey, interview, and biological data (tiered access)
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 16/26 [18:55<11:10, 67.05s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Survey, interview, and biological data (tiered access).md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Survey, interview, and biological data (tiered access).docx

üß© Generating DMP for: Non-human data (primates)
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 17/26 [19:57<09:49, 65.44s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Non-human data (primates).md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Non-human data (primates).docx

üß© Generating DMP for: Secondary data analysis-NIA
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 18/26 [21:07<08:55, 66.90s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Secondary data analysis-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Secondary data analysis-NIA.docx

üß© Generating DMP for: Survey and interview data-NIA
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 19/26 [22:06<07:31, 64.56s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Survey and interview data-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Survey and interview data-NIA.docx

üß© Generating DMP for: Human clinical and genomic data-NIA
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 20/26 [23:26<06:54, 69.15s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Human clinical and genomic data-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Human clinical and genomic data-NIA.docx

üß© Generating DMP for: Non-human data (rodents)-NIA
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 21/26 [24:41<05:54, 70.86s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Non-human data (rodents)-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Non-human data (rodents)-NIA.docx

üß© Generating DMP for: Clinical data (human biospecimens)
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 22/26 [26:01<04:54, 73.61s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Clinical data (human biospecimens).md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Clinical data (human biospecimens).docx

üß© Generating DMP for: Drug discovery including intellectual property
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 23/26 [27:08<03:35, 71.79s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Drug discovery including intellectual property.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Drug discovery including intellectual property.docx

üß© Generating DMP for: HeLa Cell Whole Genome Sequence (DNA or RNA)
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 24/26 [28:11<02:18, 69.08s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\HeLa Cell Whole Genome Sequence (DNA or RNA).md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\HeLa Cell Whole Genome Sequence (DNA or RNA).docx

üß© Generating DMP for: Secondary Data Analysis on Data from Human Subjects-NIA
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 25/26 [29:23<01:09, 69.87s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Secondary Data Analysis on Data from Human Subjects-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Secondary Data Analysis on Data from Human Subjects-NIA.docx

üß© Generating DMP for: Analysis of social media posts
üîé Retrieved 6 context chunks (using top 6).


üß† Generating NIH DMPs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [30:27<00:00, 70.30s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\md\Analysis of social media posts.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Analysis of social media posts.docx

‚úÖ Finished generating NIH DMPs ‚Äî titles preserved from Excel!
üìä CSV log saved to: c:\Users\Nahid\dmpchef\data\outputs1\rag_generated_dmp_log.csv
üìÅ MD outputs:   c:\Users\Nahid\dmpchef\data\outputs1\md
üìÅ DOCX outputs: c:\Users\Nahid\dmpchef\data\outputs1\docx





In [18]:
# ============================================
# üß© STEP 6 ‚Äî Full DMP Comparison: Markdown (Generated) vs PDF (Gold, Fuzzy Matching)
#         (ROBUST ROOT DETECTION + outputs1 paths + faster SBERT caching)
# ============================================
import re
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# ---------------------------
# ‚úÖ Robust project root finder
# ---------------------------
def find_project_root(start: Path) -> Path:
    """
    Walk upward from `start` and return the first directory that looks like the project root.
    Heuristics: contains data/ and (README.md or config/).
    """
    cur = start.resolve()
    for _ in range(25):  # safety limit
        data_dir = cur / "data"
        has_data = data_dir.exists() and data_dir.is_dir()
        has_readme = (cur / "README.md").exists()
        has_config = (cur / "config").exists() and (cur / "config").is_dir()
        # Primary: data/ + (README.md or config/)
        if has_data and (has_readme or has_config):
            return cur
        # Secondary: exact folder name match (keep your old behavior too)
        if cur.name.lower() == "dmp-rag":
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

ROOT_DIR = find_project_root(Path.cwd())
print(f"üìÇ ROOT_DIR set to: {ROOT_DIR}")

# --- Paths (match STEP 5/7 layout) ---
GOLD_DIR      = ROOT_DIR / "data" / "inputs" / "gold_dmps"
GENERATED_DIR = ROOT_DIR / "data" / "outputs1" / "md"
EVAL_DIR      = ROOT_DIR / "data" / "outputs1" / "evaluation_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìó Gold PDF folder: {GOLD_DIR}")
print(f"üìò Generated Markdown folder: {GENERATED_DIR}")
print(f"üìô Evaluation output folder: {EVAL_DIR}")

# --- Models ---
print("üöÄ Loading models...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
print("‚úÖ Models ready.")

# --- Helper functions ---
def normalize_name(name: str) -> str:
    name = (name or "").lower()
    name = re.sub(r"[^a-z0-9\s]", " ", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

def clean_text(text: str) -> str:
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)
    text = re.sub(r"#+\s*", "", text)
    text = re.sub(r"\*\*|\*", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def extract_text_from_pdf(pdf_path: Path) -> str:
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text("text") + "\n"
    except Exception as e:
        print(f"‚ùå Error reading {pdf_path.name}: {e}")
    return clean_text(text)

def chunk_text(text: str, size_words: int = 300):
    words = text.split()
    if not words:
        return []
    return [" ".join(words[i:i + size_words]) for i in range(0, len(words), size_words)]

def best_fuzzy_match(target: str, gold_names: list[str], threshold: float = 0.6):
    best_match, best_score = None, 0.0
    for g in gold_names:
        score = SequenceMatcher(None, target, g).ratio()
        if score > best_score:
            best_match, best_score = g, score
    return (best_match, best_score) if best_score >= threshold else (None, best_score)

def compare_chunked_cached(gold_text: str, gen_text: str, model: SentenceTransformer):
    gold_chunks = chunk_text(gold_text, size_words=300)
    gen_chunks  = chunk_text(gen_text,  size_words=300)
    if not gold_chunks or not gen_chunks:
        return np.nan, np.nan

    gold_emb = model.encode(gold_chunks, convert_to_tensor=True)
    gen_emb  = model.encode(gen_chunks,  convert_to_tensor=True)

    sbert_scores = []
    for i in range(len(gold_chunks)):
        sims = util.cos_sim(gold_emb[i], gen_emb)[0]
        sbert_scores.append(float(sims.max().item()))

    rouge_scores = []
    for g in gold_chunks:
        best_r = 0.0
        for gen in gen_chunks:
            r = rouge.score(g, gen)["rougeL"].recall
            if r > best_r:
                best_r = r
        rouge_scores.append(best_r)

    return float(np.mean(sbert_scores)), float(np.mean(rouge_scores))

# --- Validate folders with helpful diagnostics ---
if not GOLD_DIR.exists():
    print("\n‚ùå GOLD_DIR not found.")
    print("üîé Debug tips:")
    print(f" - Your current working dir is: {Path.cwd()}")
    print(f" - Detected ROOT_DIR is:       {ROOT_DIR}")
    print(" - Expected gold PDFs under:   data/inputs/gold_dmps/")
    # show likely alternatives for user
    data_inputs = ROOT_DIR / "data" / "inputs"
    if data_inputs.exists():
        print(f"‚úÖ Found data/inputs at: {data_inputs}")
        print("üìÅ Contents of data/inputs/:")
        for p in sorted(data_inputs.iterdir()):
            print("   -", p.name)
    raise FileNotFoundError(f"‚ùå GOLD_DIR not found: {GOLD_DIR}")

if not GENERATED_DIR.exists():
    print("\n‚ùå GENERATED_DIR not found.")
    print("üîé Debug tips:")
    print(f" - Expected generated MDs under: {GENERATED_DIR}")
    raise FileNotFoundError(f"‚ùå GENERATED_DIR not found: {GENERATED_DIR}")

# --- Collect gold PDFs and generated MDs ---
gold_files = {normalize_name(f.stem): f for f in GOLD_DIR.glob("*.pdf")}
gen_files  = {normalize_name(f.stem): f for f in GENERATED_DIR.glob("*.md")}

print(f"üìä Found {len(gen_files)} generated DMPs and {len(gold_files)} gold PDFs.")

# --- Compare all matching files ---
results = []
for name, gen_path in tqdm(gen_files.items(), desc="üîé Matching & Comparing DMPs"):
    best_match, match_score = best_fuzzy_match(name, list(gold_files.keys()), threshold=0.6)
    if not best_match:
        print(f"‚ö†Ô∏è No gold match for: {gen_path.name}")
        continue

    gold_path = gold_files[best_match]
    gold_text = extract_text_from_pdf(gold_path)
    gen_text  = clean_text(gen_path.read_text(encoding="utf-8", errors="ignore"))

    if not gold_text.strip() or not gen_text.strip():
        print(f"‚ö†Ô∏è Skipping empty text pair: {gen_path.name} ‚Üî {gold_path.name}")
        continue

    sbert_sim, rouge_l = compare_chunked_cached(gold_text, gen_text, sbert)

    results.append({
        "Generated_File": gen_path.name,
        "Matched_Gold_PDF": gold_path.name,
        "Match_Score": round(float(match_score), 3),
        "SBERT_Similarity": round(float(sbert_sim), 4) if not np.isnan(sbert_sim) else np.nan,
        "ROUGE_L_Recall": round(float(rouge_l), 4) if not np.isnan(rouge_l) else np.nan,
        "Generated_Path": str(gen_path),
        "Gold_Path": str(gold_path),
    })

    print(f"‚úÖ Matched {gen_path.name} ‚Üî {gold_path.name} (score={match_score:.2f})")

# --- Save results ---
df_results = pd.DataFrame(results)
out_path = EVAL_DIR / "full_dmp_pdf_comparison_fuzzy.csv"
df_results.to_csv(out_path, index=False, encoding="utf-8")

print(f"\n‚úÖ Markdown‚ÄìPDF (fuzzy) similarity results saved to: {out_path}")
print(f"üßæ Total matched DMP pairs: {len(df_results)}")


üìÇ ROOT_DIR set to: C:\Users\Nahid\dmpchef
üìó Gold PDF folder: C:\Users\Nahid\dmpchef\data\inputs\gold_dmps
üìò Generated Markdown folder: C:\Users\Nahid\dmpchef\data\outputs1\md
üìô Evaluation output folder: C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results
üöÄ Loading models...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 1634.62it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


‚úÖ Models ready.
üìä Found 26 generated DMPs and 26 gold PDFs.


üîé Matching & Comparing DMPs:   4%|‚ñç         | 1/26 [00:00<00:05,  4.50it/s]

‚úÖ Matched Analysis of social media posts.md ‚Üî 26-Analysis of social media posts-NCI.pdf (score=0.90)


üîé Matching & Comparing DMPs:   8%|‚ñä         | 2/26 [00:00<00:05,  4.12it/s]

‚úÖ Matched Basic Research from a Non-Human Source Example.md ‚Üî 11-Basic Research from a Non-Human Source Example-NIDDK.pdf (score=0.91)


üîé Matching & Comparing DMPs:  12%|‚ñà‚ñè        | 3/26 [00:00<00:06,  3.40it/s]

‚úÖ Matched Clinical and MRI data from human research participants.md ‚Üî 1-Clinical andor MRI data from human research participants-NIMH.pdf (score=0.92)


üîé Matching & Comparing DMPs:  15%|‚ñà‚ñå        | 4/26 [00:01<00:06,  3.41it/s]

‚úÖ Matched Clinical data (human biospecimens).md ‚Üî 22-Clinical data (human biospecimens)-NIA.pdf (score=0.90)


üîé Matching & Comparing DMPs:  19%|‚ñà‚ñâ        | 5/26 [00:01<00:05,  3.74it/s]

‚úÖ Matched Clinical data from human research participants-NIA.md ‚Üî 15-Clinical data from human research participants-NIA.pdf (score=0.97)


üîé Matching & Comparing DMPs:  23%|‚ñà‚ñà‚ñé       | 6/26 [00:01<00:05,  3.92it/s]

‚úÖ Matched Clinical Data from Human Research Participants.md ‚Üî 15-Clinical data from human research participants-NIA.pdf (score=0.93)


üîé Matching & Comparing DMPs:  27%|‚ñà‚ñà‚ñã       | 7/26 [00:01<00:04,  3.97it/s]

‚úÖ Matched Drug discovery including intellectual property.md ‚Üî 23-Drug discovery including intellectual property-NIA.pdf (score=0.93)


üîé Matching & Comparing DMPs:  31%|‚ñà‚ñà‚ñà       | 8/26 [00:02<00:04,  3.92it/s]

‚úÖ Matched Gene expression analysis data from non-human model organism (zebrafish).md ‚Üî 8-Gene expression analysis data from non-human model organism (zebrafish)-NICHD.pdf (score=0.95)


üîé Matching & Comparing DMPs:  35%|‚ñà‚ñà‚ñà‚ñç      | 9/26 [00:02<00:04,  3.58it/s]

‚úÖ Matched Genomic data from a non-human source.md ‚Üî 3-Genomic data from a non-human source-NIMH.pdf (score=0.91)


üîé Matching & Comparing DMPs:  38%|‚ñà‚ñà‚ñà‚ñä      | 10/26 [00:02<00:04,  3.48it/s]

‚úÖ Matched Genomic data from human research participants.md ‚Üî 2-Genomic data from human research participants-NIMH.pdf (score=0.93)


üîé Matching & Comparing DMPs:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 11/26 [00:02<00:03,  3.78it/s]

‚úÖ Matched HeLa Cell Whole Genome Sequence (DNA or RNA).md ‚Üî 24-HeLa Cell Whole Genome Sequence (DNA or RNA)-OD, NHGRI.pdf (score=0.88)


üîé Matching & Comparing DMPs:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 12/26 [00:03<00:03,  3.61it/s]

‚úÖ Matched Human clinical and genomic data-NIA.md ‚Üî 20-Human clinical and genomic data-NIA.pdf (score=0.96)


üîé Matching & Comparing DMPs:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 13/26 [00:03<00:03,  3.46it/s]

‚úÖ Matched Human clinical and genomics data.md ‚Üî 7-Human clinical and genomics data-NICHD.pdf (score=0.89)


üîé Matching & Comparing DMPs:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 14/26 [00:03<00:03,  3.42it/s]

‚úÖ Matched Human Clinical Trial Data.md ‚Üî 14-Human Clinical Trial Data-NICHD.pdf (score=0.85)


üîé Matching & Comparing DMPs:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 15/26 [00:04<00:03,  3.40it/s]

‚úÖ Matched Human genomic data.md ‚Üî 5-Human genomic data-NHGRI.pdf (score=0.82)


üîé Matching & Comparing DMPs:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 16/26 [00:04<00:02,  3.58it/s]

‚úÖ Matched Human survey data.md ‚Üî 9-Human survey data-NICHD.pdf (score=0.81)
‚úÖ Matched Non-human data (primates).md ‚Üî 17-Non-human data (primates)-NIA.pdf (score=0.87)


üîé Matching & Comparing DMPs:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 18/26 [00:04<00:02,  3.82it/s]

‚úÖ Matched Non-human data (rodents)-NIA.md ‚Üî 21-Non-human data (rodents)-NIA.pdf (score=0.95)
‚úÖ Matched Secondary Data Analysis Example.md ‚Üî 12-Secondary Data Analysis Example-NIDDK.pdf (score=0.87)


üîé Matching & Comparing DMPs:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 20/26 [00:05<00:01,  4.00it/s]

‚úÖ Matched Secondary Data Analysis on Data from Human Subjects-NIA.md ‚Üî 25-Secondary Data Analysis on Data from Human Subjects-NIA.pdf (score=0.97)


üîé Matching & Comparing DMPs:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 21/26 [00:05<00:01,  4.05it/s]

‚úÖ Matched Secondary data analysis-NIA.md ‚Üî 18-Secondary data analysis-NIA.pdf (score=0.95)


üîé Matching & Comparing DMPs:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 22/26 [00:05<00:00,  4.21it/s]

‚úÖ Matched Secondary data analysis.md ‚Üî 18-Secondary data analysis-NIA.pdf (score=0.87)


üîé Matching & Comparing DMPs:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 23/26 [00:06<00:00,  4.35it/s]

‚úÖ Matched Survey and interview data-NIA.md ‚Üî 19-Survey and interview data-NIA.pdf (score=0.95)


üîé Matching & Comparing DMPs:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 25/26 [00:06<00:00,  4.19it/s]

‚úÖ Matched Survey and Interview Example.md ‚Üî 13-Survey and Interview Example-NHGRI.pdf (score=0.86)
‚úÖ Matched Survey, interview, and biological data (tiered access).md ‚Üî 16-Survey, interview, and biological data (tiered access)-NIA.pdf (score=0.93)


üîé Matching & Comparing DMPs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:06<00:00,  3.83it/s]

‚úÖ Matched Technology development.md ‚Üî 6-Technology development-NHGRI.pdf (score=0.85)

‚úÖ Markdown‚ÄìPDF (fuzzy) similarity results saved to: C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results\full_dmp_pdf_comparison_fuzzy.csv
üßæ Total matched DMP pairs: 26





In [20]:
# ============================================
# üß© STEP 7 ‚Äî Element-Level Comparison with NIH Gold Standard (Exact Title Match)
#        (ROBUST ROOT DETECTION + outputs1 paths + faster embedding cache)
# ============================================
import re
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# ---------------------------
# ‚úÖ Robust project root finder
# ---------------------------
def find_project_root(start: Path) -> Path:
    """
    Walk upward from `start` and return the first directory that looks like the project root.
    Heuristics: contains data/ and (README.md or config/).
    Also accepts folder name 'DMP-RAG' if present.
    """
    cur = start.resolve()
    for _ in range(25):
        data_dir = cur / "data"
        has_data = data_dir.exists() and data_dir.is_dir()
        has_readme = (cur / "README.md").exists()
        has_config = (cur / "config").exists() and (cur / "config").is_dir()

        if has_data and (has_readme or has_config):
            return cur
        if cur.name.lower() == "dmp-rag":
            return cur

        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

ROOT_DIR = find_project_root(Path.cwd())
print(f"üìÇ ROOT_DIR set to: {ROOT_DIR}")
print(f"üìç CWD is: {Path.cwd()}")

# --- Paths (match STEP 5/6/8 layout) ---
GOLD_PATH     = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"
GENERATED_DIR = ROOT_DIR / "data" / "outputs1" / "md"
EVAL_DIR      = ROOT_DIR / "data" / "outputs1" / "evaluation_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìó Gold Excel: {GOLD_PATH}")
print(f"üìò Generated MD folder: {GENERATED_DIR}")
print(f"üìô Evaluation folder: {EVAL_DIR}")

# --- Validate paths with helpful diagnostics ---
if not GOLD_PATH.exists():
    print("\n‚ùå GOLD_PATH not found.")
    print("üîé Debug tips:")
    print(" - Make sure your notebook is running inside the DMP-RAG project folder.")
    print(" - Or update the root detection markers (README.md/config/) to match your repo.")
    print(f" - Detected ROOT_DIR: {ROOT_DIR}")
    inputs_dir = ROOT_DIR / "data" / "inputs"
    if inputs_dir.exists():
        print(f"‚úÖ Found data/inputs at: {inputs_dir}")
        print("üìÅ Contents of data/inputs/:")
        for p in sorted(inputs_dir.iterdir()):
            print("   -", p.name)
    raise FileNotFoundError(f"‚ùå Gold Excel not found: {GOLD_PATH}")

if not GENERATED_DIR.exists():
    raise FileNotFoundError(f"‚ùå Generated markdown folder not found: {GENERATED_DIR}")

# --- Load gold reference (Excel) ---
df_gold = pd.read_excel(GOLD_PATH)
df_gold.columns = df_gold.columns.str.strip().str.lower()
df_gold = df_gold.fillna("").astype(str)

def normalize_title(name: str) -> str:
    name = (name or "").lower()
    name = re.sub(r"[^a-z0-9\s]", " ", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

if "title" not in df_gold.columns:
    raise ValueError("‚ùå Excel must contain a 'title' column (case-insensitive).")

df_gold["title_norm"] = df_gold["title"].apply(normalize_title)
print(f"‚úÖ Loaded {len(df_gold)} gold projects.")

# --- Detect gold element columns automatically ---
element_cols = [c for c in df_gold.columns if c.startswith("element")]
if not element_cols:
    raise ValueError("‚ùå No element columns found in Excel. Expected columns starting with 'element'.")
print(f"üß© Detected {len(element_cols)} element columns: {element_cols}")

# --- Models ---
print("üöÄ Loading evaluation models...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
print("‚úÖ Models ready.")

# --- Markdown parsing helpers ---
def is_title(line: str) -> bool:
    s = line.strip()
    if s.startswith("#"):
        return True
    if re.match(r"^\s*\d+\.?\s*\*\*.*\*\*\s*$", s):
        return True
    if re.match(r"^\s*\*\*.*\*\*\s*$", s):
        return True
    return False

def strip_think_blocks(text: str) -> str:
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)

def extract_sections(md_path: Path) -> pd.DataFrame:
    text = md_path.read_text(encoding="utf-8", errors="ignore")
    text = strip_think_blocks(text)

    lines = text.splitlines()
    entries, current_title, buf = [], None, []

    for ln in lines:
        if is_title(ln):
            if current_title and any(x.strip() for x in buf):
                entries.append({
                    "Section Title": current_title.strip(),
                    "Generated Content": "\n".join(buf).strip()
                })
            current_title, buf = ln, []
        else:
            buf.append(ln)

    if current_title and any(x.strip() for x in buf):
        entries.append({
            "Section Title": current_title.strip(),
            "Generated Content": "\n".join(buf).strip()
        })

    return pd.DataFrame(entries)

def clean_text_minimal(text: str) -> str:
    text = strip_think_blocks(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# --- Compare (exact title match on normalized) ---
md_files = sorted(GENERATED_DIR.glob("*.md"))
print(f"üîç Found {len(md_files)} generated Markdown files.")

results = []

for md_file in tqdm(md_files, desc="üìä Comparing element-level"):
    gen_title_norm = normalize_title(md_file.stem)

    gold_row_df = df_gold[df_gold["title_norm"] == gen_title_norm]
    if gold_row_df.empty:
        print(f"‚ö†Ô∏è No gold match for file: {md_file.name}")
        continue

    gold_row = gold_row_df.iloc[0]
    gold_title = gold_row["title"]

    # Gold element texts (non-empty only)
    gold_texts = {}
    for c in element_cols:
        val = str(gold_row.get(c, "")).strip()
        if val:
            gold_texts[c] = clean_text_minimal(val)

    if not gold_texts:
        print(f"‚ö†Ô∏è Empty gold elements for: {gold_title}")
        continue

    # Extract generated sections
    gen_df = extract_sections(md_file)
    if gen_df.empty:
        print(f"‚ö†Ô∏è No sections extracted from: {md_file.name}")
        continue

    gen_df["Generated Content"] = gen_df["Generated Content"].astype(str).apply(clean_text_minimal)
    gen_df = gen_df[gen_df["Generated Content"].str.len() > 0].reset_index(drop=True)
    if gen_df.empty:
        print(f"‚ö†Ô∏è All extracted sections empty after cleaning: {md_file.name}")
        continue

    # --- SBERT caching for speed ---
    gen_text_list = gen_df["Generated Content"].tolist()
    gen_emb = sbert.encode(gen_text_list, convert_to_tensor=True)

    for element_name, gold_text in gold_texts.items():
        emb_gold = sbert.encode(gold_text, convert_to_tensor=True)

        sims = util.cos_sim(emb_gold, gen_emb)[0]
        best_idx = int(sims.argmax().item())
        best_sbert = float(sims[best_idx].item())

        best_section_title = str(gen_df.loc[best_idx, "Section Title"]).strip()
        best_gen_text = str(gen_df.loc[best_idx, "Generated Content"]).strip()
        best_rouge = float(rouge.score(gold_text, best_gen_text)["rougeL"].recall)

        results.append({
            "Gold Project": gold_title,
            "Gold Element": element_name,
            "Generated File": md_file.name,
            "Generated Section Title": best_section_title,
            "SBERT_Similarity": round(best_sbert, 4),
            "ROUGE_L_Recall": round(best_rouge, 4),
        })

# --- Save ---
df_results = pd.DataFrame(results)
out_path = EVAL_DIR / "element_similarity_exact_titles.csv"
df_results.to_csv(out_path, index=False, encoding="utf-8")

print(f"\n‚úÖ Element-level similarity saved to: {out_path}")
print(f"üßæ Total element‚Äìsection best matches: {len(df_results)}")


üìÇ ROOT_DIR set to: C:\Users\Nahid\dmpchef
üìç CWD is: c:\Users\Nahid\dmpchef\notebook_DMP_RAG
üìó Gold Excel: C:\Users\Nahid\dmpchef\data\inputs\inputs.xlsx
üìò Generated MD folder: C:\Users\Nahid\dmpchef\data\outputs1\md
üìô Evaluation folder: C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results
‚úÖ Loaded 26 gold projects.
üß© Detected 12 element columns: ['element_1a', 'element_1b', 'element_1c', 'element_2', 'element_3', 'element_4a', 'element_4b', 'element_4c', 'element_5a', 'element_5b', 'element_5c', 'element_6']
üöÄ Loading evaluation models...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 1633.93it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


‚úÖ Models ready.
üîç Found 26 generated Markdown files.


üìä Comparing element-level: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:04<00:00,  5.84it/s]


‚úÖ Element-level similarity saved to: C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results\element_similarity_exact_titles.csv
üßæ Total element‚Äìsection best matches: 312





In [22]:
# ============================================
# üßÆ STEP 8 ‚Äî Summarize Evaluation Results (ROBUST ROOT DETECTION + outputs1/)
# ============================================
import pandas as pd
import numpy as np
from pathlib import Path

# ---------------------------
# ‚úÖ Robust project root finder
# ---------------------------
def find_project_root(start: Path) -> Path:
    """
    Walk upward from `start` and return the first directory that looks like the project root.
    Heuristics: contains data/ and (README.md or config/).
    Also accepts folder name 'DMP-RAG' if present.
    """
    cur = start.resolve()
    for _ in range(25):
        data_dir = cur / "data"
        has_data = data_dir.exists() and data_dir.is_dir()
        has_readme = (cur / "README.md").exists()
        has_config = (cur / "config").exists() and (cur / "config").is_dir()

        if has_data and (has_readme or has_config):
            return cur
        if cur.name.lower() == "dmp-rag":
            return cur

        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

ROOT_DIR = find_project_root(Path.cwd())
print(f"üìÇ ROOT_DIR set to: {ROOT_DIR}")
print(f"üìç CWD is: {Path.cwd()}")

# --- Evaluation directory (outputs1) ---
EVAL_DIR = ROOT_DIR / "data" / "outputs1" / "evaluation_results"
print(f"üìô EVAL_DIR: {EVAL_DIR}")

# --- Input CSVs ---
full_path = EVAL_DIR / "full_dmp_pdf_comparison_fuzzy.csv"
elem_path = EVAL_DIR / "element_similarity_exact_titles.csv"

# --- Diagnostics if missing ---
missing = []
if not full_path.exists():
    missing.append(full_path.name)
if not elem_path.exists():
    missing.append(elem_path.name)

if missing:
    print("\n‚ùå Missing required evaluation files:", ", ".join(missing))
    print("üîé Debug tips:")
    print(" - Make sure you ran STEP 6 and STEP 7 successfully first.")
    print(f" - Expected files under: {EVAL_DIR}")

    if EVAL_DIR.exists():
        print("üìÅ Current contents of EVAL_DIR:")
        for p in sorted(EVAL_DIR.iterdir()):
            print("   -", p.name)
    else:
        print("‚ùå EVAL_DIR itself does not exist yet.")

    raise FileNotFoundError(f"‚ùå Missing files in {EVAL_DIR}: {missing}")

# --- Load CSVs ---
df_full = pd.read_csv(full_path)
df_elem = pd.read_csv(elem_path)

print(f"\n‚úÖ Loaded full-document ({len(df_full)} rows) from: {full_path.name}")
print(f"‚úÖ Loaded element-level ({len(df_elem)} rows) from: {elem_path.name}\n")

# ============================================================
# üß© 1Ô∏è‚É£ FULL-DOCUMENT LEVEL SUMMARY (Mean Only, by Generated_File)
# ============================================================

# Identify the project/file column
if "Generated_File" in df_full.columns:
    project_col = "Generated_File"
else:
    project_col = next(
        (c for c in df_full.columns if "generated" in c.lower() and "file" in c.lower()),
        None
    ) or next(
        (c for c in df_full.columns if "file" in c.lower() or "title" in c.lower() or "project" in c.lower()),
        df_full.columns[0]
    )

# Identify numeric similarity columns
sbert_col = next((c for c in df_full.columns if "sbert" in c.lower()), None)
rouge_col = next((c for c in df_full.columns if "rouge" in c.lower()), None)

if sbert_col is None or rouge_col is None:
    raise ValueError(f"‚ùå Could not find SBERT/ROUGE columns in df_full. Columns: {list(df_full.columns)}")

df_full_summary = (
    df_full.groupby(project_col)[[sbert_col, rouge_col]]
    .mean()
    .reset_index()
)

df_full_summary["SBERT"] = df_full_summary[sbert_col].astype(float).apply(lambda x: f"{x:.2f}")
df_full_summary["ROUGE"] = df_full_summary[rouge_col].astype(float).apply(lambda x: f"{x:.2f}")

df_full_table = df_full_summary[[project_col, "SBERT", "ROUGE"]].rename(
    columns={project_col: "Generated_File"}
)

print("üìä Full-document summary table (Mean only, by Generated_File):")
display(df_full_table)

# ============================================================
# üß© 2Ô∏è‚É£ ELEMENT-LEVEL SUMMARY (Mean ¬± SD)
# ============================================================

elem_col = (
    "Gold Element" if "Gold Element" in df_elem.columns
    else next((c for c in df_elem.columns if "element" in c.lower()), df_elem.columns[0])
)

sbert_col_e = next((c for c in df_elem.columns if "sbert" in c.lower()), None)
rouge_col_e = next((c for c in df_elem.columns if "rouge" in c.lower()), None)

if sbert_col_e is None or rouge_col_e is None:
    raise ValueError(f"‚ùå Could not find SBERT/ROUGE columns in df_elem. Columns: {list(df_elem.columns)}")

df_elem_summary = (
    df_elem.groupby(elem_col)[[sbert_col_e, rouge_col_e]]
    .agg(["mean", "std"])
    .reset_index()
)

# Flatten columns
df_elem_summary.columns = [elem_col, "SBERT_Mean", "SBERT_SD", "ROUGE_Mean", "ROUGE_SD"]

df_elem_summary["SBERT"] = df_elem_summary.apply(
    lambda r: f"{r['SBERT_Mean']:.2f} ¬± {r['SBERT_SD']:.2f}", axis=1
)
df_elem_summary["ROUGE"] = df_elem_summary.apply(
    lambda r: f"{r['ROUGE_Mean']:.2f} ¬± {r['ROUGE_SD']:.2f}", axis=1
)

df_elem_table = df_elem_summary[[elem_col, "SBERT", "ROUGE"]].rename(columns={elem_col: "Element"})

print("\nüìä Element-level summary table (Mean ¬± SD):")
display(df_elem_table)

# ============================================================
# üíæ Save formatted tables
# ============================================================
out_full = EVAL_DIR / "summary_full_table_mean_only.csv"
out_elem = EVAL_DIR / "summary_element_table_mean_sd.csv"

df_full_table.to_csv(out_full, index=False)
df_elem_table.to_csv(out_elem, index=False)

print(f"\nüíæ Saved formatted tables ‚Üí\n‚Ä¢ {out_full}\n‚Ä¢ {out_elem}")


üìÇ ROOT_DIR set to: C:\Users\Nahid\dmpchef
üìç CWD is: c:\Users\Nahid\dmpchef\notebook_DMP_RAG
üìô EVAL_DIR: C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results

‚úÖ Loaded full-document (26 rows) from: full_dmp_pdf_comparison_fuzzy.csv
‚úÖ Loaded element-level (312 rows) from: element_similarity_exact_titles.csv

üìä Full-document summary table (Mean only, by Generated_File):


Unnamed: 0,Generated_File,SBERT,ROUGE
0,Analysis of social media posts.md,0.77,0.38
1,Basic Research from a Non-Human Source Example.md,0.83,0.44
2,Clinical Data from Human Research Participants.md,0.71,0.26
3,Clinical and MRI data from human research part...,0.71,0.28
4,Clinical data (human biospecimens).md,0.82,0.4
5,Clinical data from human research participants...,0.76,0.31
6,Drug discovery including intellectual property.md,0.8,0.37
7,Gene expression analysis data from non-human m...,0.74,0.35
8,Genomic data from a non-human source.md,0.71,0.33
9,Genomic data from human research participants.md,0.72,0.28



üìä Element-level summary table (Mean ¬± SD):


Unnamed: 0,Element,SBERT,ROUGE
0,element_1a,0.80 ¬± 0.14,0.48 ¬± 0.29
1,element_1b,0.73 ¬± 0.11,0.43 ¬± 0.25
2,element_1c,0.77 ¬± 0.09,0.49 ¬± 0.28
3,element_2,0.80 ¬± 0.11,0.47 ¬± 0.24
4,element_3,0.78 ¬± 0.13,0.46 ¬± 0.27
5,element_4a,0.79 ¬± 0.12,0.55 ¬± 0.23
6,element_4b,0.79 ¬± 0.10,0.49 ¬± 0.24
7,element_4c,0.83 ¬± 0.08,0.51 ¬± 0.23
8,element_5a,0.76 ¬± 0.13,0.46 ¬± 0.26
9,element_5b,0.76 ¬± 0.09,0.41 ¬± 0.20



üíæ Saved formatted tables ‚Üí
‚Ä¢ C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results\summary_full_table_mean_only.csv
‚Ä¢ C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results\summary_element_table_mean_sd.csv
