In [None]:
# ============================================
# STEP 1 ‚Äî Imports, Config, and Helpers
# ============================================
import os, re, time
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import pypandoc  # for Markdown ‚Üí DOCX

# --- LangChain Core ---
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

load_dotenv()

# ---------- Paths (works in notebook or script) ----------
try:
    ROOT_DIR = Path(__file__).resolve().parents[1]  # when running a .py script
except NameError:
    ROOT_DIR = Path.cwd().parent                     # when running inside Jupyter

# --- Data folders ---
#DATA_PDFS   = ROOT_DIR / "data" / "data_ingestion" / "NIH_all"
INDEX_DIR   = ROOT_DIR / "data" / "vector_db"/"NIH_all_db"
EXCEL_PATH  = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"
TEMPLATE_MD = ROOT_DIR / "data" / "inputs" / "dmp-template.md"

# --- Output folders ---
OUTPUT_MD   = ROOT_DIR / "data" / "outputs7" / "markdown"
OUTPUT_DOCX = ROOT_DIR / "data" / "outputs7" / "docx"

# --- Models / parameters ---
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL   = "llama3.3"
TOP_K       = 6

# ---------- Helper functions ----------
def create_folder(folderpath):
    Path(folderpath).mkdir(parents=True, exist_ok=True)

def save_md(folderpath, filename, text):
    create_folder(folderpath)
    (Path(folderpath) / filename).write_text(text, encoding="utf-8")
    print("üíæ Saved:", Path(folderpath) / filename)

def md_to_docs(md_filepath, docx_folderpath, docx_filename):
    create_folder(docx_folderpath)
    pypandoc.convert_file(
        str(md_filepath), "docx",
        outputfile=str(Path(docx_folderpath) / docx_filename)
    )
    print("üìÑ Converted:", Path(docx_folderpath) / docx_filename)

def clean_filename(name: str) -> str:
    """Remove illegal characters from filenames (Windows-safe)."""
    return re.sub(r'[\\/*?:"<>|]', "_", str(name)).strip()



# ---------- Sanity print ----------
print(" STEP 1 ready")
print(f"ROOT_DIR   : {ROOT_DIR}")
print(f"INDEX_DIR  : {INDEX_DIR}")
print(f"EXCEL_PATH : {EXCEL_PATH}")
print(f"TEMPLATE_MD: {TEMPLATE_MD}")
print(f"OUTPUT_MD  : {OUTPUT_MD}")
print(f"OUTPUT_DOCX: {OUTPUT_DOCX}")
print(f"EMBED_MODEL: {EMBED_MODEL} | LLM_MODEL: {LLM_MODEL} | TOP_K: {TOP_K}")


  from .autonotebook import tqdm as notebook_tqdm


‚úÖ STEP 1 ready
ROOT_DIR   : c:\Users\Nahid\dmpchef
DATA_PDFS  : c:\Users\Nahid\dmpchef\data\NIH_95
INDEX_DIR  : c:\Users\Nahid\dmpchef\data\faiss_index
EXCEL_PATH : c:\Users\Nahid\dmpchef\data\inputs\inputs.xlsx
TEMPLATE_MD: c:\Users\Nahid\dmpchef\data\inputs\dmp-template.md
OUTPUT_MD  : c:\Users\Nahid\dmpchef\data\outputs7\markdown
OUTPUT_DOCX: c:\Users\Nahid\dmpchef\data\outputs7\docx
EMBED_MODEL: sentence-transformers/all-MiniLM-L6-v2 | LLM_MODEL: llama3.3 | TOP_K: 6


In [None]:
# ============================================
# STEP 1 ‚Äî Imports, Config (YAML), and Helpers
# ============================================
import os, re, time
from pathlib import Path

import yaml
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import pypandoc  # for Markdown ‚Üí DOCX

# --- LangChain Core ---
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

load_dotenv()

# ---------- Resolve project root (works in notebook or script) ----------
try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]  # when running a .py script
except NameError:
    PROJECT_ROOT = Path.cwd().parent                    # when running inside Jupyter


# ---------- YAML loader + path resolver ----------
def load_yaml_config(cfg_path: Path) -> dict:
    cfg_path = Path(cfg_path)
    if not cfg_path.exists():
        raise FileNotFoundError(f"Config YAML not found: {cfg_path}")
    with cfg_path.open("r", encoding="utf-8") as f:
        cfg = yaml.safe_load(f)
    if not isinstance(cfg, dict):
        raise ValueError(f"Config YAML must parse to a dict. Got: {type(cfg)}")
    return cfg


def resolve_from_root(project_root: Path, root_dir_value: str | Path) -> Path:
    """
    YAML root_dir:
      - "." means project root
      - relative paths are relative to project root
      - absolute paths are used as-is
    """
    p = Path(root_dir_value).expanduser()
    if p.is_absolute():
        return p.resolve()
    return (project_root / p).resolve()


def resolve_path(base: Path, rel_or_abs: str | Path | None) -> Path | None:
    """Resolve a path relative to `base` if not absolute. Keep None as None."""
    if rel_or_abs is None:
        return None
    p = Path(rel_or_abs).expanduser()
    if p.is_absolute():
        return p.resolve()
    return (base / p).resolve()


# ---------- Choose your YAML file here ----------
CONFIG_YAML = PROJECT_ROOT / "config" / "config.yaml"  # change if your file name differs
cfg = load_yaml_config(CONFIG_YAML)

# ---------- Root dir from YAML ----------
ROOT_DIR = resolve_from_root(PROJECT_ROOT, cfg["root_dir"])

# ---------- Paths from YAML (direct access) ----------
DATA_PDFS   = resolve_path(ROOT_DIR, cfg["paths"]["data_pdfs"])        # optional in your pipeline, but required here
INDEX_DIR   = resolve_path(ROOT_DIR, cfg["paths"]["index_dir"])
EXCEL_PATH  = resolve_path(ROOT_DIR, cfg["paths"]["excel_path"])
OUTPUT_MD   = resolve_path(ROOT_DIR, cfg["paths"]["output_md"])
OUTPUT_DOCX = resolve_path(ROOT_DIR, cfg["paths"]["output_docx"])

# If you have template_md in YAML, use it. Otherwise keep your existing default file.
TEMPLATE_MD = resolve_path(
    ROOT_DIR,
    cfg["paths"].get("template_md", "data/inputs/dmp-template.md")
)

# ---------- RAG params from YAML (direct access) ----------
TOP_K = int(cfg["rag"]["retriever_top_k"])

# ---------- Models from YAML (direct access) ----------
EMBED_MODEL = cfg["models"]["embedding_model"]
LLM_MODEL   = cfg["models"]["llm_name"]

EMBED_DEVICE       = cfg["models"]["embedding_device"]
EMBED_BATCH_SIZE   = int(cfg["models"]["embedding_batch_size"])
NORMALIZE_EMBEDS   = bool(cfg["models"]["normalize_embeddings"])
HF_CACHE_DIR       = resolve_path(ROOT_DIR, cfg["models"]["hf_cache_dir"])
LOCAL_FILES_ONLY   = bool(cfg["models"]["local_files_only"])
ALLOW_DL_IF_MISS   = bool(cfg["models"]["allow_download_if_missing"])


# ---------- Helper functions ----------
def create_folder(folderpath: Path | str) -> None:
    Path(folderpath).mkdir(parents=True, exist_ok=True)

def save_md(folderpath: Path | str, filename: str, text: str) -> Path:
    create_folder(folderpath)
    out_path = Path(folderpath) / filename
    out_path.write_text(text, encoding="utf-8")
    print("Saved:", out_path)
    return out_path

def md_to_docs(md_filepath: Path | str, docx_folderpath: Path | str, docx_filename: str) -> Path:
    create_folder(docx_folderpath)
    out_path = Path(docx_folderpath) / docx_filename
    pypandoc.convert_file(str(md_filepath), "docx", outputfile=str(out_path))
    print("Converted:", out_path)
    return out_path

def clean_filename(name: str) -> str:
    """Remove illegal characters from filenames (Windows-safe)."""
    return re.sub(r'[\\/*?:"<>|]', "_", str(name)).strip()


# ---------- Sanity print ----------
print("STEP 1 ready (YAML-driven, direct access)")
print(f"CONFIG_YAML : {CONFIG_YAML}")
print(f"PROJECT_ROOT: {PROJECT_ROOT}")
print(f"ROOT_DIR    : {ROOT_DIR}")
print(f"DATA_PDFS   : {DATA_PDFS}")
print(f"INDEX_DIR   : {INDEX_DIR}")
print(f"EXCEL_PATH  : {EXCEL_PATH}")
print(f"TEMPLATE_MD : {TEMPLATE_MD}")
print(f"OUTPUT_MD   : {OUTPUT_MD}")
print(f"OUTPUT_DOCX : {OUTPUT_DOCX}")
print(f"EMBED_MODEL : {EMBED_MODEL}")
print(f"LLM_MODEL   : {LLM_MODEL}")
print(f"TOP_K       : {TOP_K}")
print(f"EMBED_DEVICE: {EMBED_DEVICE} | BATCH: {EMBED_BATCH_SIZE} | NORMALIZE: {NORMALIZE_EMBEDS}")
print(f"HF_CACHE_DIR: {HF_CACHE_DIR} | local_files_only={LOCAL_FILES_ONLY} | allow_download_if_missing={ALLOW_DL_IF_MISS}")

In [3]:
# ============================================
# STEP 2 ‚Äî Load PDFs and Split into Text Chunks
# ============================================
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


def load_pdfs_from_folder(folder: Path):
    """Load all PDF files from a folder into LangChain Document objects."""
    if not folder.exists():
        raise FileNotFoundError(f"‚ùå Folder not found: {folder}")
    pdf_files = sorted(folder.glob("*.pdf"))
    if not pdf_files:
        raise FileNotFoundError(f"‚ö†Ô∏è No PDF files found in {folder}")

    docs = []
    for pdf_path in tqdm(pdf_files, desc="üì• Loading PDFs"):
        try:
            loader = PyPDFLoader(str(pdf_path))
            docs.extend(loader.load())
        except Exception as e:
            print(f"‚ö†Ô∏è Skipped {pdf_path.name}: {e}")

    print(f"‚úÖ Loaded {len(docs)} pages from {len(pdf_files)} PDFs.")
    return docs


def split_into_chunks(docs, chunk_size=800, chunk_overlap=120):
    """Split PDF text into overlapping chunks for embedding/indexing."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(docs)
    print(f"‚úÖ Created {len(chunks)} chunks from {len(docs)} pages.")
    return chunks


# --- Run quick test ---
raw_docs = load_pdfs_from_folder(DATA_PDFS)
chunks = split_into_chunks(raw_docs)


üì• Loading PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105/105 [00:22<00:00,  4.59it/s]


‚úÖ Loaded 586 pages from 105 PDFs.
‚úÖ Created 2016 chunks from 586 pages.


In [4]:
import sentence_transformers
print("‚úÖ sentence-transformers version:", sentence_transformers.__version__)


‚úÖ sentence-transformers version: 5.2.2


In [5]:
# ============================================
# STEP 3 ‚Äî Build or Load FAISS Index
# ============================================
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

import time

# --- Initialize embedding model ---
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

def build_or_load_faiss_index(index_dir=INDEX_DIR, chunks=None):
    """
    Builds a new FAISS index from text chunks if none exists,
    otherwise loads the saved one from disk.
    """
    faiss_path = index_dir / "index.faiss"
    pkl_path   = index_dir / "index.pkl"

    # --- If index exists, load it ---
    if faiss_path.exists() and pkl_path.exists():
        print("üì¶ Existing FAISS index found. Loading from disk...")
        vectorstore = FAISS.load_local(
            str(index_dir),
            embeddings,
            allow_dangerous_deserialization=True
        )
        print("‚úÖ FAISS index loaded successfully.")
        return vectorstore

    # --- Otherwise, build new index ---
    if chunks is None or len(chunks) == 0:
        raise RuntimeError("‚ùå No chunks provided. Please run Step 2 first to load and split PDFs.")

    print("üß± Building new FAISS index...")
    start_time = time.time()

    vectorstore = FAISS.from_documents(
        tqdm(chunks, desc="üî¢ Embedding text chunks"),
        embeddings
    )

    # --- Save the index ---
    vectorstore.save_local(str(index_dir))
    duration = time.time() - start_time

    print(f"üíæ Saved new FAISS index to {index_dir}")
    print(f"‚è±Ô∏è Build completed in {duration/60:.2f} minutes ({duration:.1f} seconds)")
    return vectorstore


# --- Execute step ---
vectorstore = build_or_load_faiss_index(INDEX_DIR, chunks)
retriever   = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
print(f"‚úÖ Retriever ready (top_k={TOP_K})")


  embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 1537.31it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


üì¶ Existing FAISS index found. Loading from disk...
‚úÖ FAISS index loaded successfully.
‚úÖ Retriever ready (top_k=6)


In [11]:
# ============================================
# üß© STEP 4 ‚Äî Load Excel, Template, and Build RAG Chain (Few-shot from Excel)
# ============================================
import re
import pandas as pd
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Ollama

# --- Load Excel file ---
if not EXCEL_PATH.exists():
    raise FileNotFoundError(f"‚ùå Excel file not found: {EXCEL_PATH}")

df = pd.read_excel(EXCEL_PATH)
df.columns = df.columns.str.strip().str.lower()
df = df.fillna("")
print(f"‚úÖ Excel loaded successfully: {len(df)} rows")

# --- Load Markdown Template ---
if not TEMPLATE_MD.exists():
    raise FileNotFoundError(f"‚ùå Template file not found: {TEMPLATE_MD}")

dmp_template_text = TEMPLATE_MD.read_text(encoding="utf-8")
print("‚úÖ DMP Markdown template loaded.")


# ============================================================
# ‚úÖ Few-shot builder from Excel (Element-aware + relevant)
# ============================================================
def _infer_element_from_question(q: str) -> str:
    m = re.search(r"element\s*([1-6])", q, flags=re.IGNORECASE)
    return m.group(1) if m else "1"

def _row_to_fewshot_example(row: pd.Series, element_num: str) -> str:
    title = str(row.get("title", "")).strip()
    designation = str(row.get("designation", "")).strip()

    if element_num == "1":
        a = str(row.get("element_1a", "")).strip()
        b = str(row.get("element_1b", "")).strip()
        c = str(row.get("element_1c", "")).strip()
        answer = (
            "**Element 1: Data Type**\n\n"
            "1. **Types and amount of scientific data expected to be generated in the project:**\n"
            f"{a}\n\n"
            "2. **Scientific data that will be preserved and shared, and the rationale for doing so:**\n"
            f"{b}\n\n"
            "3. **Metadata, other relevant data, and associated documentation:**\n"
            f"{c}"
        )

    elif element_num == "2":
        answer = "**Element 2: Related Tools, Software and/or Code**\n\n" + str(row.get("element_2", "")).strip()

    elif element_num == "3":
        answer = "**Element 3: Standards**\n\n" + str(row.get("element_3", "")).strip()

    elif element_num == "4":
        a = str(row.get("element_4a", "")).strip()
        b = str(row.get("element_4b", "")).strip()
        c = str(row.get("element_4c", "")).strip()
        answer = (
            "**Element 4: Data Preservation, Access, and Associated Timelines**\n\n"
            "1. **Repository and preservation timeline:**\n"
            f"{a}\n\n"
            "2. **How data will be discoverable/findable:**\n"
            f"{b}\n\n"
            "3. **Access, sharing mechanisms, and timelines:**\n"
            f"{c}"
        )

    elif element_num == "5":
        a = str(row.get("element_5a", "")).strip()
        b = str(row.get("element_5b", "")).strip()
        c = str(row.get("element_5c", "")).strip()
        answer = (
            "**Element 5: Access, Distribution, or Reuse Considerations**\n\n"
            "1. **Factors affecting access/sharing:**\n"
            f"{a}\n\n"
            "2. **Steps for access / distribution:**\n"
            f"{b}\n\n"
            "3. **Privacy / confidentiality protections:**\n"
            f"{c}"
        )

    else:  # element_num == "6"
        answer = "**Element 6: Oversight of Data Management and Sharing**\n\n" + str(row.get("element_6", "")).strip()

    q = f'Write Element {element_num} for a project similar to: "{title}" ({designation}).'
    return f"### Example\nQuestion:\n{q}\n\nAnswer:\n{answer}"


def build_few_shot_block_from_excel(question: str, n_examples: int = 3) -> str:
    element_num = _infer_element_from_question(question)

    needed_cols = {
        "1": ["element_1a", "element_1b", "element_1c"],
        "2": ["element_2"],
        "3": ["element_3"],
        "4": ["element_4a", "element_4b", "element_4c"],
        "5": ["element_5a", "element_5b", "element_5c"],
        "6": ["element_6"],
    }[element_num]

    # keep only rows that have content for that element
    df_valid = df.copy()
    for c in needed_cols:
        if c not in df_valid.columns:
            # column missing in file ‚Üí return no few-shot instead of crashing
            return ""
    df_valid = df_valid[(df_valid[needed_cols].astype(str).apply(lambda s: s.str.strip().ne("")).all(axis=1))]

    if df_valid.empty:
        return ""

    # simple relevance score: overlap between question words and row metadata fields
    q_terms = set(re.findall(r"[a-zA-Z]{3,}", question.lower()))

    def score_row(r):
        hay = f"{r.get('title','')} {r.get('designation','')} {r.get('institute','')} {r.get('consentdescription','')}"
        hay_terms = set(re.findall(r"[a-zA-Z]{3,}", str(hay).lower()))
        return len(q_terms & hay_terms)

    df_valid = df_valid.copy()
    df_valid["__score"] = df_valid.apply(score_row, axis=1)
    df_top = df_valid.sort_values("__score", ascending=False).head(n_examples)

    examples = [_row_to_fewshot_example(row, element_num) for _, row in df_top.iterrows()]
    return "\n\n---\n\n".join(examples)


# ============================================
# --- Build RAG chain (Few-shot + RAG grounding)
# ============================================
def build_rag_chain(retriever, llm_model=LLM_MODEL, n_few_shot: int = 3):
    llm = Ollama(model=llm_model)
    parser = StrOutputParser()

    def format_docs(docs):
        if not docs:
            return ""
        formatted = []
        for d in docs:
            page = d.metadata.get("page", "")
            title = d.metadata.get("source", "")
            formatted.append(f"[Page {page}] {title}\n{d.page_content.strip()}")
        return "\n\n".join(formatted)

    def make_few_shot(q: str) -> str:
        return build_few_shot_block_from_excel(q, n_examples=n_few_shot)

    prompt_template = """You are an expert biomedical data steward and grant writer.
Create a high-quality NIH Data Management and Sharing Plan (DMSP) based on the retrieved NIH context and the user's query.

You MUST follow the formatting and style demonstrated by the few-shot examples.

---- Few-shot examples (from your Excel) ----
{few_shot}

---- Context from NIH Repository (grounding) ----
{context}

---- Question ----
{question}

Rules:
- Use NIH context when relevant; do NOT invent policy details.
- If a specific policy detail is not supported by the provided context, write: "Not specified in provided NIH context."
- Follow the NIH template structure and keep section titles unchanged when the template is provided.
"""

    prompt = PromptTemplate(template=prompt_template, input_variables=["few_shot", "context", "question"])

    rag_chain = (
        {
            "context": retriever | format_docs,
            "few_shot": RunnablePassthrough() | make_few_shot,
            "question": RunnablePassthrough(),
        }
        | prompt
        | llm
        | parser
    )

    print(f"üîó RAG chain initialized with model: {llm_model} | few-shot examples: {n_few_shot}")
    return rag_chain


# --- Initialize the RAG chain ---
rag_chain = build_rag_chain(retriever, n_few_shot=3)
print("‚úÖ RAG chain ready for generation.")


‚úÖ Excel loaded successfully: 26 rows
‚úÖ DMP Markdown template loaded.
üîó RAG chain initialized with model: llama3.3 | few-shot examples: 3
‚úÖ RAG chain ready for generation.


In [None]:
# ============================================
# üß© STEP 5 ‚Äî RAG-Based DMP Generation Using Titles (UPDATED for Few-shot RAG Chain)
# ============================================
import re, pandas as pd, pypandoc
from tqdm import tqdm
from pathlib import Path

# ---------- Paths ----------
EXCEL_PATH = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"
OUTPUT_LOG = ROOT_DIR / "data" / "outputs7" / "rag_generated_dmp_log.csv"
OUTPUT_MD.mkdir(parents=True, exist_ok=True)
OUTPUT_DOCX.mkdir(parents=True, exist_ok=True)

# ---------- Load Excel ----------
df = pd.read_excel(EXCEL_PATH)
df.columns = df.columns.str.strip().str.lower()
df = df.fillna("")
print(f"‚úÖ Loaded input Excel ‚Äî {len(df)} rows")

# ---------- Verify template ----------
if not TEMPLATE_MD.exists():
    raise FileNotFoundError(f"‚ùå Template not found: {TEMPLATE_MD}")
dmp_template_text = TEMPLATE_MD.read_text(encoding="utf-8")
print(f"‚úÖ Loaded NIH DMP Markdown template from: {TEMPLATE_MD}")

# ---------- Helper functions ----------
def sanitize_filename(name: str) -> str:
    return re.sub(r'[\\/*?:"<>|]', "_", str(name).strip())

def create_folder(folderpath: Path):
    folderpath.mkdir(parents=True, exist_ok=True)

def save_md(folderpath: Path, filename: str, response: str):
    create_folder(folderpath)
    filepath = folderpath / filename
    filepath.write_text(response, encoding="utf-8")
    print(f"üíæ Saved: {filepath}")

def md_to_docx(md_filepath: Path, docx_folder: Path, docx_filename: str):
    create_folder(docx_folder)
    docx_path = docx_folder / docx_filename
    pypandoc.convert_file(str(md_filepath), "docx", outputfile=str(docx_path))
    print(f"üìÑ Converted: {docx_path}")

# ---------- Main Generation ----------
records = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="üß† Generating NIH DMPs"):
    title = str(row.get("title", "")).strip()
    if not title:
        continue

    print(f"\nüß© Generating DMP for: {title}")

    # 1) Build proposal background from the row
    element_texts = []
    for col in [c for c in df.columns if c.startswith("element")]:
        val = str(row.get(col, "")).strip()
        if val:
            element_texts.append(f"{col.upper()}: {val}")
    query_data = "\n".join(element_texts)

    # 2) Build question that rag_chain expects (rag_chain does retrieval + few-shot)
    question = f"""
Create a complete NIH Data Management and Sharing Plan (DMSP) for the project titled: "{title}".

Use the NIH DMSP Markdown template below and DO NOT change section titles.

Project background / proposal details:
{query_data}

NIH DMSP Markdown template:
{dmp_template_text}
""".strip()

    # 3) Generate
    try:
        response = rag_chain.invoke(question)

        safe_title = sanitize_filename(title)
        md_filename = f"{safe_title}.md"
        docx_filename = f"{safe_title}.docx"
        md_path = OUTPUT_MD / md_filename

        save_md(OUTPUT_MD, md_filename, response)
        md_to_docx(md_path, OUTPUT_DOCX, docx_filename)

        records.append({
            "Title": title,
            "Question_Preview": question[:1000],
            "Generated_DMP_Preview": response[:1000],
            "Error": ""
        })

    except Exception as e:
        print(f"‚ùå Error generating DMP for {title}: {e}")
        records.append({
            "Title": title,
            "Question_Preview": question[:1000],
            "Generated_DMP_Preview": "",
            "Error": str(e)
        })

# ---------- Save Log ----------
pd.DataFrame(records).to_csv(OUTPUT_LOG, index=False, encoding="utf-8")
print("\n‚úÖ Finished processing all rows.")
print(f"üìä CSV log saved to: {OUTPUT_LOG}")


‚úÖ Loaded input Excel ‚Äî 26 rows
‚úÖ Loaded NIH DMP Markdown template from: c:\Users\Nahid\dmpchef\data\inputs\dmp-template.md


üß† Generating NIH DMPs:   0%|          | 0/26 [00:00<?, ?it/s]


üß© Generating DMP for: Clinical and MRI data from human research participants
üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Clinical and MRI data from human research participants.md


üß† Generating NIH DMPs:   4%|‚ñç         | 1/26 [01:16<31:46, 76.27s/it]

üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Clinical and MRI data from human research participants.docx

üß© Generating DMP for: Genomic data from human research participants


üß† Generating NIH DMPs:   8%|‚ñä         | 2/26 [02:11<25:29, 63.73s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Genomic data from human research participants.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Genomic data from human research participants.docx

üß© Generating DMP for: Genomic data from a non-human source


üß† Generating NIH DMPs:  12%|‚ñà‚ñè        | 3/26 [03:04<22:35, 58.92s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Genomic data from a non-human source.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Genomic data from a non-human source.docx

üß© Generating DMP for: Secondary data analysis


üß† Generating NIH DMPs:  15%|‚ñà‚ñå        | 4/26 [04:05<21:57, 59.88s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Secondary data analysis.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Secondary data analysis.docx

üß© Generating DMP for: Human clinical and genomics data


üß† Generating NIH DMPs:  19%|‚ñà‚ñâ        | 5/26 [05:07<21:09, 60.44s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Human clinical and genomics data.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Human clinical and genomics data.docx

üß© Generating DMP for: Gene expression analysis data from non-human model organism (zebrafish)


üß† Generating NIH DMPs:  23%|‚ñà‚ñà‚ñé       | 6/26 [06:08<20:15, 60.79s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Gene expression analysis data from non-human model organism (zebrafish).md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Gene expression analysis data from non-human model organism (zebrafish).docx

üß© Generating DMP for: Human survey data


üß† Generating NIH DMPs:  27%|‚ñà‚ñà‚ñã       | 7/26 [07:02<18:32, 58.53s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Human survey data.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Human survey data.docx

üß© Generating DMP for: Clinical Data from Human Research Participants


üß† Generating NIH DMPs:  31%|‚ñà‚ñà‚ñà       | 8/26 [08:03<17:48, 59.38s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Clinical Data from Human Research Participants.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Clinical Data from Human Research Participants.docx

üß© Generating DMP for: Human genomic data


üß† Generating NIH DMPs:  35%|‚ñà‚ñà‚ñà‚ñç      | 9/26 [09:09<17:22, 61.33s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Human genomic data.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Human genomic data.docx

üß© Generating DMP for: Technology development


üß† Generating NIH DMPs:  38%|‚ñà‚ñà‚ñà‚ñä      | 10/26 [10:20<17:08, 64.28s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Technology development.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Technology development.docx

üß© Generating DMP for: Basic Research from a Non-Human Source Example


üß† Generating NIH DMPs:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 11/26 [11:18<15:34, 62.30s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Basic Research from a Non-Human Source Example.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Basic Research from a Non-Human Source Example.docx

üß© Generating DMP for: Secondary Data Analysis Example


üß† Generating NIH DMPs:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 12/26 [12:15<14:10, 60.78s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Secondary Data Analysis Example.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Secondary Data Analysis Example.docx

üß© Generating DMP for: Survey and Interview Example


üß† Generating NIH DMPs:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 13/26 [13:08<12:41, 58.58s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Survey and Interview Example.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Survey and Interview Example.docx

üß© Generating DMP for: Human Clinical Trial Data


üß† Generating NIH DMPs:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 14/26 [14:05<11:36, 58.04s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Human Clinical Trial Data.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Human Clinical Trial Data.docx

üß© Generating DMP for: Clinical data from human research participants-NIA


üß† Generating NIH DMPs:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 15/26 [15:04<10:41, 58.34s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Clinical data from human research participants-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Clinical data from human research participants-NIA.docx

üß© Generating DMP for: Survey, interview, and biological data (tiered access)


üß† Generating NIH DMPs:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 16/26 [15:59<09:34, 57.40s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Survey, interview, and biological data (tiered access).md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Survey, interview, and biological data (tiered access).docx

üß© Generating DMP for: Non-human data (primates)


üß† Generating NIH DMPs:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 17/26 [16:50<08:17, 55.33s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Non-human data (primates).md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Non-human data (primates).docx

üß© Generating DMP for: Secondary data analysis-NIA


üß† Generating NIH DMPs:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 18/26 [17:51<07:36, 57.06s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Secondary data analysis-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Secondary data analysis-NIA.docx

üß© Generating DMP for: Survey and interview data-NIA


üß† Generating NIH DMPs:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 19/26 [18:42<06:26, 55.21s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Survey and interview data-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Survey and interview data-NIA.docx

üß© Generating DMP for: Human clinical and genomic data-NIA


üß† Generating NIH DMPs:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 20/26 [20:02<06:15, 62.53s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Human clinical and genomic data-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Human clinical and genomic data-NIA.docx

üß© Generating DMP for: Non-human data (rodents)-NIA


üß† Generating NIH DMPs:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 21/26 [21:14<05:27, 65.44s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Non-human data (rodents)-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Non-human data (rodents)-NIA.docx

üß© Generating DMP for: Clinical data (human biospecimens)


üß† Generating NIH DMPs:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 22/26 [22:15<04:16, 64.13s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Clinical data (human biospecimens).md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Clinical data (human biospecimens).docx

üß© Generating DMP for: Drug discovery including intellectual property


üß† Generating NIH DMPs:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 23/26 [23:15<03:08, 62.89s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Drug discovery including intellectual property.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Drug discovery including intellectual property.docx

üß© Generating DMP for: HeLa Cell Whole Genome Sequence (DNA or RNA)


üß† Generating NIH DMPs:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 24/26 [24:20<02:07, 63.67s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\HeLa Cell Whole Genome Sequence (DNA or RNA).md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\HeLa Cell Whole Genome Sequence (DNA or RNA).docx

üß© Generating DMP for: Secondary Data Analysis on Data from Human Subjects-NIA


üß† Generating NIH DMPs:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 25/26 [25:20<01:02, 62.34s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Secondary Data Analysis on Data from Human Subjects-NIA.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Secondary Data Analysis on Data from Human Subjects-NIA.docx

üß© Generating DMP for: Analysis of social media posts


üß† Generating NIH DMPs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [26:17<00:00, 60.66s/it]

üíæ Saved: c:\Users\Nahid\dmpchef\data\outputs1\markdown\Analysis of social media posts.md
üìÑ Converted: c:\Users\Nahid\dmpchef\data\outputs1\docx\Analysis of social media posts.docx

‚úÖ Finished processing all rows.
üìä CSV log saved to: c:\Users\Nahid\dmpchef\data\outputs\rag_generated_dmp_log.csv





In [None]:
# ============================================
# üß© STEP 6 ‚Äî Full DMP Comparison: Markdown (Generated) vs PDF (Gold, Fuzzy Matching)
# ============================================
import re
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# ------------------------------------------------------------
# ‚úÖ Robust ROOT_DIR discovery
# ------------------------------------------------------------
def find_project_root(start: Path) -> Path:
    """
    Walk upward until we find a directory that looks like the project root.
    Heuristics: contains a 'data' folder and (optionally) 'config' or 'dmpchef'.
    """
    cur = start.resolve()
    for _ in range(20):
        if (cur / "data").exists() and ((cur / "config").exists() or (cur / "dmpchef").exists()):
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

ROOT_DIR = find_project_root(Path.cwd())
print(f"üìÇ ROOT_DIR set to: {ROOT_DIR}")

# ------------------------------------------------------------
# ‚úÖ Path auto-detect (try outputs1 then outputs)
# ------------------------------------------------------------
gold_candidates = [
    ROOT_DIR / "data" / "inputs" / "gold_dmps",
    ROOT_DIR / "data" / "gold_dmps",
]

generated_candidates = [
    ROOT_DIR / "data" / "outputs7" / "markdown",
    ROOT_DIR / "data" / "outputs7"  / "markdown",
]

def pick_folder(candidates, pattern):
    for p in candidates:
        if p.exists() and any(p.glob(pattern)):
            return p
    # return the first existing one even if empty (for debugging)
    for p in candidates:
        if p.exists():
            return p
    return candidates[0]

GOLD_DIR = pick_folder(gold_candidates, "*.pdf")
GENERATED_DIR = pick_folder(generated_candidates, "*.md")

EVAL_DIR = ROOT_DIR / "data" / "outputs7" / "evaluation_results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìó Gold PDF folder     : {GOLD_DIR}   (exists={GOLD_DIR.exists()})")
print(f"üìò Generated MD folder : {GENERATED_DIR} (exists={GENERATED_DIR.exists()})")

# Debug counts
gold_pdf_count = len(list(GOLD_DIR.glob("*.pdf"))) if GOLD_DIR.exists() else 0
gen_md_count   = len(list(GENERATED_DIR.glob("*.md"))) if GENERATED_DIR.exists() else 0
print(f"üìä Found {gen_md_count} generated .md files")
print(f"üìä Found {gold_pdf_count} gold .pdf files")

if gen_md_count == 0:
    raise FileNotFoundError(f"‚ùå No generated Markdown files found in: {GENERATED_DIR}")
if gold_pdf_count == 0:
    raise FileNotFoundError(f"‚ùå No gold PDF files found in: {GOLD_DIR}")

# ------------------------------------------------------------
# Models
# ------------------------------------------------------------
print("üöÄ Loading models...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
print("‚úÖ Models ready.")

# ------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------
def normalize_name(name: str) -> str:
    name = name.lower()
    name = re.sub(r"[^a-z0-9\s]", " ", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

def clean_text(text: str) -> str:
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    text = re.sub(r"#+\s*", "", text)
    text = re.sub(r"\*\*|\*", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def extract_text_from_pdf(pdf_path: Path) -> str:
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text("text") + "\n"
    except Exception as e:
        print(f"‚ùå Error reading {pdf_path.name}: {e}")
    return clean_text(text)

def chunk_text(text, size=300):
    words = text.split()
    return [" ".join(words[i:i+size]) for i in range(0, len(words), size)]

def compare_chunked(gold_text, gen_text, model):
    gold_chunks = chunk_text(gold_text)
    gen_chunks  = chunk_text(gen_text)

    sbert_scores, rouge_scores = [], []
    for g in gold_chunks:
        emb_g = model.encode(g, convert_to_tensor=True)
        sims = []
        for gen in gen_chunks:
            emb_gen = model.encode(gen, convert_to_tensor=True)
            sims.append(util.cos_sim(emb_g, emb_gen).item())
        sbert_scores.append(max(sims))

        rouge_chunk_scores = [rouge.score(g, gen)["rougeL"].recall for gen in gen_chunks]
        rouge_scores.append(max(rouge_chunk_scores))

    return float(np.mean(sbert_scores)), float(np.mean(rouge_scores))

def best_fuzzy_match(target, gold_names, threshold=0.6):
    best_match, best_score = None, 0
    for g in gold_names:
        score = SequenceMatcher(None, target, g).ratio()
        if score > best_score:
            best_match, best_score = g, score
    return (best_match, best_score) if best_score >= threshold else (None, best_score)

# ------------------------------------------------------------
# Collect files
# ------------------------------------------------------------
gold_files = {normalize_name(f.stem): f for f in GOLD_DIR.glob("*.pdf")}
gen_files  = {normalize_name(f.stem): f for f in GENERATED_DIR.glob("*.md")}
print(f"üìä Indexed {len(gen_files)} generated DMPs and {len(gold_files)} gold PDFs.")

# ------------------------------------------------------------
# Compare
# ------------------------------------------------------------
results = []
for name, gen_path in tqdm(gen_files.items(), desc="üîé Matching & Comparing DMPs"):
    best_match, score = best_fuzzy_match(name, list(gold_files.keys()))
    if not best_match:
        continue

    gold_path = gold_files[best_match]
    gold_text = extract_text_from_pdf(gold_path)
    gen_text  = clean_text(gen_path.read_text(encoding="utf-8"))

    if not gold_text.strip() or not gen_text.strip():
        continue

    sbert_sim, rouge_l = compare_chunked(gold_text, gen_text, sbert)
    results.append({
        "Generated_File": gen_path.name,
        "Matched_Gold_PDF": gold_path.name,
        "Match_Score": round(score, 3),
        "SBERT_Similarity": round(sbert_sim, 4),
        "ROUGE_L_Recall": round(rouge_l, 4),
    })

df_results = pd.DataFrame(results)
out_path = EVAL_DIR / "full_dmp_pdf_comparison_fuzzy.csv"
df_results.to_csv(out_path, index=False)
print(f"\n‚úÖ Results saved to: {out_path}")
print(f"üßæ Total matched DMP pairs: {len(df_results)}")


üìÇ ROOT_DIR set to: C:\Users\Nahid\dmpchef
üìó Gold PDF folder     : C:\Users\Nahid\dmpchef\data\inputs\gold_dmps   (exists=True)
üìò Generated MD folder : C:\Users\Nahid\dmpchef\data\outputs1\markdown (exists=True)
üìä Found 26 generated .md files
üìä Found 26 gold .pdf files
üöÄ Loading models...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 1551.42it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


‚úÖ Models ready.
üìä Indexed 26 generated DMPs and 26 gold PDFs.


üîé Matching & Comparing DMPs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:09<00:00,  2.61it/s]


‚úÖ Results saved to: C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results\full_dmp_pdf_comparison_fuzzy.csv
üßæ Total matched DMP pairs: 26





In [None]:
# ============================================
# üß© STEP 7 ‚Äî Element-Level Comparison with NIH Gold Standard (Robust ROOT + folder autodetect)
# ============================================
import re
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# ------------------------------------------------------------
# ‚úÖ Robust ROOT_DIR discovery (do NOT rely on folder name)
# ------------------------------------------------------------
def find_project_root(start: Path) -> Path:
    """
    Walk upward until we find a directory that looks like the project root.
    Heuristics: contains a 'data' folder and (optionally) 'config' or 'dmpchef'.
    """
    cur = start.resolve()
    for _ in range(20):
        if (cur / "data").exists() and ((cur / "config").exists() or (cur / "dmpchef").exists()):
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

ROOT_DIR = find_project_root(Path.cwd())
print(f"üìÇ ROOT_DIR set to: {ROOT_DIR}")

# ------------------------------------------------------------
# ‚úÖ Path auto-detect (try outputs1 then outputs)
# ------------------------------------------------------------
GOLD_PATH = ROOT_DIR / "data" / "inputs" / "inputs.xlsx"

generated_candidates = [
    ROOT_DIR / "data" / "outputs7" / "markdown",
    ROOT_DIR / "data" / "outputs7"  / "markdown",
]

eval_candidates = [
    ROOT_DIR / "data" / "outputs7" / "evaluation_results",
    ROOT_DIR / "data" / "outputs7"  / "evaluation_results",
]

def pick_existing_folder(candidates):
    for p in candidates:
        if p.exists():
            return p
    return candidates[0]

GENERATED_DIR = pick_existing_folder(generated_candidates)
EVAL_DIR = pick_existing_folder(eval_candidates)
EVAL_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìó Gold Excel: {GOLD_PATH} (exists={GOLD_PATH.exists()})")
print(f"üìò Generated MD folder: {GENERATED_DIR} (exists={GENERATED_DIR.exists()})")
print(f"üìÅ Eval output folder : {EVAL_DIR}")

if not GOLD_PATH.exists():
    raise FileNotFoundError(f"‚ùå Gold Excel not found: {GOLD_PATH}")
if not GENERATED_DIR.exists():
    raise FileNotFoundError(f"‚ùå Generated Markdown folder not found: {GENERATED_DIR}")

# ------------------------------------------------------------
# Load gold reference (Excel)
# ------------------------------------------------------------
df_gold = pd.read_excel(GOLD_PATH)
df_gold.columns = df_gold.columns.str.strip().str.lower()
df_gold = df_gold.fillna("").astype(str)

def normalize_title(name: str) -> str:
    name = str(name).lower()
    name = re.sub(r"[^a-z0-9\s]", " ", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

df_gold["title_norm"] = df_gold["title"].apply(normalize_title)

gold_elements = [
    "element_1a","element_1b","element_1c",
    "element_2","element_3",
    "element_4a","element_4b","element_4c",
    "element_5a","element_5b","element_5c",
    "element_6"
]

# Keep only element columns that actually exist in the file (avoid KeyErrors)
gold_elements = [c for c in gold_elements if c in df_gold.columns]

print(f"‚úÖ Loaded {len(df_gold)} gold projects.")
print(f"‚úÖ Gold element columns used: {gold_elements}")

# ------------------------------------------------------------
# Models
# ------------------------------------------------------------
print("üöÄ Loading evaluation models...")
sbert = SentenceTransformer("all-MiniLM-L6-v2")
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
print("‚úÖ Models ready.")

# ------------------------------------------------------------
# Markdown parsing helpers
# ------------------------------------------------------------
def is_title(line: str) -> bool:
    s = line.strip()
    # Markdown headers (#, ##, ...) OR numbered bold section titles like "1. **Data Types**"
    return s.startswith("#") or bool(re.match(r"^\s*\d*\.?\s*\*\*.*\*\*\s*$", s))

def extract_sections(md_path: Path) -> pd.DataFrame:
    """
    Extract {Section Title, Generated Content} pairs from a Markdown file.
    Also strips any <think>...</think> blocks if present.
    """
    text = md_path.read_text(encoding="utf-8", errors="ignore")
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)

    lines = text.splitlines()
    entries, current_title, buf = [], None, []

    for ln in lines:
        if is_title(ln):
            if current_title and any(x.strip() for x in buf):
                entries.append({
                    "Section Title": current_title.strip(),
                    "Generated Content": "\n".join(buf).strip()
                })
            current_title, buf = ln, []
        else:
            buf.append(ln)

    if current_title and any(x.strip() for x in buf):
        entries.append({
            "Section Title": current_title.strip(),
            "Generated Content": "\n".join(buf).strip()
        })

    return pd.DataFrame(entries)

# ------------------------------------------------------------
# Compare (exact title match on normalized strings)
# ------------------------------------------------------------
md_files = sorted(GENERATED_DIR.glob("*.md"))
print(f"üîç Found {len(md_files)} generated Markdown files in: {GENERATED_DIR}")

if len(md_files) == 0:
    raise FileNotFoundError(f"‚ùå No .md files found in {GENERATED_DIR}")

results = []

for md_file in tqdm(md_files, desc="üìä Comparing element-level"):
    gen_title_raw  = md_file.stem
    gen_title_norm = normalize_title(gen_title_raw)

    gold_row = df_gold[df_gold["title_norm"] == gen_title_norm]
    if gold_row.empty:
        # If you want fuzzy matching here too, tell me; for now keep "exact normalized"
        continue

    gold_row = gold_row.iloc[0]
    gold_title = gold_row["title"]

    # Gather gold element texts
    gold_texts = {}
    for e in gold_elements:
        txt = str(gold_row.get(e, "")).strip()
        if txt:
            gold_texts[e] = txt

    if not gold_texts:
        continue

    # Extract sections from generated MD
    gen_df = extract_sections(md_file)
    if gen_df.empty:
        continue

    # Compare each gold element against all generated sections; keep best match by SBERT
    for element, gold_text in gold_texts.items():
        emb_gold = sbert.encode(gold_text, convert_to_tensor=True)

        best = None
        best_sbert = -1.0

        for _, sec in gen_df.iterrows():
            gen_text = str(sec["Generated Content"]).strip()
            if not gen_text:
                continue

            emb_gen = sbert.encode(gen_text, convert_to_tensor=True)
            sbert_sim = float(util.cos_sim(emb_gold, emb_gen).item())
            rouge_l   = float(rouge.score(gold_text, gen_text)["rougeL"].recall)

            if sbert_sim > best_sbert:
                best_sbert = sbert_sim
                best = {
                    "Gold Project": gold_title,
                    "Gold Element": element,
                    "Generated File": md_file.name,
                    "Generated Section Title": sec["Section Title"],
                    "SBERT_Similarity": round(sbert_sim, 4),
                    "ROUGE_L_Recall": round(rouge_l, 4),
                }

        if best:
            results.append(best)

# ------------------------------------------------------------
# Save
# ------------------------------------------------------------
df_results = pd.DataFrame(results)
out_path = EVAL_DIR / "element_similarity_exact_titles.csv"
df_results.to_csv(out_path, index=False, encoding="utf-8")

print(f"\n‚úÖ Element-level similarity saved to: {out_path}")
print(f"üßæ Total element‚Äìsection best matches: {len(df_results)}")


üìÇ ROOT_DIR set to: C:\Users\Nahid\dmpchef
üìó Gold Excel: C:\Users\Nahid\dmpchef\data\inputs\inputs.xlsx (exists=True)
üìò Generated MD folder: C:\Users\Nahid\dmpchef\data\outputs1\markdown (exists=True)
üìÅ Eval output folder : C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results
‚úÖ Loaded 26 gold projects.
‚úÖ Gold element columns used: ['element_1a', 'element_1b', 'element_1c', 'element_2', 'element_3', 'element_4a', 'element_4b', 'element_4c', 'element_5a', 'element_5b', 'element_5c', 'element_6']
üöÄ Loading evaluation models...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 1560.06it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


‚úÖ Models ready.
üîç Found 26 generated Markdown files in: C:\Users\Nahid\dmpchef\data\outputs1\markdown


üìä Comparing element-level: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:31<00:00,  1.20s/it]


‚úÖ Element-level similarity saved to: C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results\element_similarity_exact_titles.csv
üßæ Total element‚Äìsection best matches: 312





In [None]:
# ============================================
# üßÆ STEP 8 ‚Äî Summarize Evaluation Results (Robust ROOT + outputs1/outputs autodetect)
# ============================================
import pandas as pd
from pathlib import Path

# ------------------------------------------------------------
# ‚úÖ Robust ROOT_DIR discovery (do NOT rely on folder name)
# ------------------------------------------------------------
def find_project_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(20):
        if (cur / "data").exists() and ((cur / "config").exists() or (cur / "dmpchef").exists()):
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

ROOT_DIR = find_project_root(Path.cwd())
print(f"üìÇ ROOT_DIR set to: {ROOT_DIR}")

# ------------------------------------------------------------
# ‚úÖ Eval dir autodetect (prefer outputs1 if it has files)
# ------------------------------------------------------------
eval_candidates = [
    ROOT_DIR / "data" / "outputs7" / "evaluation_results",
    ROOT_DIR / "data" / "outputs7"  / "evaluation_results",
]

def pick_eval_dir(cands):
    for p in cands:
        if p.exists() and any(p.glob("*.csv")):
            return p
    for p in cands:
        if p.exists():
            return p
    return cands[0]

EVAL_DIR = pick_eval_dir(eval_candidates)
EVAL_DIR.mkdir(parents=True, exist_ok=True)
print(f"üìÅ EVAL_DIR: {EVAL_DIR}")

# ------------------------------------------------------------
# Load CSVs (fail loudly if missing)
# ------------------------------------------------------------
full_path = EVAL_DIR / "full_dmp_pdf_comparison_fuzzy.csv"
elem_path = EVAL_DIR / "element_similarity_exact_titles.csv"

if not full_path.exists():
    raise FileNotFoundError(f"‚ùå Missing: {full_path} (Run Step 6 first)")
if not elem_path.exists():
    raise FileNotFoundError(f"‚ùå Missing: {elem_path} (Run Step 7 first)")

df_full = pd.read_csv(full_path)
df_elem = pd.read_csv(elem_path)

print(f"‚úÖ Loaded full-document: {len(df_full)} rows")
print(f"‚úÖ Loaded element-level: {len(df_elem)} rows\n")

if df_full.empty:
    raise ValueError("‚ùå full_dmp_pdf_comparison_fuzzy.csv is empty (Step 6 matched 0 pairs).")
if df_elem.empty:
    raise ValueError("‚ùå element_similarity_exact_titles.csv is empty (Step 7 matched 0 pairs).")

# ============================================================
# 1) FULL-DOCUMENT LEVEL SUMMARY (Mean only by Generated_File)
# ============================================================
project_col = "Generated_File" if "Generated_File" in df_full.columns else df_full.columns[0]

# Safer metric detection
sbert_col = next((c for c in df_full.columns if "sbert" in c.lower()), None)
rouge_col = next((c for c in df_full.columns if "rouge" in c.lower()), None)

if not sbert_col or not rouge_col:
    raise ValueError(f"‚ùå Could not find SBERT/ROUGE columns in: {df_full.columns.tolist()}")

df_full_summary = (
    df_full.groupby(project_col)[[sbert_col, rouge_col]]
    .mean()
    .reset_index()
)

df_full_summary["SBERT"] = df_full_summary[sbert_col].map(lambda x: f"{x:.2f}")
df_full_summary["ROUGE"] = df_full_summary[rouge_col].map(lambda x: f"{x:.2f}")

df_full_table = df_full_summary[[project_col, "SBERT", "ROUGE"]].rename(
    columns={project_col: "Generated_File"}
)

print("üìä Full-document summary table (Mean only, by Generated_File):")
display(df_full_table)

# ============================================================
# 2) ELEMENT-LEVEL SUMMARY (Mean ¬± SD)
# ============================================================
elem_col = next((c for c in df_elem.columns if "element" in c.lower()), None)
if not elem_col:
    raise ValueError(f"‚ùå Could not find element column in: {df_elem.columns.tolist()}")

sbert_col_e = next((c for c in df_elem.columns if "sbert" in c.lower()), None)
rouge_col_e = next((c for c in df_elem.columns if "rouge" in c.lower()), None)

if not sbert_col_e or not rouge_col_e:
    raise ValueError(f"‚ùå Could not find SBERT/ROUGE columns in: {df_elem.columns.tolist()}")

df_elem_summary = (
    df_elem.groupby(elem_col)[[sbert_col_e, rouge_col_e]]
    .agg(["mean", "std"])
    .reset_index()
)

# Flatten columns
df_elem_summary.columns = [
    elem_col,
    "SBERT_Mean", "SBERT_SD",
    "ROUGE_Mean", "ROUGE_SD"
]

df_elem_summary["SBERT"] = df_elem_summary.apply(
    lambda r: f"{r['SBERT_Mean']:.2f} ¬± {r['SBERT_SD']:.2f}", axis=1
)
df_elem_summary["ROUGE"] = df_elem_summary.apply(
    lambda r: f"{r['ROUGE_Mean']:.2f} ¬± {r['ROUGE_SD']:.2f}", axis=1
)

df_elem_table = df_elem_summary[[elem_col, "SBERT", "ROUGE"]].rename(columns={elem_col: "Element"})

print("\nüìä Element-level summary table (Mean ¬± SD):")
display(df_elem_table)

# ============================================================
# Save formatted tables
# ============================================================
out_full = EVAL_DIR / "summary_full_table_mean_only.csv"
out_elem = EVAL_DIR / "summary_element_table_mean_sd.csv"

df_full_table.to_csv(out_full, index=False)
df_elem_table.to_csv(out_elem, index=False)

print(f"\nüíæ Saved formatted tables ‚Üí\n‚Ä¢ {out_full}\n‚Ä¢ {out_elem}")


üìÇ ROOT_DIR set to: C:\Users\Nahid\dmpchef
üìÅ EVAL_DIR: C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results
‚úÖ Loaded full-document: 26 rows
‚úÖ Loaded element-level: 312 rows

üìä Full-document summary table (Mean only, by Generated_File):


Unnamed: 0,Generated_File,SBERT,ROUGE
0,Analysis of social media posts.md,0.76,0.37
1,Basic Research from a Non-Human Source Example.md,0.77,0.35
2,Clinical Data from Human Research Participants.md,0.71,0.25
3,Clinical and MRI data from human research part...,0.73,0.3
4,Clinical data (human biospecimens).md,0.76,0.46
5,Clinical data from human research participants...,0.82,0.42
6,Drug discovery including intellectual property.md,0.78,0.43
7,Gene expression analysis data from non-human m...,0.76,0.48
8,Genomic data from a non-human source.md,0.71,0.29
9,Genomic data from human research participants.md,0.72,0.3



üìä Element-level summary table (Mean ¬± SD):


Unnamed: 0,Element,SBERT,ROUGE
0,element_1a,0.84 ¬± 0.16,0.58 ¬± 0.39
1,element_1b,0.81 ¬± 0.14,0.62 ¬± 0.34
2,element_1c,0.85 ¬± 0.13,0.65 ¬± 0.38
3,element_2,0.86 ¬± 0.13,0.59 ¬± 0.35
4,element_3,0.83 ¬± 0.13,0.58 ¬± 0.29
5,element_4a,0.86 ¬± 0.08,0.62 ¬± 0.29
6,element_4b,0.89 ¬± 0.11,0.67 ¬± 0.33
7,element_4c,0.90 ¬± 0.08,0.68 ¬± 0.29
8,element_5a,0.80 ¬± 0.18,0.60 ¬± 0.36
9,element_5b,0.85 ¬± 0.13,0.62 ¬± 0.29



üíæ Saved formatted tables ‚Üí
‚Ä¢ C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results\summary_full_table_mean_only.csv
‚Ä¢ C:\Users\Nahid\dmpchef\data\outputs1\evaluation_results\summary_element_table_mean_sd.csv
