In [7]:
print ("Start 10/08/2025")

Start 10/08/2025


In [31]:
# ============================================
# 📦 Core Imports
# ============================================
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

# ============================================
# 🧠 LLM & Embeddings
# ============================================
import ollama
from langchain_community.embeddings import OllamaEmbeddings

# ============================================
# 📄 Document Processing
# ============================================
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# ============================================
# 📑 File Conversion
# ============================================
import pypandoc

# ============================================
# 🔐 Environment Setup
# ============================================
load_dotenv()

# ============================================
# ⚙️ User Configurable Parameters
# ============================================
DATA_DIR = Path("data")                    # Folder where your PDFs are located
INDEX_DIR = Path("data/faiss_index")       # Folder to save FAISS index
EMBED_MODEL = "nomic-embed-text"           # Embedding model for RAG
CHUNK_SIZE = 800                           # Token/text length per chunk
CHUNK_OVERLAP = 120                        # Overlap between chunks
# ============================================

In [32]:
# ============================================
# 📘 Function: Load PDFs
# ============================================
def load_pdfs_from_folder(folder: Path) -> list[Document]:
    """
    Load all PDFs from the specified folder and return a list of Document objects.
    Each PDF is loaded page by page.
    """
    docs = []
    folder = folder.resolve()

    if not folder.exists():
        raise FileNotFoundError(f"❌ Data folder not found: {folder}")

    pdf_files = sorted(folder.glob("*.pdf"))
    if not pdf_files:
        print(f"⚠️ No PDF files found in {folder}")
        return docs

    print(f"📚 Found {len(pdf_files)} PDF(s) in {folder}")
    for f in pdf_files:
        print(f"🔹 Loading: {f.name}")
        loader = PyPDFLoader(str(f))
        file_docs = loader.load()
        docs.extend(file_docs)

    print(f"✅ Loaded total {len(docs)} pages from all PDFs.\n")
    return docs


In [33]:
# ============================================
# 🧩 Function: Split into Chunks
# ============================================
def split_into_chunks(docs, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n\n", "\n", " ", ""],
    )
    chunks = splitter.split_documents(docs)
    print(f"✅ Created {len(chunks)} text chunks from {len(docs)} pages.")
    print("🔹 Example chunk:\n", chunks[0].page_content[:300], "...\n")
    return chunks


In [34]:

# ============================================
# 🧮 Function: Build & Save FAISS Index
# ============================================
from langchain_community.vectorstores import FAISS

def build_faiss_index(chunks, embed_model=EMBED_MODEL, index_dir=INDEX_DIR):
    """
    Build FAISS vector index from document chunks and save locally.
    """
    index_dir.mkdir(parents=True, exist_ok=True)

    print(f"🚀 Initializing embedding model: {embed_model}")
    embeddings = OllamaEmbeddings(model=embed_model)

    print("⚙️ Building FAISS index ...")
    vectorstore = FAISS.from_documents(chunks, embeddings)

    vectorstore.save_local(str(index_dir))
    print(f"✅ FAISS vector store built and saved to: {index_dir.resolve()}")
    return vectorstore


In [35]:
# ============================================
# 🧭 Pipeline Execution
# ============================================
if __name__ == "__main__":
    raw_docs = load_pdfs_from_folder(DATA_DIR)
    if not raw_docs:
        print("❌ No documents loaded — please add PDFs to the 'data' folder.")
    else:
        chunks = split_into_chunks(raw_docs)
        vectorstore = build_faiss_index(chunks)
        print("🎯 Stage 1 complete: documents indexed and ready for retrieval.")

📚 Found 26 PDF(s) in C:\Users\Nahid\DMP-RAG\notebook\data
🔹 Loading: 1-Clinical andor MRI data from human research participants-NIMH.pdf
🔹 Loading: 10-Clinical Data from Human Research Participants-NIDDK.pdf
🔹 Loading: 11-Basic Research from a Non-Human Source Example-NIDDK.pdf
🔹 Loading: 12-Secondary Data Analysis Example-NIDDK.pdf
🔹 Loading: 13-Survey and Interview Example-NHGRI.pdf
🔹 Loading: 14-Human Clinical Trial Data-NICHD.pdf
🔹 Loading: 15-Clinical data from human research participants-NIA.pdf
🔹 Loading: 16-Survey, interview, and biological data (tiered access)-NIA.pdf
🔹 Loading: 17-Non-human data (primates)-NIA.pdf
🔹 Loading: 18-Secondary data analysis-NIA.pdf
🔹 Loading: 19-Survey and interview data-NIA.pdf
🔹 Loading: 2-Genomic data from human research participants-NIMH.pdf
🔹 Loading: 20-Human clinical and genomic data-NIA.pdf
🔹 Loading: 21-Non-human data (rodents)-NIA.pdf
🔹 Loading: 22-Clinical data (human biospecimens)-NIA.pdf
🔹 Loading: 23-Drug discovery including intellect

In [43]:
# ============================================
# 🧩 Load FAISS Index + Create Retriever
# ============================================
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

# ---- Paths and Parameters ----
INDEX_DIR   = Path("data/faiss_index")   # same folder where Stage 1 stored your index
EMBED_MODEL = "nomic-embed-text"         # must match the model used to build the index
TOP_K       = 3                          # how many relevant chunks to retrieve for each query

def load_retriever(index_dir=INDEX_DIR, embed_model=EMBED_MODEL, k=TOP_K):
    """
    Loads the FAISS index built earlier and creates a retriever object
    for Retrieval-Augmented Generation (RAG).
    """
    print("🚀 Loading FAISS index from:", index_dir.resolve())

    # Initialize embedding model
    embeddings = OllamaEmbeddings(model=embed_model)

    # Load FAISS vector store from local folder
    vectorstore = FAISS.load_local(
        str(index_dir),
        embeddings,
        allow_dangerous_deserialization=True
    )

    # Create retriever for semantic search
    retriever = vectorstore.as_retriever(search_kwargs={"k": k})
    print(f"✅ Retriever ready (top_k={k}) using embedding: {embed_model}\n")

    return retriever


# ---- Quick Test ----
retriever = load_retriever()

# Optional sanity-check: try one retrieval
test_query = "How should NIH Data Management Plans handle genomic data and consent?"
docs = retriever.get_relevant_documents(test_query)
print(f"🔎 Retrieved {len(docs)} relevant chunks.")
if docs:
    print("🧩 Sample chunk preview:\n")
    print(docs[0].page_content[:400], "...\n")


🚀 Loading FAISS index from: C:\Users\Nahid\DMP-RAG\notebook\data\faiss_index
✅ Retriever ready (top_k=3) using embedding: nomic-embed-text



  docs = retriever.get_relevant_documents(test_query)


🔎 Retrieved 3 relevant chunks.
🧩 Sample chunk preview:

DATA MANAGEMENT AND SHARING PLAN 
An example from an application proposing to collect clinical and MRI data from human subjects. 
If any of the proposed research in the application involves the generation of scientific data, this application is subject to the NIH Policy 
for Data Management and Sharing and requires submission of a Data Management and Sharing Plan. If the proposed research in the 
 ...



In [None]:
# ============================================
# 📦 Core Imports
# ============================================
import os
from pathlib import Path
import pandas as pd
import pypandoc
from dotenv import load_dotenv

# ============================================
# 🧠 LangChain & Ollama Components
# ============================================
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# ============================================
# ⚙️ Configuration
# ============================================
load_dotenv()

INDEX_DIR   = Path("data/faiss_index")
EXCEL_PATH  = Path("inputs/inputs.xlsx")
TEMPLATE_PATH = Path("inputs/dmp-template.md")

OUTPUT_MD   = Path("outputs/markdown")
OUTPUT_DOCX = Path("outputs/docx")

EMBED_MODEL = "nomic-embed-text"
LLM_MODEL   = "llama4"
TOP_K       = 3

# ============================================
# 📁 Function3: Create a Directory if it Doesn't Exist
# ============================================
def create_folder(folderpath):
    """Creates a folder at the specified path if it doesn't already exist."""
    if not os.path.exists(folderpath):
        os.makedirs(folderpath)

# ============================================
# 📄 Function4: Save Markdown Files
# ============================================
def save_md(folderpath, filename, response):
    """Saves a given response as a Markdown (.md) file."""
    create_folder(folderpath)
    filepath = os.path.join(folderpath, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(response)
    print("💾 Saved:", filepath)

# ============================================
# 📑 Function5: Convert Markdown to DOCX
# ============================================
def md_to_docs(md_filepath, docx_folderpath, docx_filename):
    """Converts a Markdown file to a Word (.docx) document using pypandoc."""
    create_folder(docx_folderpath)
    docx_filepath = os.path.join(docx_folderpath, docx_filename)
    pypandoc.convert_file(md_filepath, "docx", outputfile=docx_filepath)
    print("📄 Converted:", docx_filepath)

# ============================================
# 📊 Step 1: Load Excel Inputs and DMP Template
# ============================================
if EXCEL_PATH.exists():
    df = pd.read_excel(EXCEL_PATH)
    print("✅ Excel file loaded successfully!")
    print(f"📄 Loaded {len(df)} rows from {EXCEL_PATH}")
else:
    raise FileNotFoundError(f"❌ The Excel file '{EXCEL_PATH}' was not found.")

if TEMPLATE_PATH.exists():
    with open(TEMPLATE_PATH, "r", encoding="utf-8") as file:
        dmp_template_text = file.read()
    print("✅ DMP template loaded successfully!")
else:
    raise FileNotFoundError(f"❌ The template file '{TEMPLATE_PATH}' was not found.")

# ============================================
# 🧩 Step 2: Load FAISS Index + Create Retriever
# ============================================
def load_retriever(index_dir=INDEX_DIR, embed_model=EMBED_MODEL, k=TOP_K):
    """Loads the FAISS index and creates a retriever object."""
    print("🚀 Loading FAISS index from:", index_dir.resolve())
    embeddings = OllamaEmbeddings(model=embed_model)
    vectorstore = FAISS.load_local(
        str(index_dir), embeddings, allow_dangerous_deserialization=True
    )
    retriever = vectorstore.as_retriever(search_kwargs={"k": k})
    print(f"✅ Retriever ready (top_k={k}) using embedding: {embed_model}\n")
    return retriever

retriever = load_retriever()

# ============================================
# 🧠 Step 3: Build the RAG Chain
# ============================================
def build_rag_chain(retriever, llm_model=LLM_MODEL):
    """Builds a Retrieval-Augmented Generation (RAG) chain."""
    llm = Ollama(model=llm_model)
    prompt_template = """
    You are an expert biomedical data steward and grant writer.
    Your goal is to create a high-quality NIH Data Management and Sharing Plan (DMSP)
    based on the question and the retrieved NIH context.

    Answer the question based on the context provided below.
    If the context does not contain sufficient information, respond with:
    "I do not have enough information about this."

    ----
    Context:
    {context}

    Question:
    {question}
    """
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )
    parser = StrOutputParser()

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | parser
    )

    print(f"🔗 RAG chain initialized using model: {llm_model}\n")
    return rag_chain

rag_chain = build_rag_chain(retriever)

# ============================================
# 🧩 Step 4: Generate Queries and Run RAG
# ============================================
def generate_query(row):
    """Builds a DMP-specific query from Excel row data."""
    query_initiate = (
        "You are an expert biomedical grant writer and data steward. "
        "Create a Data Management and Sharing Plan (DMSP) for a grant proposal being submitted to the NIH."
    )
    query_funding_agency = f"Specifically targeting the {row['institute']}."
    query_element1a = f"Here are the details about the data to be collected:\n{row['element_1A']}\n"

    if "yes" in str(row.get("isHumanStudy", "")).lower():
        query_consent_type = (
            f"This proposal includes a study involving human participants. "
            f"{row.get('consentDescription', '')}."
        )
    else:
        query_consent_type = ""

    # ✅ FIXED: use the loaded Markdown template text, not a filename
    query_template = (
        "Provide the result using exactly this markdown format template of the DMSP provided by the NIH "
        "without changing it (keep all the titles and sections as is):\n"
        + dmp_template_text
    )

    return " ".join([
        query_initiate,
        query_funding_agency,
        query_element1a,
        query_consent_type,
        query_template
    ])

# ============================================
# 🚀 Step 5: Generate and Save DMPs
# ============================================
for idx, row in df.iterrows():
    print(f"\n🧩 Generating DMP for: {row['institute']}")
    query = generate_query(row)

    # --- Run through RAG pipeline ---
    response = rag_chain.invoke(query)

    # --- Save Markdown ---
    filename_md = f"DMP_{row['institute'].replace(' ', '_').replace('(', '').replace(')', '')}.md"
    md_path = OUTPUT_MD / filename_md
    save_md(str(OUTPUT_MD), filename_md, response)

    # --- Convert Markdown → DOCX ---
    filename_docx = filename_md.replace(".md", ".docx")
    md_to_docs(str(md_path), str(OUTPUT_DOCX), filename_docx)

print("\n🎯 All NIH DMPs generated, saved as Markdown, and converted to DOCX!")


✅ Excel file loaded successfully!
📄 Loaded 26 rows from inputs\inputs.xlsx
✅ DMP template loaded successfully!
🚀 Loading FAISS index from: C:\Users\Nahid\DMP-RAG\notebook\data\faiss_index
✅ Retriever ready (top_k=3) using embedding: nomic-embed-text

🔗 RAG chain initialized using model: llama4


🧩 Generating DMP for: National Institute of Mental Health (NIMH)
💾 Saved: outputs\markdown\DMP_National_Institute_of_Mental_Health_NIMH.md
📄 Converted: outputs\docx\DMP_National_Institute_of_Mental_Health_NIMH.docx

🧩 Generating DMP for: National Institute of Mental Health (NIMH)
💾 Saved: outputs\markdown\DMP_National_Institute_of_Mental_Health_NIMH.md
📄 Converted: outputs\docx\DMP_National_Institute_of_Mental_Health_NIMH.docx

🧩 Generating DMP for: National Institute of Mental Health (NIMH)
💾 Saved: outputs\markdown\DMP_National_Institute_of_Mental_Health_NIMH.md
📄 Converted: outputs\docx\DMP_National_Institute_of_Mental_Health_NIMH.docx

🧩 Generating DMP for: National Institute of Mental Heal