# Using LLMs to Generate NIH Data Management Plans (DMPs)

## Step 1: Import Required Libraries


In [None]:
import ollama  # Local LLMs like Llama3 
import openai  # GPT-4 API
import pypandoc  # Convert Markdown to DOCX
import pandas as pd  # Handle structured data
import os  # File handling
from dotenv import load_dotenv  # Load API keys securely
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Set API keys
openai.api_key = OPENAI_API_KEY



## Step 2: Define Functions

**Function 1**  Query LLaMA 3.3 (Ollama - Local Models)
This function is responsible for sending prompts to locally hosted LLaMA 3 model using the Ollama interface and retrieving responses.

**Function 2**  Query Open AI
This function is responsible for sending prompts to GPT

**Function 3** Create a Directory if it Doesn't Exist
Ensures that a specified directory exists. If it does not, the function creates it.

**Function 4** Save Markdown Files
Saves the generated text content into a `.md` (Markdown) file format for documentation or further processing.

**Function 5**  Convert Markdown to DOCX
Takes markdown-formatted content and converts it into a `.docx` Word document, enabling easy viewing and sharing.


In [5]:

#Function1:Query Llama 3 & DeepSeek (Ollama - Local Models)
def ask_llm_ollama(model, query):
    """
    Queries an LLM model via Ollama (local models).

    Args:
        model (str): Name of the model in Ollama (must be installed locally).
        query (str): Query string to send to the model.

    Returns:
        str: Response from the LLM.
    """
    response = ollama.chat(model=model, messages=[{"role": "user", "content": query}])
    return response['message']['content']

In [None]:
#Function 2: Querying the LLM via OpenAI API
import openai

def ask_llm_openai(model, query):
    """
    Queries an LLM model via OpenAI API and returns the response.

    Args:
        model (str): Name of the OpenAI model (e.g., "gpt-4-turbo").
        query (str): Query string to send to the model.

    Returns:
        str: Response from the OpenAI model.
    """
    client = openai.OpenAI()  

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": query}],
        temperature=0.7  
    )

    return response.choices[0].message.content  

In [7]:
#Function3 : Create a Directory if it Doesn't Exist
def create_folder(folderpath):
    """
    Creates a folder at the specified path if it doesn't already exist.

    Args:
        folderpath (str): The path of the folder to create.
    """
    if not os.path.exists(folderpath):
        os.makedirs(folderpath)

In [None]:
#Function4: Save Markdown Files
def save_md(folderpath, filename, response):
    """
    Saves a given response as a Markdown (.md) file.

    Args:
        folderpath (str): Path where the file should be saved.
        filename (str): Name of the Markdown file.
        response (str): The text to be saved.
    """
    create_folder(folderpath)  
    filepath = os.path.join(folderpath, filename) 
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(response)  
    print(filepath, "saved")  


In [None]:
#Function5 : Convert Markdown to DOCX
def md_to_docs(md_filepath, docx_folderpath, docx_filename):
    """
    Converts a Markdown file to a Word (.docx) document using pypandoc.

    Args:
        md_filepath (str): Path to the source Markdown file.
        docx_folderpath (str): Path where the DOCX file will be saved.
        docx_filename (str): Name of the DOCX file.
    """
    create_folder(docx_folderpath)  
    docx_filepath = os.path.join(docx_folderpath, docx_filename)  
    pypandoc.convert_file(md_filepath, 'docx', outputfile=docx_filepath)  
    print(docx_filepath, "saved")  


## Step 3: Load Input Files

In this step, the script defines paths for input files and loads them into memory:

- **Excel File**: Contains the structured DMP data to be processed.
- **Markdown Template**: Serves as the base template for generating customized DMP content.


In [11]:

# Define file paths
import os
import pandas as pd
excel_path = "inputs/inputs.xlsx"
template_path = "inputs/dmp-template.md"

# Check and load Excel data
if os.path.exists(excel_path):
    df = pd.read_excel(excel_path)
    print(" Excel file loaded successfully!")
else:
    raise FileNotFoundError(f" Error: The file '{excel_path}' was not found.")

# Check and load Markdown template
if os.path.exists(template_path):
    with open(template_path, 'r', encoding='utf-8') as file:
        dmp_template_text = file.read()
    print(" DMP template loaded successfully!")
else:
    raise FileNotFoundError(f" Error: The file '{template_path}' was not found.")


 Excel file loaded successfully!
 DMP template loaded successfully!


## Step 4: Generate Model Outputs for NIH DMPs Using GPT and LLaMA

This step generates Data Management and Sharing Plans (DMSPs) using both OpenAI (GPT-4.1) and Ollama (LLaMA3.3) in a **single run**. Each model receives a customized prompt generated from structured metadata.

###  Inputs

- `df`: DataFrame containing DMP sample metadata (e.g., title, institute, human study flag, element_1A).
- `dmp_template_text`: The official NIH markdown-formatted DMP template.
- Model lists:
  - `openai_models = ["gpt-4.1"]`
  - `ollama_models = ["llama3.3"]`

### Query Generation

- A detailed, context-aware query is created for each DMP using:
  - Targeted funding agency
  - Project-specific data collection (element 1A)
  - Human participant considerations (if applicable)
  - Fixed NIH-compliant markdown structure

The final query is stored in the `Generated Query` column and saved to `outputs/with_generated_queries.csv`.

---

### Prompting (Single Run)

- A clean folder structure is created:
  - `outputs/Gpt_result/approach1_outputs_Gpt/`
  - `outputs/Llama_result/approach1_outputs_Llama/`

- Each DMP is processed using the generated query:
  - **Input query saved as `.docx` for traceability**
  - **Model output saved as `.md` (markdown)**
  - Markdown files are converted to `.docx` for review.

---

### Output Artifacts

For each DMP:
- Input query saved as: `DMP_Title_query.docx`
- Model outputs saved as:
  - `openai-gpt-4.1.md` / `.docx`
  - `ollama-llama3.3.md` / `.docx`

This setup keeps the generation process simple, repeatable, and fully traceable, supporting downstream evaluation and comparison.

In [None]:

# List of LLM models to be used (Ollama & OpenAI)
import ollama
import pandas as pd
import os
import pandas as pd
import re
from docx import Document
ollama_models = ["llama3.3"]
openai_models = ["gpt-4.1"]


# Initial query template for NIH Data Management and Sharing Plan (DMP)
query_initiate = "You are an expert biomedical grant writer and data steward. Create a Data Management and Sharing Plan (DMSP) for a grant proposal being submitted to the National Institutes of Health (NIH), "

# Query format using the NIH template
query_template = "Provide the result using exactly this markdown format template of the DMSP provided by the NIH without changing it (keep all the titles and sections as is) to give me a well-formatted markdown output that follows the NIH-required format:" + dmp_template_text

# Function to generate query from a row
def generate_query(row):

    query_funding_agency = f"Specifically targeting the {row['institute']}."
    query_Element1_A = (
        f"Here are the details about the data to be collected:\n"
        f"{row['element_1A']}\n"
    )

    if "yes" in str(row['isHumanStudy']).lower():
        query_consent_type = (
            f"This proposal includes a study that will involve human participants. {row['consentDescription']}."
        )
    else:
        query_consent_type = ""

    # Combine into final query
    query = " ".join([
        #query_dmp,
        query_initiate,
        query_funding_agency,
        query_Element1_A,
        query_consent_type,
        query_template
    ])
    
    return query


# Apply the function to each row and store in a new column
df["Generated Query"] = df.apply(generate_query, axis=1)
df.to_csv("outputs/with_generated_queries.csv", index=False)


In [None]:
import os
import re
import pandas as pd
from docx import Document

# Utility
def clean_filename(name: str) -> str:
    return re.sub(r'[\\/*?:"<>|]', "_", str(name).strip())


def save_query_doc(folderpath: str, name: str, query: str) -> None:
    doc = Document()
    doc.add_heading("Generated DMP Query", level=1)
    doc.add_paragraph(query)
    doc.save(os.path.join(folderpath, f"{name}_query.docx"))


def process_models(
    models,
    query_func,
    model_prefix: str,
    base_folder: str,
    query: str,
    dmp_name_clean: str
) -> None:
    folderpath = os.path.join(base_folder, dmp_name_clean)
    os.makedirs(folderpath, exist_ok=True)

    # Save the input query once per DMP (per model family folder)
    save_query_doc(folderpath, dmp_name_clean, query)

    for model in models:
        response = query_func(model, query)
        modelname = model.replace(":", "-")
        filename_md = f"{model_prefix}-{modelname}.md"
        filename_docx = filename_md.replace(".md", ".docx")

        save_md(folderpath, filename_md, response)
        md_to_docs(
            os.path.join(folderpath, filename_md),
            folderpath,
            filename_docx
        )


# Single Run Generation

print(" Single run started...")

# Output base folders (single run)
output_base_openai = "outputs/Gpt_result/approach1_outputs_Gpt"
output_base_ollama = "outputs/Llama_result/approach1_outputs_Llama"
os.makedirs(output_base_openai, exist_ok=True)
os.makedirs(output_base_ollama, exist_ok=True)

# Loop through DataFrame and process
for _, row in df.iterrows():
    query = row["Generated Query"]
    dmp_name_clean = clean_filename(row.get("title", "untitled_dmp"))

    # Process OpenAI models
    process_models(openai_models, ask_llm_openai, "openai", output_base_openai, query, dmp_name_clean)

    # Process Ollama models
    process_models(ollama_models, ask_llm_ollama, "ollama", output_base_ollama, query, dmp_name_clean)

print(" Single run completed. Results are saved.")

 Single run started...
outputs/Gpt_result/approach1_outputs_Gpt\Clinical and MRI data from human research participants\openai-gpt-4.1.md saved
outputs/Gpt_result/approach1_outputs_Gpt\Clinical and MRI data from human research participants\openai-gpt-4.1.docx saved
outputs/Llama_result/approach1_outputs_Llama\Clinical and MRI data from human research participants\ollama-llama3.3.md saved
outputs/Llama_result/approach1_outputs_Llama\Clinical and MRI data from human research participants\ollama-llama3.3.docx saved
outputs/Gpt_result/approach1_outputs_Gpt\Genomic data from human research participants\openai-gpt-4.1.md saved
outputs/Gpt_result/approach1_outputs_Gpt\Genomic data from human research participants\openai-gpt-4.1.docx saved
outputs/Llama_result/approach1_outputs_Llama\Genomic data from human research participants\ollama-llama3.3.md saved
outputs/Llama_result/approach1_outputs_Llama\Genomic data from human research participants\ollama-llama3.3.docx saved
outputs/Gpt_result/approa