In [2]:
import dmpchef
print(dmpchef.__file__)

c:\Users\Nahid\dmpchef\dmpchef\__init__.py


In [3]:
# ============================================================
# DMP Chef ‚Äî Step-by-step Notebook Demo (No-RAG + RAG)
# Goal: Show that dmpchef can be imported and run anywhere
# ============================================================

# ----------------------------
# STEP 1 ‚Äî Confirm Notebook Working Directory (cwd)
# ----------------------------
import os
from pathlib import Path

print("STEP 1) Notebook current working directory (cwd):")
print("  cwd:", os.getcwd())
print("  cwd exists:", Path(os.getcwd()).exists())

STEP 1) Notebook current working directory (cwd):
  cwd: c:\Users\Nahid\dmpchef
  cwd exists: True


In [4]:
# ----------------------------
# STEP 2 ‚Äî Auto-detect the Repo Root (no os.chdir needed)
#   We walk upward until we find setup.py (repo root marker)
# ----------------------------
repo_root = Path.cwd().resolve()
while not (repo_root / "setup.py").exists() and repo_root != repo_root.parent:
    repo_root = repo_root.parent

print("\nSTEP 2) Detected repo root:")
print("  repo_root:", repo_root)


STEP 2) Detected repo root:
  repo_root: C:\Users\Nahid\dmpchef


In [5]:
# ----------------------------
# STEP 3 ‚Äî Verify dmpchef is importable and show where it loads from
#   This proves it can be imported by notebooks / backends (e.g., DMPTool)
# ----------------------------
import dmpchef

print("\nSTEP 3) Verify package import + exports:")
print("  dmpchef module file:", Path(dmpchef.__file__).resolve())
print("  exported symbols (__all__):", getattr(dmpchef, "__all__", []))



STEP 3) Verify package import + exports:
  dmpchef module file: C:\Users\Nahid\dmpchef\dmpchef\__init__.py
  exported symbols (__all__): ['generate', 'draft', 'prepare_nih_corpus']


In [6]:
# ----------------------------
# STEP 4 ‚Äî (One-time) Prepare NIH corpus for RAG (heavy step)
#   This runs your NIH ingestion script and populates data/NIH_95
#   Skip this step if you only want No-RAG.
# ----------------------------
from dmpchef import prepare_nih_corpus

print("\nSTEP 4) Prepare NIH corpus for RAG (one-time setup):")
prep_info = prepare_nih_corpus()
print("  data_root:", prep_info["data_root"])
print("  export_pdf_folder:", prep_info["export_pdf_folder"])
print("  json_links:", prep_info["json_links"])




STEP 4) Prepare NIH corpus for RAG (one-time setup):
‚úÖ Loaded links from: C:\Users\Nahid\dmpchef\data\web_links.json (sources=1)
‚ôªÔ∏è Loaded previous session hashes:
   - dmptool.org: 0 known hashes

‚úÖ data_root: C:\Users\Nahid\dmpchef\data
‚úÖ Session Folder Created: C:\Users\Nahid\dmpchef\data\data_ingestion\2026_02_07_NIH_ingestion_20260207_043838

‚ôªÔ∏è Copy-forward from previous session: 2026_02_07_NIH_ingestion_20260207_043043
‚úÖ Copy-forward complete.
üßπ Cleaning up old sessions ‚Äî keeping last 2:
   ‚úÖ keep: 2026_02_07_NIH_ingestion_20260207_043838
   ‚úÖ keep: 2026_02_07_NIH_ingestion_20260207_043043
üóëÔ∏è Removed old session: C:\Users\Nahid\dmpchef\data\data_ingestion\2026_02_07_NIH_ingestion_20260207_041952
üöÄ Starting crawl.
üåê Crawling DMPTool: https://dmptool.org/public_plans?search=&facet%5Bfunder_ids%5D%5B%5D=123&sort_by=featured


Get LATEST chromedriver version for google-chrome
Get LATEST chromedriver version for google-chrome
Driver [C:\Users\Nahid\.wdm\drivers\chromedriver\win64\143.0.7499.192\chromedriver-win32/chromedriver.exe] found in cache


üîé Current URL: https://dmptool.org/public_plans?search=&facet%5Bfunder_ids%5D%5B%5D=123&sort_by=featured
‚úÖ Detected filtered plan count: 106


DMPTool NIH PDFs:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 105/106 [00:43<00:00,  2.41pdf/s]


‚úÖ Manifest written: C:\Users\Nahid\dmpchef\data\data_ingestion\2026_02_07_NIH_ingestion_20260207_043838\dmptool.org\manifest_dmptool_org.json
‚úÖ DMPTool crawl completed ‚Äî downloaded=105 already_have=0
‚úÖ Exported 0 NEW PDFs to: C:\Users\Nahid\dmpchef\data\NIH_95
‚ôªÔ∏è Skipped 105 PDFs already in destination (by hash)
üèÅ All crawls complete.
  data_root: C:\Users\Nahid\dmpchef\data
  export_pdf_folder: C:\Users\Nahid\dmpchef\data\NIH_95
  json_links: C:\Users\Nahid\dmpchef\data\web_links.json


In [7]:
# ----------------------------
# STEP 5 ‚Äî Run No-RAG and show clean output paths
# ----------------------------
from dmpchef import generate

# Input JSON path (resolved from repo root)
input_path = repo_root / "data" / "inputs" / "input.json"
print("\nSTEP 5) Input file used for both runs:")
print("  input.json:", input_path)


def show_summary(step_name: str, out: dict):
    """Pretty, readable summary for users."""
    print("\n" + "=" * 78)
    print(step_name)
    print("-" * 78)
    print("Funding agency :", out.get("funding_agency"))
    print("RAG enabled    :", out.get("use_rag"))
    print("Run ID         :", out.get("run_stem"))
    print("-" * 78)
    print("Markdown       :", out.get("markdown"))
    print("DOCX           :", out.get("docx"))
    print("DMPTool JSON   :", out.get("dmptool_json"))
    print("PDF            :", out.get("pdf") or "(not generated)")
    print("=" * 78)


print("\nSTEP 5A) Run No-RAG:")
outputs_no_rag = generate(input_path, export_pdf=False, use_rag=False)
show_summary("NO-RAG RESULT", outputs_no_rag)

{"timestamp": "2026-02-07T12:39:28.131178Z", "level": "info", "event": "\u2705 Config loaded successfully"}
{"llm": "llama3.3", "embed": "sentence-transformers/all-MiniLM-L6-v2", "hf_cache_dir": "data/cache/hf", "local_files_only": true, "timestamp": "2026-02-07T12:39:28.134177Z", "level": "info", "event": "ModelLoader initialized"}
  self.llm = Ollama(model=self.llm_name)
{"llm": "llama3.3", "rag_default": true, "timestamp": "2026-02-07T12:39:28.135709Z", "level": "info", "event": "\u2705 DMPPipeline initialized"}
{"funding_agency": "NIH", "timestamp": "2026-02-07T12:39:28.135709Z", "level": "info", "event": "\ud83c\udff7\ufe0f Funding agency selected"}
{"use_rag_input": false, "rag_default": true, "use_rag_final": false, "timestamp": "2026-02-07T12:39:28.136712Z", "level": "info", "event": "\ud83e\uddfe RAG decision"}
{"llm": "llama3.3", "timestamp": "2026-02-07T12:39:28.137712Z", "level": "info", "event": "\ud83d\udd17 No-RAG chain built successfully"}



STEP 5) Input file used for both runs:
  input.json: C:\Users\Nahid\dmpchef\data\inputs\input.json

STEP 5A) Run No-RAG:


{"title": "NIH Data Management and Sharing Plan: Clinical and MRI Data from Human Research Participants", "funding_agency": "NIH", "use_rag": false, "md": "data\\outputs\\markdown\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3.md", "docx": "data\\outputs\\docx\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3.docx", "dmptool_json": "data\\outputs\\json\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3.dmptool.json", "timestamp": "2026-02-07T12:40:39.231909Z", "level": "info", "event": "\u2705 DMP generated successfully"}



NO-RAG RESULT
------------------------------------------------------------------------------
Funding agency : NIH
RAG enabled    : False
Run ID         : NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3
------------------------------------------------------------------------------
Markdown       : C:\Users\Nahid\dmpchef\data\outputs\markdown\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3.md
DOCX           : C:\Users\Nahid\dmpchef\data\outputs\docx\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3.docx
DMPTool JSON   : C:\Users\Nahid\dmpchef\data\outputs\json\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3.dmptool.json
PDF            : C:\Users\Nahid\dmpchef\data\outputs\pdf\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human

In [8]:
# ----------------------------
# STEP 6 ‚Äî Run RAG and show clean output paths
#   Requires Step 4 (NIH corpus prepared) + your index present (if needed)
# ----------------------------
print("\nSTEP 6) Run RAG:")
outputs_rag = generate(input_path, export_pdf=True, use_rag=True)
show_summary("RAG RESULT", outputs_rag)

{"timestamp": "2026-02-07T12:40:39.303985Z", "level": "info", "event": "\u2705 Config loaded successfully"}
{"llm": "llama3.3", "embed": "sentence-transformers/all-MiniLM-L6-v2", "hf_cache_dir": "data/cache/hf", "local_files_only": true, "timestamp": "2026-02-07T12:40:39.305988Z", "level": "info", "event": "ModelLoader initialized"}
{"llm": "llama3.3", "rag_default": true, "timestamp": "2026-02-07T12:40:39.305988Z", "level": "info", "event": "\u2705 DMPPipeline initialized"}
{"funding_agency": "NIH", "timestamp": "2026-02-07T12:40:39.306984Z", "level": "info", "event": "\ud83c\udff7\ufe0f Funding agency selected"}
{"use_rag_input": true, "rag_default": true, "use_rag_final": true, "timestamp": "2026-02-07T12:40:39.306984Z", "level": "info", "event": "\ud83e\uddfe RAG decision"}
  emb = HuggingFaceEmbeddings(
Use pytorch device_name: cpu
Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2



STEP 6) Run RAG:


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 1643.85it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
{"model": "sentence-transformers/all-MiniLM-L6-v2", "cache_dir": "C:\\Users\\Nahid\\dmpchef\\data\\cache\\hf", "local_files_only": true, "timestamp": "2026-02-07T12:40:39.465640Z", "level": "info", "event": "Embeddings loaded successfully"}
{"path": "data\\index\\index.faiss", "timestamp": "2026-02-07T12:40:39.466643Z", "level": "info", "event": "\ud83d\udce6 Loading existing FAISS index"}
Loading faiss with AVX512 support.
Could not load library with AVX512 support due to:
ModuleNotFoundError("No m


RAG RESULT
------------------------------------------------------------------------------
Funding agency : NIH
RAG enabled    : True
Run ID         : NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__rag__k10__llama3.3
------------------------------------------------------------------------------
Markdown       : C:\Users\Nahid\dmpchef\data\outputs\markdown\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__rag__k10__llama3.3.md
DOCX           : C:\Users\Nahid\dmpchef\data\outputs\docx\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__rag__k10__llama3.3.docx
DMPTool JSON   : C:\Users\Nahid\dmpchef\data\outputs\json\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__rag__k10__llama3.3.dmptool.json
PDF            : C:\Users\Nahid\dmpchef\data\outputs\pdf\NIH Data Management and Sharing Plan_ Clinical and MRI Data fr




In [9]:
# ----------------------------
# STEP 7 ‚Äî Return both outputs (so notebook displays them too)
# ----------------------------
outputs_no_rag, outputs_rag

({'markdown': 'C:\\Users\\Nahid\\dmpchef\\data\\outputs\\markdown\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3.md',
  'docx': 'C:\\Users\\Nahid\\dmpchef\\data\\outputs\\docx\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3.docx',
  'dmptool_json': 'C:\\Users\\Nahid\\dmpchef\\data\\outputs\\json\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3.dmptool.json',
  'pdf': 'C:\\Users\\Nahid\\dmpchef\\data\\outputs\\pdf\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3.pdf',
  'funding_agency': 'NIH',
  'use_rag': 'False',
  'run_stem': 'NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__norag__llama3.3',
  'repo_root': 'C:\\Users\\Nahid\\dmpchef'},
 {'markdown': 'C:\\Users\\Nahid\\dmpchef\\data\\outputs\