In [1]:
import json
import re
from pathlib import Path
from typing import Optional, Dict, Any

# If running from a notebook, ensure repo root is on sys.path (usually already true if notebook is at repo root)
import sys
REPO_ROOT = Path.cwd()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from src.core_pipeline import DMPPipeline
from utils.dmptool_json import build_dmptool_json
from utils.nih_docx_writer import build_nih_docx_from_template


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def safe_filename(title: str) -> str:
    return re.sub(r'[\\/*?:"<>|]', "_", (title or "").strip()).strip()

def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def cleanup_title_json(out_json_dir: Path, file_stem: str) -> None:
    keep_name = f"{file_stem}.dmptool.json"
    for p in out_json_dir.glob(f"{file_stem}*.json"):
        if p.name != keep_name:
            try:
                p.unlink()
            except Exception:
                pass

def _to_bool(v, default: Optional[bool] = None) -> Optional[bool]:
    if v is None:
        return default
    if isinstance(v, bool):
        return v
    if isinstance(v, (int, float)):
        return bool(v)
    if isinstance(v, str):
        s = v.strip().lower()
        if s in {"true", "1", "yes", "y", "on"}:
            return True
        if s in {"false", "0", "no", "n", "off"}:
            return False
    return default


In [3]:
def draft(
    input_json_path: str,
    out_root: str = "data/outputs",
    config_path: str = "config/config.yaml",
    nih_template_path: str = "data/inputs/nih-dms-plan-template.docx",
    use_rag: Optional[bool] = None,                # optional override (otherwise uses input.json / YAML default)
    funding_agency: Optional[str] = None,          # optional override (otherwise uses input.json)
    export_pdf: bool = True,                       # keep demo simple: try PDF, but don't fail if not supported
) -> Dict[str, str]:
    in_path = Path(input_json_path).expanduser().resolve()
    if not in_path.exists():
        raise FileNotFoundError(f"Input JSON not found: {in_path}")

    req = json.loads(in_path.read_text(encoding="utf-8"))
    title = (req.get("title") or "").strip()
    inputs: Dict[str, Any] = req.get("inputs") or {}

    # Use request JSON unless overridden
    req_funder = (req.get("funding_agency") or "NIH").strip().upper()
    funding_agency = (funding_agency or req_funder).strip().upper()

    if not title:
        raise ValueError("Input JSON must include a non-empty 'title'.")

    # Decide RAG usage (override > JSON > None => pipeline uses YAML default)
    if use_rag is None and "use_rag" in req:
        use_rag = _to_bool(req.get("use_rag"), default=None)

    # Output dirs
    out_root = Path(out_root).expanduser().resolve()
    out_json = out_root / "json"
    out_md   = out_root / "markdown"
    out_docx = out_root / "docx"
    out_pdf  = out_root / "pdf"

    for p in [out_json, out_md, out_docx, out_pdf]:
        ensure_dir(p)

    # Run pipeline (this returns markdown, and also writes md/docx/json internally)
    pipeline = DMPPipeline(config_path=config_path, force_rebuild_index=False)

    md_text = pipeline.generate_dmp(
        title,
        inputs,
        use_rag=use_rag,
        funding_agency=funding_agency,
    )

    run_stem = pipeline.last_run_stem or safe_filename(title)

    # Save Markdown (not strictly necessary; pipeline already saves, but this keeps the demo explicit)
    md_path = out_md / f"{run_stem}.md"
    md_path.write_text(md_text, encoding="utf-8")

    # Save ONLY dmptool JSON (same approach as main.py)
    dmptool_payload = build_dmptool_json(
        template_title="NIH Data Management and Sharing Plan",
        project_title=title,
        form_inputs=inputs,
        generated_markdown=md_text,
        provenance="dmpchef",
    )
    dmptool_json_path = out_json / f"{run_stem}.dmptool.json"
    cleanup_title_json(out_json, run_stem)
    dmptool_json_path.write_text(json.dumps(dmptool_payload, indent=2), encoding="utf-8")

    # Build NIH template DOCX (explicit for demo)
    template_path = Path(nih_template_path).expanduser().resolve()
    if not template_path.exists():
        raise FileNotFoundError(f"NIH template DOCX not found: {template_path}")

    docx_path = out_docx / f"{run_stem}.docx"
    build_nih_docx_from_template(
        template_docx_path=str(template_path),
        output_docx_path=str(docx_path),
        project_title=title,
        generated_markdown=md_text,
    )

    # Optional PDF (best-effort; docx2pdf works best on Windows with Word installed)
    pdf_path = out_pdf / f"{run_stem}.pdf"
    if export_pdf:
        try:
            from docx2pdf import convert as docx2pdf_convert
            pdf_path.parent.mkdir(parents=True, exist_ok=True)
            docx2pdf_convert(str(docx_path), str(pdf_path))
        except Exception as e:
            print("⚠️ PDF conversion skipped (docx2pdf not supported in this environment).")
            print("   Reason:", str(e))
            pdf_path = None

    return {
        "funding_agency": funding_agency,
        "use_rag": str(use_rag),
        "run_stem": run_stem,
        "markdown": str(md_path),
        "docx": str(docx_path),
        "pdf": str(pdf_path) if pdf_path else "",
        "dmptool_json": str(dmptool_json_path),
    }


In [4]:
result_paths = draft("data/inputs/input.json", export_pdf=True)
result_paths


{"timestamp": "2026-02-07T01:35:15.695554Z", "level": "info", "event": "\u2705 Config loaded successfully"}
{"llm": "llama3.3", "embed": "sentence-transformers/all-MiniLM-L6-v2", "hf_cache_dir": "data/cache/hf", "local_files_only": false, "timestamp": "2026-02-07T01:35:15.697059Z", "level": "info", "event": "ModelLoader initialized"}
  self.llm = Ollama(model=self.llm_name)
{"llm": "llama3.3", "rag_default": true, "timestamp": "2026-02-07T01:35:15.698059Z", "level": "info", "event": "\u2705 DMPPipeline initialized"}
{"funding_agency": "NIH", "timestamp": "2026-02-07T01:35:15.698059Z", "level": "info", "event": "\ud83c\udff7\ufe0f Funding agency selected"}
{"use_rag_input": true, "rag_default": true, "use_rag_final": true, "timestamp": "2026-02-07T01:35:15.700059Z", "level": "info", "event": "\ud83e\uddfe RAG decision"}
  emb = HuggingFaceEmbeddings(
Use pytorch device_name: cpu
Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
HTTP Request: HEAD https://huggin

{'funding_agency': 'NIH',
 'use_rag': 'True',
 'run_stem': 'NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__rag__k10__llama3.3',
 'markdown': 'C:\\Users\\Nahid\\dmpchef\\notebook_DMP_RAG\\data\\outputs\\markdown\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__rag__k10__llama3.3.md',
 'docx': 'C:\\Users\\Nahid\\dmpchef\\notebook_DMP_RAG\\data\\outputs\\docx\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__rag__k10__llama3.3.docx',
 'pdf': 'C:\\Users\\Nahid\\dmpchef\\notebook_DMP_RAG\\data\\outputs\\pdf\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__rag__k10__llama3.3.pdf',
 'dmptool_json': 'C:\\Users\\Nahid\\dmpchef\\notebook_DMP_RAG\\data\\outputs\\json\\NIH Data Management and Sharing Plan_ Clinical and MRI Data from Human Research Participants__rag__k10__llama3.3.dmptool.json'}