In [2]:
pip install llama-index llama-index-graph-stores-neo4j llama-index-vector-stores-pinecone llama-index-llms-openai

Collecting llama-index
  Downloading llama_index-0.14.13-py3-none-any.whl.metadata (13 kB)
Collecting llama-index-graph-stores-neo4j
  Downloading llama_index_graph_stores_neo4j-0.5.1-py3-none-any.whl.metadata (406 bytes)
Collecting llama-index-vector-stores-pinecone
  Using cached llama_index_vector_stores_pinecone-0.7.1-py3-none-any.whl.metadata (424 bytes)
Collecting llama-index-llms-openai
  Downloading llama_index_llms_openai-0.6.16-py3-none-any.whl.metadata (3.0 kB)
Collecting llama-index-cli<0.6,>=0.5.0 (from llama-index)
  Using cached llama_index_cli-0.5.3-py3-none-any.whl.metadata (1.4 kB)
Collecting llama-index-core<0.15.0,>=0.14.13 (from llama-index)
  Downloading llama_index_core-0.14.13-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.6,>=0.5.0 (from llama-index)
  Using cached llama_index_embeddings_openai-0.5.1-py3-none-any.whl.metadata (400 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Using cached ll

In [3]:
from __future__ import annotations

import os
import re
import time
from pathlib import Path
from typing import List

from dotenv import load_dotenv

from llama_index.core import KnowledgeGraphIndex, Settings, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.graph_stores.neo4j import Neo4jGraphStore
from llama_index.llms.openai import OpenAI

from pinecone import Pinecone

In [4]:
# Notebook is in /notebooks
NOTEBOOKS_DIR = Path(".").resolve()

# Load from environment (.env one level above notebooks)
load_dotenv(Path("../.env"), override=True)

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE", "neo4j")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX = os.getenv("PINECONE_INDEX", "diabetes-medical-knowledge")

missing = [
    name
    for name, value in {
        "NEO4J_URI": NEO4J_URI,
        "NEO4J_USERNAME": NEO4J_USERNAME,
        "NEO4J_PASSWORD": NEO4J_PASSWORD,
        "OPENAI_API_KEY": OPENAI_API_KEY,
        "PINECONE_API_KEY": PINECONE_API_KEY,
    }.items()
    if not value
]
if missing:
    raise ValueError(f"Missing env vars: {', '.join(missing)}")

TARGET_DRUGS = [
    "metformin",
    "insulin",
    "glipizide",
    "jardiance",
    "ozempic",
    "lantus",
]

CLINICAL_DIR = NOTEBOOKS_DIR / "clinical_safety_docs"
DIET_DIR = NOTEBOOKS_DIR / "dietician_docs"
INTERACTION_DIR = NOTEBOOKS_DIR / "drug_interaction_docs"

In [7]:
def load_md_docs(directory: Path, tag: str) -> list:
    files = sorted(directory.glob("*.md"))
    if not files:
        print(f"No files found in {directory}")
        return []

    def _metadata(path: str) -> dict:
        path_obj = Path(path)
        return {
            "source": path_obj.name,
            "doc_path": str(path_obj),
            "doc_tag": tag,
        }

    return SimpleDirectoryReader(
        input_files=[str(p) for p in files],
        file_metadata=_metadata,
    ).load_data()


clinical_docs = load_md_docs(CLINICAL_DIR, "clinical_safety")
interaction_docs = load_md_docs(INTERACTION_DIR, "drug_interactions")
diet_docs = load_md_docs(DIET_DIR, "dietician_docs")

print(
    f"Loaded clinical={len(clinical_docs)}, interactions={len(interaction_docs)}, diet={len(diet_docs)}"
)

Loaded clinical=1, interactions=6, diet=1


In [8]:
kg_docs = interaction_docs
print(f"KG docs from drug_interaction_docs: {len(kg_docs)}")

Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0)
Settings.chunk_size = 512

neo4j_store = Neo4jGraphStore(
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    url=NEO4J_URI,
    database=NEO4J_DATABASE,
)
storage_context = StorageContext.from_defaults(graph_store=neo4j_store)

kg_index = KnowledgeGraphIndex.from_documents(
    kg_docs,
    storage_context=storage_context,
    max_triplets_per_chunk=2,
)

print("Knowledge graph built and stored in Neo4j.")

KG docs from drug_interaction_docs: 6


2026-02-03 16:00:45,707 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-02-03 16:00:47,147 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-02-03 16:00:48,760 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-02-03 16:00:50,214 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-02-03 16:00:51,880 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-02-03 16:00:53,690 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-02-03 16:00:55,984 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-02-03 16:00:57,438 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-02-03 16:00:58,798 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "

Knowledge graph built and stored in Neo4j.


In [12]:
splitter = SentenceSplitter(chunk_size=800, chunk_overlap=120)


def build_records(docs: list, namespace: str) -> List[dict]:
    nodes = splitter.get_nodes_from_documents(docs)
    records: List[dict] = []

    for i, node in enumerate(nodes):
        content = node.get_content().strip()
        if not content:
            continue
        meta = node.metadata or {}
        source = meta.get("source", "unknown")
        citation = f"{source}#chunk-{i}"

        records.append(
            {
                "_id": f"{namespace}_{i}",
                "text": content,  # must match Pinecone field_map text=content
                "source": source,
                "doc_path": meta.get("doc_path", ""),
                "doc_tag": meta.get("doc_tag", ""),
                "citation": citation,
            }
        )

    return records


clinical_records = build_records(clinical_docs + interaction_docs, "clinical_safety")
diet_records = build_records(diet_docs, "dietician_docs")

print(f"clinical_records={len(clinical_records)}, diet_records={len(diet_records)}")

clinical_records=61, diet_records=13


In [13]:
pc = Pinecone(api_key=PINECONE_API_KEY)

if not pc.has_index(PINECONE_INDEX):
    raise ValueError(
        "Pinecone index not found. Create it first with field_map text=content "
        "and an integrated embedding model (e.g., llama-text-embed-v2)."
    )

index = pc.Index(PINECONE_INDEX)


def batch_upsert(index, namespace: str, records: List[dict], batch_size: int = 96) -> None:
    for i in range(0, len(records), batch_size):
        batch = records[i : i + batch_size]
        if batch:
            index.upsert_records(namespace, batch)
            time.sleep(0.1)

    # Required wait for indexing before any search
    time.sleep(10)


batch_upsert(index, "clinical_safety", clinical_records)
batch_upsert(index, "dietician_docs", diet_records)

print("Upsert complete. Vectors are ready for RAG with citations.")

Upsert complete. Vectors are ready for RAG with citations.


In [14]:
stats = index.describe_index_stats()
print(stats)

if stats.get("namespaces"):
    print("Namespaces:", ", ".join(stats["namespaces"].keys()))

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'clinical_safety': {'vector_count': 61},
                'dietician_docs': {'vector_count': 13}},
 'total_vector_count': 74,
 'vector_type': 'dense'}
Namespaces: clinical_safety, dietician_docs


In [17]:
def search_with_citations(index, namespace: str, query: str, top_k: int = 5):
    print(f"\n[RAG] namespace={namespace} query='{query}' top_k={top_k}")
    results = index.search(
        namespace=namespace,
        query={
            "top_k": top_k * 2,
            "inputs": {"text": query},
        },
        rerank={
            "model": "bge-reranker-v2-m3",
            "top_n": top_k,
            "rank_fields": ["text"],
        },
    )

    hits = results.result.hits
    print(f"[RAG] hits={len(hits)}")
    for i, hit in enumerate(hits, start=1):
        fields = hit.fields
        citation = fields.get("citation") or fields.get("source", "unknown")
        snippet = fields.get("text", "")[:180].replace("\n", " ")
        score = hit["_score"]
        print(f"  {i}. score={score:.4f} cite={citation} | {snippet}...")

    return hits


# RAG test queries (adjust as needed)
_ = search_with_citations(
    index,
    "clinical_safety",
    "What are key interactions or precautions for metformin?",
    top_k=3,
)
_ = search_with_citations(
    index,
    "dietician_docs",
    "Which hawker foods are good for diabetes?",
    top_k=3,
)


# Knowledge graph query test
kg_engine = kg_index.as_query_engine(include_text=True, response_mode="tree_summarize")
kg_response = kg_engine.query("List interactions between insulin and food or other drugs.")
print("\n[KG] Response:")
print(kg_response)


[RAG] namespace=clinical_safety query='What are key interactions or precautions for metformin?' top_k=3
[RAG] hits=3
  1. score=0.9011 cite=metformin_interactions.md#chunk-49 | # Metformin Drug Interactions  ## Drug Overview - **Generic Name:** Metformin - **Drug Class:** Biguanide - **Primary Use:** Type 2 Diabetes Mellitus (first-line therapy) - **Mecha...
  2. score=0.7607 cite=metformin_interactions.md#chunk-54 | Liver Disease (Major) - **Conditions:** Hepatic impairment - **Risk:** Impaired lactate clearance - **Management:** Avoid in patients with clinical/laboratory evidence of hepatic d...
  3. score=0.7558 cite=metformin_interactions.md#chunk-53 | Vancomycin, Trimethoprim, Digoxin, Procainamide, Quinidine, Morphine, Amiloride, Triamterene - **Interaction Type:** Pharmacokinetic competition - **Mechanism:** Compete for renal ...

[RAG] namespace=dietician_docs query='Which hawker foods are good for diabetes?' top_k=3
[RAG] hits=3
  1. score=0.2451 cite=hawker_food_nutrition.md

2026-02-03 16:13:40,860 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2026-02-03 16:13:43,173 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



[KG] Response:
The interactions involving insulin include the requirement for dose adjustments and an increase in hypoglycemia risk.
