# RAG Agent Workbench (Llama Stack)

This notebook exercises the agent portion of your RAG workflow against the already running Llama Stack.

It mirrors the logic in `agent/app.py`:
- Create `LlamaStackClient`
- Select an LLM (preferring provider `vllm-inference`)
- Resolve vector DB (defaults to `confluence`)
- Create an Agent with `builtin::rag/knowledge_search`
- Open a session and ask a question (non-streaming; optional streaming cell)

Set `LLAMA_BASE_URL` and (optionally) `VECTOR_DB_ID` in the environment, or rely on the defaults used below.


In [None]:
%pip install --quiet "llama-stack-client==0.2.23"
import os



In [None]:
LLAMA_BASE_URL = os.getenv("LLAMA_BASE_URL", "http://lsd-llama-milvus-inline-service.default.svc.cluster.local:8321").rstrip("/")
VECTOR_DB_ID = os.getenv("VECTOR_DB_ID", "confluence")
print("LLAMA_BASE_URL:", LLAMA_BASE_URL)
print("VECTOR_DB_ID:", VECTOR_DB_ID)



In [None]:
from llama_stack_client import LlamaStackClient, Agent

client = LlamaStackClient(base_url=LLAMA_BASE_URL)
client



In [None]:
# Select an LLM (prefer vllm-inference)
models = list(client.models.list())
llm = next((m for m in models if m.model_type == "llm" and getattr(m, "provider_id", None) == "vllm-inference"), None)
if not llm:
    llm = next((m for m in models if m.model_type == "llm"), None)
assert llm, "No LLM models available on Llama Stack"
model_id = llm.identifier
print("Using model:", model_id)



In [None]:
# Resolve vector DB id (prefer the requested one)
vdbs = list(client.vector_dbs.list())
vector_db_id = next((v.identifier for v in vdbs if getattr(v, "identifier", None) == VECTOR_DB_ID), (vdbs[0].identifier if vdbs else VECTOR_DB_ID))
print(f"Vector DB: requested={VECTOR_DB_ID} -> using={vector_db_id}")



In [None]:
# Create an Agent bound to the vector DB via builtin RAG tool
instructions = (
    "You are a helpful assistant. Use the RAG tool when appropriate and cite source_url(s)."
)
rag_agent = Agent(
    client,
    model=model_id,
    instructions=instructions,
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": [vector_db_id]},
        }
    ],
)
print("Agent ready.")



In [None]:
import uuid
session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")
session_id



In [None]:
# Ask a question (non-streaming)
question = "Summarise the resolution for when Disk full on /var. Use our Confluence docs."
result = rag_agent.create_turn(
    messages=[{"role": "user", "content": question}],
    session_id=session_id,
    stream=False,
)

answer = None
if isinstance(result, dict):
    answer = result.get("message") or result.get("content") or result.get("text")
if answer is None and hasattr(result, "message"):
    answer = getattr(result, "message")
if answer is None and hasattr(result, "content"):
    answer = getattr(result, "content")

print(answer or result)



In [None]:
# Optional: streaming example
from llama_stack_client import AgentEventLogger

stream = rag_agent.create_turn(
    messages=[{"role": "user", "content": question}],
    session_id=session_id,
    stream=True,
)
for event in AgentEventLogger().log(stream):
    event.print()

