In [3]:
from llama_stack_client import LlamaStackClient

In [4]:
client = LlamaStackClient(base_url="http://localhost:8321")
models = client.models.list()
print(models)
model = "ollama/llama3.2:3b"


INFO:httpx:HTTP Request: GET http://localhost:8321/v1/models "HTTP/1.1 200 OK"


[Model(identifier='accounts/fireworks/models/llama-guard-3-11b-vision', metadata={}, api_model_type='llm', provider_id='fireworks', type='model', provider_resource_id='accounts/fireworks/models/llama-guard-3-11b-vision', model_type='llm'), Model(identifier='accounts/fireworks/models/llama-guard-3-8b', metadata={}, api_model_type='llm', provider_id='fireworks', type='model', provider_resource_id='accounts/fireworks/models/llama-guard-3-8b', model_type='llm'), Model(identifier='accounts/fireworks/models/llama-v3p1-405b-instruct', metadata={}, api_model_type='llm', provider_id='fireworks', type='model', provider_resource_id='accounts/fireworks/models/llama-v3p1-405b-instruct', model_type='llm'), Model(identifier='accounts/fireworks/models/llama-v3p1-70b-instruct', metadata={}, api_model_type='llm', provider_id='fireworks', type='model', provider_resource_id='accounts/fireworks/models/llama-v3p1-70b-instruct', model_type='llm'), Model(identifier='accounts/fireworks/models/llama-v3p1-8b-ins

In [5]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://localhost:8321")

vector_db = client.vector_dbs.register(
    vector_db_id="toy_faiss_db",           # your chosen identifier
    provider_id="faiss",                   # built‑in in‑memory FAISS
    embedding_model="all-MiniLM-L6-v2",     # or pick another from client.models.list()
)

print("Created vector DB:", vector_db)



INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-dbs "HTTP/1.1 200 OK"


Created vector DB: VectorDBRegisterResponse(embedding_dimension=384, embedding_model='all-MiniLM-L6-v2', identifier='toy_faiss_db', provider_id='faiss', type='vector_db', provider_resource_id='toy_faiss_db', owner={'principal': '', 'attributes': {}})


In [6]:
from llama_stack_client.types import Document

docs = [
    Document(
        document_id="dog1",
        content="Bella breed is a cavalier king.",  # (also fixed typo from 'bread')
        mime_type="text/plain",
        metadata={}
    ),
    Document(
        document_id="dog2",
        content="Dora breed is a pug.",
        mime_type="text/plain",
        metadata={}
    )
]

client.tool_runtime.rag_tool.insert(
    documents=docs,
    vector_db_id="toy_faiss_db",
    chunk_size_in_tokens=128
)

print("Documents ingested.")


INFO:httpx:HTTP Request: POST http://localhost:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 200 OK"


Documents ingested.


In [7]:
providers = client.providers.list()
for p in providers:
    print(p.provider_id)


INFO:httpx:HTTP Request: GET http://localhost:8321/v1/providers "HTTP/1.1 200 OK"


openai
fireworks
together
ollama
anthropic
gemini
groq
sambanova
vllm
sentence-transformers
faiss
meta-reference-files
llama-guard
meta-reference
meta-reference
meta-reference
huggingface
localfs
basic
llm-as-judge
braintrust
brave-search
tavily-search
rag-runtime
model-context-protocol


In [8]:
#register the RAG Toolgroup
client.toolgroups.register(
    toolgroup_id="rag-dogs",
    provider_id="rag-runtime",  # ← This is the right one for your setup
    args={"vector_db_ids": ["toy_faiss_db"]}
)


INFO:httpx:HTTP Request: POST http://localhost:8321/v1/toolgroups "HTTP/1.1 200 OK"


In [9]:
from llama_stack_client import LlamaStackClient
from llama_stack_client import Agent 

# Instantiate without agent_id parameter
agent = Agent(
    client,
    model="ollama/llama3.2:3b",
    instructions="Always use retrieval tool to fetch info about dog breeds before answering.",
    tools=["rag-dogs"]
)


INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8321/v1/tools?toolgroup_id=rag-dogs "HTTP/1.1 200 OK"


In [45]:
client.toolgroups.list()


INFO:httpx:HTTP Request: GET http://localhost:8321/v1/toolgroups "HTTP/1.1 200 OK"


[ToolGroup(identifier='builtin::rag', provider_id='rag-runtime', type='tool_group', args=None, mcp_endpoint=None, provider_resource_id='builtin::rag'),
 ToolGroup(identifier='builtin::websearch', provider_id='tavily-search', type='tool_group', args=None, mcp_endpoint=None, provider_resource_id='builtin::websearch'),
 ToolGroup(identifier='rag-dogs', provider_id='rag-runtime', type='tool_group', args={'vector_db_ids': ['toy_faiss_db']}, mcp_endpoint=None, provider_resource_id='rag-dogs')]

In [10]:
session_id = agent.create_session(session_name="dog-chat")
response = agent.create_turn(
    session_id=session_id,
    messages=[{"role": "user", "content": "Which breed is Bella?"}],
     toolgroups=[{
        "toolgroup_id": "rag-dogs",
        "name": "rag-dogs",  # required!
        "args": {"vector_db_ids": ["toy_faiss_db"]}
    }],
    stream=False,
)
print("Assistant:", response.output_message.content)
 

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents/7aa4e87d-a3c9-4414-81e4-b7a34272d253/session "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents/7aa4e87d-a3c9-4414-81e4-b7a34272d253/session/1b78cefa-d17a-4f41-8715-d52f1e420d23/turn "HTTP/1.1 200 OK"


Assistant: Bella is a Cavalier King Charles Spaniel.


In [11]:
session_id = agent.create_session(session_name="dog-chat")
response = agent.create_turn(
    session_id=session_id,
    messages=[{"role": "user", "content": "Which breed is Dora?"}],
     toolgroups=[{
        "toolgroup_id": "rag-dogs",
        "name": "rag-dogs",  # required!
        "args": {"vector_db_ids": ["toy_faiss_db"]}
    }],
    stream=False,
)
print("Assistant:", response.output_message.content)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents/7aa4e87d-a3c9-4414-81e4-b7a34272d253/session "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents/7aa4e87d-a3c9-4414-81e4-b7a34272d253/session/38d32432-2d3a-430a-967f-8a88027e51da/turn "HTTP/1.1 200 OK"


Assistant: The Dora breed is a pug.


In [12]:
for step in response.steps:
    if hasattr(step, "api_model_response") and hasattr(step.api_model_response, "tool_calls"):
        for tool_call in step.api_model_response.tool_calls:
            print("🔧 Tool:", tool_call.tool_name)
            print("📥 Arguments:", tool_call.arguments)
            print("🪪 Call ID:", tool_call.call_id)


🔧 Tool: knowledge_search
📥 Arguments: {'query': 'Dora breed'}
🪪 Call ID: 2d62270f-5040-41ed-bd92-dddc38859b97


In [13]:
toolgroups = client.toolgroups.list()

for tg in toolgroups:
    if tg.provider_id == "rag-runtime":
        print(f"🧠 RAG Toolgroup: {tg.identifier}")
        print(f"   Args: {tg.args}")


INFO:httpx:HTTP Request: GET http://localhost:8321/v1/toolgroups "HTTP/1.1 200 OK"


🧠 RAG Toolgroup: builtin::rag
   Args: None
🧠 RAG Toolgroup: rag-dogs
   Args: {'vector_db_ids': ['toy_faiss_db']}


In [35]:
from llama_stack_client.types.tool_group import McpEndpoint 

from llama_stack_client.types.tool_group import McpEndpoint

client.toolgroups.register(
    toolgroup_id="mcp::my1",
    provider_id="model-context-protocol",
    mcp_endpoint=McpEndpoint(uri="http://localhost:8421/sse")
)


INFO:httpx:HTTP Request: POST http://localhost:8321/v1/toolgroups "HTTP/1.1 200 OK"


uv run uvicorn mcp_server:app --host 0.0.0.0 --port 8421 --reload

In [36]:
agentMCP = Agent(
    client,
    model="ollama/llama3.2:3b",
    instructions="Always retrieve info using RAG before answering.",
    tools=["mcp::my1"]
)


INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents "HTTP/1.1 200 OK"
INFO:llama_stack_client._base_client:Retrying request to /v1/tools in 0.433570 seconds
INFO:llama_stack_client._base_client:Retrying request to /v1/tools in 0.954515 seconds


APITimeoutError: Request timed out.