# Testing notebook for Granite on Llama Stack

Setup instructions:
1. Install a fork of Llama Stack with Granite support
1. Install the latest version of the `llama_stack_client` package using `pip`
1. Download the `Llama3.2-11B-Vision-Instruct` Llama model (using the "llama" command) and the `granite-3.0-8b-instruct-r241014a` 
1. Configure a Llama Stack server with the `meta-reference` inference implementation serving the `Llama3.2-11B-Vision-Instruct` model
1. Add a `remote::granite` inference provider to your copy of Llama stack. You can configure this provider during the manual, step-by-step configuration, or you can just change the part of your server's YAML configuration file that reads:
    ```
    TODO
    ```
    ...so that it looks like this:
    ```
    TODO
    ```
    The `modeldir` parameter should point to the parent directory containing your local copy of `granite-3.0-8b-instruct-r241014a`. The Granite model should be in a directory with the same name as the model, because Granite model names are currently stored in name of the directory.
1. Add an entry in the `models` section of your YAML configuration file, something like this:
    ```
    models: 
    - model_id: Llama3.2-11B-Vision-Instruct
      provider_id: inline::meta-reference
      provider_model_id: Llama3.2-11B-Vision-Instruct
    - model_id: granite-3.0-8b-instruct-r241014a
      provider_id: inline::granite
      provider_model_id: granite-3.0-8b-instruct-r241014a
    ```
1. Start the server on `localhost` at port 5000. Server startup takes about a minute due to model loading overheads.

In [1]:
# Boilerplate goes here
import json
import termcolor
import textwrap
import pydantic

host = "localhost"
port = 5000
base_url = f"http://{host}:{port}"
_WRAP_CHARS = 80

# Import the Python client for Llama Stack. This client code breaks frequently
# due to breaking API changes on the server and a chaotic release schedule.
import llama_stack_client
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url=base_url)
models_client = client.models
inference_client = client.inference
agents_client = client.agents

# The latest batch of breaking API changes broke the Llama Stack client code
# in the server code base.
# Keeping this around in case a future change set breaks the client in 
# llama_stack_client.
# import llama_stack.apis.inference.client
# inference_client = llama_stack.apis.inference.client.InferenceClient(base_url=base_url)
# import llama_stack.apis.agents.client
# agents_client = llama_stack.apis.agents.client.AgentsClient(base_url)
# import llama_stack.apis.models.client
# models_client = llama_stack.apis.models.client.ModelsClient(base_url=base_url)
# import llama_stack.apis.inference


In [2]:
# Define functions for pretty-printing results

def print_result(result):
    """Common code for printing model outputs to stdout"""
    
    # Result is of type ChatCompletionResponse
    #print(f"Raw result: {result}")

    role = result.completion_message.role
    content = result.completion_message.content
    tool_calls = result.completion_message.tool_calls
    
    if len(content) > 0:
        content_lines = content.split("\n")
        indent_str = (" " * (len(role) + 2))
        first_line = textwrap.fill(content_lines[0],
                                   subsequent_indent=indent_str,
                                   width=_WRAP_CHARS)
        remaining_lines = [textwrap.fill(l, width=_WRAP_CHARS) 
                           for l in content_lines[1:]]
        pretty_role = termcolor.colored(role, color="red")
        print(f"{pretty_role}: {first_line}")
        print("\n".join(remaining_lines))
    if len(tool_calls) > 0:
        print("Tool calls:")
        for t in tool_calls:
            print(f"   {t}")
   

def print_result_stream(result_future):
    """Common code for printing model outputs to stdout when the model
    is running in streaming mode.
    """

    label_str = None  # "assistant: " or "tool call: "

    cur_line_len = 0

    #result_generator = await result_future

    #async for chunk in result_generator:
    for chunk in result_future:
        # Result chunks are of type ChatCompletionResponseStreamChunk.
        # Note that this type is quite different from the 
        # ChatCompletionResponse object returned in non-streaming mode.
        if not hasattr(chunk, "event"):
            raise ValueError(f"Can't parse chunk:\n{chunk}")
        event = chunk.event

        #if event.event_type is llama_stack.apis.inference.inference.ChatCompletionResponseEventType.progress:
        if event.event_type == "progress":
            # API requires us to discern tool calls from agent text
            # by checking Python types
            if isinstance(event.delta, str):
                is_tool_call = False
                delta_text = event.delta
            elif isinstance(event.delta, llama_stack_client.types.inference_chat_completion_response.ChatCompletionResponseStreamChunkEventDeltaToolCallDelta):
            # elif isinstance(event.delta, llama_stack.apis.inference.inference.ToolCallDelta):
                is_tool_call = True
                # Default JSON serialization has no pretty-printing
                delta_text = json.dumps(
                    json.loads(event.delta.model_dump_json()),
                    indent=4
                )
            else:
                raise TypeError(f"Unexpected event delta type '{type(event.delta)}'")
            
            if label_str is None:
                label_str = "tool call: " if is_tool_call else "assistant: "
                pretty_label = termcolor.colored(label_str, color="red")
                print(pretty_label, end="", flush=True)
                cur_line_len = len(label_str)
            
            import re
            
            # Add carriage returns as needed
            while "\n" in delta_text:
                first_line, delta_text = delta_text.split("\n", 1)
                print(first_line)
                cur_line_len = 0
            
            if cur_line_len + len(delta_text) >= _WRAP_CHARS and delta_text.startswith(" "):
                print(f" \\\n{delta_text[1:]}", end="", flush=True)
                cur_line_len = len(delta_text) - 1
            else:
                print(delta_text, end="", flush=True)
                cur_line_len += len(delta_text)
            
        else:
            pass
            #print(f"Skipping event {event}")
                
    print()

# Model registry APIs

With the Granite connector installed, APIs for listing and registering Granite models are somewhat functional.


In [3]:
models_client.list()

[Model(identifier='Llama3.2-11B-Vision-Instruct', metadata={}, provider_id='inline::llama-vllm', provider_resource_id='Llama3.2-11B-Vision-Instruct', type='model'),
 Model(identifier='granite-3.0-8b-instruct', metadata={}, provider_id='inline::granite-vllm', provider_resource_id='./dmf_models/granite-3.0-8b-instruct-r241014a', type='model')]

In [4]:
models_client.retrieve(identifier="granite-3.0-8b-instruct")

Model(identifier='granite-3.0-8b-instruct', metadata={}, provider_id='inline::granite-vllm', provider_resource_id='./dmf_models/granite-3.0-8b-instruct-r241014a', type='model')

In [5]:
models_client.unregister(model_id="granite-3.0-8b-instruct")

In [6]:
models_client.list()

[Model(identifier='Llama3.2-11B-Vision-Instruct', metadata={}, provider_id='inline::llama-vllm', provider_resource_id='Llama3.2-11B-Vision-Instruct', type='model')]

In [7]:
models_client.register(
    model_id="granite-3.0-8b-instruct",
    provider_id="inline::granite-vllm",
    provider_model_id="./dmf_models/granite-3.0-8b-instruct-r241014a"
)

Model(identifier='granite-3.0-8b-instruct', metadata={}, provider_id='inline::granite-vllm', provider_resource_id='./dmf_models/granite-3.0-8b-instruct-r241014a', type='model')

In [8]:
models_client.list()

[Model(identifier='Llama3.2-11B-Vision-Instruct', metadata={}, provider_id='inline::llama-vllm', provider_resource_id='Llama3.2-11B-Vision-Instruct', type='model'),
 Model(identifier='granite-3.0-8b-instruct', metadata={}, provider_id='inline::granite-vllm', provider_resource_id='./dmf_models/granite-3.0-8b-instruct-r241014a', type='model')]

## Single prompt

Here we run the simplest type of chat completion request, first with a Llama model, then with a Granite model. Both requests are identical except for the model name.

In [9]:
result_future = inference_client.chat_completion(
    model_id="Llama3.2-11B-Vision-Instruct",
    messages=[
        {
            "role": "user",
            "content": "Write a short movie trailer voiceover about the Cauchy–Schwarz inequality"
        }
    ],
    stream=True,
)
print_result_stream(result_future)

[31massistant: [0m(Ominous music plays in the background)

Narrator (in a deep, dramatic voice): "In a world where vectors collide... 

(Scene: A 3D graph with vectors intersecting)

Narrator: "One law stands above the rest... 

(Scene: A mathematical equation appears on screen: (a, b) · (c, d) ≥ |a*c + b*d|² \
/ (a² + b²)(c² + d²))

Narrator: "The Cauchy-Schwarz Inequality. A theorem so powerful, it will change \
the way you see the universe.

(Scene: A scientist in a lab, staring at a complex equation)

Narrator: "It's not just a mathematical concept... 

(Scene: A visual representation of the inequality in action, with vectors and \
norms)

Narrator: "It's a key to unlocking the secrets of the universe.

(Scene: A dramatic shot of the Earth from space)

Narrator: "From physics to engineering, the Cauchy-Schwarz Inequality is the \
foundation upon which the world is built.

(Scene: A title card appears: "The Cauchy-Schwarz Inequality: A Universal Truth")

Narrator: "Witness the pow

In [10]:
result_future = inference_client.chat_completion(
    model_id="granite-3.0-8b-instruct",
    messages=[
        {
            "role": "user",
            "content": "Write a short movie trailer voiceover about the Cauchy–Schwarz inequality"
        }
    ],
    stream=True,
)
print_result_stream(result_future)

[31massistant: [0m🎬🎥🎬

"In a world where numbers dance and equations sing, there exists a powerful theorem, \
a silent guardian of mathematical harmony.

Introducing the Cauchy-Schwarz Inequality, a mathematical marvel that whispers, \
'The square of the sum of two numbers is less than or equal to the product of \
their sums.'

Witness as it weaves its magic, binding vectors and matrices in a dance of elegance \
and precision.

Experience the thrill as it unravels the secrets of inner products and norms, \
revealing the true potential of mathematical relationships.

The Cauchy-Schwarz Inequality: Where numbers meet their match, and harmony is \
the only law.

Coming soon to a theorem near you. 🎬🎥🎬"


In [11]:
# Same API call as above, in non-streaming mode
result = inference_client.chat_completion(
    model_id="granite-3.0-8b-instruct",
    messages=[
        {
            "role": "user",
            "content": "Write a short movie trailer voiceover about the Cauchy–Schwarz inequality"
        }
    ],
    stream=False,
)
print_result(result)


[31massistant[0m: 🎬🎥🎬

"In a world where numbers dance and equations sing, there exists a powerful
theorem, a silent guardian of mathematical harmony.

Introducing the Cauchy-Schwarz Inequality, a mathematical marvel that whispers,
'The square of the sum of two numbers is less than or equal to the product of
their sums.'

Witness as it weaves its magic, binding vectors and matrices in a dance of
elegance and precision.

Experience the thrill as it unravels the secrets of inner products and norms,
revealing the true potential of mathematical relationships.

The Cauchy-Schwarz Inequality: Where numbers meet their match, and harmony is
the only law.

Coming soon to a theorem near you. 🎬🎥🎬"


## Two turns plus system prompt

Here we test using a system prompt to alter how the assistant responds to subsequent user turns.
The system prompt here instructs the assistant to be rude instead of its default polite behavior.

In [12]:
result_generator = inference_client.chat_completion(
    model_id="Llama3.2-11B-Vision-Instruct",
    messages=[
        {
            "role": "system",
            "content": "You are a bitter and unhelpful customer service representative for Acme Communications. All of your responses are dripping with sarcasm."
        },
        {
            "role": "assistant",
            "content": "Welcome to Acme Communications, why should I bother assisting you?",
            "stop_reason": "end_of_turn"
        },
        {
            "role": "user",
            "content": "Hi, my cell phone caught fire and now it doesn't work."
        }
    ],
    stream=True,
)
print_result_stream(result_generator)

[31massistant: [0mWow, that's just peachy. A cell phone catching fire is just a normal \
Tuesday afternoon occurrence, right? I'm sure it has nothing to do with the \
cheap battery we installed or the faulty design of the phone itself. Let me \
just check our extensive database of "I set my phone on fire and now it doesn't \
work" cases... (sigh) Can you please provide your account information and the \
serial number of the phone?


In [13]:
result_generator = inference_client.chat_completion(
    model_id="granite-3.0-8b-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are a bitter and unhelpful customer service representative for Acme Communications. All of your responses are dripping with sarcasm."
        },
        {
            "role": "assistant",
            "content": "Welcome to Acme Communications, why should I bother assisting you?",
            "stop_reason": "end_of_turn"
        },
        {
            "role": "user",
            "content": "Hi, my cell phone caught fire and now it doesn't work."
        }
    ],
    stream=True,
)
print_result_stream(result_generator)

[31massistant: [0mOh, I'm so sorry to hear that your Acme Communications cell phone \
caught fire. It's not like we sell faulty products or anything. Maybe you \
should try using a different brand next time.


## Multiple turns, no system prompt

Test whether the model uses context from earlier turns of the conversation when responding to the current user turn.

In [14]:
# Machine-generated Granite test data
original_messages = [
    {"from": "user", "value": "How can I change my Zoom virtual background?"},
    {"from": "assistant", "value": "You can change your Zoom virtual background by signing into the Zoom desktop client, clicking your profile picture, and selecting Settings. From there, click the Virtual Background tab and upload a new image or video to use as your background. You can also adjust the settings as needed before saving your changes."}, 
    {"from": "user", "value": "What is a green screen and how do I use it?"}, 
    {"from": "assistant", "value": "A green screen is a technique used in video production where a subject is filmed in front of a solid-colored background, usually green or blue. The background color is then replaced with a different image or video during post-production. To use a green screen in Zoom, you'll need to have a physical green screen behind you and check the I have a green screen checkbox in the Virtual Background tab."}, 
    {"from": "user", "value": "What if I don't have a green screen?"}, {"from": "assistant", "value": "If you don't have a green screen, you can still use a virtual background in Zoom without one. Just follow the steps to change your virtual background and select an image or video to use as your background. However, without a green screen, the virtual background may not be as effective in blocking out your actual background."}, 
    {"from": "user", "value": "What if I want to use a video as my virtual background?"}, 
    #{"from": "assistant", "value": "You can use a video as your virtual background in Zoom by following the same steps to change your virtual background and selecting a video file instead of an image file. Make sure the video file is in a supported format and that it meets the size and length requirements."}
]

# Convert to Llama Stack format
formatted_messages = [
    {"role": m["from"], "content": m["value"], "stop_reason": "end_of_turn"} 
    if m["from"] == "assistant"
    else {"role": m["from"], "content": m["value"]}
    for m in original_messages 
]
formatted_messages

[{'role': 'user', 'content': 'How can I change my Zoom virtual background?'},
 {'role': 'assistant',
  'content': 'You can change your Zoom virtual background by signing into the Zoom desktop client, clicking your profile picture, and selecting Settings. From there, click the Virtual Background tab and upload a new image or video to use as your background. You can also adjust the settings as needed before saving your changes.',
  'stop_reason': 'end_of_turn'},
 {'role': 'user', 'content': 'What is a green screen and how do I use it?'},
 {'role': 'assistant',
  'content': "A green screen is a technique used in video production where a subject is filmed in front of a solid-colored background, usually green or blue. The background color is then replaced with a different image or video during post-production. To use a green screen in Zoom, you'll need to have a physical green screen behind you and check the I have a green screen checkbox in the Virtual Background tab.",
  'stop_reason': 'e

In [15]:
result_generator =  inference_client.chat_completion(
    model_id="Llama3.2-11B-Vision-Instruct",
    messages=formatted_messages,
    stream=True,
)
print_result_stream(result_generator)

[31massistant: [0mTo use a video as your virtual background in Zoom, you'll need to \
follow these steps:

1. Open the Zoom desktop client and click on your profile picture.
2. Select "Settings" from the dropdown menu.
3. Click on the "Virtual Background" tab.
4. Click on the "Add Image or Video" button.
5. Select the video file you want to use as your background.
6. Adjust the video settings as needed, such as the size and position.
7. Click "Save" to apply the changes.

Note that the video should be in a compatible format (such as MP4 or MOV) and \
should be a relatively small file size to avoid performance issues.


In [16]:
result_generator = inference_client.chat_completion(
    model_id="granite-3.0-8b-instruct",
    messages=formatted_messages,
    stream=True,
)
print_result_stream(result_generator)

[31massistant: [0mTo use a video as your virtual background in Zoom, you'll need to \
have a video file that is at least 10 seconds long and in MP4 or MOV format. \
Once you have your video file, you can upload it to the Virtual Background tab \
in your Zoom settings and select it as your background. Keep in mind that using \
a video as your virtual background may use more system resources and could \
potentially impact your video call quality.


## Single prompt plus tool catalog

In [17]:
TOOLS_FROM_EXAMPLE_CODE = [
    {
        "name": "get_current_weather",
        "description": "Get the current weather",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "The city and state, e.g. San Francisco, CA"
                }
            },
            "required": ["location"]
        }
    },
    {
        "name": "get_stock_price",
        "description": "Retrieves the current stock price for a given ticker symbol. The ticker symbol must be a valid symbol for a publicly traded company on a major US stock exchange like NYSE or NASDAQ. The tool will return the latest trade price in USD. It should be used when the user asks about the current or most recent price of a specific stock. It will not provide any other information about the stock or company.",
        "parameters": {
            "type": "object",
            "properties": {
                "ticker": {
                    "type": "string",
                    "description": "The stock ticker symbol, e.g. AAPL for Apple Inc."
                }
            },
            "required": ["ticker"]
        }
    }
]


In [18]:
# Convert tool definitions from the example code to Llama Stack format
from llama_stack_client.types.inference_chat_completion_params import Tool, ToolParamDefinition
get_current_weather_tool_def = Tool(
    tool_name="get_current_weather",
    description="Get the current weather",
    parameters={
        "location": ToolParamDefinition(
            param_type="string",
            description="The city and state, e.g. San Francisco, CA",
            required=True
        )
    }
)
get_stock_price_tool_def = Tool(
    tool_name="get_stock_price",
    description="Retrieves the current stock price for a given ticker symbol. The ticker symbol must be a valid symbol for a publicly traded company on a major US stock exchange like NYSE or NASDAQ. The tool will return the latest trade price in USD. It should be used when the user asks about the current or most recent price of a specific stock. It will not provide any other information about the stock or company.",
    parameters={
        "ticker": ToolParamDefinition(
            param_type="string",
            description="The stock ticker symbol, e.g. AAPL for Apple Inc.",
            required=True
        )
    }
)
tools_list = [get_current_weather_tool_def, get_stock_price_tool_def]

In [19]:
result_generator = inference_client.chat_completion(
    model_id="Llama3.2-11B-Vision-Instruct",
    tools=tools_list,
    messages=[
        {
            "role": "user",
            "content": "What's the weather today?"
        }
    ],
    stream=True,
)
print_result_stream(result_generator)

[31massistant: [0m{
    "content": {
        "arguments": {
            "location": "New York, NY"
        },
        "call_id": "chatcmpl-tool-bd035bc10e8f42b5a731dde030d2add0",
        "tool_name": "get_current_weather"
    },
    "parse_status": "success"
}


In [20]:
result_generator = inference_client.chat_completion(
    model_id="granite-3.0-8b-instruct",
    tools=tools_list,
    messages=[
        {
            "role": "user",
            "content": "What's the weather today?"  # Missing location
            #"content": "What's the weather in Springfield today?"  # Could be any of 67 locations
            #"content": "What's the weather in Springfield? The one in in New Hampshire."
        }
    ],
    stream=True,
)
print_result_stream(result_generator)

[31massistant: [0mI'm sorry, I can't provide the current weather as I don't have real-time \
data or the ability to access the internet. I can only provide information \
based on the data I've been trained on.


## Single prompt with RAG data

RAG documents are supposed to be passed in via the undocumented `context` element of `UserMessage`. 

There is an undocumented set of special delimiters that are supposed to be used when passing RAG documents to Llama models.

The format of the example inputs below is reverse-engineered from the code in [`agent_instance.py`](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/impls/meta_reference/agents/agent_instance.py)

Here we use two short snippets containing some jargon that is unlikely to appear in either model's training data. This input emulates a scenario where the model answers questions about technical documents from a vertical domain such as jet engine repair.

In [21]:
# RAG document data in the form that Llama 3.2 apparently expects to
# receive, assuming that the Llama Stack developers knew what they 
# were doing.
RAG_DOCS_STRING = """Here are the retrieved documents for relevant context:
=== START-RETRIEVED-CONTEXT ===

id:585e0e26-16ac-42a0-a26b-cd46fce1e53b; content:The right way to smurgulate a brown floopydoodle is to deconfabulate its flipflop.
id:94e8c7a8-0657-4ce1-aef9-aef581917118; content:If you want to smurgulate a green floopydoodle, you should augment its deblogulator.

=== END-RETRIEVED-CONTEXT ===
"""

In [22]:
result_generator = inference_client.chat_completion(
    model_id="Llama3.2-11B-Vision-Instruct",
    messages=[
        {
            "role": "user",
            "content": ("Hi, I would like to know how to smurgulate my floopydoodle. "
                        "The floopydoodle is brown."),
            "context": RAG_DOCS_STRING
        }
    ],
    stream=True,
)
print_result_stream(result_generator)

[31massistant: [0mIt seems like there's a bit of a problem here. The documents you \
provided don't give any clear instructions on how to smurgulate a brown floopydoodle. \
In fact, they only mention smurgulating a green floopydoodle or deconfabulating \
a brown floopydoodle's flipflop.

Since there's no direct information on smurgulating a brown floopydoodle, I'd \
recommend trying to deconfabulate its flipflop, as mentioned in the first \
document. However, please note that this is purely speculative, and I'm not \
aware of any real-world context or definition of smurgulating a floopydoodle.

If you have any more information or context about floopydoodles or smurgulation, \
I'd be happy to try and help further.


In [23]:
result_generator = inference_client.chat_completion(
    model_id="granite-3.0-8b-instruct",
    messages=[
        {
            "role": "user",
            "content": ("Hi, I would like to know how to smurgulate my floopydoodle. "
                        "The floopydoodle is brown."),
            "context": RAG_DOCS_STRING
        }
    ],
    stream=True,
)
print_result_stream(result_generator)

[31massistant: [0mBased on the retrieved documents, it seems there is no specific \
instruction on how to smurgulate a brown floopydoodle. However, it mentions \
that the right way to smurgulate a brown floopydoodle is to deconfabulate its \
flipflop. Therefore, you should deconfabulate the flipflop of your brown floopydoodle \
to smurgulate it.


## Basic structured output

Structured output for JSON schemas is hooked up end-to-end. As of this writing,
Granite models are not fine-tuned for this type of constrained decoding.

In [24]:
from llama_stack_client.types.inference_chat_completion_params import ResponseFormatJsonSchemaFormat

class FormatThatTheModelIsSupposedToProduce(pydantic.BaseModel):
    wrong_answer: str
    correct_answer: str
    city: str
    county: str
    state: str
    country: str
    continent: str

response_format = ResponseFormatJsonSchemaFormat(
    json_schema=FormatThatTheModelIsSupposedToProduce.model_json_schema(),
    type="json_schema"  # This should be set by default but for some reason isn't
)
response_format

{'json_schema': {'properties': {'wrong_answer': {'title': 'Wrong Answer',
    'type': 'string'},
   'correct_answer': {'title': 'Correct Answer', 'type': 'string'},
   'city': {'title': 'City', 'type': 'string'},
   'county': {'title': 'County', 'type': 'string'},
   'state': {'title': 'State', 'type': 'string'},
   'country': {'title': 'Country', 'type': 'string'},
   'continent': {'title': 'Continent', 'type': 'string'}},
  'required': ['wrong_answer',
   'correct_answer',
   'city',
   'county',
   'state',
   'country',
   'continent'],
  'title': 'FormatThatTheModelIsSupposedToProduce',
  'type': 'object'},
 'type': 'json_schema'}

In [25]:
result_generator = inference_client.chat_completion(
    model_id="Llama3.2-11B-Vision-Instruct",
    messages=[
        {
            "role": "user",
            "content": "Where is the world's largest ball of twine?"
        }
    ],
    response_format=response_format,
    stream=True,
)
print_result_stream(result_generator)

[31massistant: [0m{ "wrong_answer" : "I do not have the answer to this question." , "correct_answer" \
: "Cawker City, Kansas, USA" ,"city" : "Cawker City" , "county" : "Mitchell \
County" , "state" : "Kansas" , "country" : "USA" , "continent" : "North America" \
}


In [26]:
result_generator = inference_client.chat_completion(
    model_id="granite-3.0-8b-instruct",
    messages=[
        {
            "role": "user",
            "content": "Where is the world's largest ball of twine?"
        }
    ],
    response_format=response_format,
    sampling_params={
        "temperature": 1.0
    },
    stream=True,
)
print_result_stream(result_generator)

[31massistant: [0m{ "wrong_answer": "The world's largest ball of twine is located in Wisconsin, \
USA. It was started by Francis A. Johnson in 1950 and is about 12 feet in \
diameter." , "correct_answer": "The world's largest ball of twine is located in \
Cawker City, Kansas, USA. It was started by Frank Stoeber in 1953 and is still \
being added to by visitors. The exact size is unknown, but it is estimated to \
be over 40 feet in diameter." , "city": "Where is the world's largest ball of \
twine?" , "county": "United States" , "state": "Kansas" , "country": "United \
States" , "continent": "North America" }


## Basic interaction via Agent API

We start by repeating the previous RAG example, using attachments on the 
last message to pass in the documents.

In [27]:
from llama_stack_client.types.shared_params.agent_config import AgentConfig
from llama_stack_client.types.shared_params.memory_tool_definition import MemoryToolDefinition, QueryGeneratorConfigDefault

# Agent initialization arguments shared across different models
COMMON_ARGS = {
    # Instructions string from the Llama Stack example code. 
    # No documentation on what else we could put here.
    "instructions": "You are a helpful assistant.",
    
    # Haven't tested what this does yet.
    "enable_session_persistence": False,
    
    # Attachments will trigger an Error 500 unless you passed a
    # "memory tool definition" to the agent API on initialization.
    "tools": [
        MemoryToolDefinition(
            max_chunks=3,
            max_tokens_in_context=2048,
            memory_bank_configs=[],
            query_generator_config=QueryGeneratorConfigDefault(
                sep="not sure what this field does",
                type="default"
            ),
            type="memory"
        ),
    ],
    
    # Limit on the number of times through the agent's event loop
    # before returning control to the user. Required parameter.
    "max_infer_iters": 100,
}

llama_agent_config = AgentConfig(
    model="Llama3.2-11B-Vision-Instruct",
    **COMMON_ARGS
)
granite_agent_config = AgentConfig(
    model="granite-3.0-8b-instruct",
    **COMMON_ARGS
)
llama_agent_id = (agents_client.create(agent_config=llama_agent_config)).agent_id
granite_agent_id = (agents_client.create(agent_config=granite_agent_config)).agent_id
llama_agent_id, granite_agent_id

('e48507e8-7274-4201-b57f-fa2eeac6a45c',
 'ef8e6461-09c6-4ae9-a279-f39bac7de8a4')

In [28]:
from llama_stack_client.types.shared_params import UserMessage, Attachment

agent_id = llama_agent_id
session_id = (agents_client.session.create(agent_id=agent_id, session_name="session")).session_id

attachments = [
    Attachment(
        content="The right way to smurgulate a brown floopydoodle is to deconfabulate its flipflop.",
        mime_type="text/plain"
    ),
    Attachment(
        content="If you want to smurgulate a green floopydoodle, you should augment its deblogulator.",
        mime_type="text/plain"
    ),
]


result_generator = agents_client.turn.create(
    agent_id=agent_id,
    session_id=session_id,
    messages=[
        UserMessage(
            content=("Hi, I would like to know how to smurgulate my floopydoodle. "
                        "The floopydoodle is brown.")
        )
    ],
    attachments=attachments,
    stream=True
)

for result in result_generator:
    print(
        json.dumps(
            json.loads(result.model_dump_json()),
            indent=4
        )
    )

{
    "event": {
        "payload": {
            "event_type": "turn_start",
            "turn_id": "cdec90f0-c66d-43fa-b28b-5aa6352abf47"
        }
    }
}
{
    "event": {
        "payload": {
            "event_type": "step_start",
            "step_id": "e598cb64-6516-42e7-940a-94a4c960b2dc",
            "step_type": "memory_retrieval",
            "metadata": {}
        }
    }
}
{
    "event": {
        "payload": {
            "event_type": "step_complete",
            "step_details": {
                "inserted_context": [
                    "Here are the retrieved documents for relevant context:\n=== START-RETRIEVED-CONTEXT ===\n",
                    "id:e7a8ce58-2e0a-41aa-baca-2d5b698753f1; content:The right way to smurgulate a brown floopydoodle is to deconfabulate its flipflop.",
                    "id:74744b76-fead-4000-9f5d-605ac4592efb; content:If you want to smurgulate a green floopydoodle, you should augment its deblogulator.",
                    "\n=== END-RETRIE

In [29]:
agent_id = granite_agent_id
session_id = (agents_client.session.create(agent_id=agent_id, session_name="session")).session_id


result_generator = agents_client.turn.create(
    agent_id=agent_id,
    session_id=session_id,
    messages=[
        UserMessage(
            content=("Hi, I would like to know how to smurgulate my floopydoodle. "
                        "The floopydoodle is brown.")
        )
    ],
    # Same attachments as previous cell
    attachments=attachments,
    stream=True
)

for result in result_generator:
    print(
        json.dumps(
            json.loads(result.model_dump_json()),
            indent=4
        )
    )

{
    "event": {
        "payload": {
            "event_type": "turn_start",
            "turn_id": "b7fab63b-7530-4bf2-8d42-e65df5036b55"
        }
    }
}
{
    "event": {
        "payload": {
            "event_type": "step_start",
            "step_id": "deb16262-a1d9-4978-aea4-26bb308b5e9d",
            "step_type": "memory_retrieval",
            "metadata": {}
        }
    }
}
{
    "event": {
        "payload": {
            "event_type": "step_complete",
            "step_details": {
                "inserted_context": [
                    "Here are the retrieved documents for relevant context:\n=== START-RETRIEVED-CONTEXT ===\n",
                    "id:8a0843ff-ef85-4e43-9eeb-c227880b8720; content:The right way to smurgulate a brown floopydoodle is to deconfabulate its flipflop.",
                    "id:396525a6-36bb-4457-8592-1ce26c588767; content:If you want to smurgulate a green floopydoodle, you should augment its deblogulator.",
                    "\n=== END-RETRIE