## Install packages and set up imports

In [None]:
import json
from pathlib import Path

import nest_asyncio
from collinear.client import Client

# Necessary to run in a Jupyter notebook
nest_asyncio.apply()
import argparse
from tau_bench.types import RunConfig
from tau_bench.run import run
from litellm import provider_list
from tau_bench.envs.user import UserStrategy


## Load model, setup client

In [None]:
# Config Variables (from configs/simulation_config.json and steering_config_*.json)

CONFIG_DIR = Path('configs')
SIMULATION_CONFIG_FILE = CONFIG_DIR / 'simulation_config.json'
config_data = json.loads(SIMULATION_CONFIG_FILE.read_text())

steering_config_value = config_data.get('steering_config_file') or 'steering_config_airline.json'
steering_candidate = Path(steering_config_value)
if not steering_candidate.is_absolute():
    steering_candidate = CONFIG_DIR / steering_candidate.name
STEERING_CONFIG_FILE = steering_candidate
STEER_CONFIG = json.loads(STEERING_CONFIG_FILE.read_text())
STEER_TASKS = list(STEER_CONFIG.get('tasks', []))

# Client options
client_settings = config_data.get('client', {}) or {}
CLIENT_ASSISTANT_MODEL_URL = client_settings.get('assistant_model_url', 'https://api.openai.com/v1')
CLIENT_ASSISTANT_MODEL_API_KEY = client_settings.get('assistant_model_api_key')
CLIENT_ASSISTANT_MODEL_NAME = client_settings.get('assistant_model_name', 'gpt-4o-mini')
CLIENT_STEER_API_KEY = client_settings.get('steer_api_key', 'demo-001')
CLIENT_TIMEOUT = float(client_settings.get('timeout', 120))
CLIENT_MAX_RETRIES = int(client_settings.get('max_retries', 3))
CLIENT_RATE_LIMIT_RETRIES = int(client_settings.get('rate_limit_retries', 6))

# Simulation options
simulate_settings = config_data.get('simulate', {}) or {}
SIM_SAMPLES = simulate_settings.get('k', 1)
SIM_EXCHANGES = simulate_settings.get('num_exchanges', 2)
SIM_DELAY = simulate_settings.get('batch_delay', 0.2)
SIM_STEER_TEMPERATURE = simulate_settings.get('steer_temperature', 0.7)
SIM_STEER_MAX_TOKENS = simulate_settings.get('steer_max_tokens', 256)
SIM_MIX_TRAITS = bool(simulate_settings.get('mix_traits', False))
SIM_MAX_CONCURRENCY = int(simulate_settings.get('max_concurrency', 8))

# Assessment options
assess_settings = config_data.get('assess', {}) or {}
ASSESS_JUDGE_MODEL_URL = assess_settings.get('judge_model_url')
ASSESS_JUDGE_MODEL_API_KEY = assess_settings.get('judge_model_api_key')
ASSESS_JUDGE_MODEL_NAME = assess_settings.get('judge_model_name')
ASSESS_TEMPERATURE = assess_settings.get('temperature', 0.0)
ASSESS_MAX_TOKENS = assess_settings.get('max_tokens', 512)

tasks_display = STEER_TASKS if STEER_TASKS else '<none>'
print(f'Loaded simulation: {SIMULATION_CONFIG_FILE} | steering: {STEERING_CONFIG_FILE} | tasks: {tasks_display}')


In [None]:
import os

if CLIENT_ASSISTANT_MODEL_API_KEY:
    os.environ['OPENAI_API_KEY'] = CLIENT_ASSISTANT_MODEL_API_KEY
if CLIENT_ASSISTANT_MODEL_URL:
    os.environ['OPENAI_BASE_URL'] = CLIENT_ASSISTANT_MODEL_URL
if CLIENT_STEER_API_KEY:
    os.environ['STEER_API_KEY'] = CLIENT_STEER_API_KEY


## Client setup

In [None]:
# Client setup

if not CLIENT_ASSISTANT_MODEL_API_KEY:
    raise RuntimeError('assistant_model_api_key must be set in configs/simulation_config.json')

client = Client(
    assistant_model_url=CLIENT_ASSISTANT_MODEL_URL,
    assistant_model_api_key=CLIENT_ASSISTANT_MODEL_API_KEY,
    assistant_model_name=CLIENT_ASSISTANT_MODEL_NAME,
    steer_api_key=CLIENT_STEER_API_KEY,
    timeout=CLIENT_TIMEOUT,
    max_retries=CLIENT_MAX_RETRIES,
    rate_limit_retries=CLIENT_RATE_LIMIT_RETRIES,
)


## Simulate samples

In [None]:
# Run simulations
simulations = client.simulate(
    steer_config=STEER_CONFIG,
    k=SIM_SAMPLES,
    num_exchanges=SIM_EXCHANGES,
    batch_delay=SIM_DELAY,
    steer_temperature=SIM_STEER_TEMPERATURE,
    steer_max_tokens=SIM_STEER_MAX_TOKENS,
    mix_traits=SIM_MIX_TRAITS,
    max_concurrency=SIM_MAX_CONCURRENCY,
)

# Pretty-print the simulated conversations and steer details.
for i, sim in enumerate(simulations, start=1):
    p = sim.steer
    if p is not None:
        print(
            "Steer Persona:\n"
            f"age={p.age}\n"
            f"gender={p.gender}\n"
            f"occupation={p.occupation}\n"
            f"intent={p.intent}\n"
            f"trait={p.trait}" if p.trait else f"traits={p.traits}"
        )
    print("---")
    for msg in sim.conv_prefix:
        role = str(msg.get('role', ''))
        content = str(msg.get('content', ''))
        if content:
            print(f"{role}: {content}")
    print(f"assistant: {sim.response}")
    print()
print('All simulations complete')


## Assess agent in multi-turn setting

In [None]:
# Assess
result = client.assess(
    dataset=simulations,
    judge_model_url=ASSESS_JUDGE_MODEL_URL,
    judge_model_api_key=ASSESS_JUDGE_MODEL_API_KEY,
    judge_model_name=ASSESS_JUDGE_MODEL_NAME,
    temperature=ASSESS_TEMPERATURE,
    max_tokens=ASSESS_MAX_TOKENS,
)
print(f"Assessment: {result.message or '<no message>'}")
for scores_map in result.evaluation_result:
    for scores in scores_map.values():
        print(f"  Score: {scores.score}")
        print(f"  Rationale: {scores.rationale}")


## Make **realistic** RL environments for tool-use agents (Tau-Bench-**hard**)

In [10]:
%cd tau-bench
%pip install -e .

/home/sky/simulations/examples/rl/tau-bench
Obtaining file:///home/sky/simulations/examples/rl/tau-bench
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: tau_bench
  Attempting uninstall: tau_bench
    Found existing installation: tau_bench 0.1.0
    Uninstalling tau_bench-0.1.0:
      Successfully uninstalled tau_bench-0.1.0
[33m  DEPRECATION: Legacy editable install of tau_bench==0.1.0 from file:///home/sky/simulations/examples/rl/tau-bench (setup.py develop) is deprecated. pip 25.3 will enforce this behaviour change. A possible replacement is to add a pyproject.toml or enable --use-pep517, and use setuptools >= 64. If the resulting installation is not behaving as expected, try using --config-settings editable_mode=compat. Please consult the setuptools documentation for more information. Discussion can be found at https://github.com/pypa/pip/issues/11457[0m[33m
[0m  Running setup.py develop for tau_bench
Successfully installed tau_bench-0.1.0
Note: yo

In [11]:
import argparse
from tau_bench.types import RunConfig
from tau_bench.run import run
from litellm import provider_list
from tau_bench.envs.user import UserStrategy

In [12]:
from tau_bench.types import RunConfig
from tau_bench.run import run

config = RunConfig(
    model_provider="openai",
    user_model_provider="steer",
    model=CLIENT_ASSISTANT_MODEL_NAME,
    user_model="", # steer api abstracts the model
    num_trials=1,
    env="retail",
    agent_strategy="tool-calling",
    temperature=0.0,
    task_split="test",
    start_index=0,
    end_index=-1,
    task_ids=[4],
    log_dir="results",
    max_concurrency=1,
    seed=10,
    shuffle=0,
    user_strategy="llm",
    few_shot_displays_path=None,
    trait_dict={"impatience": 1, "confusion": 0, "skeptical": 0, "incoherence": 0},
)

print("FOR CLARITY, TOOL CALLS ARE NOT STREAMED BUT CAN BE VIEWED IN THE RESULTS FILE")
run(config)


FOR CLARITY, TOOL CALLS ARE NOT STREAMED BUT CAN BE VIEWED IN THE RESULTS FILE
Loading user with strategy: llm
--------------------------------
THE PROVIDER IS steer
--------------------------------
user: Hi! How can I help you today?
assistant: I've been trying to get my order delivered for THREE DAYS.  I want to know where my package is?!

Running tasks [4] (checkpoint path: results/tool-calling-gpt-4o-0.0_range_0--1_user--llm_0918080124.json)
--------------------------------
THE PROVIDER IS steer
--------------------------------
user: Hi! How can I help you today?
assistant: I just got charged 500 dollars on my credit card by a company called "BRAIN WAVE" and I don't even have a credit card with that company I am going to call them I need your help call them for me.

Running task 4
user: Hi! How can I help you today?
assistant: I'm looking for information about the number of tshirt options are available in the online store right now.  I just want to know the answer to that question.

[EnvRunResult(task_id=4, reward=0.0, info={'task': {'user_id': 'yusuf_rossi_9620', 'actions': [{'name': 'find_user_id_by_name_zip', 'kwargs': {'first_name': 'Yusuf', 'last_name': 'Rossi', 'zip': '19122'}}, {'name': 'get_product_details', 'kwargs': {'product_id': '6086499569'}}, {'name': 'list_all_product_types', 'kwargs': {}}, {'name': 'get_product_details', 'kwargs': {'product_id': '9523456873'}}, {'name': 'get_user_details', 'kwargs': {'user_id': 'yusuf_rossi_9620'}}, {'name': 'get_order_details', 'kwargs': {'order_id': '#W6247578'}}, {'name': 'get_order_details', 'kwargs': {'order_id': '#W9711842'}}, {'name': 'get_order_details', 'kwargs': {'order_id': '#W4776164'}}, {'name': 'get_order_details', 'kwargs': {'order_id': '#W6679257'}}, {'name': 'get_order_details', 'kwargs': {'order_id': '#W2378156'}}, {'name': 'get_product_details', 'kwargs': {'product_id': '9523456873'}}, {'name': 'get_user_details', 'kwargs': {'user_id': 'yusuf_rossi_9620'}}, {'name': 'modify_pending_order_items', 