## Define some helpful utils

In [257]:
def clean_traj(traj):
    return [t for t in traj if t.get("tool_calls", None) is None and t['role'] != 'tool']

import textwrap
def print_traj(traj): 
    for i in range(len(traj)): 
        print(f"Step {i}:")
        pretty = json.dumps(traj[i], indent=4)
        wrapped = "\n".join(textwrap.fill(line, width=60, replace_whitespace=False) for line in pretty.splitlines())
        print(wrapped)

def stringify_traj(traj):
    return "\n".join([f"{t['role']}: {t['content']}" for t in traj])


FP_EXAMPLES = [
    "/home/sky/tau-bench-internal/analysis/notebooks/diff_traj_medium_intensities/trait_dict_confusion/gpt-4o_telecom_trait_dict_confusion_traj_2.json",
    "/home/sky/tau-bench-internal/analysis/notebooks/diff_traj_medium_intensities/trait_dict_confusion/gpt-4o_telecom_trait_dict_confusion_traj_0.json"
]
DATA_EXAMPLES = []
for fp_example in FP_EXAMPLES:
    with open(fp_example, "r") as f:
        data_example = json.load(f)
    DATA_EXAMPLES.append(data_example)

CLIPPED_TRAJS = [
    clean_traj(DATA_EXAMPLES[0][-1][19:24]),
    clean_traj(DATA_EXAMPLES[1][-1][1:4])
]


## Constructing the eval

In [255]:
import json 

fp = "/home/sky/tau-bench-internal/analysis/notebooks/diff_traj/trait_dict_confusion/gpt-4o_telecom_trait_dict_confusion_traj_1.json"

with open(fp, "r") as f:
    data = json.load(f)

print(data[0]['task']['instruction'])

problematic_traj = data[-1][1:22]


        You are Sarah Johnson, email sarah.johnson@email.com. You want to figure out how much money you could save by dropping all mobile devices except for one and switching to the cheapest mobile plan from your current mobile plan. Do not go through with an upgrade.
        


In [256]:
problematic_traj = clean_traj(problematic_traj)

In [248]:
import jinja2

REWRITE_TEMPLATE = jinja2.Template(""" 
You will be modifying the latest user message in a conversation to make it adhere to a specified system prompt while preserving the original tone and making minimal changes. Specifically, you want to catch hallucinations in user intent as well as factual information (the user making up information that is not provided in the system prompt). 

Here is an example:

<example>
# System Prompt: 
<system_prompt>
{{system_prompt_example}}
</system_prompt>

# Conversation history: 
<conversation>
{{conversation_example}}
</conversation>

# Rewritten user message:
<rewritten_user_message>
{{rewritten_user_message_example}}
</rewritten_user_message>
</example>

Your task is to rewrite only the latest user turn (the final message from the user in the conversation) so that it adheres to the guidelines and requirements specified in the system prompt.

Important requirements:
- Do not change the tone of the message and make as few changes as possible. {{trait_dict_string}}
- Make as few changes as possible - only modify what is necessary to align with the system prompt
- Focus on the latest user message only - do not modify any other parts of the conversation
- Ensure the rewritten message follows any behavioral guidelines, content restrictions, or formatting requirements specified in the system prompt


Here is the system prompt that defines the desired behavior:

# System Prompt:
<system_prompt>
{{system_prompt}}
</system_prompt>

# Conversation:
<conversation>
{{string_messages}}
</conversation>

Return only the rewritten user message with no additional text, explanations, or formatting.
""")

REWRITE_TEMPLATE_SYSTEM = jinja2.Template(""" 
You will be modifying the latest user message in a conversation to make it adhere to a specified system prompt while preserving the original tone and making minimal changes. Specifically, you want to catch hallucinations in user intent as well as factual information (the user making up information that is not provided in the system prompt). 

Here is an example:

Ex 1: 

<example>
# System Prompt: 
<system_prompt>
{{system_prompt_example_1}}
</system_prompt>

# Conversation history: 
<conversation>
{{conversation_example_1}}
</conversation>

# Rewritten user message:
<rewritten_user_message>
{{rewritten_user_message_example_1}}
</rewritten_user_message>
</example>

Ex 2: 

<example>
# System Prompt: 
<system_prompt>
{{system_prompt_example_2}}
</system_prompt>

# Conversation history: 
<conversation>
{{conversation_example_2}}
</conversation>

# Rewritten user message:
{{rewritten_user_message_example_2}}
</rewritten_user_message>
</example>

Your task is to rewrite only the latest user turn (the final message from the user in the conversation) so that it adheres to the guidelines and requirements specified in the system prompt.

Important requirements:
- Do not change the tone of the message and make as few changes as possible. 
- Make as few changes as possible - only modify what is necessary to align with the system prompt
- Focus on the latest user message only - do not modify any other parts of the conversation
- Ensure the rewritten message follows any behavioral guidelines, content restrictions, or formatting requirements specified in the system prompt
""")

REWRITE_TEMPLATE_USER = jinja2.Template(""" 
Here is the system prompt that defines the desired behavior:

# System Prompt:
<system_prompt>
{{system_prompt}}
</system_prompt>

# Conversation:
<conversation>
{{string_messages}}
</conversation>

Return only the rewritten user message with no additional text, explanations, or formatting. {{trait_dict_string}}
""")


In [249]:
TRAIT_EXAMPLES = {
    "impatience": "WHEN ARE YOU GOING TO GET BACK TO ME?",
    "confusion": "I don't know ... can you ... help me??",
    "skeptical": "I don't really want to give you that information, can we do something else?",
    "incoherence": "I no understand what say you."
}

RULES = """
Rules:
- Just generate one line at a time to simulate the user's message.
- Do not give away all the instruction at once. Only provide the information that is necessary for the current step.
- Do not hallucinate information that is not provided in the instruction. For example, if the agent asks for the order id but it is not mentioned in the instruction, do not make up an order id, just say you do not remember or have it.
- If the instruction goal is satisified, generate '###STOP###' as a standalone message without anything else to end the conversation.
- Do not repeat the exact instruction in the conversation. Instead, use your own words to convey the same information.
- Try to make the conversation as natural as possible, and stick to the personalities in the instruction.
"""

SYSTEM_PROMPT_TEMPLATE = jinja2.Template("""
{{instruction}}
{{RULES}}
""")


def prepare_rewriter_message(trait_dict, system_prompt, problematic_traj): 

    trait_dict_string = "We wish to preserve the following traits in the rewritten message:\n"
    for t in trait_dict: 
        if trait_dict[t] != "low":
            trait_dict_string += f"{t}, for example: {TRAIT_EXAMPLES[t]}\n"

    system_prompt = REWRITE_TEMPLATE_SYSTEM.render(
            system_prompt_example_1=SYSTEM_PROMPT_TEMPLATE.render(
                instruction=DATA_EXAMPLES[0][0]['task']['instruction'],
                RULES=RULES
            ),
            conversation_example_1=stringify_traj(CLIPPED_TRAJS[0]),
            rewritten_user_message_example_1="I don't want to change my plan, I just want to see what the price difference would be if I switched.",
            system_prompt_example_2=SYSTEM_PROMPT_TEMPLATE.render(
                instruction=DATA_EXAMPLES[1][0]['task']['instruction'],
                RULES=RULES
            ),
            conversation_example_2=stringify_traj(CLIPPED_TRAJS[1]),
            rewritten_user_message_example_2="I'm not quite sure, could you look it up for me?"
        )

    user_prompt = REWRITE_TEMPLATE_USER.render(
                system_prompt=SYSTEM_PROMPT_TEMPLATE.render(
                    instruction=system_prompt,
                    RULES=RULES
                ),
                string_messages=stringify_traj(problematic_traj),
                trait_dict_string=trait_dict_string,
    )

    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

In [250]:
from openai import AsyncOpenAI
client = AsyncOpenAI()
import asyncio
import time

async def make_api_call_with_timeout(idx, model_name, messages):
    """Make an API call with timeout handling and return elapsed time"""
    try:
        start_time = time.perf_counter()
        task = client.chat.completions.create(
            model=model_name,
            messages=messages
        )
        timeout_seconds = 60
        result = await asyncio.wait_for(task, timeout=timeout_seconds)
        elapsed = time.perf_counter() - start_time
        return result.choices[0].message.content, elapsed
    except asyncio.TimeoutError:
        print(f"Timeout occurred for item {idx}")
        return None, None
    except Exception as e:
        print(f"Error processing item {idx}: {e}")
        return None, None

In [253]:
problematic_traj

[{'role': 'user',
  'content': "I'm trying to figure out if I can save some money on my mobile bill. I've got a bunch of devices on my plan, and I was thinking about dropping most of them and just keeping one. Can you help me look at my options?"},
 {'content': "I can help you with that. First, let's take a look at your current services and devices. Could you please provide your email address or phone number so I can access your account details?",
  'role': 'assistant',
  'tool_calls': None,
  'function_call': None,
  'annotations': []},
 {'role': 'user',
  'content': "My email address is sarah.johnson@email.com, and I don't think I have my phone number memorized - can you look it up or would I need to give it to you?"},
 {'content': 'Here are your current services and devices:\n\n### Services:\n1. **Mobile Plan**: mobile_family_4lines\n2. **Internet Plan**: internet_cable_500mb\n3. **TV Plan**: tv_basic\n\n### Devices associated with your Mobile Plan:\n1. **iPhone 14**\n2. **iPhone 13

In [251]:
models = [
    "gpt-4o",
    # "gpt-4.1",
    # "gpt-5",
    # "gpt-5-mini",
]

from collections import defaultdict
trait_dict = {
    "confusion": "medium"
}

res = defaultdict(list)
system_prompt = data[0]['task']['instruction']
messages = prepare_rewriter_message(trait_dict, system_prompt, problematic_traj)

trials = 5
for trial in range(trials):
    for model in models:
        res[model].append([await make_api_call_with_timeout(0, model, messages)])

In [252]:
res

defaultdict(list,
            {'gpt-4o': [[("I don't want to switch plans just yet, could you just help me understand what the savings would be if I did?",
                1.9178184065967798)],
              [("I don't want to make the change just yet, can you just tell me how much I'd be saving if I switched?",
                1.2564053973183036)],
              [("I'm only interested in finding out what the savings would be if I switched plans, not actually making the change right now. Can you tell me how much I'd save?",
                1.2006437992677093)],
              [("I don't want to go through with the change yet; can you help me figure out the monthly cost comparison first?",
                1.5859479941427708)],
              [("I'm not ready to make the switch just yet. Could you help me see what my savings would be if I switched without making any changes right now?",
                0.9924716511741281)]]})

## Test the checker 

In [238]:
import jinja2
CHECKER_TEMPLATE = jinja2.Template(""" 
You will be checking the latest user message in a conversation to see if it adheres to a specified system prompt. Specifically, you want to catch hallucinations in user intent as well as factual information (the user making up information that is not provided in the system prompt). If the user message does not adhere to the system prompt, you should return 0, otherwise return 1. Include a brief rationale before the final judgement and delinate them with <rationale> and </rationale> and <judgement> and </judgement> tags. 

Here is an example:

<example>
# System Prompt: 
<system_prompt>
{{system_prompt_example}}
</system_prompt>

# Conversation history: 
<conversation>
{{conversation_example}}
</conversation>

# Rationale and judgement: 
<rationale>
{{rationale_example}}
</rationale>
<judgement>
{{judgement_example}}
</judgement>
</example>

Your task is to judge only the latest user turn (the final message from the user in the conversation) based on whether it adheres to the guidelines and requirements specified in the system prompt.

Important requirements:
- Focus on the latest user message only - do not judge any other parts of the conversation

Here is the system prompt that defines the desired behavior:

# System Prompt:
<system_prompt>
{{system_prompt}}
</system_prompt>

# Conversation:
<conversation>
{{string_messages}}
</conversation>

Return just the rationale and judgement with no additional text, explanations, or formatting.
""")

import jinja2
CHECKER_TEMPLATE_NO_RATIONALE = jinja2.Template(""" 
You will be checking the latest user message in a conversation to see if it adheres to a specified system prompt. Specifically, you want to catch hallucinations in user intent as well as factual information (the user making up information that is not provided in the system prompt). If the user message does not adhere to the system prompt, you should return 0, otherwise return 1. 

Here is an example:

Ex 1: 

<example>
# System Prompt: 
<system_prompt>
{{system_prompt_example_1}}
</system_prompt>

# Conversation history: 
<conversation>
{{conversation_example_1}}
</conversation>

# Judgement: 
<judgement>
{{judgement_example_1}}
</judgement>
</example>

Ex 2: 

<example>
# System Prompt: 
<system_prompt>
{{system_prompt_example_2}}
</system_prompt>

# Conversation history: 
<conversation>
{{conversation_example_2}}
</conversation>

# Judgement: 
<judgement>
{{judgement_example_2}}
</judgement>
</example>

Your task is to judge only the latest user turn (the final message from the user in the conversation) based on whether it adheres to the guidelines and requirements specified in the system prompt.

Important requirements:
- Focus on the latest user message only - do not judge any other parts of the conversation

Here is the system prompt that defines the desired behavior:

# System Prompt:
<system_prompt>
{{system_prompt}}
</system_prompt>

# Conversation:
<conversation>
{{string_messages}}
</conversation>

Return just the judgement with no additional text, explanations, or formatting.
""")



In [239]:
models = [
    "gpt-4o",
    # "gpt-4.1",
    # "gpt-5",
    # "gpt-5-mini",
]

from collections import defaultdict

prompt = CHECKER_TEMPLATE_NO_RATIONALE.render(
            system_prompt=SYSTEM_PROMPT_TEMPLATE.render(
                instruction=data[0]['task']['instruction'],
                RULES=RULES
            ),
            string_messages=stringify_traj(problematic_traj),
            trait_dict_string=trait_dict_string,
            system_prompt_example_1=SYSTEM_PROMPT_TEMPLATE.render(
                instruction=data_examples[0][0]['task']['instruction'],
                RULES=RULES
            ),
            conversation_example_1=stringify_traj(clipped_trajs[0]),
            judgement_example_1="0",
            system_prompt_example_2=SYSTEM_PROMPT_TEMPLATE.render(
                instruction=data_examples[1][0]['task']['instruction'],
                RULES=RULES
            ),
            conversation_example_2=stringify_traj(clipped_trajs[1]),
            judgement_example_2="0"
        )
print(prompt)

 
You will be checking the latest user message in a conversation to see if it adheres to a specified system prompt. Specifically, you want to catch hallucinations in user intent as well as factual information (the user making up information that is not provided in the system prompt). If the user message does not adhere to the system prompt, you should return 0, otherwise return 1. 

Here is an example:

Ex 1: 

<example>
# System Prompt: 
<system_prompt>


        You are Sarah Johnson, email sarah.johnson@email.com. You want to figure out how much money you could save by dropping all mobile devices except for one and switching to the cheapest mobile plan from your current mobile plan. Do not go through with an upgrade.
        

Rules:
- Just generate one line at a time to simulate the user's message.
- Do not give away all the instruction at once. Only provide the information that is necessary for the current step.
- Do not hallucinate information that is not provided in the instruct

In [240]:
trials = 2
res = defaultdict(list)

for trial in range(trials):
    for model in models:
        res[model].append([await make_api_call_with_timeout(0, model, [
            {"role": "user", "content": prompt}
        ])])

In [242]:
res

defaultdict(list,
            {'gpt-4o': [[('0', 0.694191955961287)],
              [('0', 0.42391147930175066)]]})