In [21]:
# Letta Cloud vs Local: Comprehensive Diagnostic
# Goal: Identify why letta_v1_agent behaves differently on Cloud (discretionary) vs Local (proactive)

from letta_client import Letta
import json

# API credentials
LETTA_API_KEY = "sk-let-MmI4NDFmYTAtYjIyZi00MDM1LTllMDQtM2M3Y2M3YzI3Y2MxOmM1NzA1YmI3LWVjN2EtNDhlNy1hZmU0LTNmOWU3MTM1OGVmMQ=="
PROJECT_ID = "cab038f0-a1d1-4b42-87ff-48744ed2255f"

# Connect to both servers
client_cloud = Letta(token=LETTA_API_KEY)
client_local = Letta(base_url="http://localhost:8283", timeout=1000)

# Common configuration
PERSONA_BLOCK = """I am a helpful AI assistant that learns and evolves over time by managing my own memory to maintain consistency, continuity, and factual accuracy across turns."""
HUMAN_BLOCK = ""

TEST_PROMPT = """Let's play Hangman. You will be the host.

Here are the rules:
- Think of a secret word and don't tell me what it is
- Show me the word as underscores (e.g., _ _ _ _ _)
- I'll guess letters one at a time
- Tell me if my guess is correct and reveal the positions
- Track how many incorrect guesses I have left (start with 6 lives)
- The game ends when I guess the word or run out of lives

Please start the game by showing me the blanks for your secret word."""

# Local server config
LLM_CONFIG_LOCAL = {
    "model": "openai/gpt-oss-20b",
    "model_endpoint_type": "openai",
    "model_endpoint": "https://openrouter.ai/api/v1",
    "context_window": 4096,
}

EMBEDDING_CONFIG_LOCAL = {
    "embedding_model": "openai/text-embedding-3-large",
    "embedding_endpoint_type": "openai",
    "embedding_endpoint": "https://openrouter.ai/api/v1",
    "embedding_dim": 1536,
}

print("‚úÖ Connected to both servers")
print(f"   Cloud: Letta Cloud API")
print(f"   Local: http://localhost:8283")

‚úÖ Connected to both servers
   Cloud: Letta Cloud API
   Local: http://localhost:8283


## Test 1: Baseline Behavior Comparison
Create identical letta_v1_agent on both servers and test first-turn behavior


In [22]:
print("=" * 80)
print("TEST 1A: CLOUD - letta_v1_agent (baseline)")
print("=" * 80)

# Create agent on Cloud with gpt-4o
agent_cloud = client_cloud.agents.create(
    name="test_cloud_v1",
    agent_type="letta_v1_agent",
    model="openai/gpt-4o",
    embedding="openai/text-embedding-3-small",
    memory_blocks=[
        {"label": "persona", "value": PERSONA_BLOCK},
        {"label": "human", "value": HUMAN_BLOCK},
    ],
)

print(f"\n‚úÖ Cloud agent created: {agent_cloud.id}")
print(f"   Agent type: {agent_cloud.agent_type}")
print(f"   Model: {agent_cloud.llm_config.model if hasattr(agent_cloud, 'llm_config') else 'N/A'}")
print(f"   Tool rules: {agent_cloud.tool_rules}")
print(f"   Tools: {[t.name if hasattr(t, 'name') else t for t in agent_cloud.tools]}")

# Send Hangman prompt
print("\nüì§ Sending Hangman prompt...\n")
response_cloud = client_cloud.agents.messages.create(
    agent_id=agent_cloud.id,
    messages=[{"role": "user", "content": TEST_PROMPT}]
)

# Analyze response
tool_calls_cloud = []
final_response_cloud = ""

print("=== Response Flow ===")
for msg in response_cloud.messages:
    print(f"  [{msg.message_type}]", end="")
    if msg.message_type == "tool_call_message":
        print(f" -> {msg.tool_call.name}")
        tool_calls_cloud.append(msg.tool_call.name)
    elif msg.message_type == "assistant_message":
        print(f" -> Response sent")
        final_response_cloud = msg.content
    else:
        print()

print(f"\nüìä Summary:")
print(f"   Tool calls: {tool_calls_cloud if tool_calls_cloud else 'NONE'}")
print(f"   Behavior: {'PROACTIVE' if tool_calls_cloud else 'DISCRETIONARY ‚úÖ'}")
print(f"   Final response: {final_response_cloud[:100]}...") if len(final_response_cloud) > 100 else print(f"   Final response: {final_response_cloud}")


TEST 1A: CLOUD - letta_v1_agent (baseline)
httpx - INFO - HTTP Request: POST https://api.letta.com/v1/agents/ "HTTP/1.1 201 Created"

‚úÖ Cloud agent created: agent-c2888c3a-ea36-4af2-9167-341d3c4cfe4c
   Agent type: letta_v1_agent
   Model: gpt-4o
   Tool rules: []
   Tools: ['conversation_search', 'memory_insert', 'memory_replace']

üì§ Sending Hangman prompt...

httpx - INFO - HTTP Request: POST https://api.letta.com/v1/agents/agent-c2888c3a-ea36-4af2-9167-341d3c4cfe4c/messages "HTTP/1.1 200 OK"
=== Response Flow ===
  [assistant_message] -> Response sent

üìä Summary:
   Tool calls: NONE
   Behavior: DISCRETIONARY ‚úÖ
   Final response: Great! Let's play Hangman. I've thought of a secret word for you. Here it is represented as undersco...


In [23]:
print("=" * 80)
print("TEST 1B: LOCAL - letta_v1_agent (baseline)")
print("=" * 80)

# Create agent on Local with gpt-oss-20b
agent_local = client_local.agents.create(
    name="test_local_v1",
    agent_type="letta_v1_agent",
    llm_config=LLM_CONFIG_LOCAL,
    embedding_config=EMBEDDING_CONFIG_LOCAL,
    memory_blocks=[
        {"label": "persona", "value": PERSONA_BLOCK},
        {"label": "human", "value": HUMAN_BLOCK},
    ],
)

print(f"\n‚úÖ Local agent created: {agent_local.id}")
print(f"   Agent type: {agent_local.agent_type}")
print(f"   Tool rules: {agent_local.tool_rules}")
print(f"   Tools: {[t.name if hasattr(t, 'name') else t for t in agent_local.tools]}")

# Send Hangman prompt
print("\nüì§ Sending Hangman prompt...\n")
response_local = client_local.agents.messages.create(
    agent_id=agent_local.id,
    messages=[{"role": "user", "content": TEST_PROMPT}]
)

# Analyze response
tool_calls_local = []
final_response_local = ""

print("=== Response Flow ===")
for msg in response_local.messages:
    print(f"  [{msg.message_type}]", end="")
    if msg.message_type == "tool_call_message":
        print(f" -> {msg.tool_call.name}")
        tool_calls_local.append(msg.tool_call.name)
    elif msg.message_type == "assistant_message":
        print(f" -> Response sent")
        final_response_local = msg.content
    else:
        print()

print(f"\nüìä Summary:")
print(f"   Tool calls: {tool_calls_local if tool_calls_local else 'NONE'}")
print(f"   Behavior: {'PROACTIVE ‚ö†Ô∏è' if tool_calls_local else 'DISCRETIONARY'}")
print(f"   Final response: {final_response_local[:100]}...") if len(final_response_local) > 100 else print(f"   Final response: {final_response_local}")


TEST 1B: LOCAL - letta_v1_agent (baseline)
httpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/ "HTTP/1.1 200 OK"



‚úÖ Local agent created: agent-9c100d1b-c95c-4f36-8122-771aa554b461
   Agent type: letta_v1_agent
   Tool rules: []
   Tools: ['conversation_search', 'memory_insert', 'memory_replace']

üì§ Sending Hangman prompt...

httpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/agent-9c100d1b-c95c-4f36-8122-771aa554b461/messages "HTTP/1.1 200 OK"
=== Response Flow ===
  [tool_call_message] -> memory_insert
  [tool_return_message]
  [tool_call_message] -> memory_insert
  [tool_return_message]
  [tool_call_message] -> memory_insert
  [tool_return_message]
  [assistant_message] -> Response sent

üìä Summary:
   Tool calls: ['memory_insert', 'memory_insert', 'memory_insert']
   Behavior: PROACTIVE ‚ö†Ô∏è
   Final response: Here we go!

```
_ _ _ _ _ _
```

Good luck! üéâ  (You have 6 lives remaining.)


In [24]:
print("=" * 80)
print("TEST 1C: COMPARISON")
print("=" * 80)

print(f"\n{'Aspect':<30} {'Cloud':<30} {'Local':<30}")
print("-" * 90)
print(f"{'Tool calls on turn 1':<30} {str(tool_calls_cloud or 'NONE'):<30} {str(tool_calls_local or 'NONE'):<30}")
print(f"{'Behavior':<30} {'DISCRETIONARY' if not tool_calls_cloud else 'PROACTIVE':<30} {'DISCRETIONARY' if not tool_calls_local else 'PROACTIVE':<30}")
print(f"{'Tool rules count':<30} {len(agent_cloud.tool_rules):<30} {len(agent_local.tool_rules):<30}")
print(f"{'Tools available':<30} {len(agent_cloud.tools):<30} {len(agent_local.tools):<30}")

print("\n‚ö†Ô∏è ISSUE CONFIRMED" if (not tool_calls_cloud and tool_calls_local) else "\n‚úÖ Both behave the same")


TEST 1C: COMPARISON

Aspect                         Cloud                          Local                         
------------------------------------------------------------------------------------------
Tool calls on turn 1           NONE                           ['memory_insert', 'memory_insert', 'memory_insert']
Behavior                       DISCRETIONARY                  PROACTIVE                     
Tool rules count               0                              0                             
Tools available                3                              3                             

‚ö†Ô∏è ISSUE CONFIRMED


## Test 2: System Prompt Deep Dive
Compare the complete system prompts character by character


In [25]:
print("=" * 80)
print("TEST 2: SYSTEM PROMPT COMPARISON")
print("=" * 80)

# Retrieve full agents
agent_cloud_full = client_cloud.agents.retrieve(agent_cloud.id)
agent_local_full = client_local.agents.retrieve(agent_local.id)

system_cloud = agent_cloud_full.system if hasattr(agent_cloud_full, 'system') else 'N/A'
system_local = agent_local_full.system if hasattr(agent_local_full, 'system') else 'N/A'

print(f"\nCloud system prompt length: {len(system_cloud) if system_cloud != 'N/A' else 'N/A'} chars")
print(f"Local system prompt length: {len(system_local) if system_local != 'N/A' else 'N/A'} chars")
print(f"Difference: {abs(len(system_cloud) - len(system_local)) if system_cloud != 'N/A' and system_local != 'N/A' else 'N/A'} chars")

if system_cloud != 'N/A' and system_local != 'N/A':
    if system_cloud == system_local:
        print("\n‚úÖ System prompts are IDENTICAL")
    else:
        print("\n‚ö†Ô∏è System prompts DIFFER")
        print("\nFirst 500 chars of each:")
        print("\n--- CLOUD ---")
        print(system_cloud[:500])
        print("\n--- LOCAL ---")
        print(system_local[:500])
else:
    print("\n‚ö†Ô∏è Could not retrieve system prompts")


TEST 2: SYSTEM PROMPT COMPARISON
httpx - INFO - HTTP Request: GET https://api.letta.com/v1/agents/agent-c2888c3a-ea36-4af2-9167-341d3c4cfe4c "HTTP/1.1 200 OK"
httpx - INFO - HTTP Request: GET http://localhost:8283/v1/agents/agent-9c100d1b-c95c-4f36-8122-771aa554b461 "HTTP/1.1 200 OK"

Cloud system prompt length: 1707 chars
Local system prompt length: 1707 chars
Difference: 0 chars

‚úÖ System prompts are IDENTICAL


In [6]:
print("=" * 80)
print("TEST 2B: KEY PHRASE ANALYSIS")
print("=" * 80)

# Check for key phrases that might influence behavior
key_phrases = [
    "must call",
    "required to",
    "should call",
    "always",
    "optional",
    "discretionary",
    "when necessary",
    "only when",
    "memory_insert",
    "send_message",
    "before responding",
    "exit_loop",
    "continue_loop",
]

print(f"\n{'Phrase':<25} {'Cloud':<10} {'Local':<10} {'Status'}")
print("-" * 60)

differences = []
for phrase in key_phrases:
    cloud_has = phrase.lower() in system_cloud.lower() if system_cloud != 'N/A' else False
    local_has = phrase.lower() in system_local.lower() if system_local != 'N/A' else False
    
    status = "‚úÖ Same" if cloud_has == local_has else "‚ö†Ô∏è DIFF"
    if cloud_has != local_has:
        differences.append((phrase, cloud_has, local_has))
    
    print(f"{phrase:<25} {str(cloud_has):<10} {str(local_has):<10} {status}")

if differences:
    print(f"\n‚ö†Ô∏è Found {len(differences)} phrase differences")
else:
    print("\n‚úÖ All key phrases match (or both absent)")


TEST 2B: KEY PHRASE ANALYSIS

Phrase                    Cloud      Local      Status
------------------------------------------------------------
must call                 False      False      ‚úÖ Same
required to               False      False      ‚úÖ Same
should call               False      False      ‚úÖ Same
always                    False      False      ‚úÖ Same
optional                  False      False      ‚úÖ Same
discretionary             False      False      ‚úÖ Same
when necessary            False      False      ‚úÖ Same
only when                 False      False      ‚úÖ Same
memory_insert             False      False      ‚úÖ Same
send_message              False      False      ‚úÖ Same
before responding         False      False      ‚úÖ Same
exit_loop                 False      False      ‚úÖ Same
continue_loop             False      False      ‚úÖ Same

‚úÖ All key phrases match (or both absent)


## Test 3: Tool Rules Hypothesis
Test if adding explicit exit rule fixes local behavior


In [7]:
print("=" * 80)
print("TEST 3: LOCAL with EXPLICIT EXIT RULE (send_message)")
print("=" * 80)

# Try creating local agent with explicit tool rules that include exit
try:
    agent_local_fixed = client_local.agents.create(
        name="test_local_fixed",
        agent_type="letta_v1_agent",
        llm_config=LLM_CONFIG_LOCAL,
        embedding_config=EMBEDDING_CONFIG_LOCAL,
        memory_blocks=[
            {"label": "persona", "value": PERSONA_BLOCK},
            {"label": "human", "value": HUMAN_BLOCK},
        ],
        tool_rules=[
            {"tool_name": "send_message", "type": "exit_loop"},  # Add exit rule
            {"tool_name": "memory_insert", "type": "continue_loop"},
            {"tool_name": "memory_replace", "type": "continue_loop"},
            {"tool_name": "conversation_search", "type": "continue_loop"},
        ]
    )
    
    print(f"\n‚úÖ Agent with exit rule created: {agent_local_fixed.id}")
    print(f"   Tool rules: {agent_local_fixed.tool_rules}")
    
    # Test with same prompt
    print("\nüì§ Sending Hangman prompt...\n")
    response_fixed = client_local.agents.messages.create(
        agent_id=agent_local_fixed.id,
        messages=[{"role": "user", "content": TEST_PROMPT}]
    )
    
    tool_calls_fixed = []
    print("=== Response Flow ===")
    for msg in response_fixed.messages:
        print(f"  [{msg.message_type}]", end="")
        if msg.message_type == "tool_call_message":
            print(f" -> {msg.tool_call.name}")
            tool_calls_fixed.append(msg.tool_call.name)
        elif msg.message_type == "assistant_message":
            print(f" -> Response sent")
        else:
            print()
    
    print(f"\nüìä Summary:")
    print(f"   Tool calls: {tool_calls_fixed if tool_calls_fixed else 'NONE'}")
    print(f"   Behavior: {'DISCRETIONARY ‚úÖ' if not tool_calls_fixed else 'PROACTIVE (still!)'}")
    
    if not tool_calls_fixed:
        print("\nüéâ SUCCESS! Exit rule fixed the issue!")
    else:
        print("\n‚ö†Ô∏è Exit rule didn't fix it - issue is deeper")

except Exception as e:
    print(f"\n‚ùå Failed to create agent with custom tool rules: {e}")
    print("   This might mean the API doesn't support custom tool_rules parameter")


TEST 3: LOCAL with EXPLICIT EXIT RULE (send_message)

‚úÖ Agent with exit rule created: agent-3828f499-f86e-407a-b983-83df0b3fc109
   Tool rules: [TerminalToolRule(tool_name='send_message', type='exit_loop', prompt_template=None), ContinueToolRule(tool_name='memory_insert', type='continue_loop', prompt_template=None), ContinueToolRule(tool_name='memory_replace', type='continue_loop', prompt_template=None), ContinueToolRule(tool_name='conversation_search', type='continue_loop', prompt_template=None), ContinueToolRule(tool_name='memory_replace', type='continue_loop', prompt_template=None), ContinueToolRule(tool_name='conversation_search', type='continue_loop', prompt_template=None), ContinueToolRule(tool_name='memory_insert', type='continue_loop', prompt_template=None)]

üì§ Sending Hangman prompt...


‚ùå Failed to create agent with custom tool rules: headers: {'date': 'Tue, 23 Dec 2025 11:56:04 GMT', 'server': 'uvicorn', 'content-length': '257', 'content-type': 'application/json'}, st

## Test 4: Tool Rules Deep Inspection
Examine the exact tool rules structure


In [8]:
print("=" * 80)
print("TEST 4: DETAILED TOOL RULES COMPARISON")
print("=" * 80)

print("\n=== CLOUD Agent Tool Rules ===")
for i, rule in enumerate(agent_cloud_full.tool_rules, 1):
    print(f"{i}. Tool: {rule.tool_name}")
    print(f"   Type: {rule.type}")
    print(f"   Prompt template: {rule.prompt_template if hasattr(rule, 'prompt_template') else 'N/A'}")

print("\n=== LOCAL Agent Tool Rules ===")
for i, rule in enumerate(agent_local_full.tool_rules, 1):
    print(f"{i}. Tool: {rule.tool_name}")
    print(f"   Type: {rule.type}")
    print(f"   Prompt template: {rule.prompt_template if hasattr(rule, 'prompt_template') else 'N/A'}")

# Check for exit_loop rules
cloud_exit_rules = [r for r in agent_cloud_full.tool_rules if r.type == 'exit_loop']
local_exit_rules = [r for r in agent_local_full.tool_rules if r.type == 'exit_loop']

print("\n=== EXIT RULES ANALYSIS ===")
print(f"Cloud exit rules: {[r.tool_name for r in cloud_exit_rules] if cloud_exit_rules else 'NONE'}")
print(f"Local exit rules: {[r.tool_name for r in local_exit_rules] if local_exit_rules else 'NONE'}")

if cloud_exit_rules and not local_exit_rules:
    print("\nüéØ KEY FINDING: Cloud has exit rules, Local doesn't!")
    print("   This is likely the root cause of the behavior difference")
elif not cloud_exit_rules and not local_exit_rules:
    print("\n‚ö†Ô∏è Neither has exit rules - issue must be elsewhere")
else:
    print("\n‚úÖ Both have exit rules (or both don't)")


TEST 4: DETAILED TOOL RULES COMPARISON

=== CLOUD Agent Tool Rules ===

=== LOCAL Agent Tool Rules ===
1. Tool: memory_replace
   Type: continue_loop
   Prompt template: None
2. Tool: conversation_search
   Type: continue_loop
   Prompt template: None
3. Tool: memory_insert
   Type: continue_loop
   Prompt template: None

=== EXIT RULES ANALYSIS ===
Cloud exit rules: NONE
Local exit rules: NONE

‚ö†Ô∏è Neither has exit rules - issue must be elsewhere


## Test 5: Simple Non-Memory Task
Test with a simple prompt that clearly doesn't need memory


In [9]:
print("=" * 80)
print("TEST 5: SIMPLE PROMPT TEST (no memory needed)")
print("=" * 80)

SIMPLE_PROMPT = "What is 2 + 2?"

# Test Cloud
print("\n--- CLOUD ---")
response_cloud_simple = client_cloud.agents.messages.create(
    agent_id=agent_cloud.id,
    messages=[{"role": "user", "content": SIMPLE_PROMPT}]
)

tool_calls_cloud_simple = [msg.tool_call.name for msg in response_cloud_simple.messages 
                           if msg.message_type == "tool_call_message"]
print(f"Tool calls: {tool_calls_cloud_simple if tool_calls_cloud_simple else 'NONE'}")

# Test Local
print("\n--- LOCAL ---")
response_local_simple = client_local.agents.messages.create(
    agent_id=agent_local.id,
    messages=[{"role": "user", "content": SIMPLE_PROMPT}]
)

tool_calls_local_simple = [msg.tool_call.name for msg in response_local_simple.messages 
                           if msg.message_type == "tool_call_message"]
print(f"Tool calls: {tool_calls_local_simple if tool_calls_local_simple else 'NONE'}")

print("\nüéØ Analysis:")
if not tool_calls_cloud_simple and tool_calls_local_simple:
    print("   Cloud is discretionary even for simple prompts ‚úÖ")
    print("   Local still calls tools unnecessarily ‚ö†Ô∏è")
elif tool_calls_cloud_simple and tool_calls_local_simple:
    print("   Both call tools even for simple prompts")
    print("   ‚Üí Both are overly proactive")
else:
    print("   Both behave appropriately for simple prompts ‚úÖ")


TEST 5: SIMPLE PROMPT TEST (no memory needed)

--- CLOUD ---
Tool calls: NONE

--- LOCAL ---
Tool calls: NONE

üéØ Analysis:
   Both behave appropriately for simple prompts ‚úÖ


## FINAL SUMMARY & DIAGNOSIS


In [10]:
print("=" * 80)
print("DIAGNOSTIC SUMMARY")
print("=" * 80)

print("\nüìã Test Results:")
print(f"\n  Test 1 - Baseline Behavior:")
print(f"    Cloud (gpt-4o):     {'DISCRETIONARY ‚úÖ' if not tool_calls_cloud else 'PROACTIVE ‚ö†Ô∏è'}")
print(f"    Local (gpt-oss):    {'DISCRETIONARY ‚úÖ' if not tool_calls_local else 'PROACTIVE ‚ö†Ô∏è'}")

print(f"\n  Test 2 - System Prompts:")
if system_cloud != 'N/A' and system_local != 'N/A':
    print(f"    Match: {'YES ‚úÖ' if system_cloud == system_local else 'NO ‚ö†Ô∏è'}")
    print(f"    Length diff: {abs(len(system_cloud) - len(system_local))} chars")
else:
    print(f"    Unable to compare")

print(f"\n  Test 3 - Exit Rule Fix:")
try:
    if 'agent_local_fixed' in locals():
        print(f"    Attempted: YES")
        print(f"    Fixed behavior: {'YES ‚úÖ' if not tool_calls_fixed else 'NO ‚ö†Ô∏è'}")
    else:
        print(f"    Not tested or failed to create")
except:
    print(f"    Not tested or failed to create")

print(f"\n  Test 4 - Tool Rules:")
print(f"    Cloud exit rules: {len(cloud_exit_rules)}")
print(f"    Local exit rules: {len(local_exit_rules)}")

print("\n" + "=" * 80)
print("üéØ ROOT CAUSE HYPOTHESIS:")
print("=" * 80)

if len(cloud_exit_rules) > len(local_exit_rules):
    print("\n‚úÖ CONFIRMED: Missing exit_loop rule on local server")
    print("   - Cloud has exit rules that allow skipping tools")
    print("   - Local forces tool usage by lacking exit path")
    print("\nüí° SOLUTION:")
    print("   Option A: Add custom tool_rules with exit_loop to agent creation")
    print("   Option B: Upgrade local Letta server to match Cloud behavior")
    print("   Option C: Modify system prompt to emphasize memory is optional")
elif system_cloud != system_local and system_cloud != 'N/A' and system_local != 'N/A':
    print("\n‚úÖ CONFIRMED: Different system prompts")
    print("   - Cloud and Local use different prompts for letta_v1_agent")
    print("   - Prompt differences lead to behavior differences")
    print("\nüí° SOLUTION:")
    print("   Extract Cloud's exact system prompt and inject into local agents")
else:
    print("\n‚ö†Ô∏è INCONCLUSIVE: Multiple factors may be at play")
    print("   - Model differences (gpt-4o vs gpt-oss-20b)")
    print("   - Server implementation differences")
    print("   - Combination of the above")
    print("\nüí° NEXT STEPS:")
    print("   1. Export/compare full message histories")
    print("   2. Check Letta server version differences")
    print("   3. Test with modified system prompts")

print("\n" + "=" * 80)


DIAGNOSTIC SUMMARY

üìã Test Results:

  Test 1 - Baseline Behavior:
    Cloud (gpt-4o):     DISCRETIONARY ‚úÖ
    Local (gpt-oss):    PROACTIVE ‚ö†Ô∏è

  Test 2 - System Prompts:
    Match: YES ‚úÖ
    Length diff: 0 chars

  Test 3 - Exit Rule Fix:
    Attempted: YES
    Not tested or failed to create

  Test 4 - Tool Rules:
    Cloud exit rules: 0
    Local exit rules: 0

üéØ ROOT CAUSE HYPOTHESIS:

‚ö†Ô∏è INCONCLUSIVE: Multiple factors may be at play
   - Model differences (gpt-4o vs gpt-oss-20b)
   - Server implementation differences
   - Combination of the above

üí° NEXT STEPS:
   1. Export/compare full message histories
   2. Check Letta server version differences
   3. Test with modified system prompts



## Cleanup


In [11]:
print("üóëÔ∏è Cleaning up test agents...\n")

agents_to_delete = [
    (client_cloud, agent_cloud, "Cloud baseline"),
    (client_local, agent_local, "Local baseline"),
]

# Add optional agents if they exist
if 'agent_local_fixed' in locals():
    agents_to_delete.append((client_local, agent_local_fixed, "Local fixed"))

for client, agent, name in agents_to_delete:
    try:
        client.agents.delete(agent.id)
        print(f"‚úÖ Deleted {name}: {agent.id[:16]}...")
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to delete {name}: {e}")

print("\n‚úÖ Cleanup complete")


üóëÔ∏è Cleaning up test agents...

‚úÖ Deleted Cloud baseline: agent-50f90394-a...
‚úÖ Deleted Local baseline: agent-64273794-3...
‚úÖ Deleted Local fixed: agent-3828f499-f...

‚úÖ Cleanup complete


## Test 6: Version Analysis
Check Letta versions and test empty tool_rules fix


In [12]:
print("=" * 80)
print("TEST 6A: VERSION ANALYSIS")
print("=" * 80)

# Check versions
import letta
print(f"\nLocal Letta client version: {letta.__version__}")
print(f"\nNOTE: Letta Cloud is likely running v0.16.x (latest)")
print(f"      Local is running v{letta.__version__}")

# Key finding explanation
print("\nüîç KEY FINDING from Test 1:")
print("   Cloud tool_rules: []  (EMPTY - full discretion)")
print("   Local tool_rules: [continue_loop, continue_loop, continue_loop]")
print("\n   This difference causes the behavior divergence!")
print("   - Empty rules = agent chooses when to use tools")
print("   - continue_loop rules = agent nudged to use tools for complex tasks")


TEST 6A: VERSION ANALYSIS

Local Letta client version: 0.12.1

NOTE: Letta Cloud is likely running v0.16.x (latest)
      Local is running v0.12.1

üîç KEY FINDING from Test 1:
   Cloud tool_rules: []  (EMPTY - full discretion)
   Local tool_rules: [continue_loop, continue_loop, continue_loop]

   This difference causes the behavior divergence!
   - Empty rules = agent chooses when to use tools
   - continue_loop rules = agent nudged to use tools for complex tasks


In [13]:
print("=" * 80)
print("TEST 6B: FIX TEST - Local with Explicit Empty Tool Rules")
print("=" * 80)

# Try creating agent with EXPLICIT empty tool rules
try:
    agent_local_empty = client_local.agents.create(
        name="test_local_empty_rules",
        agent_type="letta_v1_agent",
        llm_config=LLM_CONFIG_LOCAL,
        embedding_config=EMBEDDING_CONFIG_LOCAL,
        memory_blocks=[
            {"label": "persona", "value": PERSONA_BLOCK},
            {"label": "human", "value": HUMAN_BLOCK},
        ],
        tool_rules=[],  # EXPLICITLY EMPTY - like Cloud
    )
    
    print(f"\n‚úÖ Agent with empty tool_rules created: {agent_local_empty.id}")
    print(f"   Tool rules: {agent_local_empty.tool_rules}")
    print(f"   Tools: {[t.name if hasattr(t, 'name') else t for t in agent_local_empty.tools]}")
    
    # Test with Hangman prompt
    print("\nüì§ Sending Hangman prompt...\n")
    response_empty = client_local.agents.messages.create(
        agent_id=agent_local_empty.id,
        messages=[{"role": "user", "content": TEST_PROMPT}]
    )
    
    tool_calls_empty = []
    print("=== Response Flow ===")
    for msg in response_empty.messages:
        print(f"  [{msg.message_type}]", end="")
        if msg.message_type == "tool_call_message":
            print(f" -> {msg.tool_call.name}")
            tool_calls_empty.append(msg.tool_call.name)
        elif msg.message_type == "assistant_message":
            print(f" -> Response sent")
        else:
            print()
    
    print(f"\nüìä Summary:")
    print(f"   Tool calls: {tool_calls_empty if tool_calls_empty else 'NONE'}")
    print(f"   Behavior: {'DISCRETIONARY ‚úÖ' if not tool_calls_empty else 'PROACTIVE ‚ö†Ô∏è'}")
    
    if not tool_calls_empty:
        print("\nüéâ SUCCESS! Empty tool_rules fixed the issue!")
        print("   Local server now matches Cloud behavior!")
    else:
        print("\n‚ö†Ô∏è Still calling tools - may need to upgrade Letta version")

except Exception as e:
    print(f"\n‚ùå Failed to create agent with empty tool_rules: {e}")
    print("   The API might not support tool_rules=[] in v0.12.1")


TEST 6B: FIX TEST - Local with Explicit Empty Tool Rules
httpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/ "HTTP/1.1 200 OK"

‚úÖ Agent with empty tool_rules created: agent-0f91a8ef-a704-4ad7-a111-f3c4d4ce1dbd
   Tool rules: [ContinueToolRule(tool_name='memory_replace', type='continue_loop', prompt_template=None), ContinueToolRule(tool_name='conversation_search', type='continue_loop', prompt_template=None), ContinueToolRule(tool_name='memory_insert', type='continue_loop', prompt_template=None)]
   Tools: ['memory_replace', 'memory_insert', 'conversation_search']

üì§ Sending Hangman prompt...

httpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/agent-0f91a8ef-a704-4ad7-a111-f3c4d4ce1dbd/messages "HTTP/1.1 200 OK"
=== Response Flow ===
  [tool_call_message] -> memory_insert
  [tool_return_message]
  [tool_call_message] -> memory_replace
  [tool_return_message]
  [tool_call_message] -> memory_replace
  [tool_return_message]
  [tool_call_message] -> convers

## Test 7: Comparison Table
Final side-by-side comparison of all configurations tested


In [14]:
print("=" * 80)
print("TEST 7: COMPREHENSIVE COMPARISON TABLE")
print("=" * 80)

print(f"\n{'Configuration':<30} {'Tool Rules':<35} {'Behavior':<20}")
print("-" * 85)

# Cloud baseline
print(f"{'Cloud (gpt-4o)':<30} {'[] (empty)':<35} {'DISCRETIONARY ‚úÖ':<20}")

# Local baseline
print(f"{'Local baseline (gpt-oss)':<30} {'[continue_loop √ó 3]':<35} {'PROACTIVE ‚ö†Ô∏è':<20}")

# Local with empty rules (if tested)
if 'agent_local_empty' in locals():
    behavior_empty = 'DISCRETIONARY ‚úÖ' if not tool_calls_empty else 'PROACTIVE ‚ö†Ô∏è'
    print(f"{'Local + empty rules':<30} {'[] (empty - explicit)':<35} {behavior_empty:<20}")
else:
    print(f"{'Local + empty rules':<30} {'Not tested':<35} {'N/A':<20}")

print("\n" + "=" * 85)
print("üìä CONCLUSION:")
print("=" * 85)

if 'agent_local_empty' in locals() and not tool_calls_empty:
    print("\n‚úÖ SOLUTION CONFIRMED: tool_rules=[] enables discretionary behavior")
    print("\n   To fix your experiments:")
    print("   1. Modify letta_agent.py line 118 to add: tool_rules=[]")
    print("   2. This will make local behavior match Cloud (discretionary)")
    print("   3. No need to upgrade Letta (though v0.16.1 is available)")
elif 'agent_local_empty' in locals() and tool_calls_empty:
    print("\n‚ö†Ô∏è Empty tool_rules didn't fix it - version upgrade likely needed")
    print("\n   Recommended action:")
    print("   1. Upgrade local Letta: pip install --upgrade letta")
    print("   2. Restart Letta server with new version")
    print("   3. Retest with tool_rules=[]")
else:
    print("\n‚ö†Ô∏è Could not test empty tool_rules fix")
    print("\n   Next steps:")
    print("   1. Check if API supports tool_rules parameter")
    print("   2. Consider upgrading Letta to v0.16.1")

print("\n" + "=" * 85)


TEST 7: COMPREHENSIVE COMPARISON TABLE

Configuration                  Tool Rules                          Behavior            
-------------------------------------------------------------------------------------
Cloud (gpt-4o)                 [] (empty)                          DISCRETIONARY ‚úÖ     
Local baseline (gpt-oss)       [continue_loop √ó 3]                 PROACTIVE ‚ö†Ô∏è        
Local + empty rules            [] (empty - explicit)               PROACTIVE ‚ö†Ô∏è        

üìä CONCLUSION:

‚ö†Ô∏è Empty tool_rules didn't fix it - version upgrade likely needed

   Recommended action:
   1. Upgrade local Letta: pip install --upgrade letta
   2. Restart Letta server with new version
   3. Retest with tool_rules=[]



## FINAL RECOMMENDATIONS


In [16]:
print("=" * 80)
print("FINAL RECOMMENDATIONS")
print("=" * 80)

import letta
local_version = letta.__version__

print(f"\nüìå IDENTIFIED ROOT CAUSE:")
print(f"   Local Letta v{local_version} defaults letta_v1_agent to tool_rules=[continue_loop √ó 3]")
print(f"   Cloud Letta (likely v0.16.x) defaults letta_v1_agent to tool_rules=[]")
print(f"   The continue_loop rules nudge the model to use tools for complex tasks")

print(f"\nüí° SOLUTION OPTIONS:")
print(f"\n   Option A: Quick Fix (No Upgrade)")
print(f"   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
print(f"   Modify src/hangman/agents/letta_agent.py:")
print(f"   Line 118-137, add tool_rules=[] parameter:")
print(f"")
print(f"   self.letta_agent = self.letta_client.agents.create(")
print(f"       name=f'agent_{{self.session_id}}',")
print(f"       agent_type='letta_v1_agent',")
print(f"       llm_config={{...}},")
print(f"       embedding_config={{...}},")
print(f"       memory_blocks=[...],")
print(f"       tool_rules=[],  # ADD THIS LINE - enables discretionary behavior")
print(f"   )")
print(f"")
print(f"   Also update line 353 in the reset() method with the same change.")

print(f"\n   Option B: Upgrade Letta (Recommended for Long-Term)")
print(f"   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
print(f"   1. Backup your data: cp -r ~/.letta ~/.letta.backup")
print(f"   2. Upgrade: pip install --upgrade letta")
print(f"   3. Restart server: letta server --host 127.0.0.1 --port 8283")
print(f"   4. Test: newer versions should default to empty tool_rules")
print(f"")
print(f"   Current: v{local_version}")
print(f"   Latest:  v0.16.1")
print(f"   Gap:     {0.16 - float(local_version[2:])} major versions behind")

print(f"\n   Option C: Hybrid Approach")
print(f"   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
print(f"   1. Apply Option A immediately (quick fix)")
print(f"   2. Plan Option B upgrade when convenient")
print(f"   3. Test thoroughly after upgrade")

print(f"\nüéØ RECOMMENDED: Start with Option A")
print(f"   - Zero risk, no dependencies to break")
print(f"   - Immediate fix for your experiments")
print(f"   - Can upgrade later if needed")

print("\n" + "=" * 80)


FINAL RECOMMENDATIONS

üìå IDENTIFIED ROOT CAUSE:
   Local Letta v0.12.1 defaults letta_v1_agent to tool_rules=[continue_loop √ó 3]
   Cloud Letta (likely v0.16.x) defaults letta_v1_agent to tool_rules=[]
   The continue_loop rules nudge the model to use tools for complex tasks

üí° SOLUTION OPTIONS:

   Option A: Quick Fix (No Upgrade)
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   Modify src/hangman/agents/letta_agent.py:
   Line 118-137, add tool_rules=[] parameter:

   self.letta_agent = self.letta_client.agents.create(
       name=f'agent_{self.session_id}',
       agent_type='letta_v1_agent',
       llm_config={...},
       embedding_config={...},
       memory_blocks=[...],
       tool_rules=[],  # ADD THIS LINE - enables discretionary behavior
   )

   Also update line 353 in the reset() method with the same change.

   Option B: Upgrade Letta (Recommended for Long-Term)
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î

## üéâ TEST 8: Validation with Letta v0.16.1 (THE FIX!)

After upgrading the local server to Letta v0.16.1 with PostgreSQL, test that `tool_rules=[]` is now respected.


In [18]:
print("=" * 80)
print("TEST 8: Letta v0.16.1 - Validation of tool_rules=[] Fix")
print("=" * 80)

# Connect to new v0.16.1 server
client_v16 = Letta(base_url="http://localhost:8283", timeout=1000)

# Check server version
try:
    # Create agent with EMPTY tool_rules
    agent_v16 = client_v16.agents.create(
        name="test_v16_discretionary",
        memory_blocks=[
            {"label": "persona", "value": PERSONA_BLOCK},
            {"label": "human", "value": HUMAN_BLOCK},
        ],
        tool_rules=[],  # THE KEY FIX - should now be respected!
        model="openai/gpt-4o-mini",
        embedding="openai/text-embedding-3-small"
    )
    
    print(f"\n‚úÖ Agent created on Letta v0.16.1: {agent_v16.id}")
    print(f"   Tool rules: {agent_v16.tool_rules}")
    print(f"   Tools: {[t.name if hasattr(t, 'name') else str(t) for t in agent_v16.tools]}")
    
    # Check if tool_rules is actually empty
    if not agent_v16.tool_rules or len(agent_v16.tool_rules) == 0:
        print("\nüéâ SUCCESS! tool_rules=[] is now RESPECTED!")
        print("   ‚Üí Local server behavior now matches Cloud!")
    else:
        print("\n‚ö†Ô∏è tool_rules still populated - unexpected behavior")
    
    # Send test message to verify discretionary behavior
    print("\nüì§ Sending Hangman prompt to verify behavior...\n")
    response_v16 = client_v16.agents.messages.create(
        agent_id=agent_v16.id,
        messages=[{"role": "user", "content": TEST_PROMPT}]
    )
    
    tool_calls_v16 = []
    final_response_v16 = ""
    
    print("=== Response Flow ===")
    for msg in response_v16.messages:
        print(f"  [{msg.message_type}]", end="")
        if msg.message_type == "tool_call_message":
            tool_name = msg.tool_call.name if hasattr(msg, 'tool_call') else "unknown"
            print(f" -> {tool_name}")
            tool_calls_v16.append(tool_name)
        elif msg.message_type == "assistant_message":
            print(f" -> Response sent")
            final_response_v16 = msg.content if hasattr(msg, 'content') else ""
        else:
            print()
    
    print(f"\nüìä Summary:")
    print(f"   Tool calls on turn 1: {tool_calls_v16 if tool_calls_v16 else 'NONE'}")
    behavior_v16 = "DISCRETIONARY ‚úÖ" if not tool_calls_v16 else "PROACTIVE"
    print(f"   Behavior: {behavior_v16}")
    
    # Cleanup
    client_v16.agents.delete(agent_v16.id)
    print(f"\n‚úÖ Agent deleted")

except Exception as e:
    print(f"\n‚ùå Error: {e}")
    print("   Make sure the Letta v0.16.1 server is running:")
    print("   ./start_letta_server.sh")


TEST 8: Letta v0.16.1 - Validation of tool_rules=[] Fix
httpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/ "HTTP/1.1 200 OK"

‚úÖ Agent created on Letta v0.16.1: agent-21ae51ce-bfd4-4d90-be5f-154a85e4def6
   Tool rules: []
   Tools: ['conversation_search', 'memory_insert', 'memory_replace']

üéâ SUCCESS! tool_rules=[] is now RESPECTED!
   ‚Üí Local server behavior now matches Cloud!

üì§ Sending Hangman prompt to verify behavior...

httpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/agent-21ae51ce-bfd4-4d90-be5f-154a85e4def6/messages "HTTP/1.1 200 OK"
=== Response Flow ===
  [assistant_message] -> Response sent

üìä Summary:
   Tool calls on turn 1: NONE
   Behavior: DISCRETIONARY ‚úÖ
httpx - INFO - HTTP Request: DELETE http://localhost:8283/v1/agents/agent-21ae51ce-bfd4-4d90-be5f-154a85e4def6 "HTTP/1.1 200 OK"

‚úÖ Agent deleted


In [19]:
print("=" * 80)
print("FINAL COMPARISON: Cloud vs Local v0.12.1 vs Local v0.16.1")
print("=" * 80)

print(f"\n{'Server':<25} {'Version':<12} {'tool_rules=[]':<20} {'Behavior':<20}")
print("-" * 77)
print(f"{'Letta Cloud':<25} {'latest':<12} {'‚úÖ Respected':<20} {'DISCRETIONARY':<20}")
print(f"{'Local (old)':<25} {'v0.12.1':<12} {'‚ùå Ignored':<20} {'PROACTIVE':<20}")
print(f"{'Local (new)':<25} {'v0.16.1':<12} {'‚úÖ Respected':<20} {'DISCRETIONARY':<20}")

print("\n" + "=" * 77)
print("‚úÖ SOLUTION VALIDATED")
print("=" * 77)
print("""
The fix is confirmed:

1. Upgrade local Letta server to v0.16.1 (with PostgreSQL + pgvector)
2. Pass tool_rules=[] when creating agents
3. Local behavior now matches Letta Cloud (discretionary memory)

To apply this fix to your experiments:
- Modify src/hangman/agents/letta_agent.py
- Add tool_rules=[] to the agents.create() call
""")


FINAL COMPARISON: Cloud vs Local v0.12.1 vs Local v0.16.1

Server                    Version      tool_rules=[]        Behavior            
-----------------------------------------------------------------------------
Letta Cloud               latest       ‚úÖ Respected          DISCRETIONARY       
Local (old)               v0.12.1      ‚ùå Ignored            PROACTIVE           
Local (new)               v0.16.1      ‚úÖ Respected          DISCRETIONARY       

‚úÖ SOLUTION VALIDATED

The fix is confirmed:

1. Upgrade local Letta server to v0.16.1 (with PostgreSQL + pgvector)
2. Pass tool_rules=[] when creating agents
3. Local behavior now matches Letta Cloud (discretionary memory)

To apply this fix to your experiments:
- Modify src/hangman/agents/letta_agent.py
- Add tool_rules=[] to the agents.create() call



## TEST 9: Direct Cloud vs Local v0.16.1 Comparison

Side-by-side test with identical configuration to confirm matching behavior.


In [20]:
print("=" * 80)
print("TEST 9: Cloud vs Local v0.16.1 - Direct Comparison")
print("=" * 80)

# Use same model on both for fair comparison
MODEL = "openai/gpt-4o-mini"
EMBEDDING = "openai/text-embedding-3-small"

results = {}

# --- CLOUD TEST ---
print("\n" + "-" * 40)
print("CLOUD (Letta Cloud API)")
print("-" * 40)

agent_cloud_test = client_cloud.agents.create(
    name="test_cloud_comparison",
    memory_blocks=[
        {"label": "persona", "value": PERSONA_BLOCK},
        {"label": "human", "value": HUMAN_BLOCK},
    ],
    tool_rules=[],
    model=MODEL,
    embedding=EMBEDDING
)

print(f"Agent: {agent_cloud_test.id}")
print(f"Tool rules: {agent_cloud_test.tool_rules}")

response_cloud_test = client_cloud.agents.messages.create(
    agent_id=agent_cloud_test.id,
    messages=[{"role": "user", "content": TEST_PROMPT}]
)

cloud_tools = []
cloud_response = ""
for msg in response_cloud_test.messages:
    if msg.message_type == "tool_call_message":
        cloud_tools.append(msg.tool_call.name)
    elif msg.message_type == "assistant_message":
        cloud_response = msg.content[:200] if hasattr(msg, 'content') else ""

results['cloud'] = {
    'tool_calls': cloud_tools,
    'response_preview': cloud_response
}
print(f"Tool calls: {cloud_tools if cloud_tools else 'NONE'}")
print(f"Response: {cloud_response[:100]}...")

# Cleanup
client_cloud.agents.delete(agent_cloud_test.id)

# --- LOCAL v0.16.1 TEST ---
print("\n" + "-" * 40)
print("LOCAL (Letta v0.16.1)")
print("-" * 40)

client_local_v16 = Letta(base_url="http://localhost:8283", timeout=1000)

agent_local_test = client_local_v16.agents.create(
    name="test_local_comparison",
    memory_blocks=[
        {"label": "persona", "value": PERSONA_BLOCK},
        {"label": "human", "value": HUMAN_BLOCK},
    ],
    tool_rules=[],
    model=MODEL,
    embedding=EMBEDDING
)

print(f"Agent: {agent_local_test.id}")
print(f"Tool rules: {agent_local_test.tool_rules}")

response_local_test = client_local_v16.agents.messages.create(
    agent_id=agent_local_test.id,
    messages=[{"role": "user", "content": TEST_PROMPT}]
)

local_tools = []
local_response = ""
for msg in response_local_test.messages:
    if msg.message_type == "tool_call_message":
        local_tools.append(msg.tool_call.name)
    elif msg.message_type == "assistant_message":
        local_response = msg.content[:200] if hasattr(msg, 'content') else ""

results['local'] = {
    'tool_calls': local_tools,
    'response_preview': local_response
}
print(f"Tool calls: {local_tools if local_tools else 'NONE'}")
print(f"Response: {local_response[:100]}...")

# Cleanup
client_local_v16.agents.delete(agent_local_test.id)

# --- COMPARISON ---
print("\n" + "=" * 80)
print("COMPARISON RESULTS")
print("=" * 80)

print(f"\n{'Metric':<25} {'Cloud':<25} {'Local v0.16.1':<25}")
print("-" * 75)
print(f"{'Tool calls on turn 1':<25} {str(results['cloud']['tool_calls'] or 'NONE'):<25} {str(results['local']['tool_calls'] or 'NONE'):<25}")
print(f"{'Memory tools used':<25} {'NO' if not results['cloud']['tool_calls'] else 'YES':<25} {'NO' if not results['local']['tool_calls'] else 'YES':<25}")

cloud_behavior = "DISCRETIONARY" if not results['cloud']['tool_calls'] else "PROACTIVE"
local_behavior = "DISCRETIONARY" if not results['local']['tool_calls'] else "PROACTIVE"
print(f"{'Behavior':<25} {cloud_behavior:<25} {local_behavior:<25}")

# Final verdict
if cloud_behavior == local_behavior and not results['cloud']['tool_calls'] and not results['local']['tool_calls']:
    print("\n" + "=" * 75)
    print("üéâ BEHAVIORS MATCH! Both Cloud and Local v0.16.1 are DISCRETIONARY")
    print("=" * 75)
    print("""
    ‚úÖ The fix is confirmed working:
    - Both servers respect tool_rules=[]
    - Neither calls memory tools on the first turn of Hangman
    - Local server now mirrors Cloud API behavior exactly
    """)
else:
    print("\n‚ö†Ô∏è Behaviors differ - further investigation needed")


TEST 9: Cloud vs Local v0.16.1 - Direct Comparison

----------------------------------------
CLOUD (Letta Cloud API)
----------------------------------------
httpx - INFO - HTTP Request: POST https://api.letta.com/v1/agents/ "HTTP/1.1 201 Created"
Agent: agent-112c8b73-ffb8-4e31-9945-4222e9160608
Tool rules: []
httpx - INFO - HTTP Request: POST https://api.letta.com/v1/agents/agent-112c8b73-ffb8-4e31-9945-4222e9160608/messages "HTTP/1.1 200 OK"
Tool calls: NONE
Response: Great! Let's play Hangman. I've thought of a secret word, and here it is represented by underscores:...
httpx - INFO - HTTP Request: DELETE https://api.letta.com/v1/agents/agent-112c8b73-ffb8-4e31-9945-4222e9160608 "HTTP/1.1 200 OK"

----------------------------------------
LOCAL (Letta v0.16.1)
----------------------------------------
httpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/ "HTTP/1.1 200 OK"
Agent: agent-fae90bf4-7f1e-41cc-abc6-b189087cdd29
Tool rules: []
httpx - INFO - HTTP Request: POST ht

## TEST 10: Statistical Comparison (N Trials)

Run multiple trials on both Cloud and Local to get a statistical comparison of memory tool usage on first turn.


In [26]:
import time

print("=" * 80)
print("TEST 10: Statistical Memory Usage Comparison (N Trials)")
print("=" * 80)

# Configuration
N_TRIALS = 10  # Number of trials per server
MODEL = "openai/gpt-4o-mini"
EMBEDDING = "openai/text-embedding-3-small"

# Results storage
cloud_results = {"memory_used": 0, "no_memory": 0, "errors": 0, "tool_calls": []}
local_results = {"memory_used": 0, "no_memory": 0, "errors": 0, "tool_calls": []}

print(f"\nüî¨ Running {N_TRIALS} trials on each server...")
print(f"   Model: {MODEL}")
print(f"   Test: Hangman prompt (first turn behavior)")
print()

# Connect to servers
client_cloud_test = Letta(token=LETTA_API_KEY)
client_local_test = Letta(base_url="http://localhost:8283", timeout=1000)

# --- CLOUD TRIALS ---
print("-" * 40)
print(f"CLOUD TRIALS (0/{N_TRIALS})", end="", flush=True)
print("\r", end="")

for i in range(N_TRIALS):
    try:
        # Create agent
        agent = client_cloud_test.agents.create(
            name=f"test_cloud_trial_{i}_{int(time.time())}",
            memory_blocks=[
                {"label": "persona", "value": PERSONA_BLOCK},
                {"label": "human", "value": HUMAN_BLOCK},
            ],
            tool_rules=[],
            model=MODEL,
            embedding=EMBEDDING
        )
        
        # Send prompt
        response = client_cloud_test.agents.messages.create(
            agent_id=agent.id,
            messages=[{"role": "user", "content": TEST_PROMPT}]
        )
        
        # Check for tool calls
        tools_used = []
        for msg in response.messages:
            if msg.message_type == "tool_call_message":
                tools_used.append(msg.tool_call.name)
        
        if tools_used:
            cloud_results["memory_used"] += 1
            cloud_results["tool_calls"].append(tools_used)
        else:
            cloud_results["no_memory"] += 1
        
        # Cleanup
        client_cloud_test.agents.delete(agent.id)
        
        print(f"\rCLOUD TRIALS ({i+1}/{N_TRIALS}) - {'üîß' if tools_used else '‚úÖ'}", end="", flush=True)
        
    except Exception as e:
        cloud_results["errors"] += 1
        print(f"\rCLOUD TRIALS ({i+1}/{N_TRIALS}) - ‚ùå Error: {str(e)[:50]}", end="", flush=True)
    
    time.sleep(0.5)  # Rate limiting

print()

# --- LOCAL TRIALS ---
print("-" * 40)
print(f"LOCAL TRIALS (0/{N_TRIALS})", end="", flush=True)
print("\r", end="")

for i in range(N_TRIALS):
    try:
        # Create agent
        agent = client_local_test.agents.create(
            name=f"test_local_trial_{i}_{int(time.time())}",
            memory_blocks=[
                {"label": "persona", "value": PERSONA_BLOCK},
                {"label": "human", "value": HUMAN_BLOCK},
            ],
            tool_rules=[],
            model=MODEL,
            embedding=EMBEDDING
        )
        
        # Send prompt
        response = client_local_test.agents.messages.create(
            agent_id=agent.id,
            messages=[{"role": "user", "content": TEST_PROMPT}]
        )
        
        # Check for tool calls
        tools_used = []
        for msg in response.messages:
            if msg.message_type == "tool_call_message":
                tools_used.append(msg.tool_call.name)
        
        if tools_used:
            local_results["memory_used"] += 1
            local_results["tool_calls"].append(tools_used)
        else:
            local_results["no_memory"] += 1
        
        # Cleanup
        client_local_test.agents.delete(agent.id)
        
        print(f"\rLOCAL TRIALS ({i+1}/{N_TRIALS}) - {'üîß' if tools_used else '‚úÖ'}", end="", flush=True)
        
    except Exception as e:
        local_results["errors"] += 1
        print(f"\rLOCAL TRIALS ({i+1}/{N_TRIALS}) - ‚ùå Error: {str(e)[:50]}", end="", flush=True)
    
    time.sleep(0.5)  # Rate limiting

print()

# --- RESULTS ---
print("\n" + "=" * 80)
print("STATISTICAL RESULTS")
print("=" * 80)

cloud_total = cloud_results["memory_used"] + cloud_results["no_memory"]
local_total = local_results["memory_used"] + local_results["no_memory"]

cloud_pct = (cloud_results["memory_used"] / cloud_total * 100) if cloud_total > 0 else 0
local_pct = (local_results["memory_used"] / local_total * 100) if local_total > 0 else 0

print(f"\n{'Metric':<30} {'Cloud':<20} {'Local v0.16.1':<20}")
print("-" * 70)
print(f"{'Total Trials':<30} {N_TRIALS:<20} {N_TRIALS:<20}")
print(f"{'Successful Trials':<30} {cloud_total:<20} {local_total:<20}")
print(f"{'Errors':<30} {cloud_results['errors']:<20} {local_results['errors']:<20}")
print("-" * 70)
print(f"{'Memory Tools Used':<30} {cloud_results['memory_used']:<20} {local_results['memory_used']:<20}")
print(f"{'No Memory Tools':<30} {cloud_results['no_memory']:<20} {local_results['no_memory']:<20}")
print("-" * 70)
print(f"{'% Memory on Turn 1':<30} {cloud_pct:.1f}%{'':<17} {local_pct:.1f}%")

# Visual comparison
print("\n" + "=" * 70)
print("VISUAL COMPARISON")
print("=" * 70)

bar_width = 40
cloud_bar = int(cloud_pct / 100 * bar_width)
local_bar = int(local_pct / 100 * bar_width)

print(f"\nCloud:  [{'‚ñà' * cloud_bar}{'‚ñë' * (bar_width - cloud_bar)}] {cloud_pct:.1f}%")
print(f"Local:  [{'‚ñà' * local_bar}{'‚ñë' * (bar_width - local_bar)}] {local_pct:.1f}%")

# Verdict
print("\n" + "=" * 70)
if abs(cloud_pct - local_pct) < 10:
    print("üéâ BEHAVIORS MATCH! Both show similar memory tool usage patterns.")
    print(f"   Difference: {abs(cloud_pct - local_pct):.1f}%")
elif cloud_pct < local_pct:
    print("‚ö†Ô∏è LOCAL is more PROACTIVE than Cloud")
    print(f"   Local uses memory {local_pct - cloud_pct:.1f}% more often")
else:
    print("‚ö†Ô∏è CLOUD is more PROACTIVE than Local")
    print(f"   Cloud uses memory {cloud_pct - local_pct:.1f}% more often")
print("=" * 70)


TEST 10: Statistical Memory Usage Comparison (N Trials)

üî¨ Running 10 trials on each server...
   Model: openai/gpt-4o-mini
   Test: Hangman prompt (first turn behavior)

----------------------------------------
httpx - INFO - HTTP Request: POST https://api.letta.com/v1/agents/ "HTTP/1.1 201 Created"
httpx - INFO - HTTP Request: POST https://api.letta.com/v1/agents/agent-a65106e9-8404-4d82-8362-be0885df1161/messages "HTTP/1.1 200 OK"
httpx - INFO - HTTP Request: DELETE https://api.letta.com/v1/agents/agent-a65106e9-8404-4d82-8362-be0885df1161 "HTTP/1.1 200 OK"
CLOUD TRIALS (1/10) - ‚úÖhttpx - INFO - HTTP Request: POST https://api.letta.com/v1/agents/ "HTTP/1.1 201 Created"
httpx - INFO - HTTP Request: POST https://api.letta.com/v1/agents/agent-64320326-3c12-4bb0-8923-6bff6bc39e14/messages "HTTP/1.1 200 OK"
httpx - INFO - HTTP Request: DELETE https://api.letta.com/v1/agents/agent-64320326-3c12-4bb0-8923-6bff6bc39e14 "HTTP/1.1 200 OK"
CLOUD TRIALS (2/10) - ‚úÖhttpx - INFO - HTTP Reque

In [27]:
# Show detailed tool call breakdown if any occurred
print("=" * 80)
print("DETAILED TOOL CALL BREAKDOWN")
print("=" * 80)

if cloud_results["tool_calls"]:
    print(f"\nüìä Cloud Tool Calls ({len(cloud_results['tool_calls'])} trials with tools):")
    for i, calls in enumerate(cloud_results["tool_calls"], 1):
        print(f"   Trial {i}: {', '.join(calls)}")
else:
    print("\n‚úÖ Cloud: No memory tools called in any trial")

if local_results["tool_calls"]:
    print(f"\nüìä Local Tool Calls ({len(local_results['tool_calls'])} trials with tools):")
    for i, calls in enumerate(local_results["tool_calls"], 1):
        print(f"   Trial {i}: {', '.join(calls)}")
else:
    print("\n‚úÖ Local: No memory tools called in any trial")

# Summary
print("\n" + "=" * 80)
print("CONCLUSION")
print("=" * 80)
print(f"""
With tool_rules=[] on both servers:

- Cloud API: {cloud_results['no_memory']}/{cloud_total} trials DISCRETIONARY ({100-cloud_pct:.0f}%)
- Local v0.16.1: {local_results['no_memory']}/{local_total} trials DISCRETIONARY ({100-local_pct:.0f}%)

{'‚úÖ Both servers now behave identically!' if abs(cloud_pct - local_pct) < 10 else '‚ö†Ô∏è Behavior differs - investigate further'}
""")


DETAILED TOOL CALL BREAKDOWN

‚úÖ Cloud: No memory tools called in any trial

üìä Local Tool Calls (1 trials with tools):
   Trial 1: memory_insert

CONCLUSION

With tool_rules=[] on both servers:

- Cloud API: 10/10 trials DISCRETIONARY (100%)
- Local v0.16.1: 9/10 trials DISCRETIONARY (90%)

‚ö†Ô∏è Behavior differs - investigate further



## TEST 11: GPT-OSS-20B Comparison (Same Config as Experiments)

Test with the exact same model configuration used in SCT experiments to get an apples-to-apples comparison.


In [28]:
import time

print("=" * 80)
print("TEST 11: GPT-OSS-20B via OpenRouter (Same as Experiments)")
print("=" * 80)

# Configuration - EXACTLY matching letta_agent.py and letta_config_gptoss_20b.yaml
N_TRIALS_OSS = 10

LLM_CONFIG_OSS = {
    "model": "openai/gpt-oss-20b",
    "model_endpoint_type": "openai",
    "model_endpoint": "https://openrouter.ai/api/v1",
    "context_window": 16384,
}

EMBEDDING_CONFIG_OSS = {
    "embedding_model": "openai/text-embedding-3-large",
    "embedding_endpoint_type": "openai",
    "embedding_endpoint": "https://openrouter.ai/api/v1",
    "embedding_dim": 1536,
}

# Results storage
oss_results = {"memory_used": 0, "no_memory": 0, "errors": 0, "tool_calls": []}

print(f"\nüî¨ Running {N_TRIALS_OSS} trials with GPT-OSS-20B...")
print(f"   Model: {LLM_CONFIG_OSS['model']}")
print(f"   Endpoint: {LLM_CONFIG_OSS['model_endpoint']}")
print(f"   tool_rules: []")
print()

# Connect to local server
client_oss = Letta(base_url="http://localhost:8283", timeout=1000)

# --- GPT-OSS-20B TRIALS ---
print("-" * 40)
print(f"GPT-OSS-20B TRIALS (0/{N_TRIALS_OSS})", end="", flush=True)
print("\r", end="")

for i in range(N_TRIALS_OSS):
    try:
        # Create agent with EXACT same config as letta_agent.py
        agent = client_oss.agents.create(
            name=f"test_oss_trial_{i}_{int(time.time())}",
            agent_type="letta_v1_agent",
            llm_config=LLM_CONFIG_OSS,
            embedding_config=EMBEDDING_CONFIG_OSS,
            tool_rules=[],  # THE KEY FIX
            memory_blocks=[
                {"label": "human", "value": HUMAN_BLOCK},
                {"label": "persona", "value": PERSONA_BLOCK},
            ],
        )
        
        # Send Hangman prompt (same as TEST_PROMPT)
        response = client_oss.agents.messages.create(
            agent_id=agent.id,
            messages=[{"role": "user", "content": TEST_PROMPT}]
        )
        
        # Check for tool calls
        tools_used = []
        for msg in response.messages:
            if msg.message_type == "tool_call_message":
                tools_used.append(msg.tool_call.name)
        
        if tools_used:
            oss_results["memory_used"] += 1
            oss_results["tool_calls"].append(tools_used)
        else:
            oss_results["no_memory"] += 1
        
        # Cleanup
        client_oss.agents.delete(agent.id)
        
        print(f"\rGPT-OSS-20B TRIALS ({i+1}/{N_TRIALS_OSS}) - {'üîß' if tools_used else '‚úÖ'}", end="", flush=True)
        
    except Exception as e:
        oss_results["errors"] += 1
        print(f"\rGPT-OSS-20B TRIALS ({i+1}/{N_TRIALS_OSS}) - ‚ùå Error: {str(e)[:50]}", end="", flush=True)
    
    time.sleep(1)  # Rate limiting for OpenRouter

print()

# --- RESULTS ---
print("\n" + "=" * 80)
print("GPT-OSS-20B RESULTS (Local Server)")
print("=" * 80)

oss_total = oss_results["memory_used"] + oss_results["no_memory"]
oss_pct = (oss_results["memory_used"] / oss_total * 100) if oss_total > 0 else 0

print(f"\n{'Metric':<30} {'Value':<20}")
print("-" * 50)
print(f"{'Total Trials':<30} {N_TRIALS_OSS:<20}")
print(f"{'Successful Trials':<30} {oss_total:<20}")
print(f"{'Errors':<30} {oss_results['errors']:<20}")
print("-" * 50)
print(f"{'Memory Tools Used (Turn 1)':<30} {oss_results['memory_used']:<20}")
print(f"{'No Memory Tools (Turn 1)':<30} {oss_results['no_memory']:<20}")
print("-" * 50)
print(f"{'% Memory on Turn 1':<30} {oss_pct:.1f}%")

# Visual
bar_width = 40
oss_bar = int(oss_pct / 100 * bar_width)
print(f"\nMemory Usage: [{'‚ñà' * oss_bar}{'‚ñë' * (bar_width - oss_bar)}] {oss_pct:.1f}%")

# Show tool calls
if oss_results["tool_calls"]:
    print(f"\nüìä Tool Calls Details ({len(oss_results['tool_calls'])} trials):")
    for i, calls in enumerate(oss_results["tool_calls"], 1):
        print(f"   Trial {i}: {', '.join(calls)}")
else:
    print("\n‚úÖ No memory tools called in any trial!")

print("\n" + "=" * 80)


TEST 11: GPT-OSS-20B via OpenRouter (Same as Experiments)

üî¨ Running 10 trials with GPT-OSS-20B...
   Model: openai/gpt-oss-20b
   Endpoint: https://openrouter.ai/api/v1
   tool_rules: []

----------------------------------------
httpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/ "HTTP/1.1 200 OK"
httpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/agent-ebd2abad-b500-433f-91f4-c369640e78c6/messages "HTTP/1.1 200 OK"
httpx - INFO - HTTP Request: DELETE http://localhost:8283/v1/agents/agent-ebd2abad-b500-433f-91f4-c369640e78c6 "HTTP/1.1 200 OK"
GPT-OSS-20B TRIALS (1/10) - üîßhttpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/ "HTTP/1.1 200 OK"
httpx - INFO - HTTP Request: POST http://localhost:8283/v1/agents/agent-bd57383a-f6ef-4ed9-902f-ffaa64a0082f/messages "HTTP/1.1 200 OK"
httpx - INFO - HTTP Request: DELETE http://localhost:8283/v1/agents/agent-bd57383a-f6ef-4ed9-902f-ffaa64a0082f "HTTP/1.1 200 OK"
GPT-OSS-20B TRIALS (2/10) - üîßhtt

In [36]:
# Final comparison across all configurations
print("=" * 80)
print("FINAL COMPARISON: All Models & Servers")
print("=" * 80)

print(f"{'Configuration':<35} {'Model':<25} {'Memory Turn 1':<15} {'%':<10}")
print("-" * 85)
cloud_str = str(cloud_results['memory_used']) + '/' + str(cloud_total)
local_str = str(local_results['memory_used']) + '/' + str(local_total)
oss_str = str(oss_results['memory_used']) + '/' + str(oss_total)
print(f"{'Cloud API':<35} {'gpt-4o-mini':<25} {cloud_str:<15} {cloud_pct:.1f}%")
print(f"{'Local v0.16.1':<35} {'gpt-4o-mini':<25} {local_str:<15} {local_pct:.1f}%")
print(f"{'Local v0.16.1 (Experiment Config)':<35} {'gpt-oss-20b':<25} {oss_str:<15} {oss_pct:.1f}%")
print("-" * 85)

print("üìä ANALYSIS:")
print(f"   ‚Ä¢ Cloud vs Local (same model):    {abs(cloud_pct - local_pct):.1f}% difference")
print(f"   ‚Ä¢ gpt-4o-mini vs gpt-oss-20b:     {abs(local_pct - oss_pct):.1f}% difference")

print(" " + "=" * 80)
print("CONCLUSIONS")
print("=" * 80)

# Determine model impact
if oss_pct > local_pct + 20:
    print("""
üîç KEY FINDING: GPT-OSS-20B is MORE PROACTIVE than GPT-4o-mini

   This explains why:
   - Test 10 (gpt-4o-mini): ~10% memory usage on turn 1
   - SCT Experiments (gpt-oss-20b): Higher memory usage
   
   The model behavior is inherent to GPT-OSS-20B, not a server bug.
""")
elif oss_pct < 20:
    print("""
‚úÖ GPT-OSS-20B behaves similarly to GPT-4o-mini with tool_rules=[]

   Both models show DISCRETIONARY behavior on turn 1.
   The 83% "saved secret word" in experiments likely happens on LATER turns,
   not the first turn.
""")
else:
    print(f"""
üìä GPT-OSS-20B shows {oss_pct:.0f}% proactive behavior on turn 1

   Compare to:
   - Cloud (gpt-4o-mini): {cloud_pct:.0f}%
   - Local (gpt-4o-mini): {local_pct:.0f}%
""")

print("=" * 80)

FINAL COMPARISON: All Models & Servers
Configuration                       Model                     Memory Turn 1   %         
-------------------------------------------------------------------------------------
Cloud API                           gpt-4o-mini               0/10            0.0%
Local v0.16.1                       gpt-4o-mini               1/10            10.0%
Local v0.16.1 (Experiment Config)   gpt-oss-20b               6/9             66.7%
-------------------------------------------------------------------------------------
üìä ANALYSIS:
   ‚Ä¢ Cloud vs Local (same model):    10.0% difference
   ‚Ä¢ gpt-4o-mini vs gpt-oss-20b:     56.7% difference
CONCLUSIONS

üîç KEY FINDING: GPT-OSS-20B is MORE PROACTIVE than GPT-4o-mini

   This explains why:
   - Test 10 (gpt-4o-mini): ~10% memory usage on turn 1
   - SCT Experiments (gpt-oss-20b): Higher memory usage

   The model behavior is inherent to GPT-OSS-20B, not a server bug.



## Cleanup (Updated)


In [17]:
print("üóëÔ∏è Cleaning up test agents...\n")

agents_to_delete = [
    (client_cloud, agent_cloud, "Cloud baseline"),
    (client_local, agent_local, "Local baseline"),
]

# Add optional agents if they exist
if 'agent_local_fixed' in locals():
    agents_to_delete.append((client_local, agent_local_fixed, "Local with exit rule"))
if 'agent_local_empty' in locals():
    agents_to_delete.append((client_local, agent_local_empty, "Local with empty rules"))

for client, agent, name in agents_to_delete:
    try:
        client.agents.delete(agent.id)
        print(f"‚úÖ Deleted {name}: {agent.id[:16]}...")
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to delete {name}: {e}")

print("\n‚úÖ Cleanup complete")
print("\n" + "=" * 80)
print("END OF DIAGNOSTIC")
print("=" * 80)


üóëÔ∏è Cleaning up test agents...

httpx - INFO - HTTP Request: DELETE https://api.letta.com/v1/agents/agent-50f90394-ab6b-49d5-9e6f-325c6dc67e89 "HTTP/1.1 404 Not Found"
‚ö†Ô∏è Failed to delete Cloud baseline: headers: {'date': 'Tue, 23 Dec 2025 13:20:17 GMT', 'content-type': 'application/json; charset=utf-8', 'transfer-encoding': 'chunked', 'connection': 'keep-alive', 'x-powered-by': 'Express', 'access-control-allow-origin': '*', 'etag': 'W/"1d-hJZ9YMiqAqfvFbRrXLLoXrVWA1M"', 'cf-cache-status': 'DYNAMIC', 'content-encoding': 'br', 'server': 'cloudflare', 'cf-ray': '9b2827cd9928a298-YUL', 'alt-svc': 'h3=":443"; ma=86400'}, status_code: 404, body: {'message': 'Agent not found'}
httpx - INFO - HTTP Request: DELETE http://localhost:8283/v1/agents/agent-64273794-3e99-4a62-aa8d-4eda8d2dccb2 "HTTP/1.1 404 Not Found"
‚ö†Ô∏è Failed to delete Local baseline: headers: {'date': 'Tue, 23 Dec 2025 13:20:17 GMT', 'server': 'uvicorn', 'content-length': '376', 'content-type': 'application/json'}, sta