chughtapan · chughtapan · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/tests/benchmarks/appworld/mcp_server.py b/tests/benchmarks/appworld/mcp_server.py
@@ -230,7 +230,7 @@ async def call_tool(name: str, arguments: dict[str, Any]) -> Any:
             # Save databases on task completion
             if api_name == "complete_task" or name == "supervisor__complete_task":
                 Path(db_paths.output_db_path).mkdir(parents=True, exist_ok=True)
-                collections.model_collection.save(db_home_path=db_paths.output_db_path)
+                collections.model_collection.save(db_home_path=db_paths.output_db_path, save_model_hashes=True)
 
             return format_tool_response(response)
         except Exception as e:

diff --git a/tests/benchmarks/appworld/prompts.py b/tests/benchmarks/appworld/prompts.py
@@ -13,13 +13,12 @@
 EXPERIMENTS_PATH = Path(appworld_experiments.__file__).parent
 
 
-def load_system_instruction(task: Task, max_steps: int = 40) -> str:
+def load_system_instruction(task: Task) -> str:
     """
     Load and render system instruction from AppWorld's template with demo examples.
 
     Args:
         task: AppWorld Task object
-        max_steps: Maximum number of turns allowed
 
     Returns:
         Rendered system instruction with supervisor info, rules, and demos
@@ -40,7 +39,6 @@ def load_system_instruction(task: Task, max_steps: int = 40) -> str:
         template_content,
         main_user=task.supervisor,
         app_descriptions=app_descriptions_yaml,
-        max_steps=max_steps,
     )
 
     # Load demo messages and format them

diff --git a/tests/benchmarks/appworld/system_instruction.txt b/tests/benchmarks/appworld/system_instruction.txt
@@ -5,7 +5,7 @@ My name is: {{ main_user.first_name }} {{ main_user.last_name }}. My personal em
 
 You will be given a task instruction and a list of functions in the standard format. The functions correspond to APIs from various apps you have access to. The function name has two parts, the app name and API name separated by "__", e.g., spotify__login is the login API for the Spotify app.
 
-You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue either until you call `complete_task` API from the Supervisor app, or until a maximum of {max_steps} turns are reached.
+You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue until you call `complete_task` API from the Supervisor app.
 
 Here are brief app-wise descriptions.
 
@@ -21,7 +21,7 @@ A. General instructions:
 - Never leave placeholders; don't output things like "your_username". Always fill in the real value by retrieving it via APIs (e.g., Supervisor app for credentials).
 - When I omit details, choose any valid value. For example, if I ask you to buy something but don't specify which payment card to use, you may pick any one of my available cards.
 - Avoid collateral damage. Only perform what I explicitly ask for. Example: if I ask you to buy something, do not delete emails, return the order, or perform unrelated account operations.
-- You only have {max_steps} turns. Avoid unnecessary requests. You can batch unlimited function calls in a single turn - always group them to save steps.
+- Avoid unnecessary requests.
 
 B. App-specific instructions:
 

diff --git a/tests/benchmarks/appworld/test_appworld.py b/tests/benchmarks/appworld/test_appworld.py
@@ -50,7 +50,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
 
 
 @pytest.mark.asyncio
-@pytest.mark.timeout(300)
+@pytest.mark.timeout(900)
 async def test_appworld(
     task_id: str,
     model: str,

diff --git a/uv.lock b/uv.lock