eval-protocol · xzrderek · Nov 27, 2025 · Nov 27, 2025 · Nov 27, 2025
diff --git a/eval_protocol/benchmarks/test_frozen_lake.py b/eval_protocol/benchmarks/test_frozen_lake.py
@@ -39,7 +39,11 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
     input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
     dataset_adapter=frozen_lake_to_evaluation_row,
     completion_params=[
-        {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
+        {
+            "temperature": 0.0,
+            "max_tokens": 4096,
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        }
     ],
     rollout_processor=MCPGymRolloutProcessor(),
     passed_threshold=0.66,

diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py
@@ -32,7 +32,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent:
     "completion_params",
     [
         {
-            "model": "accounts/fireworks/models/kimi-k2-instruct",
+            "model": "accounts/fireworks/models/kimi-k2-instruct-0905",
             "provider": "fireworks",
         },
         {
@@ -82,7 +82,7 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow:
         )
     else:
         model = OpenAIChatModel(
-            "accounts/fireworks/models/kimi-k2-instruct",
+            "accounts/fireworks/models/kimi-k2-instruct-0905",
             provider="fireworks",
         )
 

diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
@@ -27,7 +27,11 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"],
     dataset_adapter=apps_dataset_to_evaluation_row,
     completion_params=[
-        {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
+        {
+            "temperature": 0.0,
+            "max_tokens": 4096,
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        }
     ],
     passed_threshold=0.33,
     rollout_processor=SingleTurnRolloutProcessor(),

diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
@@ -29,7 +29,11 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"],
     dataset_adapter=coding_dataset_to_evaluation_row,
     completion_params=[
-        {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
+        {
+            "temperature": 0.0,
+            "max_tokens": 4096,
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        }
     ],
     passed_threshold=0.8,
     rollout_processor=SingleTurnRolloutProcessor(),

diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py
@@ -39,7 +39,11 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
     input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
     dataset_adapter=frozen_lake_to_evaluation_row,
     completion_params=[
-        {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
+        {
+            "temperature": 0.0,
+            "max_tokens": 4096,
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        }
     ],
     rollout_processor=MCPGymRolloutProcessor(),
     passed_threshold=0.66,

diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
@@ -16,7 +16,7 @@
 from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
 
 # Configure the judge model for LiteLLM
-JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"
+JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"
 
 
 def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -35,7 +35,11 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
     input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
     dataset_adapter=hallucination_dataset_adapter,
     completion_params=[
-        {"temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
+        {
+            "temperature": 0.0,
+            "max_tokens": 512,
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
+        }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     passed_threshold=0.33,

diff --git a/tests/pytest/test_openenv_browsergym_basic.py b/tests/pytest/test_openenv_browsergym_basic.py
@@ -65,7 +65,7 @@ def test_openenv_browsergym_basic():
         "model": os.getenv(
             "OPENENV_TEST_MODEL",
             # Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY
-            "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
+            "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         ),
         "temperature": 0.0,
         "max_tokens": 16,

diff --git a/tests/pytest/test_openenv_browsergym_eval.py b/tests/pytest/test_openenv_browsergym_eval.py
@@ -233,7 +233,7 @@ def action_parser(response_text: str):
         {
             "temperature": 0.0,
             "max_tokens": 512,
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         }
     ],
     # Keep concurrency and steps low for a quick health-check

diff --git a/tests/pytest/test_openenv_echo_hub.py b/tests/pytest/test_openenv_echo_hub.py
@@ -76,7 +76,7 @@ def action_parser(response_text: str):
             "temperature": 0.0,
             "max_tokens": 16,
             # Any working model with your API key; match other tests' default
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         }
     ],
     num_runs=1,

diff --git a/tests/pytest/test_openenv_textarena_docker.py b/tests/pytest/test_openenv_textarena_docker.py
@@ -94,7 +94,7 @@ def action_parser(response_text: str):
             "temperature": 0.7,
             "max_tokens": 32,
             # Any working model with your API key
-            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
+            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         }
     ],
     num_runs=1,

diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py
@@ -19,7 +19,7 @@
         ]
     ],
     rollout_processor=AgentRolloutProcessor(),
-    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
+    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}],
     mode="all",
 )
 def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]:

diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py
@@ -16,7 +16,7 @@ class ResponseFormat(BaseModel):
 @evaluation_test(
     input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
     rollout_processor=AgentRolloutProcessor(),
-    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
+    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}],
     mode="pointwise",
     mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json",
 )

diff --git a/tests/pytest/test_pytest_mcp_url.py b/tests/pytest/test_pytest_mcp_url.py
@@ -21,7 +21,7 @@
         ]
     ],
     rollout_processor=AgentRolloutProcessor(),
-    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
+    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}],
     mode="pointwise",
     mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config.json",
 )