diff --git a/eval_protocol/benchmarks/test_frozen_lake.py b/eval_protocol/benchmarks/test_frozen_lake.py index ac5c998a..f0b7ef55 100644 --- a/eval_protocol/benchmarks/test_frozen_lake.py +++ b/eval_protocol/benchmarks/test_frozen_lake.py @@ -39,7 +39,11 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"], dataset_adapter=frozen_lake_to_evaluation_row, completion_params=[ - {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + { + "temperature": 0.0, + "max_tokens": 4096, + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + } ], rollout_processor=MCPGymRolloutProcessor(), passed_threshold=0.66, diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py index 62c427eb..c8f515f5 100644 --- a/tests/chinook/pydantic/test_pydantic_chinook.py +++ b/tests/chinook/pydantic/test_pydantic_chinook.py @@ -32,7 +32,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: "completion_params", [ { - "model": "accounts/fireworks/models/kimi-k2-instruct", + "model": "accounts/fireworks/models/kimi-k2-instruct-0905", "provider": "fireworks", }, { @@ -82,7 +82,7 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow: ) else: model = OpenAIChatModel( - "accounts/fireworks/models/kimi-k2-instruct", + "accounts/fireworks/models/kimi-k2-instruct-0905", provider="fireworks", ) diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py index ef195791..d7157ff1 100644 --- a/tests/pytest/test_apps_coding.py +++ b/tests/pytest/test_apps_coding.py @@ -27,7 +27,11 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"], dataset_adapter=apps_dataset_to_evaluation_row, completion_params=[ - {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + { + "temperature": 0.0, + "max_tokens": 4096, + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + } ], passed_threshold=0.33, rollout_processor=SingleTurnRolloutProcessor(), diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py index 4945d378..3b3ce560 100644 --- a/tests/pytest/test_basic_coding.py +++ b/tests/pytest/test_basic_coding.py @@ -29,7 +29,11 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"], dataset_adapter=coding_dataset_to_evaluation_row, completion_params=[ - {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + { + "temperature": 0.0, + "max_tokens": 4096, + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + } ], passed_threshold=0.8, rollout_processor=SingleTurnRolloutProcessor(), diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py index 24e32b56..2a053425 100644 --- a/tests/pytest/test_frozen_lake.py +++ b/tests/pytest/test_frozen_lake.py @@ -39,7 +39,11 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"], dataset_adapter=frozen_lake_to_evaluation_row, completion_params=[ - {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + { + "temperature": 0.0, + "max_tokens": 4096, + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + } ], rollout_processor=MCPGymRolloutProcessor(), passed_threshold=0.66, diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py index a2e27e6a..0003a88e 100644 --- a/tests/pytest/test_hallucination.py +++ b/tests/pytest/test_hallucination.py @@ -16,7 +16,7 @@ from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test # Configure the judge model for LiteLLM -JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct" +JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905" def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]: @@ -35,7 +35,11 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"], dataset_adapter=hallucination_dataset_adapter, completion_params=[ - {"temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + { + "temperature": 0.0, + "max_tokens": 512, + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", + } ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=0.33, diff --git a/tests/pytest/test_openenv_browsergym_basic.py b/tests/pytest/test_openenv_browsergym_basic.py index f87a663b..0f52a7ad 100644 --- a/tests/pytest/test_openenv_browsergym_basic.py +++ b/tests/pytest/test_openenv_browsergym_basic.py @@ -65,7 +65,7 @@ def test_openenv_browsergym_basic(): "model": os.getenv( "OPENENV_TEST_MODEL", # Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY - "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", + "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", ), "temperature": 0.0, "max_tokens": 16, diff --git a/tests/pytest/test_openenv_browsergym_eval.py b/tests/pytest/test_openenv_browsergym_eval.py index 505336ae..251608fa 100644 --- a/tests/pytest/test_openenv_browsergym_eval.py +++ b/tests/pytest/test_openenv_browsergym_eval.py @@ -233,7 +233,7 @@ def action_parser(response_text: str): { "temperature": 0.0, "max_tokens": 512, - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", } ], # Keep concurrency and steps low for a quick health-check diff --git a/tests/pytest/test_openenv_echo_hub.py b/tests/pytest/test_openenv_echo_hub.py index a80e2ef9..3b8bed01 100644 --- a/tests/pytest/test_openenv_echo_hub.py +++ b/tests/pytest/test_openenv_echo_hub.py @@ -76,7 +76,7 @@ def action_parser(response_text: str): "temperature": 0.0, "max_tokens": 16, # Any working model with your API key; match other tests' default - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", } ], num_runs=1, diff --git a/tests/pytest/test_openenv_textarena_docker.py b/tests/pytest/test_openenv_textarena_docker.py index b752cb91..6e9da0c5 100644 --- a/tests/pytest/test_openenv_textarena_docker.py +++ b/tests/pytest/test_openenv_textarena_docker.py @@ -94,7 +94,7 @@ def action_parser(response_text: str): "temperature": 0.7, "max_tokens": 32, # Any working model with your API key - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", } ], num_runs=1, diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py index d057108b..9ed2b3dc 100644 --- a/tests/pytest/test_pytest_default_agent_rollout_processor.py +++ b/tests/pytest/test_pytest_default_agent_rollout_processor.py @@ -19,7 +19,7 @@ ] ], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}], mode="all", ) def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]: diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index 00f48c9c..67ff6de4 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -16,7 +16,7 @@ class ResponseFormat(BaseModel): @evaluation_test( input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json", ) diff --git a/tests/pytest/test_pytest_mcp_url.py b/tests/pytest/test_pytest_mcp_url.py index c8063492..c0d35b87 100644 --- a/tests/pytest/test_pytest_mcp_url.py +++ b/tests/pytest/test_pytest_mcp_url.py @@ -21,7 +21,7 @@ ] ], rollout_processor=AgentRolloutProcessor(), - completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config.json", )