Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion eval_protocol/benchmarks/test_frozen_lake.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,11 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
dataset_adapter=frozen_lake_to_evaluation_row,
completion_params=[
{"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
{
"temperature": 0.0,
"max_tokens": 4096,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
}
],
rollout_processor=MCPGymRolloutProcessor(),
passed_threshold=0.66,
Expand Down
4 changes: 2 additions & 2 deletions tests/chinook/pydantic/test_pydantic_chinook.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent:
"completion_params",
[
{
"model": "accounts/fireworks/models/kimi-k2-instruct",
"model": "accounts/fireworks/models/kimi-k2-instruct-0905",
"provider": "fireworks",
},
{
Expand Down Expand Up @@ -82,7 +82,7 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow:
)
else:
model = OpenAIChatModel(
"accounts/fireworks/models/kimi-k2-instruct",
"accounts/fireworks/models/kimi-k2-instruct-0905",
provider="fireworks",
)

Expand Down
6 changes: 5 additions & 1 deletion tests/pytest/test_apps_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"],
dataset_adapter=apps_dataset_to_evaluation_row,
completion_params=[
{"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
{
"temperature": 0.0,
"max_tokens": 4096,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
}
],
passed_threshold=0.33,
rollout_processor=SingleTurnRolloutProcessor(),
Expand Down
6 changes: 5 additions & 1 deletion tests/pytest/test_basic_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"],
dataset_adapter=coding_dataset_to_evaluation_row,
completion_params=[
{"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
{
"temperature": 0.0,
"max_tokens": 4096,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
}
],
passed_threshold=0.8,
rollout_processor=SingleTurnRolloutProcessor(),
Expand Down
6 changes: 5 additions & 1 deletion tests/pytest/test_frozen_lake.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,11 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
dataset_adapter=frozen_lake_to_evaluation_row,
completion_params=[
{"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
{
"temperature": 0.0,
"max_tokens": 4096,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
}
],
rollout_processor=MCPGymRolloutProcessor(),
passed_threshold=0.66,
Expand Down
8 changes: 6 additions & 2 deletions tests/pytest/test_hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test

# Configure the judge model for LiteLLM
JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"
JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"


def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
Expand All @@ -35,7 +35,11 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
dataset_adapter=hallucination_dataset_adapter,
completion_params=[
{"temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
{
"temperature": 0.0,
"max_tokens": 512,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
}
],
rollout_processor=SingleTurnRolloutProcessor(),
passed_threshold=0.33,
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_openenv_browsergym_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_openenv_browsergym_basic():
"model": os.getenv(
"OPENENV_TEST_MODEL",
# Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY
"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
),
"temperature": 0.0,
"max_tokens": 16,
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_openenv_browsergym_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def action_parser(response_text: str):
{
"temperature": 0.0,
"max_tokens": 512,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
}
],
# Keep concurrency and steps low for a quick health-check
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_openenv_echo_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def action_parser(response_text: str):
"temperature": 0.0,
"max_tokens": 16,
# Any working model with your API key; match other tests' default
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
}
],
num_runs=1,
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_openenv_textarena_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def action_parser(response_text: str):
"temperature": 0.7,
"max_tokens": 32,
# Any working model with your API key
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
}
],
num_runs=1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
]
],
rollout_processor=AgentRolloutProcessor(),
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}],
mode="all",
)
def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]:
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_pytest_klavis_mcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class ResponseFormat(BaseModel):
@evaluation_test(
input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
rollout_processor=AgentRolloutProcessor(),
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}],
mode="pointwise",
mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json",
)
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_pytest_mcp_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
]
],
rollout_processor=AgentRolloutProcessor(),
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"}],
mode="pointwise",
mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config.json",
)
Expand Down
Loading