Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions tests/pytest/data/json_schema.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"user\": {\n \"name\": \"string\",\n \"age\": \"number\",\n \"isActive\": \"boolean\"\n },\n \"products\": [\n {\n \"id\": \"string\",\n \"price\": \"number\"\n }\n ]\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"user\": {\n \"name\": \"John Doe\",\n \"age\": 30,\n \"isActive\": true\n },\n \"products\": [\n {\n \"id\": \"prod-1\",\n \"price\": 29.99\n }\n ]\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect schema match", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "exact_match", "description": "JSON content that exactly matches the expected schema", "expected_schema": {"type": "object", "properties": {"user": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}, "isActive": {"type": "boolean"}}}, "products": {"type": "array", "items": {"type": "object", "properties": {"id": {"type": "string"}, "price": {"type": "number"}}}}}}}}}
{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"user\": {\n \"name\": \"string\",\n \"age\": \"number\",\n \"isActive\": \"boolean\",\n \"address\": {\n \"street\": \"string\",\n \"city\": \"string\"\n }\n }\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"user\": {\n \"name\": \"John Doe\",\n \"age\": 30,\n \"email\": \"john@example.com\"\n }\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.5, "reason": "Partial schema match with missing and extra properties", "metrics": {"schema_similarity": {"score": 0.5, "reason": "Schema similarity: 0.50", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "partial_match", "description": "JSON content with missing and extra properties compared to expected schema", "expected_schema": {"type": "object", "properties": {"user": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}, "isActive": {"type": "boolean"}, "address": {"type": "object", "properties": {"street": {"type": "string"}, "city": {"type": "string"}}}}}}}}}}
{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\"\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect schema match for simple object", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "extract_from_message", "description": "JSON extracted from message content with code block formatting", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}}}}
{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\",\n \"tags\": [\"string\"]\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": \"30\",\n \"tags\": {\n \"tag1\": \"value1\"\n }\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.33, "reason": "Type mismatches in schema validation", "metrics": {"schema_similarity": {"score": 0.33, "reason": "Schema similarity: 0.33", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "mismatched_types", "description": "JSON content with type mismatches (string instead of number, object instead of array)", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}, "tags": {"type": "array", "items": {"type": "string"}}}}}}}
{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\"\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect schema match using JSON string input", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "json_string_input", "description": "JSON schema validation using JSON string instead of object", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}}}}
{"messages": [{"role": "user", "content": "Give me an empty object in the following JSON format:\n```json\n{}\n```"}, {"role": "assistant", "content": "Here's an empty object:\n```json\n{}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect match for empty object schema", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "empty_properties", "description": "JSON schema validation with empty object properties", "expected_schema": {"type": "object", "properties": {}}}}}
{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\"\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": \n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.0, "reason": "Invalid JSON content", "metrics": {"error": {"score": 0.0, "reason": "Invalid JSON content: Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": \n}\n```", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "invalid_json", "description": "JSON schema validation with invalid JSON syntax", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}}}}
{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\"\n}\n```\n\nPlease provide a user object with name and age."}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.925, "reason": "Combined schema and LLM evaluation", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00\nMatching properties (2):\n - age: number\n - name: string", "is_score_valid": true}, "llm_judge": {"score": 0.85, "reason": "This is a good JSON structure that matches the expected schema. The data is well-formatted and contains the required fields with appropriate types.", "is_score_valid": false}, "schema_score": {"score": 1.0, "reason": "Schema validation score: 1.00", "is_score_valid": true}, "llm_score": {"score": 0.85, "reason": "LLM judge score: 0.85", "is_score_valid": false}, "weights": {"score": 0.0, "reason": "Weights used - Schema: 0.7, LLM: 0.3", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "llm_judge_integration", "description": "Combined schema validation and LLM judgment evaluation", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}, "expected_behavior": "Provide a user object with name and age"}}}
{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\"\n}\n```\n\nPlease provide a user object with name and age."}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.72, "reason": "Combined evaluation with custom weights", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00\nMatching properties (2):\n - age: number\n - name: string", "is_score_valid": true}, "llm_judge": {"score": 0.6, "reason": "This JSON is acceptable but has some issues. The structure is correct but could be more comprehensive.", "is_score_valid": false}, "schema_score": {"score": 1.0, "reason": "Schema validation score: 1.00", "is_score_valid": true}, "llm_score": {"score": 0.6, "reason": "LLM judge score: 0.6", "is_score_valid": false}, "weights": {"score": 0.0, "reason": "Weights used - Schema: 0.3, LLM: 0.7", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "custom_weights", "description": "Combined evaluation with custom weights (schema=0.3, llm=0.7)", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}, "expected_behavior": "Provide a user object with name and age"}}}
{"messages": [{"role": "user", "content": "Create a complex nested object in the following JSON format:\n```json\n{\n \"company\": {\n \"name\": \"string\",\n \"employees\": [\n {\n \"id\": \"string\",\n \"name\": \"string\",\n \"department\": {\n \"name\": \"string\",\n \"location\": \"string\",\n \"code\": \"string\"\n },\n \"salary\": \"number\",\n \"skills\": [\"string\"]\n }\n ],\n \"metadata\": {\n \"founded\": \"number\",\n \"isPublic\": \"boolean\",\n \"tags\": [\"string\"]\n }\n }\n}\n```"}, {"role": "assistant", "content": "Here's a complex nested object:\n```json\n{\n \"company\": {\n \"name\": \"Tech Corp\",\n \"employees\": [\n {\n \"id\": \"emp-001\",\n \"name\": \"Alice Johnson\",\n \"department\": {\n \"name\": \"Engineering\",\n \"location\": \"Building A\",\n \"extra_field\": \"some value\"\n },\n \"skills\": [\"Python\", \"JavaScript\", \"SQL\"]\n },\n {\n \"id\": \"emp-002\",\n \"name\": \"Bob Smith\",\n \"department\": {\n \"name\": \"Marketing\",\n \"location\": \"Building B\"\n },\n \"skills\": [\"Content Writing\", \"SEO\"]\n }\n ],\n \"metadata\": {\n \"founded\": 2010,\n \"isPublic\": false,\n \"tags\": [\"technology\", \"startup\"]\n }\n }\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.8, "reason": "Complex nested schema with some mismatches", "metrics": {"schema_similarity": {"score": 0.8, "reason": "Schema similarity: 0.80", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "complex_nested", "description": "Complex nested JSON object with arrays and objects", "expected_schema": {"type": "object", "properties": {"company": {"type": "object", "properties": {"name": {"type": "string"}, "employees": {"type": "array", "items": {"type": "object", "properties": {"id": {"type": "string"}, "name": {"type": "string"}, "department": {"type": "object", "properties": {"name": {"type": "string"}, "location": {"type": "string"}, "code": {"type": "string"}}}, "salary": {"type": "number"}, "skills": {"type": "array", "items": {"type": "string"}}}}}}}, "metadata": {"type": "object", "properties": {"founded": {"type": "number"}, "isPublic": {"type": "boolean"}, "tags": {"type": "array", "items": {"type": "string"}}}}}}}}}
{"messages": [{"role": "user", "content": "Provide an array of simple objects in the following JSON format:\n```json\n[\n {\n \"id\": \"number\",\n \"name\": \"string\",\n \"active\": \"boolean\"\n }\n]\n```"}, {"role": "assistant", "content": "Here's an array of simple objects:\n```json\n[\n {\n \"id\": 1,\n \"name\": \"Item One\",\n \"active\": true\n },\n {\n \"id\": 2,\n \"name\": \"Item Two\",\n \"active\": false\n },\n {\n \"id\": 3,\n \"name\": \"Item Three\",\n \"active\": true\n }\n]\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect array schema match", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "array_of_objects", "description": "JSON array containing simple objects with consistent structure", "expected_schema": {"type": "array", "items": {"type": "object", "properties": {"id": {"type": "number"}, "name": {"type": "string"}, "active": {"type": "boolean"}}}}}}}
{"messages": [{"role": "user", "content": "Give me a mixed data structure in the following JSON format:\n```json\n{\n \"string_field\": \"string\",\n \"number_field\": \"number\",\n \"boolean_field\": \"boolean\",\n \"null_field\": \"null\",\n \"array_field\": [\"number\"],\n \"object_field\": {\n \"nested_string\": \"string\",\n \"nested_number\": \"number\"\n },\n \"date_field\": \"string\",\n \"enum_field\": \"string\"\n}\n```"}, {"role": "assistant", "content": "Here's a mixed data structure:\n```json\n{\n \"string_field\": \"Hello World\",\n \"number_field\": 42,\n \"boolean_field\": true,\n \"null_field\": null,\n \"array_field\": [1, 2, 3, 4, 5],\n \"object_field\": {\n \"nested_string\": \"Nested Value\",\n \"nested_number\": 123.45\n },\n \"extra_field\": \"unexpected field\"\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.75, "reason": "Mixed data types with some schema mismatches", "metrics": {"schema_similarity": {"score": 0.75, "reason": "Schema similarity: 0.75", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "mixed_data_types", "description": "JSON object with various data types including strings, numbers, booleans, null, arrays, and objects", "expected_schema": {"type": "object", "properties": {"string_field": {"type": "string"}, "number_field": {"type": "number"}, "boolean_field": {"type": "boolean"}, "null_field": {"type": "null"}, "array_field": {"type": "array", "items": {"type": "number"}}, "object_field": {"type": "object", "properties": {"nested_string": {"type": "string"}, "nested_number": {"type": "number"}}}, "date_field": {"type": "string"}, "enum_field": {"type": "string"}}}}}}
37 changes: 37 additions & 0 deletions tests/pytest/test_pytest_json_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import json
from typing import Any, Dict, List
from eval_protocol.models import EvaluationRow
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
from eval_protocol.rewards.json_schema import json_schema_reward


def json_schema_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
"""
Convert a json schema row to an evaluation row.
"""
dataset: List[EvaluationRow] = []
for row in rows:
dataset.append(
EvaluationRow(
messages=row["messages"][:1],
ground_truth=row["ground_truth"],
input_metadata=row["input_metadata"],
)
)
return dataset


@evaluation_test(
input_dataset=["tests/pytest/data/json_schema.jsonl"],
model=["accounts/fireworks/models/kimi-k2-instruct"],
mode="pointwise",
rollout_processor=default_single_turn_rollout_processor,
dataset_adapter=json_schema_to_evaluation_row,
)
async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow:
"""Run pointwise evaluation on sample dataset using pytest interface."""
expected_schema = row.input_metadata.dataset_info["expected_schema"]
result = json_schema_reward(row.messages, expected_schema=expected_schema)
row.evaluation_result = result
print(row.evaluation_result)
return row
Loading