eval-protocol · dphuang2 · Aug 4, 2025 · Aug 4, 2025
diff --git a/tests/pytest/data/json_schema.jsonl b/tests/pytest/data/json_schema.jsonl
@@ -0,0 +1,12 @@
+{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n  \"user\": {\n    \"name\": \"string\",\n    \"age\": \"number\",\n    \"isActive\": \"boolean\"\n  },\n  \"products\": [\n    {\n      \"id\": \"string\",\n      \"price\": \"number\"\n    }\n  ]\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n  \"user\": {\n    \"name\": \"John Doe\",\n    \"age\": 30,\n    \"isActive\": true\n  },\n  \"products\": [\n    {\n      \"id\": \"prod-1\",\n      \"price\": 29.99\n    }\n  ]\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect schema match", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "exact_match", "description": "JSON content that exactly matches the expected schema", "expected_schema": {"type": "object", "properties": {"user": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}, "isActive": {"type": "boolean"}}}, "products": {"type": "array", "items": {"type": "object", "properties": {"id": {"type": "string"}, "price": {"type": "number"}}}}}}}}}
+{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n  \"user\": {\n    \"name\": \"string\",\n    \"age\": \"number\",\n    \"isActive\": \"boolean\",\n    \"address\": {\n      \"street\": \"string\",\n      \"city\": \"string\"\n    }\n  }\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n  \"user\": {\n    \"name\": \"John Doe\",\n    \"age\": 30,\n    \"email\": \"john@example.com\"\n  }\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.5, "reason": "Partial schema match with missing and extra properties", "metrics": {"schema_similarity": {"score": 0.5, "reason": "Schema similarity: 0.50", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "partial_match", "description": "JSON content with missing and extra properties compared to expected schema", "expected_schema": {"type": "object", "properties": {"user": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}, "isActive": {"type": "boolean"}, "address": {"type": "object", "properties": {"street": {"type": "string"}, "city": {"type": "string"}}}}}}}}}}
+{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n  \"name\": \"string\",\n  \"age\": \"number\"\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n  \"name\": \"John Doe\",\n  \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect schema match for simple object", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "extract_from_message", "description": "JSON extracted from message content with code block formatting", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}}}}
+{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n  \"name\": \"string\",\n  \"age\": \"number\",\n  \"tags\": [\"string\"]\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n  \"name\": \"John Doe\",\n  \"age\": \"30\",\n  \"tags\": {\n    \"tag1\": \"value1\"\n  }\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.33, "reason": "Type mismatches in schema validation", "metrics": {"schema_similarity": {"score": 0.33, "reason": "Schema similarity: 0.33", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "mismatched_types", "description": "JSON content with type mismatches (string instead of number, object instead of array)", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}, "tags": {"type": "array", "items": {"type": "string"}}}}}}}
+{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n  \"name\": \"string\",\n  \"age\": \"number\"\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n  \"name\": \"John Doe\",\n  \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect schema match using JSON string input", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "json_string_input", "description": "JSON schema validation using JSON string instead of object", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}}}}
+{"messages": [{"role": "user", "content": "Give me an empty object in the following JSON format:\n```json\n{}\n```"}, {"role": "assistant", "content": "Here's an empty object:\n```json\n{}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect match for empty object schema", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "empty_properties", "description": "JSON schema validation with empty object properties", "expected_schema": {"type": "object", "properties": {}}}}}
+{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n  \"name\": \"string\",\n  \"age\": \"number\"\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n  \"name\": \"John Doe\",\n  \"age\": \n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.0, "reason": "Invalid JSON content", "metrics": {"error": {"score": 0.0, "reason": "Invalid JSON content: Here's the user information:\n```json\n{\n  \"name\": \"John Doe\",\n  \"age\": \n}\n```", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "invalid_json", "description": "JSON schema validation with invalid JSON syntax", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}}}}
+{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n  \"name\": \"string\",\n  \"age\": \"number\"\n}\n```\n\nPlease provide a user object with name and age."}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n  \"name\": \"John Doe\",\n  \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.925, "reason": "Combined schema and LLM evaluation", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00\nMatching properties (2):\n  - age: number\n  - name: string", "is_score_valid": true}, "llm_judge": {"score": 0.85, "reason": "This is a good JSON structure that matches the expected schema. The data is well-formatted and contains the required fields with appropriate types.", "is_score_valid": false}, "schema_score": {"score": 1.0, "reason": "Schema validation score: 1.00", "is_score_valid": true}, "llm_score": {"score": 0.85, "reason": "LLM judge score: 0.85", "is_score_valid": false}, "weights": {"score": 0.0, "reason": "Weights used - Schema: 0.7, LLM: 0.3", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "llm_judge_integration", "description": "Combined schema validation and LLM judgment evaluation", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}, "expected_behavior": "Provide a user object with name and age"}}}
+{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n  \"name\": \"string\",\n  \"age\": \"number\"\n}\n```\n\nPlease provide a user object with name and age."}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n  \"name\": \"John Doe\",\n  \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.72, "reason": "Combined evaluation with custom weights", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00\nMatching properties (2):\n  - age: number\n  - name: string", "is_score_valid": true}, "llm_judge": {"score": 0.6, "reason": "This JSON is acceptable but has some issues. The structure is correct but could be more comprehensive.", "is_score_valid": false}, "schema_score": {"score": 1.0, "reason": "Schema validation score: 1.00", "is_score_valid": true}, "llm_score": {"score": 0.6, "reason": "LLM judge score: 0.6", "is_score_valid": false}, "weights": {"score": 0.0, "reason": "Weights used - Schema: 0.3, LLM: 0.7", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "custom_weights", "description": "Combined evaluation with custom weights (schema=0.3, llm=0.7)", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}, "expected_behavior": "Provide a user object with name and age"}}}
+{"messages": [{"role": "user", "content": "Create a complex nested object in the following JSON format:\n```json\n{\n  \"company\": {\n    \"name\": \"string\",\n    \"employees\": [\n      {\n        \"id\": \"string\",\n        \"name\": \"string\",\n        \"department\": {\n          \"name\": \"string\",\n          \"location\": \"string\",\n          \"code\": \"string\"\n        },\n        \"salary\": \"number\",\n        \"skills\": [\"string\"]\n      }\n    ],\n    \"metadata\": {\n      \"founded\": \"number\",\n      \"isPublic\": \"boolean\",\n      \"tags\": [\"string\"]\n    }\n  }\n}\n```"}, {"role": "assistant", "content": "Here's a complex nested object:\n```json\n{\n  \"company\": {\n    \"name\": \"Tech Corp\",\n    \"employees\": [\n      {\n        \"id\": \"emp-001\",\n        \"name\": \"Alice Johnson\",\n        \"department\": {\n          \"name\": \"Engineering\",\n          \"location\": \"Building A\",\n          \"extra_field\": \"some value\"\n        },\n        \"skills\": [\"Python\", \"JavaScript\", \"SQL\"]\n      },\n      {\n        \"id\": \"emp-002\",\n        \"name\": \"Bob Smith\",\n        \"department\": {\n          \"name\": \"Marketing\",\n          \"location\": \"Building B\"\n        },\n        \"skills\": [\"Content Writing\", \"SEO\"]\n      }\n    ],\n    \"metadata\": {\n      \"founded\": 2010,\n      \"isPublic\": false,\n      \"tags\": [\"technology\", \"startup\"]\n    }\n  }\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.8, "reason": "Complex nested schema with some mismatches", "metrics": {"schema_similarity": {"score": 0.8, "reason": "Schema similarity: 0.80", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "complex_nested", "description": "Complex nested JSON object with arrays and objects", "expected_schema": {"type": "object", "properties": {"company": {"type": "object", "properties": {"name": {"type": "string"}, "employees": {"type": "array", "items": {"type": "object", "properties": {"id": {"type": "string"}, "name": {"type": "string"}, "department": {"type": "object", "properties": {"name": {"type": "string"}, "location": {"type": "string"}, "code": {"type": "string"}}}, "salary": {"type": "number"}, "skills": {"type": "array", "items": {"type": "string"}}}}}}}, "metadata": {"type": "object", "properties": {"founded": {"type": "number"}, "isPublic": {"type": "boolean"}, "tags": {"type": "array", "items": {"type": "string"}}}}}}}}}
+{"messages": [{"role": "user", "content": "Provide an array of simple objects in the following JSON format:\n```json\n[\n  {\n    \"id\": \"number\",\n    \"name\": \"string\",\n    \"active\": \"boolean\"\n  }\n]\n```"}, {"role": "assistant", "content": "Here's an array of simple objects:\n```json\n[\n  {\n    \"id\": 1,\n    \"name\": \"Item One\",\n    \"active\": true\n  },\n  {\n    \"id\": 2,\n    \"name\": \"Item Two\",\n    \"active\": false\n  },\n  {\n    \"id\": 3,\n    \"name\": \"Item Three\",\n    \"active\": true\n  }\n]\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect array schema match", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "array_of_objects", "description": "JSON array containing simple objects with consistent structure", "expected_schema": {"type": "array", "items": {"type": "object", "properties": {"id": {"type": "number"}, "name": {"type": "string"}, "active": {"type": "boolean"}}}}}}}
+{"messages": [{"role": "user", "content": "Give me a mixed data structure in the following JSON format:\n```json\n{\n  \"string_field\": \"string\",\n  \"number_field\": \"number\",\n  \"boolean_field\": \"boolean\",\n  \"null_field\": \"null\",\n  \"array_field\": [\"number\"],\n  \"object_field\": {\n    \"nested_string\": \"string\",\n    \"nested_number\": \"number\"\n  },\n  \"date_field\": \"string\",\n  \"enum_field\": \"string\"\n}\n```"}, {"role": "assistant", "content": "Here's a mixed data structure:\n```json\n{\n  \"string_field\": \"Hello World\",\n  \"number_field\": 42,\n  \"boolean_field\": true,\n  \"null_field\": null,\n  \"array_field\": [1, 2, 3, 4, 5],\n  \"object_field\": {\n    \"nested_string\": \"Nested Value\",\n    \"nested_number\": 123.45\n  },\n  \"extra_field\": \"unexpected field\"\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.75, "reason": "Mixed data types with some schema mismatches", "metrics": {"schema_similarity": {"score": 0.75, "reason": "Schema similarity: 0.75", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "mixed_data_types", "description": "JSON object with various data types including strings, numbers, booleans, null, arrays, and objects", "expected_schema": {"type": "object", "properties": {"string_field": {"type": "string"}, "number_field": {"type": "number"}, "boolean_field": {"type": "boolean"}, "null_field": {"type": "null"}, "array_field": {"type": "array", "items": {"type": "number"}}, "object_field": {"type": "object", "properties": {"nested_string": {"type": "string"}, "nested_number": {"type": "number"}}}, "date_field": {"type": "string"}, "enum_field": {"type": "string"}}}}}}
diff --git a/tests/pytest/test_pytest_json_schema.py b/tests/pytest/test_pytest_json_schema.py
@@ -0,0 +1,37 @@
+import json
+from typing import Any, Dict, List
+from eval_protocol.models import EvaluationRow
+from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
+from eval_protocol.rewards.json_schema import json_schema_reward
+
+
+def json_schema_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Convert a json schema row to an evaluation row.
+    """
+    dataset: List[EvaluationRow] = []
+    for row in rows:
+        dataset.append(
+            EvaluationRow(
+                messages=row["messages"][:1],
+                ground_truth=row["ground_truth"],
+                input_metadata=row["input_metadata"],
+            )
+        )
+    return dataset
+
+
+@evaluation_test(
+    input_dataset=["tests/pytest/data/json_schema.jsonl"],
+    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    mode="pointwise",
+    rollout_processor=default_single_turn_rollout_processor,
+    dataset_adapter=json_schema_to_evaluation_row,
+)
+async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow:
+    """Run pointwise evaluation on sample dataset using pytest interface."""
+    expected_schema = row.input_metadata.dataset_info["expected_schema"]
+    result = json_schema_reward(row.messages, expected_schema=expected_schema)
+    row.evaluation_result = result
+    print(row.evaluation_result)
+    return row