From 19d491a968674994a58b79e8cd8a4b658a5adba5 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 17 Sep 2025 01:45:13 -0700 Subject: [PATCH 1/5] Various small refactors --- eval_protocol/adapters/__init__.py | 14 ++- eval_protocol/adapters/base.py | 21 ++++ eval_protocol/adapters/bigquery.py | 3 +- eval_protocol/adapters/braintrust.py | 46 ++++++++- eval_protocol/adapters/huggingface.py | 3 +- eval_protocol/adapters/langfuse.py | 3 +- eval_protocol/adapters/langsmith.py | 3 +- eval_protocol/adapters/openai_responses.py | 3 +- eval_protocol/quickstart/llm_judge.py | 61 +++--------- .../quickstart/llm_judge_braintrust.py | 95 ++----------------- .../quickstart/llm_judge_langfuse.py | 50 ++++++++++ 11 files changed, 153 insertions(+), 149 deletions(-) create mode 100644 eval_protocol/adapters/base.py create mode 100644 eval_protocol/quickstart/llm_judge_langfuse.py diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py index 57757901..5944c131 100644 --- a/eval_protocol/adapters/__init__.py +++ b/eval_protocol/adapters/__init__.py @@ -4,6 +4,7 @@ and converting them to EvaluationRow format for use in evaluation pipelines. Available adapters: +- BaseAdapter: Abstract base class for all adapters - LangfuseAdapter: Pull data from Langfuse deployments - HuggingFaceAdapter: Load datasets from HuggingFace Hub - BigQueryAdapter: Query data from Google BigQuery @@ -11,13 +12,18 @@ - TRL integration (legacy) """ +# Always available +from .base import BaseAdapter + +__all__ = ["BaseAdapter"] + # Conditional imports based on available dependencies try: from .langfuse import LangfuseAdapter, create_langfuse_adapter - __all__ = ["LangfuseAdapter", "create_langfuse_adapter"] + __all__.extend(["LangfuseAdapter", "create_langfuse_adapter"]) except ImportError: - __all__ = [] + pass try: from .huggingface import ( @@ -55,9 +61,9 @@ # Legacy adapters (always available) try: - from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn + from .braintrust import BraintrustAdapter, create_braintrust_adapter, reward_fn_to_scorer, scorer_to_reward_fn - __all__.extend(["scorer_to_reward_fn", "reward_fn_to_scorer"]) + __all__.extend(["BraintrustAdapter", "create_braintrust_adapter", "scorer_to_reward_fn", "reward_fn_to_scorer"]) except ImportError: pass diff --git a/eval_protocol/adapters/base.py b/eval_protocol/adapters/base.py new file mode 100644 index 00000000..def6d85c --- /dev/null +++ b/eval_protocol/adapters/base.py @@ -0,0 +1,21 @@ +""" +Base adapter interface for Eval Protocol. +""" + +from abc import ABC, abstractmethod +from typing import List + +from eval_protocol.models import EvaluationRow + + +class BaseAdapter(ABC): + """Abstract base class for all Eval Protocol adapters.""" + + @abstractmethod + def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]: + """Get evaluation rows from the data source.""" + pass + + def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None: + """Push evaluation scores back to the data source for tracking and analysis.""" + pass diff --git a/eval_protocol/adapters/bigquery.py b/eval_protocol/adapters/bigquery.py index 9831e748..9446b5a8 100644 --- a/eval_protocol/adapters/bigquery.py +++ b/eval_protocol/adapters/bigquery.py @@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, Iterator, List, Optional, TypeAlias from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message +from .base import BaseAdapter logger = logging.getLogger(__name__) @@ -42,7 +43,7 @@ TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]] -class BigQueryAdapter: +class BigQueryAdapter(BaseAdapter): """Adapter to query data from Google BigQuery and convert to EvaluationRow format. This adapter connects to Google BigQuery, executes SQL queries, and applies diff --git a/eval_protocol/adapters/braintrust.py b/eval_protocol/adapters/braintrust.py index 979d4d52..665748b9 100644 --- a/eval_protocol/adapters/braintrust.py +++ b/eval_protocol/adapters/braintrust.py @@ -14,6 +14,7 @@ import requests from eval_protocol.models import EvaluationRow, InputMetadata, Message +from .base import BaseAdapter from .utils import extract_messages_from_data # Keep backward compatibility @@ -128,7 +129,7 @@ def extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool return messages -class BraintrustAdapter: +class BraintrustAdapter(BaseAdapter): """Adapter to pull data from Braintrust and convert to EvaluationRow format. This adapter can pull both chat conversations and tool calling traces from @@ -223,6 +224,49 @@ def get_evaluation_rows( logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows)) return eval_rows + def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None: + """Push evaluation scores back to Braintrust traces for tracking and analysis. + + Creates score entries in Braintrust for each unique trace_id found in the evaluation + rows' session data. This allows you to see evaluation results directly in the + Braintrust UI alongside the original traces. + + Args: + rows: List of EvaluationRow objects with session_data containing trace IDs + model_name: Name of the model (used as the score name in Braintrust) + mean_score: The calculated mean score to push to Braintrust + + Note: + Silently handles errors if rows lack session data + """ + try: + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + feedback_items = [] + for trace_id in set( + row.input_metadata.session_data["braintrust_trace_id"] + for row in rows + if row.evaluation_result and row.input_metadata and row.input_metadata.session_data + ): + if trace_id: + feedback_items.append({"id": trace_id, "scores": {model_name: mean_score}}) + + if feedback_items: + payload = {"feedback": feedback_items} + + response = requests.post( + f"{self.api_url}/v1/project_logs/{self.project_id}/feedback", + headers=headers, + json=payload, + ) + response.raise_for_status() + + except Exception as e: + logger.warning("Failed to push scores to Braintrust: %s", e) + def create_braintrust_adapter( api_key: Optional[str] = None, diff --git a/eval_protocol/adapters/huggingface.py b/eval_protocol/adapters/huggingface.py index 7f8b6902..1f740ba2 100644 --- a/eval_protocol/adapters/huggingface.py +++ b/eval_protocol/adapters/huggingface.py @@ -8,6 +8,7 @@ from typing import Any, Callable, Dict, Iterator, List, Optional from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message +from .base import BaseAdapter logger = logging.getLogger(__name__) @@ -23,7 +24,7 @@ TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]] -class HuggingFaceAdapter: +class HuggingFaceAdapter(BaseAdapter): """Generic adapter to load HuggingFace datasets with custom transformations. This adapter loads datasets from HuggingFace Hub and applies a user-provided diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index 115448dd..040f02d9 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional, Protocol from eval_protocol.models import EvaluationRow, InputMetadata, Message +from .base import BaseAdapter from .utils import extract_messages_from_data logger = logging.getLogger(__name__) @@ -188,7 +189,7 @@ def get_final_generation_in_span(trace: TraceWithFullDetails, span_name: str) -> return generations[-1] -class LangfuseAdapter: +class LangfuseAdapter(BaseAdapter): """Adapter to pull data from Langfuse and convert to EvaluationRow format. This adapter can pull both chat conversations and tool calling traces from diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py index 1d29b66a..fc1daf71 100644 --- a/eval_protocol/adapters/langsmith.py +++ b/eval_protocol/adapters/langsmith.py @@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional, Iterable from eval_protocol.models import EvaluationRow, InputMetadata, Message +from .base import BaseAdapter logger = logging.getLogger(__name__) @@ -24,7 +25,7 @@ LANGSMITH_AVAILABLE = False -class LangSmithAdapter: +class LangSmithAdapter(BaseAdapter): """Adapter to pull data from LangSmith and convert to EvaluationRow format. By default, fetches root runs from a project and maps inputs/outputs into diff --git a/eval_protocol/adapters/openai_responses.py b/eval_protocol/adapters/openai_responses.py index 8380ce06..1d4c03e5 100644 --- a/eval_protocol/adapters/openai_responses.py +++ b/eval_protocol/adapters/openai_responses.py @@ -21,6 +21,7 @@ from openai.types.responses.tool import Tool from eval_protocol.models import EvaluationRow, InputMetadata, Message +from .base import BaseAdapter logger = logging.getLogger(__name__) @@ -28,7 +29,7 @@ from openai import OpenAI -class OpenAIResponsesAdapter: +class OpenAIResponsesAdapter(BaseAdapter): """Adapter to pull data from OpenAI Responses API and convert to EvaluationRow format. This adapter can pull both chat conversations and tool calling traces from diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py index 7cd03647..30f4d86e 100644 --- a/eval_protocol/quickstart/llm_judge.py +++ b/eval_protocol/quickstart/llm_judge.py @@ -2,65 +2,23 @@ Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto. """ -from collections.abc import Awaitable, Callable -import os -from datetime import datetime -from typing import List, Dict, Any, Optional -from typing_extensions import cast from tqdm import tqdm +from typing import Optional -import pytest - -from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult -from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor +from eval_protocol.models import EvaluationRow +from eval_protocol.adapters.base import BaseAdapter from eval_protocol.quickstart.utils import ( - split_multi_turn_rows, JUDGE_CONFIGS, calculate_bootstrap_scores, run_judgment_async, ) import asyncio from openai import AsyncOpenAI -from eval_protocol.adapters.langfuse import create_langfuse_adapter - -adapter = create_langfuse_adapter() - - -@pytest.mark.asyncio -@evaluation_test( - input_rows=[ - adapter.get_evaluation_rows( - to_timestamp=datetime(2025, 9, 12, 0, 11, 18), - limit=711, - sample_size=50, - sleep_between_gets=3.0, - max_retries=5, - ) - ], - completion_params=[ - {"model": "gpt-4.1"}, - { - "max_tokens": 131000, - "extra_body": {"reasoning_effort": "medium"}, - "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", - }, - { - "max_tokens": 131000, - "extra_body": {"reasoning_effort": "low"}, - "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b", - }, - ], - rollout_processor=SingleTurnRolloutProcessor(), - preprocess_fn=split_multi_turn_rows, - max_concurrent_rollouts=64, - mode="all", -) -async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]: - return await aha_judge(rows) -async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro") -> list[EvaluationRow]: +async def aha_judge( + rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro", adapter: Optional[BaseAdapter] = None +) -> list[EvaluationRow]: """ LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons. @@ -73,6 +31,8 @@ async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro Args: rows: List of EvaluationRow objects with messages, ground_truth, and tools + judge_name: Name of the judge configuration to use + adapter: Optional adapter to push scores back to (if provided) Returns: Same rows with updated evaluation_result containing scores and judgments @@ -133,7 +93,8 @@ async def run_judgment(row): if row.evaluation_result: row.evaluation_result.score = mean_score - # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace. - adapter.push_scores(rows, model_name, mean_score) + # Push scores back to adapter if provided. Note that one score per model will be pushed back onto same trace. + if adapter: + adapter.push_scores(rows, model_name, mean_score) return rows diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py index a1902cf7..71b51b0a 100644 --- a/eval_protocol/quickstart/llm_judge_braintrust.py +++ b/eval_protocol/quickstart/llm_judge_braintrust.py @@ -1,26 +1,17 @@ """ -Default LLM judge for Eval Protocol using Braintrust. Inspired by Arena-Hard-Auto. +Example for using Braintrust with the aha judge. """ import os -from datetime import datetime -from typing import List, Dict, Any, Optional -from tqdm import tqdm import pytest -from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult +from eval_protocol.models import EvaluationRow from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor -from eval_protocol.quickstart.utils import ( - split_multi_turn_rows, - JUDGE_CONFIGS, - calculate_bootstrap_scores, - run_judgment_async, -) -import asyncio -from openai import AsyncOpenAI +from eval_protocol.quickstart.utils import split_multi_turn_rows from eval_protocol.adapters.braintrust import create_braintrust_adapter +from eval_protocol.quickstart import aha_judge adapter = create_braintrust_adapter() @@ -33,7 +24,7 @@ select: * from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces filter: is_root = true -limit: 5 +limit: 10 """ ) ], @@ -56,78 +47,4 @@ mode="all", ) async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]: - """ - LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons. - - Compares model responses against ground truth using an LLM judge. For each row: - 1. Extracts the question from messages[:-1] - 2. Compares messages[-1] (new model response) vs ground_truth (baseline response) - 3. Runs two judgment rounds (A vs B, B vs A) to reduce position bias - 4. Calculates bootstrap scores across all comparisons - 5. Updates evaluation_result with final scores and confidence intervals - - Args: - rows: List of EvaluationRow objects with messages, ground_truth, and tools - - Returns: - Same rows with updated evaluation_result containing scores and judgments - """ - - judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py. - - if not rows: - print("❌ No evaluation rows provided") - return rows - - print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging...") - - model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model") - - judgments = [] - max_concurrency = JUDGE_CONFIGS[judge_name]["max_concurrency"] - - judge_config = JUDGE_CONFIGS[judge_name] - - async with AsyncOpenAI( - api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url") - ) as shared_client: - semaphore = asyncio.Semaphore(max_concurrency) - - async def run_judgment(row): - async with semaphore: - return await run_judgment_async(row, model_name, judge_name, shared_client) - - tasks = [run_judgment(row) for row in rows] - - for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Generating judgments"): - result = await coro - if result and result["games"][0] and result["games"][1]: - judgments.append(result) - - if not judgments: - print("❌ No valid judgments generated") - return rows - - print(f"✅ Generated {len(judgments)} valid judgments") - - # Calculate bootstrap scores - result = calculate_bootstrap_scores(judgments) - if not result: - print("❌ No valid scores extracted") - return rows - - mean_score, lower_score, upper_score = result - - # Print leaderboard - print("\n##### LLM Judge Results (90th percentile CI) #####") - - clean_model_name = model_name.split("/")[-1] # Clean model name - - print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})") - print("original: 50.0% (CI: 50.0% - 50.0%)") - - for row in rows: - if row.evaluation_result: - row.evaluation_result.score = mean_score - - return rows + return await aha_judge(rows) diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py new file mode 100644 index 00000000..08f72c67 --- /dev/null +++ b/eval_protocol/quickstart/llm_judge_langfuse.py @@ -0,0 +1,50 @@ +""" +Example for using Langfuse with the aha judge. +""" + +from datetime import datetime + +import pytest + +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor +from eval_protocol.quickstart.utils import split_multi_turn_rows + +from eval_protocol.adapters.langfuse import create_langfuse_adapter +from eval_protocol.quickstart import aha_judge + +adapter = create_langfuse_adapter() + + +@pytest.mark.asyncio +@evaluation_test( + input_rows=[ + adapter.get_evaluation_rows( + to_timestamp=datetime(2025, 9, 12, 0, 11, 18), + limit=711, + sample_size=50, + sleep_between_gets=3.0, + max_retries=5, + ) + ], + completion_params=[ + {"model": "gpt-4.1"}, + { + "max_tokens": 131000, + "extra_body": {"reasoning_effort": "medium"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + }, + { + "max_tokens": 131000, + "extra_body": {"reasoning_effort": "low"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b", + }, + ], + rollout_processor=SingleTurnRolloutProcessor(), + preprocess_fn=split_multi_turn_rows, + max_concurrent_rollouts=64, + mode="all", +) +async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]: + return await aha_judge(rows) From 3dc99358b35fc3971a78d8047de2113692f766f7 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 17 Sep 2025 10:14:14 -0700 Subject: [PATCH 2/5] change name --- eval_protocol/adapters/base.py | 4 ++-- eval_protocol/adapters/braintrust.py | 4 ++-- eval_protocol/adapters/langfuse.py | 4 ++-- eval_protocol/quickstart/llm_judge.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/eval_protocol/adapters/base.py b/eval_protocol/adapters/base.py index def6d85c..6009b8e1 100644 --- a/eval_protocol/adapters/base.py +++ b/eval_protocol/adapters/base.py @@ -16,6 +16,6 @@ def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]: """Get evaluation rows from the data source.""" pass - def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None: - """Push evaluation scores back to the data source for tracking and analysis.""" + def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None: + """Upload evaluation scores back to the data source for tracking and analysis.""" pass diff --git a/eval_protocol/adapters/braintrust.py b/eval_protocol/adapters/braintrust.py index 665748b9..d419052b 100644 --- a/eval_protocol/adapters/braintrust.py +++ b/eval_protocol/adapters/braintrust.py @@ -224,8 +224,8 @@ def get_evaluation_rows( logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows)) return eval_rows - def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None: - """Push evaluation scores back to Braintrust traces for tracking and analysis. + def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None: + """Upload evaluation scores back to Braintrust traces for tracking and analysis. Creates score entries in Braintrust for each unique trace_id found in the evaluation rows' session data. This allows you to see evaluation results directly in the diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index 040f02d9..6d057372 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -434,8 +434,8 @@ def get_evaluation_rows_by_ids( continue return eval_rows - def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None: - """Push evaluation scores back to Langfuse traces for tracking and analysis. + def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None: + """Upload evaluation scores back to Langfuse traces for tracking and analysis. Creates a score entry in Langfuse for each unique trace_id found in the evaluation rows' session data. This allows you to see evaluation results directly in the diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py index 30f4d86e..42255a49 100644 --- a/eval_protocol/quickstart/llm_judge.py +++ b/eval_protocol/quickstart/llm_judge.py @@ -95,6 +95,6 @@ async def run_judgment(row): # Push scores back to adapter if provided. Note that one score per model will be pushed back onto same trace. if adapter: - adapter.push_scores(rows, model_name, mean_score) + adapter.upload_scores(rows, model_name, mean_score) return rows From f62f0ad503bd235b97509dea70e52950354e6ba9 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 17 Sep 2025 10:18:10 -0700 Subject: [PATCH 3/5] remove old braintrust integration --- docs/integrations/braintrust_integration.mdx | 49 --------------- eval_protocol/__init__.py | 4 -- eval_protocol/adapters/__init__.py | 4 +- eval_protocol/adapters/braintrust.py | 5 +- eval_protocol/integrations/__init__.py | 3 - eval_protocol/integrations/braintrust.py | 54 ---------------- examples/braintrust_example/README.md | 24 ------- .../conf/simple_braintrust_eval.yaml | 63 ------------------- examples/braintrust_example/main.py | 20 ------ examples/braintrust_integration.py | 26 -------- tests/test_braintrust_adapter.py | 34 ---------- tests/test_braintrust_example.py | 49 --------------- tests/test_eval_protocol_import.py | 8 --- tests/test_reward_protocol_import.py | 8 --- 14 files changed, 3 insertions(+), 348 deletions(-) delete mode 100644 docs/integrations/braintrust_integration.mdx delete mode 100644 eval_protocol/integrations/braintrust.py delete mode 100644 examples/braintrust_example/README.md delete mode 100644 examples/braintrust_example/conf/simple_braintrust_eval.yaml delete mode 100644 examples/braintrust_example/main.py delete mode 100644 examples/braintrust_integration.py delete mode 100644 tests/test_braintrust_adapter.py delete mode 100644 tests/test_braintrust_example.py diff --git a/docs/integrations/braintrust_integration.mdx b/docs/integrations/braintrust_integration.mdx deleted file mode 100644 index 43e4b90c..00000000 --- a/docs/integrations/braintrust_integration.mdx +++ /dev/null @@ -1,49 +0,0 @@ -# Integrating with Braintrust - -This guide explains how to bridge Eval Protocol with [Braintrust](https://braintrust.dev/). You can log Eval Protocol evaluations to Braintrust or reuse Braintrust scorers as Eval Protocol reward functions. - -## Installation - -Install the Braintrust SDK in your environment: - -```bash -pip install braintrust -``` - -## Using a Braintrust scorer in Eval Protocol - -Convert a Braintrust-style scorer to an Eval Protocol reward function using `scorer_to_reward_fn`: - -```python -from braintrust import Eval -from eval_protocol.integrations.braintrust import scorer_to_reward_fn - - -def equality_scorer(input: str, output: str, expected: str) -> float: - return 1.0 if output == expected else 0.0 - -reward_fn = scorer_to_reward_fn(equality_scorer) - - -def hi_bot_task(name: str) -> str: - return "Hi " + name - - -Eval( - "Eval Protocol Braintrust Example", - data=lambda: [ - {"input": "Foo", "expected": "Hi Foo"}, - {"input": "Bar", "expected": "Hello Bar"}, - ], - task=hi_bot_task, - scores=[reward_fn], -) -``` - -Run the script with your Braintrust API key: - -```bash -BRAINTRUST_API_KEY= braintrust eval examples/braintrust_integration.py -``` - -This will create an experiment in Braintrust where you can inspect the scores, outputs and metadata. diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py index 5939896f..f39369b2 100644 --- a/eval_protocol/__init__.py +++ b/eval_protocol/__init__.py @@ -10,8 +10,6 @@ import warnings -from eval_protocol.adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn - from .auth import get_fireworks_account_id, get_fireworks_api_key from .common_utils import load_jsonl from .config import RewardKitConfig, get_config, load_config @@ -49,8 +47,6 @@ "EvaluateResult", "reward_function", "RewardFunction", - "scorer_to_reward_fn", - "reward_fn_to_scorer", # Authentication "get_fireworks_api_key", "get_fireworks_account_id", diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py index 5944c131..fef59a3a 100644 --- a/eval_protocol/adapters/__init__.py +++ b/eval_protocol/adapters/__init__.py @@ -61,9 +61,9 @@ # Legacy adapters (always available) try: - from .braintrust import BraintrustAdapter, create_braintrust_adapter, reward_fn_to_scorer, scorer_to_reward_fn + from .braintrust import BraintrustAdapter, create_braintrust_adapter - __all__.extend(["BraintrustAdapter", "create_braintrust_adapter", "scorer_to_reward_fn", "reward_fn_to_scorer"]) + __all__.extend(["BraintrustAdapter", "create_braintrust_adapter"]) except ImportError: pass diff --git a/eval_protocol/adapters/braintrust.py b/eval_protocol/adapters/braintrust.py index d419052b..2007f322 100644 --- a/eval_protocol/adapters/braintrust.py +++ b/eval_protocol/adapters/braintrust.py @@ -17,9 +17,6 @@ from .base import BaseAdapter from .utils import extract_messages_from_data -# Keep backward compatibility -from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn - logger = logging.getLogger(__name__) @@ -281,4 +278,4 @@ def create_braintrust_adapter( ) -__all__ = ["scorer_to_reward_fn", "reward_fn_to_scorer", "BraintrustAdapter", "create_braintrust_adapter"] +__all__ = ["BraintrustAdapter", "create_braintrust_adapter"] diff --git a/eval_protocol/integrations/__init__.py b/eval_protocol/integrations/__init__.py index 0a037738..f85283cf 100644 --- a/eval_protocol/integrations/__init__.py +++ b/eval_protocol/integrations/__init__.py @@ -1,12 +1,9 @@ """Integration helpers for Eval Protocol.""" -from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn from .openeval import adapt from .trl import create_trl_adapter __all__ = [ "adapt", - "scorer_to_reward_fn", - "reward_fn_to_scorer", "create_trl_adapter", ] diff --git a/eval_protocol/integrations/braintrust.py b/eval_protocol/integrations/braintrust.py deleted file mode 100644 index 14080bcb..00000000 --- a/eval_protocol/integrations/braintrust.py +++ /dev/null @@ -1,54 +0,0 @@ -"""Adapters for integrating Eval Protocol with Braintrust scoring functions.""" - -from typing import Any, Callable, List, Optional, cast - -from eval_protocol.models import EvaluateResult, Message -from eval_protocol.typed_interface import reward_function - -# Type alias for Braintrust scoring functions -BraintrustScorer = Callable[[Any, Any, Any], float] - - -def scorer_to_reward_fn( - scorer: BraintrustScorer, - *, - messages_to_input: Optional[Callable[[List[Message]], Any]] = None, - ground_truth_to_expected: Optional[Callable[[List[Message]], Any]] = None, -) -> Callable[[List[Message], Optional[List[Message]]], EvaluateResult]: - """Wrap a Braintrust scorer as an Eval Protocol reward function.""" - - def reward_fn_core( - messages: List[Message], ground_truth: Optional[List[Message]] = None, **kwargs: Any - ) -> EvaluateResult: - input_val = messages_to_input(messages) if messages_to_input else messages[0].content - output_val = messages[-1].content - expected_val = None - if ground_truth: - expected_val = ( - ground_truth_to_expected(ground_truth) if ground_truth_to_expected else ground_truth[-1].content - ) - score = scorer(input_val, output_val, expected_val) - return EvaluateResult(score=float(score)) - - # Wrap with reward_function decorator while preserving precise callable type for type checker - wrapped = reward_function(reward_fn_core) - return cast(Callable[[List[Message], Optional[List[Message]]], EvaluateResult], wrapped) - - -def reward_fn_to_scorer( - reward_fn: Callable[[List[Message], Optional[List[Message]]], EvaluateResult], -) -> BraintrustScorer: - """Create a Braintrust-compatible scorer from an Eval Protocol reward function.""" - - def scorer(input_val: Any, output: Any, expected: Any) -> float: - messages = [ - Message(role="user", content=str(input_val)), - Message(role="assistant", content=str(output)), - ] - ground_truth = None - if expected is not None: - ground_truth = [Message(role="assistant", content=str(expected))] - result = reward_fn(messages, ground_truth) - return float(result.score) - - return scorer diff --git a/examples/braintrust_example/README.md b/examples/braintrust_example/README.md deleted file mode 100644 index b93dbfe3..00000000 --- a/examples/braintrust_example/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Braintrust Example - -A minimal example showing how to evaluate a Braintrust-style scorer end to end with Eval Protocol and the Fireworks API. - -## Quick Start - -```bash -python -m eval_protocol.cli run --config-name simple_braintrust_eval -``` - -## Files - -- `main.py` - Equality scorer wrapped as an Eval Protocol reward function. -- `conf/simple_braintrust_eval.yaml` - Configuration using the `accounts/fireworks/models/qwen3-235b-a22b` model and the GSM8K dataset. -- `README.md` - This file. - -## Data - -This example reuses the **GSM8K** dataset from the math example. - - -## Output - -Results are saved to `outputs/braintrust_eval//eval_results.jsonl`. diff --git a/examples/braintrust_example/conf/simple_braintrust_eval.yaml b/examples/braintrust_example/conf/simple_braintrust_eval.yaml deleted file mode 100644 index c88d26c4..00000000 --- a/examples/braintrust_example/conf/simple_braintrust_eval.yaml +++ /dev/null @@ -1,63 +0,0 @@ -# Simplified Braintrust evaluation configuration - -defaults: - - _self_ - - override hydra/job_logging: default - - override hydra/hydra_logging: default - -hydra: - run: - dir: ./outputs/braintrust_eval/${now:%Y-%m-%d}/${now:%H-%M-%S} - sweep: - dir: ./multirun/braintrust_eval/${now:%Y-%m-%d}/${now:%H-%M-%S} - subdir: ${hydra.job.num} - -# Dataset loading from HuggingFace GSM8K (reuse math example dataset) -dataset: - _target_: eval_protocol.datasets.loader.load_and_process_dataset - source_type: "huggingface" - path_or_name: "openai/gsm8k" - config_name: "main" - split: "test" - max_samples: 5 - column_mapping: - user_query: question - ground_truth_for_eval: answer - hf_extra_load_params: {} - -# Simple system prompt -system_prompt: "Solve the math problem and return the same text as the ground truth." - -# Generation configuration using Fireworks -generation: - enabled: true - _target_: eval_protocol.generation.generate_responses - model_name: "accounts/fireworks/models/qwen3-235b-a22b" - batch_size: 1 - max_new_tokens: 50 - temperature: 0.0 - cache: - enabled: true - api_params: - rate_limit_qps: 1.0 - max_retries: 3 - max_concurrent_requests: 5 - -# Reward function -reward: - function_path: "main.evaluate" - -# Evaluation parameters -evaluation_params: - limit_samples: 2 - -# Output files -output: - results_file: "eval_results.jsonl" - preview_pairs_file: "preview_samples.jsonl" - -logging_params: - batch_log_interval: 10 - -seed: 42 -verbose: true diff --git a/examples/braintrust_example/main.py b/examples/braintrust_example/main.py deleted file mode 100644 index d7ccf6e5..00000000 --- a/examples/braintrust_example/main.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Braintrust scorer wrapped for Eval Protocol.""" - -from eval_protocol.adapters.braintrust import scorer_to_reward_fn -from eval_protocol.typed_interface import reward_function - - -def equality_scorer(input: str, output: str, expected: str) -> float: - """Return ``1.0`` if ``output`` exactly matches ``expected``.""" - - return 1.0 if output.strip() == expected.strip() else 0.0 - - -_reward_fn = scorer_to_reward_fn(equality_scorer) - - -@reward_function -def evaluate(messages, ground_truth=None, **kwargs): - """Eval Protocol evaluate function calling the Braintrust scorer.""" - - return _reward_fn(messages=messages, ground_truth=ground_truth) diff --git a/examples/braintrust_integration.py b/examples/braintrust_integration.py deleted file mode 100644 index be78aab2..00000000 --- a/examples/braintrust_integration.py +++ /dev/null @@ -1,26 +0,0 @@ -from braintrust import Eval - -from eval_protocol.adapters.braintrust import scorer_to_reward_fn - - -def equality_scorer(input: str, output: str, expected: str) -> float: - return 1.0 if output == expected else 0.0 - - -reward_fn = scorer_to_reward_fn(equality_scorer) - - -def hi_bot_task(name: str) -> str: - """Simple placeholder task that echoes the user's name.""" - return "Hi " + name - - -Eval( - "Eval Protocol Braintrust Example", - data=lambda: [ - {"input": "Foo", "expected": "Hi Foo"}, - {"input": "Bar", "expected": "Hello Bar"}, - ], - task=hi_bot_task, - scores=[reward_fn], -) diff --git a/tests/test_braintrust_adapter.py b/tests/test_braintrust_adapter.py deleted file mode 100644 index 0cff0be9..00000000 --- a/tests/test_braintrust_adapter.py +++ /dev/null @@ -1,34 +0,0 @@ -import pytest - -from eval_protocol.adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn -from eval_protocol.models import EvaluateResult, Message -from eval_protocol.typed_interface import reward_function - - -def simple_scorer(input, output, expected): - return 1.0 if output == expected else 0.0 - - -def test_scorer_to_reward_fn(): - reward_fn = scorer_to_reward_fn(simple_scorer) - messages = [ - Message(role="user", content="hi"), - Message(role="assistant", content="hi"), - ] - ground_truth = [Message(role="assistant", content="hi")] - result = reward_fn(messages=messages, ground_truth=ground_truth) - assert isinstance(result, EvaluateResult) - assert result.score == 1.0 - - -@reward_function -def my_reward(messages, ground_truth=None, **kwargs): - expected = ground_truth[-1].content if ground_truth else "" - score = 1.0 if messages[-1].content == expected else 0.0 - return EvaluateResult(score=score) - - -def test_reward_fn_to_scorer(): - scorer = reward_fn_to_scorer(my_reward) - score = scorer("foo", "bar", "bar") - assert score == 1.0 diff --git a/tests/test_braintrust_example.py b/tests/test_braintrust_example.py deleted file mode 100644 index b383b22b..00000000 --- a/tests/test_braintrust_example.py +++ /dev/null @@ -1,49 +0,0 @@ -import importlib.util -import os - -import pytest - -from eval_protocol.models import Message - - -def load_module_from_path(name, path): - spec = importlib.util.spec_from_file_location(name, path) - if spec is None or spec.loader is None: - raise ImportError(f"Could not load module {name} from {path}") - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -def get_example_module(): - example_path = os.path.join( - os.path.dirname(os.path.dirname(__file__)), - "examples", - "braintrust_example", - "main.py", - ) - return load_module_from_path("braintrust_example_main_test", example_path) - - -def test_evaluate_match(): - module = get_example_module() - messages = [ - Message(role="user", content="hi"), - Message(role="assistant", content="hello"), - ] - ground_truth = [Message(role="assistant", content="hello")] - result = module.evaluate(messages=messages, ground_truth=ground_truth) - assert result.score == 1.0 - assert result.is_score_valid is True - - -def test_evaluate_mismatch(): - module = get_example_module() - messages = [ - Message(role="user", content="hi"), - Message(role="assistant", content="goodbye"), - ] - ground_truth = [Message(role="assistant", content="hello")] - result = module.evaluate(messages=messages, ground_truth=ground_truth) - assert result.score == 0.0 - assert result.is_score_valid is True diff --git a/tests/test_eval_protocol_import.py b/tests/test_eval_protocol_import.py index c16b3927..4777be1e 100644 --- a/tests/test_eval_protocol_import.py +++ b/tests/test_eval_protocol_import.py @@ -262,14 +262,6 @@ def test_message_creation(self): assert msg.role == "user" assert msg.content == "Test message" - def test_adapter_functions(self): - """Test that adapter functions work through eval_protocol.""" - from eval_protocol import reward_fn_to_scorer, scorer_to_reward_fn - - # These should be callable - assert callable(reward_fn_to_scorer) - assert callable(scorer_to_reward_fn) - def test_utility_functions(self): """Test that utility functions work through eval_protocol.""" from eval_protocol import create_llm_resource, load_jsonl diff --git a/tests/test_reward_protocol_import.py b/tests/test_reward_protocol_import.py index d643c483..806525b1 100644 --- a/tests/test_reward_protocol_import.py +++ b/tests/test_reward_protocol_import.py @@ -261,14 +261,6 @@ def test_message_creation(self): assert msg.role == "user" assert msg.content == "Test message" - def test_adapter_functions(self): - """Test that adapter functions work through eval_protocol.""" - from eval_protocol import reward_fn_to_scorer, scorer_to_reward_fn - - # These should be callable - assert callable(reward_fn_to_scorer) - assert callable(scorer_to_reward_fn) - def test_utility_functions(self): """Test that utility functions work through eval_protocol.""" from eval_protocol import create_llm_resource, load_jsonl From 92552eebb0cc37ec348bd55abf7434df0020cbb9 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 17 Sep 2025 10:21:22 -0700 Subject: [PATCH 4/5] comments --- eval_protocol/adapters/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py index fef59a3a..dd906568 100644 --- a/eval_protocol/adapters/__init__.py +++ b/eval_protocol/adapters/__init__.py @@ -8,7 +8,6 @@ - LangfuseAdapter: Pull data from Langfuse deployments - HuggingFaceAdapter: Load datasets from HuggingFace Hub - BigQueryAdapter: Query data from Google BigQuery -- Braintrust integration (legacy) - TRL integration (legacy) """ @@ -59,7 +58,6 @@ except ImportError: pass -# Legacy adapters (always available) try: from .braintrust import BraintrustAdapter, create_braintrust_adapter @@ -67,6 +65,8 @@ except ImportError: pass +# Legacy adapters (always available) + try: from .trl import create_trl_adapter From 5cf7f26d034db566ed96cf66315ee51aebce3aad Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 17 Sep 2025 10:27:48 -0700 Subject: [PATCH 5/5] remove comment --- eval_protocol/quickstart/llm_judge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py index 42255a49..7c793795 100644 --- a/eval_protocol/quickstart/llm_judge.py +++ b/eval_protocol/quickstart/llm_judge.py @@ -93,7 +93,7 @@ async def run_judgment(row): if row.evaluation_result: row.evaluation_result.score = mean_score - # Push scores back to adapter if provided. Note that one score per model will be pushed back onto same trace. + # Push scores back to adapter if provided if adapter: adapter.upload_scores(rows, model_name, mean_score)