From 19d491a968674994a58b79e8cd8a4b658a5adba5 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 17 Sep 2025 01:45:13 -0700
Subject: [PATCH 1/5] Various small refactors

---
 eval_protocol/adapters/__init__.py            | 14 ++-
 eval_protocol/adapters/base.py                | 21 ++++
 eval_protocol/adapters/bigquery.py            |  3 +-
 eval_protocol/adapters/braintrust.py          | 46 ++++++++-
 eval_protocol/adapters/huggingface.py         |  3 +-
 eval_protocol/adapters/langfuse.py            |  3 +-
 eval_protocol/adapters/langsmith.py           |  3 +-
 eval_protocol/adapters/openai_responses.py    |  3 +-
 eval_protocol/quickstart/llm_judge.py         | 61 +++---------
 .../quickstart/llm_judge_braintrust.py        | 95 ++-----------------
 .../quickstart/llm_judge_langfuse.py          | 50 ++++++++++
 11 files changed, 153 insertions(+), 149 deletions(-)
 create mode 100644 eval_protocol/adapters/base.py
 create mode 100644 eval_protocol/quickstart/llm_judge_langfuse.py

diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py
index 57757901..5944c131 100644
--- a/eval_protocol/adapters/__init__.py
+++ b/eval_protocol/adapters/__init__.py
@@ -4,6 +4,7 @@
 and converting them to EvaluationRow format for use in evaluation pipelines.
 
 Available adapters:
+- BaseAdapter: Abstract base class for all adapters
 - LangfuseAdapter: Pull data from Langfuse deployments
 - HuggingFaceAdapter: Load datasets from HuggingFace Hub
 - BigQueryAdapter: Query data from Google BigQuery
@@ -11,13 +12,18 @@
 - TRL integration (legacy)
 """
 
+# Always available
+from .base import BaseAdapter
+
+__all__ = ["BaseAdapter"]
+
 # Conditional imports based on available dependencies
 try:
     from .langfuse import LangfuseAdapter, create_langfuse_adapter
 
-    __all__ = ["LangfuseAdapter", "create_langfuse_adapter"]
+    __all__.extend(["LangfuseAdapter", "create_langfuse_adapter"])
 except ImportError:
-    __all__ = []
+    pass
 
 try:
     from .huggingface import (
@@ -55,9 +61,9 @@
 
 # Legacy adapters (always available)
 try:
-    from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn
+    from .braintrust import BraintrustAdapter, create_braintrust_adapter, reward_fn_to_scorer, scorer_to_reward_fn
 
-    __all__.extend(["scorer_to_reward_fn", "reward_fn_to_scorer"])
+    __all__.extend(["BraintrustAdapter", "create_braintrust_adapter", "scorer_to_reward_fn", "reward_fn_to_scorer"])
 except ImportError:
     pass
 
diff --git a/eval_protocol/adapters/base.py b/eval_protocol/adapters/base.py
new file mode 100644
index 00000000..def6d85c
--- /dev/null
+++ b/eval_protocol/adapters/base.py
@@ -0,0 +1,21 @@
+"""
+Base adapter interface for Eval Protocol.
+"""
+
+from abc import ABC, abstractmethod
+from typing import List
+
+from eval_protocol.models import EvaluationRow
+
+
+class BaseAdapter(ABC):
+    """Abstract base class for all Eval Protocol adapters."""
+
+    @abstractmethod
+    def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
+        """Get evaluation rows from the data source."""
+        pass
+
+    def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
+        """Push evaluation scores back to the data source for tracking and analysis."""
+        pass
diff --git a/eval_protocol/adapters/bigquery.py b/eval_protocol/adapters/bigquery.py
index 9831e748..9446b5a8 100644
--- a/eval_protocol/adapters/bigquery.py
+++ b/eval_protocol/adapters/bigquery.py
@@ -10,6 +10,7 @@
 from typing import Any, Callable, Dict, Iterator, List, Optional, TypeAlias
 
 from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
+from .base import BaseAdapter
 
 logger = logging.getLogger(__name__)
 
@@ -42,7 +43,7 @@
 TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
 
 
-class BigQueryAdapter:
+class BigQueryAdapter(BaseAdapter):
     """Adapter to query data from Google BigQuery and convert to EvaluationRow format.
 
     This adapter connects to Google BigQuery, executes SQL queries, and applies
diff --git a/eval_protocol/adapters/braintrust.py b/eval_protocol/adapters/braintrust.py
index 979d4d52..665748b9 100644
--- a/eval_protocol/adapters/braintrust.py
+++ b/eval_protocol/adapters/braintrust.py
@@ -14,6 +14,7 @@
 import requests
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
+from .base import BaseAdapter
 from .utils import extract_messages_from_data
 
 # Keep backward compatibility
@@ -128,7 +129,7 @@ def extract_messages_from_trace(trace: Dict[str, Any], include_tool_calls: bool
     return messages
 
 
-class BraintrustAdapter:
+class BraintrustAdapter(BaseAdapter):
     """Adapter to pull data from Braintrust and convert to EvaluationRow format.
 
     This adapter can pull both chat conversations and tool calling traces from
@@ -223,6 +224,49 @@ def get_evaluation_rows(
         logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows))
         return eval_rows
 
+    def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
+        """Push evaluation scores back to Braintrust traces for tracking and analysis.
+
+        Creates score entries in Braintrust for each unique trace_id found in the evaluation
+        rows' session data. This allows you to see evaluation results directly in the
+        Braintrust UI alongside the original traces.
+
+        Args:
+            rows: List of EvaluationRow objects with session_data containing trace IDs
+            model_name: Name of the model (used as the score name in Braintrust)
+            mean_score: The calculated mean score to push to Braintrust
+
+        Note:
+            Silently handles errors if rows lack session data
+        """
+        try:
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            }
+
+            feedback_items = []
+            for trace_id in set(
+                row.input_metadata.session_data["braintrust_trace_id"]
+                for row in rows
+                if row.evaluation_result and row.input_metadata and row.input_metadata.session_data
+            ):
+                if trace_id:
+                    feedback_items.append({"id": trace_id, "scores": {model_name: mean_score}})
+
+            if feedback_items:
+                payload = {"feedback": feedback_items}
+
+                response = requests.post(
+                    f"{self.api_url}/v1/project_logs/{self.project_id}/feedback",
+                    headers=headers,
+                    json=payload,
+                )
+                response.raise_for_status()
+
+        except Exception as e:
+            logger.warning("Failed to push scores to Braintrust: %s", e)
+
 
 def create_braintrust_adapter(
     api_key: Optional[str] = None,
diff --git a/eval_protocol/adapters/huggingface.py b/eval_protocol/adapters/huggingface.py
index 7f8b6902..1f740ba2 100644
--- a/eval_protocol/adapters/huggingface.py
+++ b/eval_protocol/adapters/huggingface.py
@@ -8,6 +8,7 @@
 from typing import Any, Callable, Dict, Iterator, List, Optional
 
 from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
+from .base import BaseAdapter
 
 logger = logging.getLogger(__name__)
 
@@ -23,7 +24,7 @@
 TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]]
 
 
-class HuggingFaceAdapter:
+class HuggingFaceAdapter(BaseAdapter):
     """Generic adapter to load HuggingFace datasets with custom transformations.
 
     This adapter loads datasets from HuggingFace Hub and applies a user-provided
diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
index 115448dd..040f02d9 100644
--- a/eval_protocol/adapters/langfuse.py
+++ b/eval_protocol/adapters/langfuse.py
@@ -12,6 +12,7 @@
 from typing import Any, Dict, List, Optional, Protocol
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
+from .base import BaseAdapter
 from .utils import extract_messages_from_data
 
 logger = logging.getLogger(__name__)
@@ -188,7 +189,7 @@ def get_final_generation_in_span(trace: TraceWithFullDetails, span_name: str) ->
     return generations[-1]
 
 
-class LangfuseAdapter:
+class LangfuseAdapter(BaseAdapter):
     """Adapter to pull data from Langfuse and convert to EvaluationRow format.
 
     This adapter can pull both chat conversations and tool calling traces from
diff --git a/eval_protocol/adapters/langsmith.py b/eval_protocol/adapters/langsmith.py
index 1d29b66a..fc1daf71 100644
--- a/eval_protocol/adapters/langsmith.py
+++ b/eval_protocol/adapters/langsmith.py
@@ -13,6 +13,7 @@
 from typing import Any, Dict, List, Optional, Iterable
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
+from .base import BaseAdapter
 
 logger = logging.getLogger(__name__)
 
@@ -24,7 +25,7 @@
     LANGSMITH_AVAILABLE = False
 
 
-class LangSmithAdapter:
+class LangSmithAdapter(BaseAdapter):
     """Adapter to pull data from LangSmith and convert to EvaluationRow format.
 
     By default, fetches root runs from a project and maps inputs/outputs into
diff --git a/eval_protocol/adapters/openai_responses.py b/eval_protocol/adapters/openai_responses.py
index 8380ce06..1d4c03e5 100644
--- a/eval_protocol/adapters/openai_responses.py
+++ b/eval_protocol/adapters/openai_responses.py
@@ -21,6 +21,7 @@
 from openai.types.responses.tool import Tool
 
 from eval_protocol.models import EvaluationRow, InputMetadata, Message
+from .base import BaseAdapter
 
 logger = logging.getLogger(__name__)
 
@@ -28,7 +29,7 @@
 from openai import OpenAI
 
 
-class OpenAIResponsesAdapter:
+class OpenAIResponsesAdapter(BaseAdapter):
     """Adapter to pull data from OpenAI Responses API and convert to EvaluationRow format.
 
     This adapter can pull both chat conversations and tool calling traces from
diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
index 7cd03647..30f4d86e 100644
--- a/eval_protocol/quickstart/llm_judge.py
+++ b/eval_protocol/quickstart/llm_judge.py
@@ -2,65 +2,23 @@
 Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
 """
 
-from collections.abc import Awaitable, Callable
-import os
-from datetime import datetime
-from typing import List, Dict, Any, Optional
-from typing_extensions import cast
 from tqdm import tqdm
+from typing import Optional
 
-import pytest
-
-from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
-from eval_protocol.pytest import evaluation_test
-from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
+from eval_protocol.models import EvaluationRow
+from eval_protocol.adapters.base import BaseAdapter
 from eval_protocol.quickstart.utils import (
-    split_multi_turn_rows,
     JUDGE_CONFIGS,
     calculate_bootstrap_scores,
     run_judgment_async,
 )
 import asyncio
 from openai import AsyncOpenAI
-from eval_protocol.adapters.langfuse import create_langfuse_adapter
-
-adapter = create_langfuse_adapter()
-
-
-@pytest.mark.asyncio
-@evaluation_test(
-    input_rows=[
-        adapter.get_evaluation_rows(
-            to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
-            limit=711,
-            sample_size=50,
-            sleep_between_gets=3.0,
-            max_retries=5,
-        )
-    ],
-    completion_params=[
-        {"model": "gpt-4.1"},
-        {
-            "max_tokens": 131000,
-            "extra_body": {"reasoning_effort": "medium"},
-            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
-        },
-        {
-            "max_tokens": 131000,
-            "extra_body": {"reasoning_effort": "low"},
-            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
-        },
-    ],
-    rollout_processor=SingleTurnRolloutProcessor(),
-    preprocess_fn=split_multi_turn_rows,
-    max_concurrent_rollouts=64,
-    mode="all",
-)
-async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
-    return await aha_judge(rows)
 
 
-async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro") -> list[EvaluationRow]:
+async def aha_judge(
+    rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro", adapter: Optional[BaseAdapter] = None
+) -> list[EvaluationRow]:
     """
     LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.
 
@@ -73,6 +31,8 @@ async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro
 
     Args:
         rows: List of EvaluationRow objects with messages, ground_truth, and tools
+        judge_name: Name of the judge configuration to use
+        adapter: Optional adapter to push scores back to (if provided)
 
     Returns:
         Same rows with updated evaluation_result containing scores and judgments
@@ -133,7 +93,8 @@ async def run_judgment(row):
         if row.evaluation_result:
             row.evaluation_result.score = mean_score
 
-    # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
-    adapter.push_scores(rows, model_name, mean_score)
+    # Push scores back to adapter if provided. Note that one score per model will be pushed back onto same trace.
+    if adapter:
+        adapter.push_scores(rows, model_name, mean_score)
 
     return rows
diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py
index a1902cf7..71b51b0a 100644
--- a/eval_protocol/quickstart/llm_judge_braintrust.py
+++ b/eval_protocol/quickstart/llm_judge_braintrust.py
@@ -1,26 +1,17 @@
 """
-Default LLM judge for Eval Protocol using Braintrust. Inspired by Arena-Hard-Auto.
+Example for using Braintrust with the aha judge.
 """
 
 import os
-from datetime import datetime
-from typing import List, Dict, Any, Optional
-from tqdm import tqdm
 
 import pytest
 
-from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
+from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
-from eval_protocol.quickstart.utils import (
-    split_multi_turn_rows,
-    JUDGE_CONFIGS,
-    calculate_bootstrap_scores,
-    run_judgment_async,
-)
-import asyncio
-from openai import AsyncOpenAI
+from eval_protocol.quickstart.utils import split_multi_turn_rows
 from eval_protocol.adapters.braintrust import create_braintrust_adapter
+from eval_protocol.quickstart import aha_judge
 
 adapter = create_braintrust_adapter()
 
@@ -33,7 +24,7 @@
 select: *
 from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
 filter: is_root = true
-limit: 5
+limit: 10
 """
         )
     ],
@@ -56,78 +47,4 @@
     mode="all",
 )
 async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
-    """
-    LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.
-
-    Compares model responses against ground truth using an LLM judge. For each row:
-    1. Extracts the question from messages[:-1]
-    2. Compares messages[-1] (new model response) vs ground_truth (baseline response)
-    3. Runs two judgment rounds (A vs B, B vs A) to reduce position bias
-    4. Calculates bootstrap scores across all comparisons
-    5. Updates evaluation_result with final scores and confidence intervals
-
-    Args:
-        rows: List of EvaluationRow objects with messages, ground_truth, and tools
-
-    Returns:
-        Same rows with updated evaluation_result containing scores and judgments
-    """
-
-    judge_name = "gemini-2.5-pro"  # Edit to which judge you'd like to use. Configs are in utils.py.
-
-    if not rows:
-        print("❌ No evaluation rows provided")
-        return rows
-
-    print(f"🔄 Processing {len(rows)} evaluation rows for LLM judging...")
-
-    model_name = rows[0].input_metadata.completion_params.get("model", "unknown_model")
-
-    judgments = []
-    max_concurrency = JUDGE_CONFIGS[judge_name]["max_concurrency"]
-
-    judge_config = JUDGE_CONFIGS[judge_name]
-
-    async with AsyncOpenAI(
-        api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")
-    ) as shared_client:
-        semaphore = asyncio.Semaphore(max_concurrency)
-
-        async def run_judgment(row):
-            async with semaphore:
-                return await run_judgment_async(row, model_name, judge_name, shared_client)
-
-        tasks = [run_judgment(row) for row in rows]
-
-        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Generating judgments"):
-            result = await coro
-            if result and result["games"][0] and result["games"][1]:
-                judgments.append(result)
-
-    if not judgments:
-        print("❌ No valid judgments generated")
-        return rows
-
-    print(f"✅ Generated {len(judgments)} valid judgments")
-
-    # Calculate bootstrap scores
-    result = calculate_bootstrap_scores(judgments)
-    if not result:
-        print("❌ No valid scores extracted")
-        return rows
-
-    mean_score, lower_score, upper_score = result
-
-    # Print leaderboard
-    print("\n##### LLM Judge Results (90th percentile CI) #####")
-
-    clean_model_name = model_name.split("/")[-1]  # Clean model name
-
-    print(f"{clean_model_name}: {mean_score:.1%} (CI: {lower_score:.1%} - {upper_score:.1%})")
-    print("original: 50.0% (CI: 50.0% - 50.0%)")
-
-    for row in rows:
-        if row.evaluation_result:
-            row.evaluation_result.score = mean_score
-
-    return rows
+    return await aha_judge(rows)
diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py
new file mode 100644
index 00000000..08f72c67
--- /dev/null
+++ b/eval_protocol/quickstart/llm_judge_langfuse.py
@@ -0,0 +1,50 @@
+"""
+Example for using Langfuse with the aha judge.
+"""
+
+from datetime import datetime
+
+import pytest
+
+from eval_protocol.models import EvaluationRow
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
+from eval_protocol.quickstart.utils import split_multi_turn_rows
+
+from eval_protocol.adapters.langfuse import create_langfuse_adapter
+from eval_protocol.quickstart import aha_judge
+
+adapter = create_langfuse_adapter()
+
+
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[
+        adapter.get_evaluation_rows(
+            to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
+            limit=711,
+            sample_size=50,
+            sleep_between_gets=3.0,
+            max_retries=5,
+        )
+    ],
+    completion_params=[
+        {"model": "gpt-4.1"},
+        {
+            "max_tokens": 131000,
+            "extra_body": {"reasoning_effort": "medium"},
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+        },
+        {
+            "max_tokens": 131000,
+            "extra_body": {"reasoning_effort": "low"},
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
+        },
+    ],
+    rollout_processor=SingleTurnRolloutProcessor(),
+    preprocess_fn=split_multi_turn_rows,
+    max_concurrent_rollouts=64,
+    mode="all",
+)
+async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
+    return await aha_judge(rows)

From 3dc99358b35fc3971a78d8047de2113692f766f7 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 17 Sep 2025 10:14:14 -0700
Subject: [PATCH 2/5] change name

---
 eval_protocol/adapters/base.py        | 4 ++--
 eval_protocol/adapters/braintrust.py  | 4 ++--
 eval_protocol/adapters/langfuse.py    | 4 ++--
 eval_protocol/quickstart/llm_judge.py | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/eval_protocol/adapters/base.py b/eval_protocol/adapters/base.py
index def6d85c..6009b8e1 100644
--- a/eval_protocol/adapters/base.py
+++ b/eval_protocol/adapters/base.py
@@ -16,6 +16,6 @@ def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
         """Get evaluation rows from the data source."""
         pass
 
-    def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
-        """Push evaluation scores back to the data source for tracking and analysis."""
+    def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
+        """Upload evaluation scores back to the data source for tracking and analysis."""
         pass
diff --git a/eval_protocol/adapters/braintrust.py b/eval_protocol/adapters/braintrust.py
index 665748b9..d419052b 100644
--- a/eval_protocol/adapters/braintrust.py
+++ b/eval_protocol/adapters/braintrust.py
@@ -224,8 +224,8 @@ def get_evaluation_rows(
         logger.info("Successfully processed %d BTQL results into %d evaluation rows", len(all_traces), len(eval_rows))
         return eval_rows
 
-    def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
-        """Push evaluation scores back to Braintrust traces for tracking and analysis.
+    def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
+        """Upload evaluation scores back to Braintrust traces for tracking and analysis.
 
         Creates score entries in Braintrust for each unique trace_id found in the evaluation
         rows' session data. This allows you to see evaluation results directly in the
diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
index 040f02d9..6d057372 100644
--- a/eval_protocol/adapters/langfuse.py
+++ b/eval_protocol/adapters/langfuse.py
@@ -434,8 +434,8 @@ def get_evaluation_rows_by_ids(
                 continue
         return eval_rows
 
-    def push_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
-        """Push evaluation scores back to Langfuse traces for tracking and analysis.
+    def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
+        """Upload evaluation scores back to Langfuse traces for tracking and analysis.
 
         Creates a score entry in Langfuse for each unique trace_id found in the evaluation
         rows' session data. This allows you to see evaluation results directly in the
diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
index 30f4d86e..42255a49 100644
--- a/eval_protocol/quickstart/llm_judge.py
+++ b/eval_protocol/quickstart/llm_judge.py
@@ -95,6 +95,6 @@ async def run_judgment(row):
 
     # Push scores back to adapter if provided. Note that one score per model will be pushed back onto same trace.
     if adapter:
-        adapter.push_scores(rows, model_name, mean_score)
+        adapter.upload_scores(rows, model_name, mean_score)
 
     return rows

From f62f0ad503bd235b97509dea70e52950354e6ba9 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 17 Sep 2025 10:18:10 -0700
Subject: [PATCH 3/5] remove old braintrust integration

---
 docs/integrations/braintrust_integration.mdx  | 49 ---------------
 eval_protocol/__init__.py                     |  4 --
 eval_protocol/adapters/__init__.py            |  4 +-
 eval_protocol/adapters/braintrust.py          |  5 +-
 eval_protocol/integrations/__init__.py        |  3 -
 eval_protocol/integrations/braintrust.py      | 54 ----------------
 examples/braintrust_example/README.md         | 24 -------
 .../conf/simple_braintrust_eval.yaml          | 63 -------------------
 examples/braintrust_example/main.py           | 20 ------
 examples/braintrust_integration.py            | 26 --------
 tests/test_braintrust_adapter.py              | 34 ----------
 tests/test_braintrust_example.py              | 49 ---------------
 tests/test_eval_protocol_import.py            |  8 ---
 tests/test_reward_protocol_import.py          |  8 ---
 14 files changed, 3 insertions(+), 348 deletions(-)
 delete mode 100644 docs/integrations/braintrust_integration.mdx
 delete mode 100644 eval_protocol/integrations/braintrust.py
 delete mode 100644 examples/braintrust_example/README.md
 delete mode 100644 examples/braintrust_example/conf/simple_braintrust_eval.yaml
 delete mode 100644 examples/braintrust_example/main.py
 delete mode 100644 examples/braintrust_integration.py
 delete mode 100644 tests/test_braintrust_adapter.py
 delete mode 100644 tests/test_braintrust_example.py

diff --git a/docs/integrations/braintrust_integration.mdx b/docs/integrations/braintrust_integration.mdx
deleted file mode 100644
index 43e4b90c..00000000
--- a/docs/integrations/braintrust_integration.mdx
+++ /dev/null
@@ -1,49 +0,0 @@
-# Integrating with Braintrust
-
-This guide explains how to bridge Eval Protocol with [Braintrust](https://braintrust.dev/). You can log Eval Protocol evaluations to Braintrust or reuse Braintrust scorers as Eval Protocol reward functions.
-
-## Installation
-
-Install the Braintrust SDK in your environment:
-
-```bash
-pip install braintrust
-```
-
-## Using a Braintrust scorer in Eval Protocol
-
-Convert a Braintrust-style scorer to an Eval Protocol reward function using `scorer_to_reward_fn`:
-
-```python
-from braintrust import Eval
-from eval_protocol.integrations.braintrust import scorer_to_reward_fn
-
-
-def equality_scorer(input: str, output: str, expected: str) -> float:
-    return 1.0 if output == expected else 0.0
-
-reward_fn = scorer_to_reward_fn(equality_scorer)
-
-
-def hi_bot_task(name: str) -> str:
-    return "Hi " + name
-
-
-Eval(
-    "Eval Protocol Braintrust Example",
-    data=lambda: [
-        {"input": "Foo", "expected": "Hi Foo"},
-        {"input": "Bar", "expected": "Hello Bar"},
-    ],
-    task=hi_bot_task,
-    scores=[reward_fn],
-)
-```
-
-Run the script with your Braintrust API key:
-
-```bash
-BRAINTRUST_API_KEY=<your key> braintrust eval examples/braintrust_integration.py
-```
-
-This will create an experiment in Braintrust where you can inspect the scores, outputs and metadata.
diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py
index 5939896f..f39369b2 100644
--- a/eval_protocol/__init__.py
+++ b/eval_protocol/__init__.py
@@ -10,8 +10,6 @@
 
 import warnings
 
-from eval_protocol.adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
-
 from .auth import get_fireworks_account_id, get_fireworks_api_key
 from .common_utils import load_jsonl
 from .config import RewardKitConfig, get_config, load_config
@@ -49,8 +47,6 @@
     "EvaluateResult",
     "reward_function",
     "RewardFunction",
-    "scorer_to_reward_fn",
-    "reward_fn_to_scorer",
     # Authentication
     "get_fireworks_api_key",
     "get_fireworks_account_id",
diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py
index 5944c131..fef59a3a 100644
--- a/eval_protocol/adapters/__init__.py
+++ b/eval_protocol/adapters/__init__.py
@@ -61,9 +61,9 @@
 
 # Legacy adapters (always available)
 try:
-    from .braintrust import BraintrustAdapter, create_braintrust_adapter, reward_fn_to_scorer, scorer_to_reward_fn
+    from .braintrust import BraintrustAdapter, create_braintrust_adapter
 
-    __all__.extend(["BraintrustAdapter", "create_braintrust_adapter", "scorer_to_reward_fn", "reward_fn_to_scorer"])
+    __all__.extend(["BraintrustAdapter", "create_braintrust_adapter"])
 except ImportError:
     pass
 
diff --git a/eval_protocol/adapters/braintrust.py b/eval_protocol/adapters/braintrust.py
index d419052b..2007f322 100644
--- a/eval_protocol/adapters/braintrust.py
+++ b/eval_protocol/adapters/braintrust.py
@@ -17,9 +17,6 @@
 from .base import BaseAdapter
 from .utils import extract_messages_from_data
 
-# Keep backward compatibility
-from ..integrations.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
-
 
 logger = logging.getLogger(__name__)
 
@@ -281,4 +278,4 @@ def create_braintrust_adapter(
     )
 
 
-__all__ = ["scorer_to_reward_fn", "reward_fn_to_scorer", "BraintrustAdapter", "create_braintrust_adapter"]
+__all__ = ["BraintrustAdapter", "create_braintrust_adapter"]
diff --git a/eval_protocol/integrations/__init__.py b/eval_protocol/integrations/__init__.py
index 0a037738..f85283cf 100644
--- a/eval_protocol/integrations/__init__.py
+++ b/eval_protocol/integrations/__init__.py
@@ -1,12 +1,9 @@
 """Integration helpers for Eval Protocol."""
 
-from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn
 from .openeval import adapt
 from .trl import create_trl_adapter
 
 __all__ = [
     "adapt",
-    "scorer_to_reward_fn",
-    "reward_fn_to_scorer",
     "create_trl_adapter",
 ]
diff --git a/eval_protocol/integrations/braintrust.py b/eval_protocol/integrations/braintrust.py
deleted file mode 100644
index 14080bcb..00000000
--- a/eval_protocol/integrations/braintrust.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""Adapters for integrating Eval Protocol with Braintrust scoring functions."""
-
-from typing import Any, Callable, List, Optional, cast
-
-from eval_protocol.models import EvaluateResult, Message
-from eval_protocol.typed_interface import reward_function
-
-# Type alias for Braintrust scoring functions
-BraintrustScorer = Callable[[Any, Any, Any], float]
-
-
-def scorer_to_reward_fn(
-    scorer: BraintrustScorer,
-    *,
-    messages_to_input: Optional[Callable[[List[Message]], Any]] = None,
-    ground_truth_to_expected: Optional[Callable[[List[Message]], Any]] = None,
-) -> Callable[[List[Message], Optional[List[Message]]], EvaluateResult]:
-    """Wrap a Braintrust scorer as an Eval Protocol reward function."""
-
-    def reward_fn_core(
-        messages: List[Message], ground_truth: Optional[List[Message]] = None, **kwargs: Any
-    ) -> EvaluateResult:
-        input_val = messages_to_input(messages) if messages_to_input else messages[0].content
-        output_val = messages[-1].content
-        expected_val = None
-        if ground_truth:
-            expected_val = (
-                ground_truth_to_expected(ground_truth) if ground_truth_to_expected else ground_truth[-1].content
-            )
-        score = scorer(input_val, output_val, expected_val)
-        return EvaluateResult(score=float(score))
-
-    # Wrap with reward_function decorator while preserving precise callable type for type checker
-    wrapped = reward_function(reward_fn_core)
-    return cast(Callable[[List[Message], Optional[List[Message]]], EvaluateResult], wrapped)
-
-
-def reward_fn_to_scorer(
-    reward_fn: Callable[[List[Message], Optional[List[Message]]], EvaluateResult],
-) -> BraintrustScorer:
-    """Create a Braintrust-compatible scorer from an Eval Protocol reward function."""
-
-    def scorer(input_val: Any, output: Any, expected: Any) -> float:
-        messages = [
-            Message(role="user", content=str(input_val)),
-            Message(role="assistant", content=str(output)),
-        ]
-        ground_truth = None
-        if expected is not None:
-            ground_truth = [Message(role="assistant", content=str(expected))]
-        result = reward_fn(messages, ground_truth)
-        return float(result.score)
-
-    return scorer
diff --git a/examples/braintrust_example/README.md b/examples/braintrust_example/README.md
deleted file mode 100644
index b93dbfe3..00000000
--- a/examples/braintrust_example/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Braintrust Example
-
-A minimal example showing how to evaluate a Braintrust-style scorer end to end with Eval Protocol and the Fireworks API.
-
-## Quick Start
-
-```bash
-python -m eval_protocol.cli run --config-name simple_braintrust_eval
-```
-
-## Files
-
-- `main.py` - Equality scorer wrapped as an Eval Protocol reward function.
-- `conf/simple_braintrust_eval.yaml` - Configuration using the `accounts/fireworks/models/qwen3-235b-a22b` model and the GSM8K dataset.
-- `README.md` - This file.
-
-## Data
-
-This example reuses the **GSM8K** dataset from the math example.
-
-
-## Output
-
-Results are saved to `outputs/braintrust_eval/<timestamp>/eval_results.jsonl`.
diff --git a/examples/braintrust_example/conf/simple_braintrust_eval.yaml b/examples/braintrust_example/conf/simple_braintrust_eval.yaml
deleted file mode 100644
index c88d26c4..00000000
--- a/examples/braintrust_example/conf/simple_braintrust_eval.yaml
+++ /dev/null
@@ -1,63 +0,0 @@
-# Simplified Braintrust evaluation configuration
-
-defaults:
-  - _self_
-  - override hydra/job_logging: default
-  - override hydra/hydra_logging: default
-
-hydra:
-  run:
-    dir: ./outputs/braintrust_eval/${now:%Y-%m-%d}/${now:%H-%M-%S}
-  sweep:
-    dir: ./multirun/braintrust_eval/${now:%Y-%m-%d}/${now:%H-%M-%S}
-    subdir: ${hydra.job.num}
-
-# Dataset loading from HuggingFace GSM8K (reuse math example dataset)
-dataset:
-  _target_: eval_protocol.datasets.loader.load_and_process_dataset
-  source_type: "huggingface"
-  path_or_name: "openai/gsm8k"
-  config_name: "main"
-  split: "test"
-  max_samples: 5
-  column_mapping:
-    user_query: question
-    ground_truth_for_eval: answer
-  hf_extra_load_params: {}
-
-# Simple system prompt
-system_prompt: "Solve the math problem and return the same text as the ground truth."
-
-# Generation configuration using Fireworks
-generation:
-  enabled: true
-  _target_: eval_protocol.generation.generate_responses
-  model_name: "accounts/fireworks/models/qwen3-235b-a22b"
-  batch_size: 1
-  max_new_tokens: 50
-  temperature: 0.0
-  cache:
-    enabled: true
-  api_params:
-    rate_limit_qps: 1.0
-    max_retries: 3
-    max_concurrent_requests: 5
-
-# Reward function
-reward:
-  function_path: "main.evaluate"
-
-# Evaluation parameters
-evaluation_params:
-  limit_samples: 2
-
-# Output files
-output:
-  results_file: "eval_results.jsonl"
-  preview_pairs_file: "preview_samples.jsonl"
-
-logging_params:
-  batch_log_interval: 10
-
-seed: 42
-verbose: true
diff --git a/examples/braintrust_example/main.py b/examples/braintrust_example/main.py
deleted file mode 100644
index d7ccf6e5..00000000
--- a/examples/braintrust_example/main.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""Braintrust scorer wrapped for Eval Protocol."""
-
-from eval_protocol.adapters.braintrust import scorer_to_reward_fn
-from eval_protocol.typed_interface import reward_function
-
-
-def equality_scorer(input: str, output: str, expected: str) -> float:
-    """Return ``1.0`` if ``output`` exactly matches ``expected``."""
-
-    return 1.0 if output.strip() == expected.strip() else 0.0
-
-
-_reward_fn = scorer_to_reward_fn(equality_scorer)
-
-
-@reward_function
-def evaluate(messages, ground_truth=None, **kwargs):
-    """Eval Protocol evaluate function calling the Braintrust scorer."""
-
-    return _reward_fn(messages=messages, ground_truth=ground_truth)
diff --git a/examples/braintrust_integration.py b/examples/braintrust_integration.py
deleted file mode 100644
index be78aab2..00000000
--- a/examples/braintrust_integration.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from braintrust import Eval
-
-from eval_protocol.adapters.braintrust import scorer_to_reward_fn
-
-
-def equality_scorer(input: str, output: str, expected: str) -> float:
-    return 1.0 if output == expected else 0.0
-
-
-reward_fn = scorer_to_reward_fn(equality_scorer)
-
-
-def hi_bot_task(name: str) -> str:
-    """Simple placeholder task that echoes the user's name."""
-    return "Hi " + name
-
-
-Eval(
-    "Eval Protocol Braintrust Example",
-    data=lambda: [
-        {"input": "Foo", "expected": "Hi Foo"},
-        {"input": "Bar", "expected": "Hello Bar"},
-    ],
-    task=hi_bot_task,
-    scores=[reward_fn],
-)
diff --git a/tests/test_braintrust_adapter.py b/tests/test_braintrust_adapter.py
deleted file mode 100644
index 0cff0be9..00000000
--- a/tests/test_braintrust_adapter.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import pytest
-
-from eval_protocol.adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
-from eval_protocol.models import EvaluateResult, Message
-from eval_protocol.typed_interface import reward_function
-
-
-def simple_scorer(input, output, expected):
-    return 1.0 if output == expected else 0.0
-
-
-def test_scorer_to_reward_fn():
-    reward_fn = scorer_to_reward_fn(simple_scorer)
-    messages = [
-        Message(role="user", content="hi"),
-        Message(role="assistant", content="hi"),
-    ]
-    ground_truth = [Message(role="assistant", content="hi")]
-    result = reward_fn(messages=messages, ground_truth=ground_truth)
-    assert isinstance(result, EvaluateResult)
-    assert result.score == 1.0
-
-
-@reward_function
-def my_reward(messages, ground_truth=None, **kwargs):
-    expected = ground_truth[-1].content if ground_truth else ""
-    score = 1.0 if messages[-1].content == expected else 0.0
-    return EvaluateResult(score=score)
-
-
-def test_reward_fn_to_scorer():
-    scorer = reward_fn_to_scorer(my_reward)
-    score = scorer("foo", "bar", "bar")
-    assert score == 1.0
diff --git a/tests/test_braintrust_example.py b/tests/test_braintrust_example.py
deleted file mode 100644
index b383b22b..00000000
--- a/tests/test_braintrust_example.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import importlib.util
-import os
-
-import pytest
-
-from eval_protocol.models import Message
-
-
-def load_module_from_path(name, path):
-    spec = importlib.util.spec_from_file_location(name, path)
-    if spec is None or spec.loader is None:
-        raise ImportError(f"Could not load module {name} from {path}")
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module
-
-
-def get_example_module():
-    example_path = os.path.join(
-        os.path.dirname(os.path.dirname(__file__)),
-        "examples",
-        "braintrust_example",
-        "main.py",
-    )
-    return load_module_from_path("braintrust_example_main_test", example_path)
-
-
-def test_evaluate_match():
-    module = get_example_module()
-    messages = [
-        Message(role="user", content="hi"),
-        Message(role="assistant", content="hello"),
-    ]
-    ground_truth = [Message(role="assistant", content="hello")]
-    result = module.evaluate(messages=messages, ground_truth=ground_truth)
-    assert result.score == 1.0
-    assert result.is_score_valid is True
-
-
-def test_evaluate_mismatch():
-    module = get_example_module()
-    messages = [
-        Message(role="user", content="hi"),
-        Message(role="assistant", content="goodbye"),
-    ]
-    ground_truth = [Message(role="assistant", content="hello")]
-    result = module.evaluate(messages=messages, ground_truth=ground_truth)
-    assert result.score == 0.0
-    assert result.is_score_valid is True
diff --git a/tests/test_eval_protocol_import.py b/tests/test_eval_protocol_import.py
index c16b3927..4777be1e 100644
--- a/tests/test_eval_protocol_import.py
+++ b/tests/test_eval_protocol_import.py
@@ -262,14 +262,6 @@ def test_message_creation(self):
         assert msg.role == "user"
         assert msg.content == "Test message"
 
-    def test_adapter_functions(self):
-        """Test that adapter functions work through eval_protocol."""
-        from eval_protocol import reward_fn_to_scorer, scorer_to_reward_fn
-
-        # These should be callable
-        assert callable(reward_fn_to_scorer)
-        assert callable(scorer_to_reward_fn)
-
     def test_utility_functions(self):
         """Test that utility functions work through eval_protocol."""
         from eval_protocol import create_llm_resource, load_jsonl
diff --git a/tests/test_reward_protocol_import.py b/tests/test_reward_protocol_import.py
index d643c483..806525b1 100644
--- a/tests/test_reward_protocol_import.py
+++ b/tests/test_reward_protocol_import.py
@@ -261,14 +261,6 @@ def test_message_creation(self):
         assert msg.role == "user"
         assert msg.content == "Test message"
 
-    def test_adapter_functions(self):
-        """Test that adapter functions work through eval_protocol."""
-        from eval_protocol import reward_fn_to_scorer, scorer_to_reward_fn
-
-        # These should be callable
-        assert callable(reward_fn_to_scorer)
-        assert callable(scorer_to_reward_fn)
-
     def test_utility_functions(self):
         """Test that utility functions work through eval_protocol."""
         from eval_protocol import create_llm_resource, load_jsonl

From 92552eebb0cc37ec348bd55abf7434df0020cbb9 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 17 Sep 2025 10:21:22 -0700
Subject: [PATCH 4/5] comments

---
 eval_protocol/adapters/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py
index fef59a3a..dd906568 100644
--- a/eval_protocol/adapters/__init__.py
+++ b/eval_protocol/adapters/__init__.py
@@ -8,7 +8,6 @@
 - LangfuseAdapter: Pull data from Langfuse deployments
 - HuggingFaceAdapter: Load datasets from HuggingFace Hub
 - BigQueryAdapter: Query data from Google BigQuery
-- Braintrust integration (legacy)
 - TRL integration (legacy)
 """
 
@@ -59,7 +58,6 @@
 except ImportError:
     pass
 
-# Legacy adapters (always available)
 try:
     from .braintrust import BraintrustAdapter, create_braintrust_adapter
 
@@ -67,6 +65,8 @@
 except ImportError:
     pass
 
+# Legacy adapters (always available)
+
 try:
     from .trl import create_trl_adapter
 

From 5cf7f26d034db566ed96cf66315ee51aebce3aad Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 17 Sep 2025 10:27:48 -0700
Subject: [PATCH 5/5] remove comment

---
 eval_protocol/quickstart/llm_judge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
index 42255a49..7c793795 100644
--- a/eval_protocol/quickstart/llm_judge.py
+++ b/eval_protocol/quickstart/llm_judge.py
@@ -93,7 +93,7 @@ async def run_judgment(row):
         if row.evaluation_result:
             row.evaluation_result.score = mean_score
 
-    # Push scores back to adapter if provided. Note that one score per model will be pushed back onto same trace.
+    # Push scores back to adapter if provided
     if adapter:
         adapter.upload_scores(rows, model_name, mean_score)