In [None]:
#| hide

%load_ext autoreload
%autoreload 2

# Evaluate

**TODO: GIT LFS on json of logs -- migrate**

> Here, we evaluate different LLM agents on our benchmark

- skip_showdoc: true
- skip_exec: true

In [None]:
#| default_exp experiments

In [None]:
#| hide

import agents
from claimdb.configuration import *

## Googleâ€™s MCP Toolbox

Google makes our life easy because they built a great tool for managing
database connections and agentic **tools**. If you want to learn more
you can read the
[blog](https://cloud.google.com/blog/products/ai-machine-learning/mcp-toolbox-for-databases-now-supports-model-context-protocol).

Here, all we care about is setting up the database connections properly
and defining a few tools! We will only work with the `tools.yaml` file.

### Define SQLite connections

We will first find the unique database names

In [None]:
config.project_root

PosixPath('/Users/michaeltheologitis/Code/claimdb')

In [None]:
import json

In [None]:
db_names = set()

with open(config.bird_dir / 'train_dev_filtered.jsonl', 'r') as f:
    db_names = set(json.loads(line)['db_id'] for line in f)

In [None]:
next(iter(db_names))

'formula_1'

In [None]:
len(db_names)

80

We want to create (see [SQLite config](https://googleapis.github.io/genai-toolbox/resources/sources/sqlite/)):

```yaml
sources:
  superhero:
      kind: sqlite
      database: /path/to/superhero.sqlite
    ...
```

In [None]:
lines = ["sources:"]
for name in sorted(db_names):
    lines.append(f"  {name}:")
    lines.append(f"    kind: sqlite")
    lines.append(f"    database: {str(config.bird_databases_dir / name / (name + ".sqlite"))}")

yaml_connections_str = "\n".join(lines)

In [None]:
print(yaml_connections_str[:550])

### Define Tools

Now, we want to create ([Tools](https://googleapis.github.io/genai-toolbox/resources/tools/)):

```yaml
tools:
  superhero_execute_sql:
    description: Executes SQL queries on the SQLite Superhero database. The queries must be SQLite-compatible.
    kind: sqlite-execute-sql
    source: superhero
  ...
```

Notice here that we use Google's primitive tool (`sqlite-execute-sql`) to
execute SQL queries on the database (this is the code that is already implemented by them). Then, we *bind* the `superhero_execute_sql` tool to the `superhero` database source we defined above.

In [None]:
from claimdb.preprocess_bird import *

In [None]:
lines = ["tools:"]
for name in sorted(db_names):
    lines.append(f"  {name}_execute_sql:")
    lines.append(f"    description: Executes SQL queries on the SQLite {convert_db_name(name)} database. The queries must be SQLite-compatible.")
    lines.append(f"    source: {name}")
    lines.append(f"    kind: sqlite-execute-sql")

yaml_tools_str = "\n".join(lines)

In [None]:
print(yaml_tools_str[:1000])

tools:
  address_execute_sql:
    description: Executes SQL queries on the SQLite Address database. The queries must be SQLite-compatible.
    source: address
    kind: sqlite-execute-sql
  airline_execute_sql:
    description: Executes SQL queries on the SQLite Airline database. The queries must be SQLite-compatible.
    source: airline
    kind: sqlite-execute-sql
  app_store_execute_sql:
    description: Executes SQL queries on the SQLite App Store database. The queries must be SQLite-compatible.
    source: app_store
    kind: sqlite-execute-sql
  authors_execute_sql:
    description: Executes SQL queries on the SQLite Authors database. The queries must be SQLite-compatible.
    source: authors
    kind: sqlite-execute-sql
  beer_factory_execute_sql:
    description: Executes SQL queries on the SQLite Beer Factory database. The queries must be SQLite-compatible.
    source: beer_factory
    kind: sqlite-execute-sql
  bike_share_1_execute_sql:
    description: Executes SQL queries o

### Create the YAML tools file

**Finally, combine the two strings:**

In [None]:
yml_str = yaml_connections_str + "\n" + yaml_tools_str

In [None]:
with open(config.project_root / "tools.yaml", "w") as f:
    f.write(yml_str)

Now you can run 

```bash 
toolbox --ui
```

and check that all is ok!

### Test tools

**We need to open a connection to the toolbox server first (client)**

In [None]:
from toolbox_core import ToolboxSyncClient

In [None]:
client = ToolboxSyncClient("http://127.0.0.1:5000")

Now we can start loading the registered tools of `tools.yaml` and use them to query the databases.

In [None]:
db_id = "california_schools"

In [None]:
california_tool = client.load_tool(f"{db_id}_execute_sql")

In [None]:
list_tables_query = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;"

In [None]:
california_tool(list_tables_query)

In [None]:
db_id = "financial"

In [None]:
db_id = "financial"

tool = client.load_tool(f"{db_id}_execute_sql")

tool(list_tables_query)

In [None]:
db_id = "mental_health_survey"

tool = client.load_tool(f"{db_id}_execute_sql")

tool(list_tables_query)

**Close the connection**

In [None]:
client.close()

## Prompts, I/O, Client

**OpenAI's agents handle structured outputs well with the Pydantic types**. They give the descriptions correctly. that is why in the prompts you will not see me explaining specific output types more. These comments are in the pydantic (`BaseModel` as `description`)

In [None]:
#| export
from toolbox_core import ToolboxSyncClient
from claimdb.configuration import *

In [None]:
#| export
from pydantic import BaseModel, Field
from typing import Literal
from agents import Runner, Agent, function_tool
import json

In [None]:
#| export
from claimdb.transformation import claim_collection_json_to_parsed

In [None]:
#| export
toolbox_client = ToolboxSyncClient("http://127.0.0.1:5000")

Unclosed client session
client_session: <aiohttp.client.ClientSession object>


In [None]:
db_id = "california_schools"
california_tool = toolbox_client.load_tool(f"{db_id}_execute_sql")

RuntimeError: API request failed with status 403 (Forbidden). Server response: 

In [None]:
#| hide
#| export
def description(pydantic_model):
    "Print the field descriptions of a Pydantic model"
    for name, field in pydantic_model.model_fields.items():
        print(f"{name}: {field.description}\n")

### I/O Schema

In [None]:
#| hide
#| export

class ClaimVerdict(BaseModel):
    verdict: Literal["ENTAILED", "CONTRADICTED", "NOT ENOUGH INFO"] = Field(
        ...,
        description="Whether the claim is supported, contradicted, or undecidable from the database."
    )
    justification: str = Field(
        ...,
        description="Brief justification (1-2 sentences) of the verdict."
    )

In [None]:
description(ClaimVerdict)

verdict: Whether the claim is supported, contradicted, or undecidable from the database.

justification: Brief justification (1-2 sentences) of the verdict.



### Prompt

In [None]:
#| export

BASE_PROMPT = f"""
You are a fact-checking assistant operating over structured data. You will be given a natural-language claim and optional external information. You will have access to a SQLite database and may execute arbitrary SQL queries over it using specialized tools.

Your task is to determine whether the claim is "ENTAILED", "CONTRADICTED", or "NOT ENOUGH INFO" based on evidence you obtain from the database. The labels are defined as follows:

- ENTAILED: The claim is supported by the database.
- CONTRADICTED: The claim is refuted by the database.
- NOT ENOUGH INFO: The database does not provide sufficient evidence to decide.

Use the available tools to query the database and gather evidence before making a decision. Do not ask the user for clarification or additional information.

You should always start by querying the database for the schema (tables and columns).
"""

FACT_CHECKER_PROMPT = BASE_PROMPT + f"""
Your answer should be in JSON format, adhering to the following schema:
{json.dumps(ClaimVerdict.model_json_schema(), indent=2)}
"""


In [None]:
#| export
EX1 = """
Output Example 1:
{
    "verdict": "ENTAILED",
    "justification": "The database shows that the population of France is 67 million, which supports the claim."
}
"""

EX2 = """
Output Example 2:
{
    "verdict": "CONTRADICTED",
    "justification": "The database indicates that the capital of Germany is Berlin, contradicting the claim."
}
"""

EX3 = """
Output Example 3:
{
    "verdict": "NOT ENOUGH INFO",
    "justification": "The database does not contain any information about the population of Sacramento."
}
"""

In [None]:
#| export
FACT_CHECKER_PROMPT_3SHOT = FACT_CHECKER_PROMPT + "\n" + EX1 + EX2 + EX3

In [None]:
print(FACT_CHECKER_PROMPT_3SHOT)


You are a fact-checking assistant operating over structured data. You will be given a natural-language claim and optional external information. You will have access to a SQLite database and may execute arbitrary SQL queries over it using specialized tools.

Your task is to determine whether the claim is "ENTAILED", "CONTRADICTED", or "NOT ENOUGH INFO" based on evidence you obtain from the database. The labels are defined as follows:

- ENTAILED: The claim is supported by the database.
- CONTRADICTED: The claim is refuted by the database.
- NOT ENOUGH INFO: The database does not provide sufficient evidence to decide.

Use the available tools to query the database and gather evidence before making a decision. Do not ask the user for clarification or additional information.

You should always start by querying the database for the schema (tables and columns).

Your answer should be in JSON format, adhering to the following schema:
{
  "properties": {
    "verdict": {
      "description":

## OpenAI Agents

### Single Example Test

In [None]:
model = "gpt-5-nano"
model = "gpt-5-mini"
model_dir = config.experiments_dir_pub / model
model_dir.mkdir(parents=True, exist_ok=True)

In [None]:
with open(config.final_benchmark_dir / 'test-public.jsonl', "r") as f:
    all_claims = [json.loads(line) for line in f]

claim = all_claims[2]

In [None]:
claim

In [None]:
tool = toolbox_client.load_tool(f"{claim['db_name']}_execute_sql")

In [None]:
fact_checker_agent = Agent(
    name="Fact-Checker",
    instructions=FACT_CHECKER_PROMPT_3SHOT,
    model=model,
    tools=[function_tool(tool)],
    output_type=ClaimVerdict,
)
    

In [None]:
inp = f"Claim: {claim['claim']}\nExtra Information: {claim['extra_info']}"
#inp = f"Do you see what tools and metadata of tools you have?"

print(inp)

In [None]:
result = await Runner.run(
    fact_checker_agent, 
    inp, 
    max_turns=20
)

In [None]:
result.final_output

In [None]:
claim['label']

In [None]:
result.to_input_list()[3]

### Agent's RunResult to dict

#### Example to understand RunResult

`to_input_list()` is the complete pipeline of all things that happened in json and text

In [None]:
lst = result.to_input_list()

In [None]:
result.to_input_list()[-1]

Usage (see [here](https://github.com/openai/openai-agents-python/blob/f903ad0ac44e1c5c959301bd3c8721fbd4cd4e5b/examples/basic/usage_tracking.py#L41))

#### Function Definition

In [None]:
#| export

def run_result_to_dict(result, ollama=False) -> dict:
    """Convert an Agent's RunResult to a dictionary."""
    info_dict = {}

    if isinstance(result, Exception):
        return {
            'verdict': "",
            'error': str(result),
            'justification': "",
            'model_name': "",
            'model_settings': "",
            'usage': [],
            'to_input_list': []
        }

    # 1. Final output
    info_dict['verdict'] = result.final_output.verdict
    info_dict['justification'] = result.final_output.justification
    info_dict['final_output'] = str(result.final_output)

    # 2. Model Settings
    info_dict['model_name'] = result._last_agent.model
    if ollama: info_dict['model_name'] = info_dict['model_name'].model
    info_dict['model_settings'] = result._last_agent.model_settings.to_json_dict()

    # 3. All Requests Costs (the total is the sum)
    usage = []
    for request_usage in result.context_wrapper.usage.request_usage_entries:
        cached_input_tokens = request_usage.input_tokens_details.cached_tokens
        regular_input_tokens = request_usage.input_tokens - cached_input_tokens
        output_tokens = request_usage.output_tokens

        usage.append(
            {
                "regular_input_tokens": regular_input_tokens,
                "cached_input_tokens": cached_input_tokens,
                "output_tokens": output_tokens,
            }
        )
    info_dict['usage'] = usage

    # 4. The complete Agentic Pipeline
    info_dict['to_input_list'] = result.to_input_list()

    return info_dict

### Run OpenAI models on All Claims

Here, simply change **model** name and run this subsection of the notebook again and again!

In [None]:
#| export
import asyncio
import random

In [None]:
#| export
bird_id_to_example_dict = dict()

with open(config.bird_dir / 'train_dev_filtered.jsonl', 'r') as f:
    for line in f:
        parsed = json.loads(line)
        bird_id = parsed['bird_id']
        bird_id_to_example_dict[bird_id] = parsed
    
len(bird_id_to_example_dict)

In [None]:
#| export
db_names = set(v['db_id'] for v in bird_id_to_example_dict.values())

In [None]:
#| export
_tool_cache = dict()
tool_cache = dict()

for db_name in db_names:
    tool = toolbox_client.load_tool(f"{db_name}_execute_sql")
    _tool_cache[db_name] = tool
    tool_cache[db_name] = function_tool(tool)

In [None]:
#| export
def return_coroutines(test_claims, model):
    cors = []
    claim_ids = []

    for claim in test_claims:

        tool = tool_cache[claim['db_name']]

        fact_checker_agent = Agent(
            name="Fact-Checker",
            instructions=FACT_CHECKER_PROMPT_3SHOT,
            model=model,
            tools=[tool],
            output_type=ClaimVerdict,
        )

        inp = f"Claim: {claim['claim']}\nExtra Information: {claim['extra_info']}"

        cors.append(Runner.run(fact_checker_agent, inp, max_turns=20))
        
        claim_ids.append(claim['claim_id'])
    
    return cors, claim_ids

In [None]:
#| export

#TODO: tool returns exception without messing up the dependancies on 07b.

#model = "gpt-5-mini"
#model = "gpt-4.1-nano"
model = "gpt-5-nano"
model = "gpt-4o-mini"

batch_size = 100

results_path = config.experiments_dir_pub / f"{model}.jsonl"
results_path.touch()

In [None]:
#| export
import asyncio

In [None]:
#| export
with open(results_path, 'r') as f:
    already_tested = [json.loads(line)['claim_id'] for line in f]

benchmark = []
with open(config.final_benchmark_dir / 'test-public.jsonl') as f:
    for line in f: 
        parsed_claim = json.loads(line)
        if parsed_claim['claim_id'] in already_tested: continue
        benchmark.append(parsed_claim)

In [None]:
len(benchmark)

In [None]:
#| export
async def run_tests():

    for i in range(0, len(benchmark), batch_size):
        test_claims = benchmark[i:i+batch_size]

        cors, claim_ids = return_coroutines(test_claims, model)

        results = await asyncio.gather(*cors, return_exceptions=True)

        for claim_id, res in zip(claim_ids, results):
            results_dict = {'claim_id': claim_id} | run_result_to_dict(res)
            results_path.open('a').write(json.dumps(results_dict) + '\n')

In [None]:
await run_tests()

In [None]:
#| export 
try: from nbdev.imports import IN_NOTEBOOK
except: IN_NOTEBOOK=False

In [None]:
#| export
if __name__ == "__main__" and not IN_NOTEBOOK:
    print(f"#Exps Left: {len(benchmark)}")
    print(model)
    asyncio.run(run_tests())

In [None]:
x = "{\n  \"properties\": {\n    \"verdict\": {\n      \"description\": \"Whether the claim is supported, contradicted, or undecideable from the database.\",\n      \"enum\": [\n        \"ENTAILED\",\n        \"CONTRADICTED\",\n        \"NOT ENOUGH INFO\"\n      ],\n      \"title\": \"Verdict\",\n      \"type\": \"string\"\n    },\n    \"justification\": {\n      \"description\": \"Brief justification (1-2 sentences) of the verdict.\",\n      \"title\": \"Justification\",\n      \"type\": \"string\"\n    }\n  },\n  \"required\": [\n    \"verdict\",\n    \"justification\"\n  ],\n  \"title\": \"ClaimVerdict\",\n  \"type\": \"object\"\n}\n\n{\"verdict\": \"CONTRADICTED\", \"justification\": \"The average build up play speed for Heart of Midlothian is 59.6, not 72.0 as claimed.\"}"

In [None]:
print(x)

In [None]:
toolbox_client.close()

## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()