In [None]:
#| hide

%load_ext autoreload
%autoreload 2

# OpenAI & LiteLLM

> Here, we evaluate different LLM agents on our benchmark. We use propriatery models from Gemini and Anthropic.

- skip_showdoc: true
- skip_exec: true

In [None]:
#| default_exp experiments_litellm

In [None]:
#| export
#| hide

import json
from claimdb.configuration import *
from claimdb.experiments import *
import asyncio
import random
import json
from agents import function_tool, Agent, Runner

## LiteLLM

### Extract JSON from text

Open source models are not good at following structured output instructions. Therefore, we need to extract the JSON part from the text output of the model.

In [None]:
#| export
import re

In [None]:
#| export
def extract_json(text: str) -> str:
    match = re.search(r'\{[\s\S]*?\}', text, re.DOTALL)
    return match.group()

In [None]:
test = '```json\n{\n  "verdict": "ENTAILED",\n  "justification": "The database query returned 236 users who created their accounts after 2013 and have more than 10 views, which matches the claim."\n}\n```'

In [None]:
extract_json(test)

In [None]:
test = "```json\n{\n  \"verdict\": \"NOT ENOUGH INFO\",\n  \"justification\": \"The database schema for the 'loan' table does not contain information regarding the type of collateral or security for the loans, making it impossible to determine if a loan was secured by a first-lien mortgage on the owner's primary residence.\"\n}\n```"

In [None]:
extract_json(test)

In [None]:
ClaimVerdict.model_validate_json(extract_json(test))

### Single Example Test

In [None]:
model = "litellm/gemini/gemini-3-flash-preview"
model = "litellm/gemini/gemini-2.5-flash-lite"
model = "litellm/anthropic/claude-haiku-4-5"  # 1/5$
model = "litellm/anthropic/claude-3-haiku-20240307" 
model = "litellm/gemini/gemini-2.5-flash"

In [None]:
with open(config.final_benchmark_dir / 'test-public.jsonl', "r") as f:
    all_claims = [json.loads(line) for line in f]

claim = all_claims[5]

In [None]:
for claim in all_claims:
    if "toxico" in claim['db_name']:
        break

In [None]:
claim

In [None]:
tool = toolbox_client.load_tool(f"{claim['db_name']}_execute_sql")

In [None]:
fact_checker_agent = Agent(
    name="Fact-Checker",
    instructions=FACT_CHECKER_PROMPT_3SHOT,
    model=model,
    tools=[function_tool(tool)],
    #output_type=ClaimVerdict
)
    

In [None]:
inp = f"Claim: {claim['claim']}\nExtra Information: {claim['extra_info']}"
#inp = f"Do you see what tools and metadata of tools you have?"

print(inp)

In [None]:
result = await Runner.run(
    fact_checker_agent, 
    inp, 
    max_turns=20
)

In [None]:
result.final_output

In [None]:
extract_json(result.final_output)

In [None]:
ClaimVerdict.model_validate_json(extract_json(test))

### Function Definition

In [None]:
#| export

def run_result_to_dict_litellm(result, ollama=False) -> dict:
    """Convert an Agent's RunResult to a dictionary."""
    info_dict = {}

    if isinstance(result, Exception):
        return {
            'verdict': "",
            'error': str(result),
            'justification': "",
            'model_name': "",
            'model_settings': "",
            'usage': [],
            'to_input_list': []
        }

    # 1. Final output (we will input at the end -- we will del the error if no error)
    info_dict['verdict'] = ""
    info_dict['error'] = ""
    info_dict['justification'] = ""
    info_dict['final_output'] = result.final_output

    # 2. Model Settings
    info_dict['model_name'] = result._last_agent.model
    if ollama: info_dict['model_name'] = info_dict['model_name'].model
    info_dict['model_settings'] = result._last_agent.model_settings.to_json_dict()

    # 3. All Requests Costs (the total is the sum)
    usage = []
    for request_usage in result.context_wrapper.usage.request_usage_entries:
        cached_input_tokens = request_usage.input_tokens_details.cached_tokens
        regular_input_tokens = request_usage.input_tokens - cached_input_tokens
        output_tokens = request_usage.output_tokens

        usage.append(
            {
                "regular_input_tokens": regular_input_tokens,
                "cached_input_tokens": cached_input_tokens,
                "output_tokens": output_tokens,
            }
        )
    info_dict['usage'] = usage

    # 4. The complete Agentic Pipeline
    info_dict['to_input_list'] = result.to_input_list()

    # 5. Extract JSON and parse
    try:
        json_extr = extract_json(result.final_output)
        claim_verdict = ClaimVerdict.model_validate_json(json_extr)
    except Exception as e:
        info_dict['error'] = "JSON Extraction Error."
        return info_dict
    
    del info_dict['error']
    info_dict['verdict'] = claim_verdict.verdict
    info_dict['justification'] = claim_verdict.justification

    return info_dict

### Run models on All Claims

Here, simply change **model** name and run this subsection of the notebook again and again!

In [None]:
#| export
import asyncio
import random

In [None]:
#| export
bird_id_to_example_dict = dict()

with open(config.bird_dir / 'train_dev_filtered.jsonl', 'r') as f:
    for line in f:
        parsed = json.loads(line)
        bird_id = parsed['bird_id']
        bird_id_to_example_dict[bird_id] = parsed
    
len(bird_id_to_example_dict)

In [None]:
#| export
def return_coroutines_litellm(test_claims, model):
    cors = []
    claim_ids = []

    for claim in test_claims:

        tool = tool_cache[claim['db_name']]

        fact_checker_agent = Agent(
            name="Fact-Checker",
            instructions=FACT_CHECKER_PROMPT_3SHOT,
            model=model,
            tools=[tool],
        )

        inp = f"Claim: {claim['claim']}\nExtra Information: {claim['extra_info']}"

        cors.append(Runner.run(fact_checker_agent, inp, max_turns=20))
        
        claim_ids.append(claim['claim_id'])
    
    return cors, claim_ids

In [None]:
#| export
#model = "litellm/gemini/gemini-2.5-flash-lite"
#model = "litellm/anthropic/claude-3-5-haiku-20241022"  # done!
model = "litellm/anthropic/claude-haiku-4-5"
#model = "litellm/anthropic/claude-3-haiku-20240307"  # done!
#model = "litellm/gemini/gemini-3-flash-preview"  # done!
#model = "litellm/gemini/gemini-2.5-flash" # done!

batch_size = 1
sleep_time = 15

filename = model.split("/")[-1]
results_path = config.experiments_dir_pub / f"{filename}.jsonl"
results_path.touch()

In [None]:
#| export
import asyncio

In [None]:
#| export
with open(results_path, 'r') as f:
    already_tested = [json.loads(line)['claim_id'] for line in f]

benchmark = []
with open(config.final_benchmark_dir / 'test-public.jsonl') as f:
    for line in f: 
        parsed_claim = json.loads(line)
        if parsed_claim['claim_id'] in already_tested: continue
        benchmark.append(parsed_claim)

In [None]:
len(benchmark)

In [None]:
#| export
import time

In [None]:
#| export
async def run_tests_litellm():

    for i in range(0, len(benchmark), batch_size):
        test_claims = benchmark[i:i+batch_size]

        cors, claim_ids = return_coroutines_litellm(test_claims, model)

        results = await asyncio.gather(*cors, return_exceptions=True)

        time.sleep(sleep_time)

        for claim_id, res in zip(claim_ids, results):
            results_dict = {'claim_id': claim_id} | run_result_to_dict_litellm(res)
            results_path.open('a').write(json.dumps(results_dict) + '\n')

In [None]:
print(model)
print()
await run_tests_litellm()

In [None]:
#| export 
try: from nbdev.imports import IN_NOTEBOOK
except: IN_NOTEBOOK=False

In [None]:
#| export
if __name__ == "__main__" and not IN_NOTEBOOK:
    print(f"#Bench Exps: {len(benchmark)}")
    print(model)
    print()
    asyncio.run(run_tests_litellm())

### No Final Output & JSON Extraction Error (with JSON output)

1. `"final_output": ""` -> This should never really happen and since we cannot rule out `LiteLLM` failing, we delete these cases from the results and run again.

2. `"error": "JSON Extraction Error."` but the output contains valid JSON-like content -> Here, we can try to extract the JSON ourselves and if successful, we can keep the case. If not, we delete and re-run.

## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()