In [1]:
from __future__ import annotations

import asyncio
import nest_asyncio

import csv
import time
import dotenv
from typing import List

import pandas as pd
from pydantic import BaseModel

from llm import create_agent, MODEL_DICT

In [2]:
dotenv.load_dotenv()

# to run async in jupyter notebook
nest_asyncio.apply()

In [3]:
def load_headlines(path: str = "test_headlines.csv") -> pd.DataFrame:
    df = pd.read_csv(path)
    df = df.rename(columns={"id": "id", "title": "title"})
    return df[["id", "title"]]

input_df = load_headlines()
input_df


Unnamed: 0,id,title
0,0,GitHub will be folded into Microsoft proper as...
1,10,"With new in-house models, Microsoft lays the g..."
2,16,Google improves Gemini AI image editing with “...
3,20,Google warns that mass data theft hitting Sale...
4,23,AI Wants More Data. More Chips. More Real Esta...
...,...,...
244,758,"Reframing Jensen’s Law: ‘Buy more, make more’ ..."
245,759,Zeta Global (ZETA) Target Raised by Goldman as...
246,763,Luis Enrique names his squad to face Toulouse
247,773,CorelDRAW Graphics Suite 2025 v26.2.0.170


In [4]:
def load_ground_truth(path: str = "headline_classifier_ground_truth.csv") -> dict:
    truth = {}
    with open(path) as f:
        reader = csv.DictReader(f)
        for row in reader:
            truth[row["input"].strip()] = row["output"].strip() == "True"
    return truth

truth_df = load_ground_truth()
truth_df


{'GitHub will be folded into Microsoft proper as CEO steps down': False,
 'With new in-house models, Microsoft lays the groundwork for independence from OpenAI': True,
 'Google improves Gemini AI image editing with “nano banana” model': True,
 'Google warns that mass data theft hitting Salesloft AI agent has grown bigger': True,
 'AI Wants More Data. More Chips. More Real Estate. More Power. More Water. More Everything': True,
 'Opinion: China Just Got a Big Leg Up in the AI Race': True,
 'Jack Ma-Backed Ant’s Profit Dives 60% After AI, Global Expansion': True,
 'Dell Falls After Reporting Tighter Profit Margins on Servers': False,
 'AI Billionaire Lucy Guo Pushes Into Crowded Social Media FieldScale AI co-founder’s new venture, Passes, confronts established rivals and lawsuits': True,
 'Alibaba Shows Progress in China AI Push, Lifting Shares': True,
 'Vercel Triples Valuation to $9 Billion With Accel Investment': False,
 'Bain Is Said to Draw Chinese Bidders for $4 Billion Data Center

In [5]:
class FilterResult(BaseModel):
    id: int
    value: bool


class FilterResultList(BaseModel):
    results_list: List[FilterResult]
    

In [6]:
def extract_usage_anthropic(raw) -> tuple[int, int]:
    """Extract input/output tokens from Anthropic response."""
    try:
        return raw.usage.input_tokens, raw.usage.output_tokens
    except AttributeError:
        return 0, 0
        

In [7]:
# MODEL_ID = "claude-sonnet-4-5-20250929"
MODEL_ID = "claude-opus-4-6"
# MODEL_ID = "claude-haiku-4-5-20251001"
MODEL = MODEL_DICT[MODEL_ID]
USAGE_FN = extract_usage_anthropic
REASONING_EFFORT = -1
CHUNK_SIZE = 25

SYSTEM_PROMPT = """\
You are a content-classification assistant that labels news headlines as AI-related or not.
You will receive a list of JSON objects with fields "id" and "title"
Return **only** a JSON object that satisfies the provided schema.
For each headline provided, you MUST return one element with the same id, and a boolean value; do not skip any items.
Return elements in the same order they were provided.
No markdown, no markdown fences, no extra keys, no comments."""
USER_PROMPT = """\
Classify every headline below.

AI-related if the title mentions (explicitly or implicitly):
- Core AI technologies: machine learning, neural / deep / transformer networks
- AI Applications: computer vision, NLP, robotics, autonomous driving, generative media
- AI hardware, GPU chip supply, AI data centers and infrastructure
- Companies or labs known for AI: OpenAI, DeepMind, Anthropic, xAI, NVIDIA, etc.
- AI models & products: ChatGPT, Gemini, Claude, Sora, Midjourney, DeepSeek, etc.
- New AI products and AI integration into existing products/services
- AI policy / ethics / safety / regulation / analysis
- Research results related to AI
- AI industry figures (Sam Altman, Demis Hassabis, etc.)
- AI market and business developments, funding rounds, partnerships centered on AI
- Any other news with a significant AI component

Non-AI examples: crypto, ordinary software, non-AI gadgets and medical devices, and anything else.
Input:
{input_text}"""


In [8]:
async def run_benchmark():
    df = load_headlines()
    truth = load_ground_truth()

    model = MODEL
    agent = create_agent(
        model=model,
        system_prompt=SYSTEM_PROMPT,
        user_prompt=USER_PROMPT,
        output_type=FilterResultList,
        reasoning_effort=REASONING_EFFORT,
    )

    # Patch _call_llm to track token usage
    total_input_tokens = 0
    total_output_tokens = 0
    original_call = agent._call_llm

    async def tracked_call(system, user, output_schema):
        nonlocal total_input_tokens, total_output_tokens
        raw = await original_call(system, user, output_schema)
        inp, out = USAGE_FN(raw)
        total_input_tokens += inp
        total_output_tokens += out
        return raw

    agent._call_llm = tracked_call

    print(f"Model:            {model.display_name} ({model.model_id})")
    print(f"Reasoning effort: {REASONING_EFFORT}")
    print(f"Headlines:        {len(df)}")
    print(f"Chunk size:       {CHUNK_SIZE}")
    print()

    start = time.time()
    is_ai = await agent.filter_dataframe(
        df=df,
        chunk_size=CHUNK_SIZE,
        value_field="value",
        item_list_field="results_list",
        item_id_field="id",
    )
    elapsed = time.time() - start


    # Build results and compare
    correct = 0
    total = 0
    tp = fp = tn = fn = 0
    errors = []
    for idx, row in df.iterrows():
        title = row["title"].strip()
        predicted = bool(is_ai.iloc[idx])
        expected = truth.get(title)
        if expected is None:
            print(f"  Not present in ground truth: predicted={predicted}, title={title[:80]}")
            continue
        total += 1
        if predicted and expected:
            tp += 1
        elif predicted and not expected:
            fp += 1
        elif not predicted and expected:
            fn += 1
        else:
            tn += 1
        if predicted == expected:
            correct += 1
        else:
            errors.append((title[:80], f"predicted={predicted}, expected={expected}"))

    accuracy = correct / total if total else 0

    print(f"--- Results ---")
    print(f"Accuracy:      {accuracy:.1%} ({correct}/{total})")
    print(f"Input tokens:  {total_input_tokens:,}")
    print(f"Output tokens: {total_output_tokens:,}")
    print(f"Total tokens:  {total_input_tokens + total_output_tokens:,}")
    print(f"Wall time:     {elapsed:.1f}s")
    print()

    # Confusion matrix in % terms
    print(f"--- Confusion Matrix (% of {total}) ---")
    print(f"                  Predicted AI   Predicted Not-AI")
    print(f"  Actual AI       {tp/total:>10.1%}       {fn/total:>10.1%}")
    print(f"  Actual Not-AI   {fp/total:>10.1%}       {tn/total:>10.1%}")
    print()
    
    if errors:
        print(f"--- Misclassifications ({len(errors)}) ---")
        for title, detail in errors:
            print(f"  {title}  [{detail}]")



In [9]:
asyncio.run(run_benchmark())
    

Exception in callback Task.__step()
handle: <Handle Task.__step()>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/py/lib/python3.11/asyncio/events.py", line 84, in _run
    self._context.run(self._callback, *self._args)
RuntimeError: cannot enter context: <_contextvars.Context object at 0x105c61440> is already entered


Model:            Claude Sonnet 4.5 (claude-opus-4-6)
Reasoning effort: -1
Headlines:        249
Chunk size:       25

--- Results ---
Accuracy:      100.0% (249/249)
Input tokens:  20,416
Output tokens: 3,099
Total tokens:  23,515
Wall time:     4.4s

--- Confusion Matrix (% of 249) ---
                  Predicted AI   Predicted Not-AI
  Actual AI            54.2%             0.0%
  Actual Not-AI         0.0%            45.8%

