In [1]:
#| hide

%load_ext autoreload
%autoreload 2

# Vizualise Private Test

> Here, we see experiment results!

- skip_showdoc: true
- skip_exec: true

In [2]:
#| default_exp visualize_private

In [None]:
#| hide

import json
from claimdb.configuration import *
from claimdb.transformation import *

In [4]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.patches import Patch
import seaborn as sns

plt.rcParams['font.size'] = 14.0
plt.rcParams['axes.labelsize'] = 22
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['legend.fontsize'] = 18
plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams.update({'mathtext.default': 'regular' })

## Re-run Bad Cases

Here, we will re-run some cases where we suspect the `LiteLLM` framework to have failed as an intermediary. Also, since we have custom logic for structured output extraction (we dont use the official APIs) we need to be extra careful and re-run cases where it is clear that outside factors produced the error and not the LLM (this is rare but can happen).

In [None]:
#| export
#| hide

import json
from claimdb.configuration import config

In [13]:
model = 'ministral-3:3b'

fix_path = config.experiments_dir_priv / f"{model}.jsonl"

In [18]:
correct_experiments = []
errs = []
total = 0

seen = []

with open(fix_path, 'r') as f:
    for line in f:
        total += 1
        parsed = json.loads(line)
        if parsed['claim_id'] in seen:
            print("Duplicate:", parsed['claim_id'])
            continue

        seen.append(parsed['claim_id'])

        #if 'all_messages' in parsed: del parsed['all_messages']

        if "(SQLITE_BUSY)" in line:
            errs.append(parsed)
            continue
        if '"An error occurred while running the tool. Please try again. Error: "' in line:
            errs.append(parsed)
            continue

        if "error" in parsed:
            if parsed['error'] == "JSON Extraction Error.":
                errs.append(parsed)
                continue
            if parsed['error'] == "Connection error.":
                errs.append(parsed)
                continue
            if "status_code: " in parsed['error']:
                errs.append(parsed)
                continue
            if "Exceeded maximum retries (20) for output" in parsed['error']:
                errs.append(parsed)
                continue
            if "The next tool call(s) would exceed the tool_calls_limit" in parsed['error']:
                errs.append(parsed)
                continue
            if "Max turns (20) exceeded" in parsed['error']:
                errs.append(parsed)
                continue
            if "Error code: " in parsed['error']:
                errs.append(parsed)
                continue
            if "exceeded max retries count of 10" in parsed['error']:
                errs.append(parsed)
                continue

        correct_experiments.append(parsed)

survived = len(correct_experiments)

In [19]:
print(f"Total: {total}, Survived: {survived}, Will Re-Run: {total - survived}")

Total: 1000, Survived: 992, Will Re-Run: 8


In [20]:
import random 

random.shuffle(correct_experiments)

In [21]:
with open(fix_path, 'w') as f:
    for entry in correct_experiments:
        f.write(json.dumps(entry) + '\n')

## Results

### High-Level Overview

In [None]:
#| export
with open(config.final_benchmark_dir / 'test-public.jsonl', "r") as f:
    pub_test = [json.loads(line) for line in f]
    pub_ids = [item['claim_id'] for item in pub_test]

with open(config.final_benchmark_dir / 'train.jsonl', "r") as f:
    train = [json.loads(line) for line in f]

# this will fail -- not available publicly
with open(config.final_benchmark_dir / 'test-private-with-labels.jsonl', "r") as f:
    priv_test = [json.loads(line) for line in f]
    priv_ids = [item['claim_id'] for item in priv_test]

all_claims = pub_test + train + priv_test

claim_map = {item['claim_id']: item for item in all_claims}

#### Load

In [131]:
#| export

def load_df(model_name, split):
    predicted_results = []

    ids = []
    if split == 'private':
        path = config.experiments_dir_priv / f"{model_name}.jsonl"
        ids = priv_ids
    if split == 'public':
        path = config.experiments_dir_pub / f"{model_name}.jsonl"
        ids = pub_ids
    
    ids = list(ids)

    if not os.path.exists(path): return None
    
    with open(path, "r") as f:

        for line in f:

            log = json.loads(line)
            claim_id = log['claim_id']
            if claim_id not in ids: 
                continue
            else:
                ids.remove(claim_id)

            verdict = log['verdict']

            category = claim_map[claim_id].get('category', None)
            ground_truth = claim_map[claim_id]['label']
            db_name = claim_map[claim_id]['db_name']

            # Tool Calls
            num_tool_calls = None
            tokens = 0

            if "model_settings" in log and not "error" in log:
                num_tool_calls = 0
                for item in log['to_input_list']:
                    if item.get('type', None) == 'function_call':
                        num_tool_calls += 1

            if "model_settings" in log and 'usage' in log:   # Means we are in OpenAI's SDK
                usage_logs = log['usage']
                tokens = 0
                for usage_dict in usage_logs:
                    tokens += sum(usage_dict.values())

                #print(f"{model_name} : {tokens} total tokens")

            if "model_settings" not in log and not "error" in log:  # Means we are in Pydantic's Agents
                if log.get("error", None): 
                    num_tool_calls = None
                    continue
                num_tool_calls = 0
                for message in log.get('all_messages', []):
                    # Check if this is a response message
                    if message.get('kind') == 'response':
                        # Look through the parts for tool calls
                        for part in message.get('parts', []):
                            if part.get('part_kind') == 'tool-call':
                                num_tool_calls += 1
                
            if "model_settings" not in log: # Means we are in Pydantic's Agents
                tokens = 0
                for msg in log.get('all_messages', []):
                    if 'usage' not in msg:
                        continue
                    tokens += sum([t for t in msg['usage'].values() if isinstance(t, int)])

            # NOTE: gpt-5-nano on private does not have "model_settings" but has "usage"
            if model_name == 'gpt-5-nano':
                usage_logs = log['usage']
                tokens = 0
                for usage_dict in usage_logs:
                    tokens += sum(usage_dict.values())
                
                #print(f"{model_name} : {tokens} total tokens")

            entry = {
                'claim_id': claim_id,
                'verdict': verdict,
                'ground_truth': ground_truth,
                'category': category,
                'db_name': db_name,
                'tool_calls': num_tool_calls,
                'tokens': tokens
            }

            predicted_results.append(entry)
        
        for missing_id in ids:
            claim = claim_map[missing_id]
            entry = {
                'claim_id': missing_id,
                'verdict': 'MISSING',
                'ground_truth': claim['label'],
                'category': claim.get('category', None),
                'db_name': claim['db_name'],
                'tool_calls': None,
                'tokens': 0
            }
            predicted_results.append(entry)
    
    df = pd.DataFrame(predicted_results)
    df['correct'] = df['verdict'] == df['ground_truth']

    return df

### Analyze

In [76]:
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    confusion_matrix
)

def find_statistics(df):

    labels = ["ENTAILED", "CONTRADICTED", "NOT ENOUGH INFO"]

    # Accuracy
    acc = accuracy_score(df["ground_truth"], df["verdict"])

    # Per-class precision / recall / F1
    P, R, F1, support = precision_recall_fscore_support(
        df["ground_truth"],
        df["verdict"],
        labels=labels,
        zero_division=0
    )

    metrics = {
        label: {
            "precision": P[i],
            "recall": R[i],
            "f1": F1[i],
            "support": support[i],
        }
        for i, label in enumerate(labels)
    }

    # Macro-F1
    macro_f1 = F1.mean()

    # Confusion matrix
    cm = confusion_matrix(
        df["ground_truth"],
        df["verdict"],
        labels=labels
    )

    total_tokens = df['tokens'].sum()

    return acc, metrics, macro_f1, cm, total_tokens

### Latex Prints

In [None]:
def format_millions(n: int | float) -> str:
    """Format as whole millions (rounded, no decimals).

    Examples:
      9_300_000   -> '9M'
      11_300_000  -> '11M'
      12_800_000  -> '13M'
      300_000     -> '0M'
    """
    m = int(round(float(n) / 1_000_000))
    return f"{m}M"


In [134]:
def print_latex_tab(model, acc, metrics, macro_f1, total_tokens, priv=False):
    ent = metrics['ENTAILED']
    contr = metrics['CONTRADICTED']
    nei = metrics['NOT ENOUGH INFO']

    if not priv:
        print(
            f"\\texttt{{{model}}}\n"
            f"& \\gray{{ {ent['precision']:.3f} }} & \\gray{{ {contr['precision']:.3f} }} & \\gray{{ {nei['precision']:.3f} }}\n"
            f"& \\gray{{ {ent['recall']:.3f} }} & \\gray{{ {contr['recall']:.3f} }} & \\gray{{ {nei['recall']:.3f} }}\n"
            f"& {ent['f1']:.3f} & {contr['f1']:.3f} & {nei['f1']:.3f}\n"
            f"& {macro_f1:.3f} & {acc:.3f} & {format_millions(total_tokens)} \\\\"
        )
    else:
        print(
            f"\\texttt{{{model}}}\n"
            f"& {macro_f1:.3f} & {acc:.3f} & {format_millions(total_tokens)} \\\\"
        )

### Load Up

In [135]:
models = [
    'gpt-4o-mini',
    'gpt-4.1-nano',
    'gpt-5-nano',
    'gpt-5-mini',
    'gpt-oss:20b',
    'gemini-2.5-flash',
    'gemini-3-flash-preview',
    'claude-3-haiku-20240307',
    'claude-3-5-haiku-20241022',
    'claude-haiku-4-5',
    'qwen3:1.7b',
    'qwen3:4b',
    'qwen3:8b',
    'qwen3:14b',
    'qwen3:32b',
    'qwen3-coder:30b',
    'ministral-3:3b',
    'ministral-3:8b',
    'ministral-3:14b',
    'mistral-nemo:12b',
    'mistral-small:22b',
    'magistral:24b',
    'devstral:24b',
    'devstral-small-2:24b',
    'nemotron-3-nano:30b',
    'llama3.1:8b',
    'llama3.2:3b',
    'cogito:14b',
    'cogito:32b',
    'qwq:32b',
]

info_dict = {
    'gpt-4o-mini': {'name': 'gpt-4o-mini', 'params': None, 'creator': 'OpenAI', 'date_pub': '2025-12-31', 'date_priv': '2026-01-15'},
    'gpt-4.1-nano': {'name': 'gpt-4.1-nano', 'params': None, 'creator': 'OpenAI', 'date_pub': '2025-12-30', 'date_priv': '2026-01-11'},
    'gpt-5-nano': {'name': 'gpt-5-nano', 'params': None, 'creator': 'OpenAI', 'date_pub': '2026-01-04', 'date_priv': '2026-01-15'},
    'gpt-5-mini': {'name': 'gpt-5-mini', 'params': None, 'creator': 'OpenAI', 'date_pub': '2025-12-20', 'date_priv': '2026-01-14'},
    'gpt-oss:20b': {'name': 'gpt-oss', 'params': 20, 'creator': 'OpenAI', 'date_pub': '2025-12-20', 'date_priv': '2026-01-14'},
    'gemini-2.5-flash': {'name': 'gemini-2.5-flash', 'params': None, 'creator': 'Google', 'date_pub': '2025-12-28', 'date_priv': '2026-01-11'},
    'gemini-3-flash-preview': {'name': 'gemini-3-flash', 'params': None, 'creator': 'Google', 'date_pub': '2025-12-26', 'date_priv': '2026-01-14'},
    'claude-3-haiku-20240307': {'name': 'claude-3-haiku', 'params': None, 'creator': 'Anthropic', 'date_pub': '2026-01-05', 'date_priv': '2026-01-18'},
    'claude-3-5-haiku-20241022': {'name': 'claude-3-5-haiku', 'params': None, 'creator': 'Anthropic', 'date_pub': '2025-12-28', 'date_priv': '2026-01-16'},
    'claude-haiku-4-5': {'name': 'claude-haiku-4-5', 'params': None, 'creator': 'Anthropic', 'date_pub': '2025-12-21', 'date_priv': '2026-01-16'},
    'qwen3:1.7b': {'name': 'qwen3', 'params': 1.7, 'creator': 'Alibaba', 'date_pub': '2026-01-06', 'date_priv': '2026-01-08'},
    'qwen3:4b': {'name': 'qwen3', 'params': 4, 'creator': 'Alibaba', 'date_pub': '2025-12-23', 'date_priv': '2026-01-09'},
    'qwen3:8b': {'name': 'qwen3', 'params': 8, 'creator': 'Alibaba', 'date_pub': '2026-01-05', 'date_priv': '2026-01-09'},
    'qwen3:14b': {'name': 'qwen3', 'params': 14, 'creator': 'Alibaba', 'date_pub': '2026-01-06', 'date_priv': '2026-01-14'},
    'qwen3:32b': {'name': 'qwen3', 'params': 32, 'creator': 'Alibaba', 'date_pub': '2026-01-06', 'date_priv': '2026-01-15'},
    'qwen3-coder:30b': {'name': 'qwen3-coder', 'params': 30, 'creator': 'Alibaba', 'date_pub': '2026-01-06', 'date_priv': '2026-01-15'},
    'ministral-3:3b': {'name': 'ministral-3', 'params': 3, 'creator': 'Mistral AI', 'date_pub': '2026-01-06', 'date_priv': '2026-01-11'},
    'ministral-3:8b': {'name': 'ministral-3', 'params': 8 , 'creator': 'Mistral AI', 'date_pub': '2026-01-06', 'date_priv': '2026-01-15'},
    'ministral-3:14b': {'name': 'ministral-3', 'params': 14, 'creator': 'Mistral AI', 'date_pub': '2026-01-01', 'date_priv': '2026-01-17'},
    'mistral-small:22b': {'name': 'mistral-small', 'params': 22, 'creator': 'Mistral AI', 'date_pub': '2025-12-20', 'date_priv': '2026-01-19'},
    'magistral:24b': {'name': 'magistral', 'params': 24, 'creator': 'Mistral AI', 'date_pub': '2026-01-05', 'date_priv': '2026-01-19'},
    'mistral-nemo:12b': {'name': 'mistral-nemo', 'params': 12, 'creator': 'Mistral AI', 'date_pub': '2026-01-05', 'date_priv': '2026-01-18'},
    'devstral:24b': {'name': 'devstral', 'params': 24, 'creator': 'Mistral AI', 'date_pub': '2026-01-01', 'date_priv': '2026-01-09'},
    'devstral-small-2:24b': {'name': 'devstral-small-2', 'params': 24, 'creator': 'Mistral AI', 'date_pub': '2026-01-04', 'date_priv': '2026-01-14'},
    'nemotron-3-nano:30b': {'name': 'nemotron-3-nano', 'params': 30, 'creator': 'NVIDIA', 'date_pub': '2026-01-02', 'date_priv': '2026-01-08'},
    'llama3.1:8b': {'name': 'llama3.1', 'params': 8, 'creator': 'Meta', 'date_pub': '2025-12-22', 'date_priv': '2026-01-14'},
    'llama3.2:3b': {'name': 'llama3.2', 'params': 3, 'creator': 'Meta', 'date_pub': '2025-12-29', 'date_priv': '2026-01-11'},
    'cogito:14b': {'name': 'cogito', 'params': 14, 'creator': 'Deep Cogito', 'date_pub': '2025-12-30', 'date_priv': '2026-01-18'},
    'cogito:32b': {'name': 'cogito', 'params': 32, 'creator': 'Deep Cogito', 'date_pub': '2025-12-24', 'date_priv': '2026-01-14'},
    'qwq:32b': {'name': 'qwq', 'params': 32, 'creator': 'Alibaba', 'date_pub': '2026-01-05', 'date_priv': '2026-01-17'},
}

In [136]:
import tqdm 

In [138]:

model_results = dict()
model_dfs = []

#for split in ['public', 'private']:
#for split in ['public']:
for split in ['private']:
    for model in models:
        df = load_df(model, split)
        if df is None:
            continue
        df['model'] = model
        model_dfs.append(df)
        acc, metrics, macro_f1, cm, total_tokens = find_statistics(df)
        
        #print(f"split: {split}, model: {model}, t: {format_millions(total_tokens)} total tokens: {total_tokens}")
        print_latex_tab(model, acc, metrics, macro_f1, total_tokens, split=='private')
        print()

df = pd.concat(model_dfs)

\texttt{gpt-5-nano}
& 0.791 & 0.793 & 20M \\

\texttt{gpt-5-mini}
& 0.828 & 0.828 & 19M \\

\texttt{gpt-oss:20b}
& 0.759 & 0.763 & 14M \\

\texttt{gemini-2.5-flash}
& 0.761 & 0.758 & 8M \\

\texttt{gemini-3-flash-preview}
& 0.807 & 0.805 & 12M \\

\texttt{claude-3-5-haiku-20241022}
& 0.671 & 0.673 & 11M \\

\texttt{claude-haiku-4-5}
& 0.799 & 0.792 & 35M \\

\texttt{qwen3:1.7b}
& 0.231 & 0.363 & 2M \\

\texttt{qwen3:4b}
& 0.461 & 0.493 & 11M \\

\texttt{qwen3:8b}
& 0.493 & 0.529 & 10M \\

\texttt{qwen3:14b}
& 0.465 & 0.506 & 9M \\

\texttt{qwen3:32b}
& 0.539 & 0.558 & 9M \\

\texttt{qwen3-coder:30b}
& 0.685 & 0.686 & 22M \\

\texttt{ministral-3:3b}
& 0.332 & 0.369 & 15M \\

\texttt{ministral-3:8b}
& 0.543 & 0.556 & 16M \\

\texttt{ministral-3:14b}
& 0.618 & 0.618 & 23M \\

\texttt{mistral-nemo:12b}
& 0.345 & 0.354 & 8M \\

\texttt{mistral-small:22b}
& 0.296 & 0.379 & 15M \\

\texttt{magistral:24b}
& 0.430 & 0.468 & 18M \\

\texttt{devstral:24b}
& 0.388 & 0.420 & 0M \\

\texttt{devstral

## End

In [1]:
#| hide
import nbdev; nbdev.nbdev_export()