# Benchmark HTR Models

This notebook runs various vision-language models on historical French manuscript images and compares their performance.

In [1]:
import os
import json
from datetime import datetime
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
from utils import query_model, system_prompt, generate_results_md_table

# Create necessary directories if they don't exist
results_dir = Path("résultats")
reference_dir = Path("transcriptions_de_référence")
images_dir = Path("images")

results_dir.mkdir(exist_ok=True)
reference_dir.mkdir(exist_ok=True)

In [2]:
# Load models configuration
with open("models_to_test.json", "r") as f:
    models = json.load(f)

# Get list of images
image_files = list(images_dir.glob("*.jpg")) + list(images_dir.glob("*.png"))
print(f"Found {len(image_files)} images to process")
print(f"Will test {len(models)} models")

Found 15 images to process
Will test 22 models


In [3]:
# Function to determine model metadata
def get_model_metadata(model_id, model_type):
    """Extract editor and model type from model ID and type"""
    parts = model_id.split("/")
    editor = parts[0]
    
    # Map model type to French
    model_type_fr = "libre" if model_type == "open" else "propriétaire"
    
    return {
        "editeur": editor,
        "modele_type": model_type_fr
    }

In [4]:
# Process each image with each model in parallel
import concurrent.futures
from functools import partial

max_w = 1

results = []

def process_image(img_path, model_info):
    model_meta = get_model_metadata(model_info[0], model_info[1])
    model, _ = model_info
    
    # Create valid filename by replacing invalid characters
    safe_model_name = model.replace('/', '_').replace('\\', '_').replace(':', '_')
    result_file = results_dir / f"{img_path.stem}_{safe_model_name}.json"
    
    # Check if result file already exists and is valid JSON
    if result_file.exists():
        # print(f"Skipping existing result: {result_file}")
        return None
        
    try:
        # Query the model
        response_data, cost = query_model(str(img_path), model)

        # Extract the transcription from the response based on format
        if 'choices' in response_data and response_data['choices']:
            transcription = response_data['choices'][0]['message']['content']
        elif 'result' in response_data:
            # This is for Transkribus API format
            transcription = response_data['result']
        else:
            raise Exception(f"Unexpected response format: {response_data}")
        
        # Extract the transcription from the response
        transcription = response_data['choices'][0]['message']['content']
        
        # Get usage data with defaults if missing
        usage_data = response_data.get('usage', {})
        if not usage_data or not isinstance(usage_data, dict):
            usage_data = {
                'prompt_tokens': 0,
                'completion_tokens': 0,
                'total_tokens': 0
            }

        # Prepare result data
        result_data = {
            "model": model,
            "editeur": model_meta["editeur"],
            "modele_type": model_meta["modele_type"],
            "image": str(img_path),
            "result": transcription,
            "timestamp": datetime.now().isoformat(),
            "model_info": response_data.get('model_info', {}),
            "usage": usage_data,
            "latency": response_data.get('created') - response_data.get('started', 0) if 'created' in response_data else None
        }
        
        # Save result to file
        with open(result_file, 'w', encoding='utf-8') as f:
            json.dump(result_data, f, ensure_ascii=False, indent=2)
            
        return result_data
        
    except Exception as e:
        print(f"Error processing {img_path} with {model}: {str(e)}")
        return None

# Process models in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=max_w) as executor:
    for model_info in tqdm(models, desc="Processing models"):
        # Create partial function with fixed model_info
        process_fn = partial(process_image, model_info=model_info)
        
        # Submit all images for this model to process in parallel
        future_to_img = {executor.submit(process_fn, img_path): img_path 
                        for img_path in image_files}
        
        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(future_to_img), 
                         total=len(future_to_img),
                         desc=f"Processing images with {model_info[0]}", 
                         leave=False):
            result = future.result()
            if result is not None:
                results.append(result)

Processing models:   0%|          | 0/22 [00:00<?, ?it/s]

Processing images with google/gemini-2.0-flash-001:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with qwen/qwen-vl-plus:free:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with qwen/qwen2.5-vl-72b-instruct:free:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with google/gemini-2.0-flash-thinking-exp:free:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with google/gemini-2.0-flash-exp:free:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with qwen/qvq-72b-preview:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with openai/o1:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with x-ai/grok-2-vision-1212:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with amazon/nova-lite-v1:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with openai/gpt-4o-2024-11-20:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with mistralai/pixtral-large-2411:   0%|          | 0/15 [00:00<?, ?it/s]

Image AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_15.png was resized to fit within 2.0MB
Image AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_4.png was resized to fit within 2.0MB
Image AN-284AP-4-doss 10_page_30.png was resized to fit within 2.0MB
Image AN-284AP-4-doss 11_page_12.png was resized to fit within 2.0MB
Image AN-284AP-4-doss 11_page_17.png was resized to fit within 2.0MB
Error processing images\AN-284AP-4-doss 11_page_17.png with mistralai/pixtral-large-2411: Unexpected response format - missing 'choices' field: {'error': {'message': 'Provider returned error', 'code': 400, 'metadata': {'raw': '{"object":"error","message":"Image is too large: (8442x10484). Maximum allowed size is (10000, 10000). Image URL: 

Processing images with anthropic/claude-3.5-sonnet:   0%|          | 0/15 [00:00<?, ?it/s]

Image AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_15.png was resized to fit within 4.0MB and 7500px max dimension
Image AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_4.png was resized to fit within 4.0MB and 7500px max dimension
Image AN-284AP-4-doss 10_page_18.png was resized to fit within 4.0MB and 7500px max dimension
Image AN-284AP-4-doss 10_page_20.png was resized to fit within 4.0MB and 7500px max dimension
Image AN-284AP-4-doss 10_page_30.png was resized to fit within 4.0MB and 7500px max dimension
Image AN-284AP-4-doss 11_page_12.png was resized to fit within 4.0MB and 7500px max dimension
Image AN-284AP-4-doss 11_page_17.png was resized to fit within 4.0MB and 7500px max dimension
Image AN-284AP-4-doss 11_page_28.png was resized to fit within 4.0MB and 7500px max dimension
Image AN-284AP-4-doss 11_page_29.png was resized to fit within 4.0MB and 7500px max dimension
Image AN-284AP-4-doss 11_page_36.png was resized to fit within 4.0MB and 

Processing images with meta-llama/llama-3.2-90b-vision-instruct:   0%|          | 0/15 [00:00<?, ?it/s]

Image AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_15.png was resized to fit within 3.0MB and 6000px max dimension
Error processing images\AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_15.png with meta-llama/llama-3.2-90b-vision-instruct: Response ended prematurely
Image AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_4.png was resized to fit within 3.0MB and 6000px max dimension
Error processing images\AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_4.png with meta-llama/llama-3.2-90b-vision-instruct: HTTPSConnectionPool(host='openrouter.ai', port=443): Max retries exceeded with url: /api/v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000011EB2AE2330>: Failed to resolve 'openrouter.ai' ([Errno 11001] getaddrinfo failed)"))
Image AN-284AP-4-doss 10_page_18.png was resized to fit within 3.0MB and 6000px max dimension
Error processing images\AN-284AP-4-doss 10_pa

Processing images with qwen/qwen-2-vl-72b-instruct:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with mistralai/pixtral-12b:   0%|          | 0/15 [00:00<?, ?it/s]

Image AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_15.png was resized to fit within 2.0MB
Error processing images\AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_15.png with mistralai/pixtral-12b: HTTPSConnectionPool(host='openrouter.ai', port=443): Max retries exceeded with url: /api/v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000011EB2AE3170>: Failed to resolve 'openrouter.ai' ([Errno 11001] getaddrinfo failed)"))
Image AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_4.png was resized to fit within 2.0MB
Error processing images\AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_4.png with mistralai/pixtral-12b: HTTPSConnectionPool(host='openrouter.ai', port=443): Max retries exceeded with url: /api/v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000011EB2B3E2A0>: Failed to resolve 'openrouter.ai' ([Errno 1100

Processing images with qwen/qwen-2-vl-7b-instruct:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with openai/gpt-4o-mini:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with openai/gpt-4.5-preview:   0%|          | 0/15 [00:00<?, ?it/s]

Processing images with anthropic/claude-3.7-sonnet:   0%|          | 0/15 [00:00<?, ?it/s]

Image AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_15.png was resized to fit within 4.0MB and 7500px max dimension
Error processing images\AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_15.png with anthropic/claude-3.7-sonnet: HTTPSConnectionPool(host='openrouter.ai', port=443): Max retries exceeded with url: /api/v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000011EB2AE3F20>: Failed to resolve 'openrouter.ai' ([Errno 11001] getaddrinfo failed)"))
Image AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_4.png was resized to fit within 4.0MB and 7500px max dimension
Error processing images\AN-284AP-18-fasc ms extr Moniteur carriere Sieyes-1789-1799_page_4.png with anthropic/claude-3.7-sonnet: HTTPSConnectionPool(host='openrouter.ai', port=443): Max retries exceeded with url: /api/v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000

Processing images with transkribus/CITlab_HTR+:   0%|          | 0/15 [00:00<?, ?it/s]

Error processing images\AN-284AP-4-doss 10_page_18.png with transkribus/CITlab_HTR+: HTTPSConnectionPool(host='transkribus.eu', port=443): Max retries exceeded with url: /TrpServer/rest/auth/login (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000011EB2B3DFD0>: Failed to resolve 'transkribus.eu' ([Errno 11001] getaddrinfo failed)"))
Error processing images\AN-284AP-4-doss 10_page_20.png with transkribus/CITlab_HTR+: HTTPSConnectionPool(host='transkribus.eu', port=443): Max retries exceeded with url: /TrpServer/rest/auth/login (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000011EB2AE0050>: Failed to resolve 'transkribus.eu' ([Errno 11001] getaddrinfo failed)"))
Error processing images\AN-284AP-4-doss 10_page_30.png with transkribus/CITlab_HTR+: HTTPSConnectionPool(host='transkribus.eu', port=443): Max retries exceeded with url: /TrpServer/rest/auth/login (Caused by NameResolutionError("<urllib3.connection.HTTPSConnecti

Processing images with transkribus/PyLaia:   0%|          | 0/15 [00:00<?, ?it/s]

Error processing images\AN-284AP-4-doss 10_page_18.png with transkribus/PyLaia: HTTPSConnectionPool(host='transkribus.eu', port=443): Max retries exceeded with url: /TrpServer/rest/auth/login (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000011EB2B3D220>: Failed to resolve 'transkribus.eu' ([Errno 11001] getaddrinfo failed)"))
Error processing images\AN-284AP-4-doss 10_page_20.png with transkribus/PyLaia: HTTPSConnectionPool(host='transkribus.eu', port=443): Max retries exceeded with url: /TrpServer/rest/auth/login (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000011EB2B3D3D0>: Failed to resolve 'transkribus.eu' ([Errno 11001] getaddrinfo failed)"))
Error processing images\AN-284AP-4-doss 10_page_30.png with transkribus/PyLaia: HTTPSConnectionPool(host='transkribus.eu', port=443): Max retries exceeded with url: /TrpServer/rest/auth/login (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x

Processing images with transkribus/French_18th_Century:   0%|          | 0/15 [00:00<?, ?it/s]

Error processing images\AN-284AP-4-doss 10_page_18.png with transkribus/French_18th_Century: HTTPSConnectionPool(host='transkribus.eu', port=443): Max retries exceeded with url: /TrpServer/rest/auth/login (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000011EB2B3C7A0>: Failed to resolve 'transkribus.eu' ([Errno 11001] getaddrinfo failed)"))
Error processing images\AN-284AP-4-doss 10_page_20.png with transkribus/French_18th_Century: HTTPSConnectionPool(host='transkribus.eu', port=443): Max retries exceeded with url: /TrpServer/rest/auth/login (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000011EB2B3FAA0>: Failed to resolve 'transkribus.eu' ([Errno 11001] getaddrinfo failed)"))
Error processing images\AN-284AP-4-doss 10_page_30.png with transkribus/French_18th_Century: HTTPSConnectionPool(host='transkribus.eu', port=443): Max retries exceeded with url: /TrpServer/rest/auth/login (Caused by NameResolutionError("<urllib3.

In [None]:
# Generate the results table
generate_results_md_table()

# Generate results per page


In [None]:
!python scripts/generate_performance_table.py

In [None]:
!python scripts/generate_viewer_data.py

In [None]:
Penser à refaire générer manuellement @analyse_modeles.md et le readme qui reprennent les résultats

In [None]:


# Also create a DataFrame for additional analysis
df_results = pd.DataFrame(results)

# Add columns for latency and token usage analysis
if not df_results.empty:
    df_summary = df_results.groupby('model').agg({
        'latency': ['mean', 'min', 'max'],
        'usage': lambda x: pd.Series([d.get('total_tokens', 0) for d in x]).mean()
    }).round(2)
    
    df_summary.columns = ['Avg Latency (s)', 'Min Latency (s)', 'Max Latency (s)', 'Avg Tokens']
    display(df_summary)

## Additional Performance Metrics

The summary above includes:
- Latency metrics (average, min, max) in seconds
- Average token usage per request

These metrics complement the WER scores and cost analysis in the main results table, providing a more complete picture of each model's performance characteristics.