# Benchmark HTR Models

This notebook runs various vision-language models on historical French manuscript images and compares their performance.

In [1]:
import os
import json
from datetime import datetime
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
from utils import query_openrouter, system_prompt, generate_results_md_table

# Create necessary directories if they don't exist
results_dir = Path("résultats")
reference_dir = Path("transcriptions_de_référence")
images_dir = Path("images")

results_dir.mkdir(exist_ok=True)
reference_dir.mkdir(exist_ok=True)

In [None]:
# Load models configuration
with open("models_to_test.json", "r") as f:
    models = json.load(f)

# Get list of images
image_files = list(images_dir.glob("*.jpg")) + list(images_dir.glob("*.png"))
print(f"Found {len(image_files)} images to process")
print(f"Will test {len(models)} models")

In [3]:
# Function to determine model metadata
def get_model_metadata(model_id, model_type):
    """Extract editor and model type from model ID and type"""
    parts = model_id.split("/")
    editor = parts[0]
    
    # Map model type to French
    model_type_fr = "libre" if model_type == "open" else "propriétaire"
    
    return {
        "editeur": editor,
        "modele_type": model_type_fr
    }

In [None]:
# Process each image with each model in parallel
import concurrent.futures
from functools import partial

max_w = 1

results = []

def process_image(img_path, model_info):
    model_meta = get_model_metadata(model_info[0], model_info[1])
    model, _ = model_info
    
    # Create valid filename by replacing invalid characters
    safe_model_name = model.replace('/', '_').replace('\\', '_').replace(':', '_')
    result_file = results_dir / f"{img_path.stem}_{safe_model_name}.json"
    
    # Check if result file already exists and is valid JSON
    if result_file.exists():
        print(f"Skipping existing result: {result_file}")
        return None
        
    try:
        # Query the model
        response_data, cost = query_openrouter(str(img_path), model)
        
        # Extract the transcription from the response
        transcription = response_data['choices'][0]['message']['content']
        
        # Get usage data with defaults if missing
        usage_data = response_data.get('usage', {})
        if not usage_data or not isinstance(usage_data, dict):
            usage_data = {
                'prompt_tokens': 0,
                'completion_tokens': 0,
                'total_tokens': 0
            }

        # Prepare result data
        result_data = {
            "model": model,
            "editeur": model_meta["editeur"],
            "modele_type": model_meta["modele_type"],
            "image": str(img_path),
            "result": transcription,
            "timestamp": datetime.now().isoformat(),
            "model_info": response_data.get('model_info', {}),
            "usage": usage_data,
            "latency": response_data.get('created') - response_data.get('started', 0) if 'created' in response_data else None
        }
        
        # Save result to file
        with open(result_file, 'w', encoding='utf-8') as f:
            json.dump(result_data, f, ensure_ascii=False, indent=2)
            
        return result_data
        
    except Exception as e:
        print(f"Error processing {img_path} with {model}: {str(e)}")
        return None

# Process models in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=max_w) as executor:
    for model_info in tqdm(models, desc="Processing models"):
        # Create partial function with fixed model_info
        process_fn = partial(process_image, model_info=model_info)
        
        # Submit all images for this model to process in parallel
        future_to_img = {executor.submit(process_fn, img_path): img_path 
                        for img_path in image_files}
        
        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(future_to_img), 
                         total=len(future_to_img),
                         desc=f"Processing images with {model_info[0]}", 
                         leave=False):
            result = future.result()
            if result is not None:
                results.append(result)

In [None]:
# Generate the results table
generate_results_md_table()

# Also create a DataFrame for additional analysis
df_results = pd.DataFrame(results)

# Add columns for latency and token usage analysis
if not df_results.empty:
    df_summary = df_results.groupby('model').agg({
        'latency': ['mean', 'min', 'max'],
        'usage': lambda x: pd.Series([d.get('total_tokens', 0) for d in x]).mean()
    }).round(2)
    
    df_summary.columns = ['Avg Latency (s)', 'Min Latency (s)', 'Max Latency (s)', 'Avg Tokens']
    display(df_summary)

## Additional Performance Metrics

The summary above includes:
- Latency metrics (average, min, max) in seconds
- Average token usage per request

These metrics complement the WER scores and cost analysis in the main results table, providing a more complete picture of each model's performance characteristics.