# 03 - Parser Benchmark

Comparaison des méthodes d'extraction PDF :
- **pdfplumber** : Parser actuel (extraction de tableaux)
- **Mistral OCR** : Vision model (mistral-ocr-2503)

## Objectif
Mesurer la précision d'extraction des bulletins scolaires vs ground truth.

In [1]:
# ruff: noqa: E402
import json
import os
import sys
import time
from pathlib import Path

# Auto-détecter project_root
current = Path.cwd()
while current != current.parent:
    if (current / "pyproject.toml").exists():
        project_root = current
        break
    current = current.parent

sys.path.insert(0, str(project_root))

from dotenv import load_dotenv

load_dotenv(project_root / ".env")

# Paths
DATA_DIR = project_root / "data"
RAW_DIR = DATA_DIR / "raw"
GROUND_TRUTH_PATH = DATA_DIR / "ground_truth" / "chiron_ground_truth.json"

print(f"Project root: {project_root}")
print(f"PDFs: {list(RAW_DIR.glob('*.pdf'))}")

Project root: c:\Users\Florent\Documents\data_science\chiron
PDFs: [WindowsPath('c:/Users/Florent/Documents/data_science/chiron/data/raw/ELEVE_A.pdf'), WindowsPath('c:/Users/Florent/Documents/data_science/chiron/data/raw/ELEVE_B.pdf'), WindowsPath('c:/Users/Florent/Documents/data_science/chiron/data/raw/ELEVE_C.pdf'), WindowsPath('c:/Users/Florent/Documents/data_science/chiron/data/raw/ELEVE_D.pdf')]


In [2]:
# Charger ground truth
with open(GROUND_TRUTH_PATH, encoding="utf-8") as f:
    ground_truth = json.load(f)

# Index par eleve_id pour comparaison facile
gt_by_eleve = {e["eleve_id"]: e for e in ground_truth["eleves"]}
print(f"Ground truth: {list(gt_by_eleve.keys())}")

Ground truth: ['ELEVE_A', 'ELEVE_B', 'ELEVE_C', 'ELEVE_D']


## 1. Parser pdfplumber (actuel)

In [3]:
from src.document.bulletin_parser import BulletinParser

parser = BulletinParser()

pdfplumber_results = {}
pdfplumber_times = {}

for pdf_path in sorted(RAW_DIR.glob("*.pdf")):
    eleve_id = pdf_path.stem  # ELEVE_A, ELEVE_B, etc.

    start = time.perf_counter()
    try:
        eleves = parser.parse(pdf_path)
        pdfplumber_results[eleve_id] = eleves[0] if eleves else None
    except Exception as e:
        pdfplumber_results[eleve_id] = f"ERROR: {e}"
    pdfplumber_times[eleve_id] = time.perf_counter() - start

    print(f"{eleve_id}: {pdfplumber_times[eleve_id]:.2f}s")

ELEVE_A: 0.06s
ELEVE_B: 0.06s
ELEVE_C: 0.06s
ELEVE_D: 0.07s


In [4]:
# Afficher un exemple de résultat pdfplumber
exemple = pdfplumber_results.get("ELEVE_A")
if exemple and not isinstance(exemple, str):
    print(f"Nom: {exemple.nom}")
    print(f"Prénom: {exemple.prenom}")
    print(f"Classe: {exemple.classe}")
    print(f"Matières extraites: {len(exemple.matieres)}")
    for m in exemple.matieres[:]:
        print(f"  - {m.nom}: {m.moyenne_eleve}")
else:
    print(f"Erreur ou pas de résultat: {exemple}")

Nom: None
Prénom: None
Classe: None
Matières extraites: 12
  - Anglais LV1: 15.21
  - Arts Plastiques: 15.0
  - EPS: 8.0
  - Éducation Musicale: 13.0
  - Espagnol LV2: 13.83
  - Français: 13.21
  - Histoire-Géographie-EMC: 14.66
  - Latin et Grec: 18.21
  - Mathématiques: 16.07
  - Physique-Chimie: 15.71
  - SVT: 10.86
  - Technologie: 15.8


## 2. Mistral OCR

In [5]:
import base64

from mistralai import Mistral

# Init client
mistral_api_key = os.getenv("MISTRAL_OCR_API_KEY")
if not mistral_api_key:
    raise ValueError("MISTRAL_OCR_API_KEY non configurée dans .env")

client = Mistral(api_key=mistral_api_key)
print("Client Mistral initialisé")

Client Mistral initialisé


In [6]:
def extract_with_mistral_ocr(pdf_path: Path) -> dict:
    """Extrait le contenu d'un PDF avec Mistral OCR.

    Utilise l'API OCR de Mistral (mistral-ocr-2503).
    Ref: https://docs.mistral.ai/capabilities/document/
    """
    # Encoder le PDF en base64
    with open(pdf_path, "rb") as f:
        pdf_base64 = base64.standard_b64encode(f.read()).decode("utf-8")

    # Appel API OCR
    response = client.ocr.process(
        model="mistral-ocr-2503",
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{pdf_base64}"
        }
    )

    return response

In [7]:
# Extraire tous les PDFs avec Mistral OCR
mistral_results = {}
mistral_times = {}

for pdf_path in sorted(RAW_DIR.glob("*.pdf")):
    eleve_id = pdf_path.stem

    start = time.perf_counter()
    try:
        result = extract_with_mistral_ocr(pdf_path)
        mistral_results[eleve_id] = result
    except Exception as e:
        mistral_results[eleve_id] = f"ERROR: {e}"
    mistral_times[eleve_id] = time.perf_counter() - start

    print(f"{eleve_id}: {mistral_times[eleve_id]:.2f}s")

ELEVE_A: 2.41s
ELEVE_B: 2.26s
ELEVE_C: 2.30s
ELEVE_D: 2.42s


## 3. Normalisation des extractions

Convertir les résultats des deux parsers en format uniforme pour comparaison.

In [9]:
import re

import pandas as pd


def parse_mistral_markdown(markdown: str) -> dict:
    """Parse le markdown de Mistral OCR en structure normalisée."""
    result = {
        "eleve_id": None,
        "genre": None,
        "absences": None,
        "engagements": None,
        "matieres": [],
        "moyenne_generale": None,
    }

    eleve_match = re.search(r"Élève\s*:\s*(\w+)", markdown)
    if eleve_match:
        result["eleve_id"] = eleve_match.group(1)

    genre_match = re.search(r"Genre\s*:\s*(\w+)", markdown)
    if genre_match:
        result["genre"] = genre_match.group(1)

    absences_match = re.search(r"Absences\s*:\s*(\d+)", markdown)
    if absences_match:
        result["absences"] = int(absences_match.group(1))

    engagements_match = re.search(r"Engagements\s*:\s*([^<\n]+)", markdown)
    if engagements_match:
        result["engagements"] = engagements_match.group(1).strip()

    table_pattern = r"\|\s*([^|]+)\s*\|\s*([^|]+)\s*\|\s*([^|]*)\s*\|"
    rows = re.findall(table_pattern, markdown)

    for row in rows:
        matiere, notes, appreciation = [cell.strip() for cell in row]
        if matiere in ("Matière", ":--:") or "Élève" in matiere:
            continue
        notes_match = re.search(r"([\d.]+)\s*/\s*([\d.]+)", notes)
        result["matieres"].append({
            "nom": matiere,
            "moy_eleve": float(notes_match.group(1)) if notes_match else None,
            "moy_classe": float(notes_match.group(2)) if notes_match else None,
            "appreciation": appreciation,
        })

    moy_match = re.search(r"Moyenne générale\s*:\s*([\d.]+)", markdown)
    if moy_match:
        result["moyenne_generale"] = float(moy_match.group(1))

    return result


def normalize_pdfplumber(result) -> dict:
    """Convertit le résultat pdfplumber en structure normalisée."""
    if isinstance(result, str) or result is None:
        return None

    return {
        "eleve_id": result.nom,
        "genre": result.genre,
        "absences": result.absences_demi_journees,
        "engagements": ", ".join(result.engagements) if result.engagements else None,
        "matieres": [
            {
                "nom": m.nom,
                "moy_eleve": m.moyenne_eleve,
                "moy_classe": m.moyenne_classe,
                "appreciation": m.appreciation,
            }
            for m in result.matieres
        ],
        "moyenne_generale": None,  # Non extrait par pdfplumber
    }


# Normaliser tous les résultats
normalized = {"pdfplumber": {}, "mistral_ocr": {}}

for eleve_id in gt_by_eleve.keys():
    # pdfplumber
    pdf_result = pdfplumber_results.get(eleve_id)
    normalized["pdfplumber"][eleve_id] = normalize_pdfplumber(pdf_result)

    # Mistral OCR
    mistral_result = mistral_results.get(eleve_id)
    if mistral_result and not isinstance(mistral_result, str):
        normalized["mistral_ocr"][eleve_id] = parse_mistral_markdown(
            mistral_result.pages[0].markdown
        )

print("Données normalisées pour:")
for source, data in normalized.items():
    print(f"  - {source}: {list(data.keys())}")

Données normalisées pour:
  - pdfplumber: ['ELEVE_A', 'ELEVE_B', 'ELEVE_C', 'ELEVE_D']
  - mistral_ocr: ['ELEVE_A', 'ELEVE_B', 'ELEVE_C', 'ELEVE_D']


## 4. Export des extractions

Sauvegarder les extractions en markdown pour faciliter la comparaison visuelle.

In [10]:
BENCHMARK_DIR = DATA_DIR / "processed" / "benchmark-extraction"
BENCHMARK_DIR.mkdir(parents=True, exist_ok=True)


def export_to_markdown(data: dict, source: str, output_path: Path):
    """Exporte les données normalisées en markdown."""
    lines = [
        f"# {source} - {data.get('eleve_id', 'N/A')}",
        "",
        f"**Genre:** {data.get('genre', 'N/A')}",
        f"**Absences:** {data.get('absences', 'N/A')} demi-journées",
        f"**Engagements:** {data.get('engagements') or 'Aucun'}",
        "",
        "## Matières",
        "",
        "| Matière | Moy. Élève | Moy. Classe | Appréciation |",
        "|---------|------------|-------------|--------------|",
    ]

    for m in data.get("matieres", []):
        app = (m.get("appreciation") or "").replace("|", "\\|").replace("\n", " ")
        lines.append(
            f"| {m['nom']} | {m['moy_eleve']} | {m['moy_classe']} | {app} |"
        )

    if data.get("moyenne_generale"):
        lines.extend(["", f"**Moyenne générale:** {data['moyenne_generale']}/20"])

    output_path.write_text("\n".join(lines), encoding="utf-8")


# Exporter pour tous les élèves
for eleve_id in gt_by_eleve.keys():
    # Ground truth
    gt_data = {
        "eleve_id": eleve_id,
        "genre": gt_by_eleve[eleve_id]["genre"],
        "absences": gt_by_eleve[eleve_id]["absences_demi_journees"],
        "engagements": ", ".join(gt_by_eleve[eleve_id].get("engagements", [])),
        "matieres": [
            {
                "nom": m["nom"],
                "moy_eleve": m["moyenne_eleve"],
                "moy_classe": m["moyenne_classe"],
                "appreciation": m.get("appreciation", ""),
            }
            for m in gt_by_eleve[eleve_id]["matieres"]
        ],
    }
    export_to_markdown(gt_data, "Ground Truth", BENCHMARK_DIR / f"{eleve_id}_ground_truth.md")

    # pdfplumber
    if normalized["pdfplumber"].get(eleve_id):
        export_to_markdown(
            normalized["pdfplumber"][eleve_id],
            "pdfplumber",
            BENCHMARK_DIR / f"{eleve_id}_pdfplumber.md",
        )

    # Mistral OCR
    if normalized["mistral_ocr"].get(eleve_id):
        export_to_markdown(
            normalized["mistral_ocr"][eleve_id],
            "Mistral OCR",
            BENCHMARK_DIR / f"{eleve_id}_mistral_ocr.md",
        )

    print(f"✓ {eleve_id} exporté")

print(f"\nFichiers exportés dans: {BENCHMARK_DIR}")

✓ ELEVE_A exporté
✓ ELEVE_B exporté
✓ ELEVE_C exporté
✓ ELEVE_D exporté

Fichiers exportés dans: c:\Users\Florent\Documents\data_science\chiron\data\processed\benchmark-extraction


## 5. Comparaison avec le Ground Truth

Fonction de comparaison générique pour les deux sources.

In [11]:
def compare_with_gt(extracted: dict, gt: dict, source: str) -> pd.DataFrame:
    """Compare les données extraites avec le ground truth.

    Args:
        extracted: Données normalisées extraites
        gt: Ground truth
        source: Nom de la source (pdfplumber, mistral_ocr)

    Returns:
        DataFrame avec les résultats de comparaison
    """
    if not extracted:
        return pd.DataFrame()

    comparisons = []
    gt_matieres = {m["nom"]: m for m in gt["matieres"]}

    for m in extracted["matieres"]:
        nom = m["nom"]
        gt_m = gt_matieres.get(nom, {})

        # Comparer moyennes
        moy_eleve_ok = m["moy_eleve"] == gt_m.get("moyenne_eleve")
        moy_classe_ok = m["moy_classe"] == gt_m.get("moyenne_classe")

        # Comparer appréciations (normaliser les espaces)
        app_ext = " ".join((m.get("appreciation") or "").split())
        app_gt = " ".join(gt_m.get("appreciation", "").split())
        app_ok = app_ext == app_gt

        comparisons.append({
            "source": source,
            "matiere": nom,
            "moy_eleve_ext": m["moy_eleve"],
            "moy_eleve_gt": gt_m.get("moyenne_eleve"),
            "moy_eleve_ok": "✅" if moy_eleve_ok else "❌",
            "moy_classe_ext": m["moy_classe"],
            "moy_classe_gt": gt_m.get("moyenne_classe"),
            "moy_classe_ok": "✅" if moy_classe_ok else "❌",
            "app_ok": "✅" if app_ok else "❌",
            "app_len_ext": len(app_ext),
            "app_len_gt": len(app_gt),
        })

    return pd.DataFrame(comparisons)


# Comparer toutes les sources pour tous les élèves
all_comparisons = []

for source_name, source_data in normalized.items():
    for eleve_id, extracted in source_data.items():
        df = compare_with_gt(extracted, gt_by_eleve[eleve_id], source_name)
        if not df.empty:
            df["eleve_id"] = eleve_id
            all_comparisons.append(df)

df_comparison = pd.concat(all_comparisons, ignore_index=True)
print(f"Total comparaisons: {len(df_comparison)} lignes")

Total comparaisons: 92 lignes


In [12]:
# Statistiques par source
stats_by_source = []

for source in df_comparison["source"].unique():
    df_src = df_comparison[df_comparison["source"] == source]
    total = len(df_src)

    stats_by_source.append({
        "source": source,
        "total_matieres": total,
        "moy_eleve_ok": (df_src["moy_eleve_ok"] == "✅").sum(),
        "moy_classe_ok": (df_src["moy_classe_ok"] == "✅").sum(),
        "app_ok": (df_src["app_ok"] == "✅").sum(),
    })

df_stats = pd.DataFrame(stats_by_source)
df_stats["moy_eleve_%"] = (df_stats["moy_eleve_ok"] / df_stats["total_matieres"] * 100).round(1)
df_stats["moy_classe_%"] = (df_stats["moy_classe_ok"] / df_stats["total_matieres"] * 100).round(1)
df_stats["app_%"] = (df_stats["app_ok"] / df_stats["total_matieres"] * 100).round(1)

print("=" * 60)
print("STATISTIQUES PAR SOURCE")
print("=" * 60)
print(df_stats[["source", "total_matieres", "moy_eleve_%", "moy_classe_%", "app_%"]].to_string(index=False))

STATISTIQUES PAR SOURCE
     source  total_matieres  moy_eleve_%  moy_classe_%  app_%
 pdfplumber              46        100.0         100.0  100.0
mistral_ocr              46        100.0         100.0   93.5


In [13]:
# Détail des appréciations qui ne matchent pas
df_errors = df_comparison[df_comparison["app_ok"] == "❌"][
    ["source", "eleve_id", "matiere", "app_len_ext", "app_len_gt"]
]
print(f"Appréciations différentes ({len(df_errors)}):\n")
if not df_errors.empty:
    print(df_errors.to_string(index=False))
else:
    print("Aucune erreur d'appréciation!")

Appréciations différentes (3):

     source eleve_id                 matiere  app_len_ext  app_len_gt
mistral_ocr  ELEVE_B             Anglais LV1          218         219
mistral_ocr  ELEVE_B                     EPS           90          91
mistral_ocr  ELEVE_C Histoire-Géographie-EMC           56          56


## 6. Résumé comparatif

In [14]:
# Temps d'exécution
df_times = pd.DataFrame([
    {
        "eleve_id": eleve_id,
        "pdfplumber_time": pdfplumber_times.get(eleve_id),
        "mistral_ocr_time": mistral_times.get(eleve_id),
    }
    for eleve_id in gt_by_eleve.keys()
])

print("Temps d'exécution (secondes):")
print(df_times.to_string(index=False))
print(f"\nMoyenne pdfplumber: {df_times['pdfplumber_time'].mean():.2f}s")
print(f"Moyenne Mistral OCR: {df_times['mistral_ocr_time'].mean():.2f}s")
print(f"Ratio: {df_times['mistral_ocr_time'].mean() / df_times['pdfplumber_time'].mean():.0f}x plus lent")

Temps d'exécution (secondes):
eleve_id  pdfplumber_time  mistral_ocr_time
 ELEVE_A         0.064094          2.410624
 ELEVE_B         0.062869          2.256400
 ELEVE_C         0.055212          2.299399
 ELEVE_D         0.065765          2.416317

Moyenne pdfplumber: 0.06s
Moyenne Mistral OCR: 2.35s
Ratio: 38x plus lent


In [15]:
# Tableau récapitulatif final
print("=" * 70)
print("RÉSUMÉ BENCHMARK")
print("=" * 70)

summary = pd.DataFrame({
    "Métrique": [
        "Temps moyen (s)",
        "Moyennes élève correctes",
        "Moyennes classe correctes",
        "Appréciations identiques",
    ],
    "pdfplumber": [
        f"{df_times['pdfplumber_time'].mean():.2f}",
        f"{df_stats[df_stats['source'] == 'pdfplumber']['moy_eleve_%'].values[0]}%",
        f"{df_stats[df_stats['source'] == 'pdfplumber']['moy_classe_%'].values[0]}%",
        f"{df_stats[df_stats['source'] == 'pdfplumber']['app_%'].values[0]}%",
    ],
    "mistral_ocr": [
        f"{df_times['mistral_ocr_time'].mean():.2f}",
        f"{df_stats[df_stats['source'] == 'mistral_ocr']['moy_eleve_%'].values[0]}%",
        f"{df_stats[df_stats['source'] == 'mistral_ocr']['moy_classe_%'].values[0]}%",
        f"{df_stats[df_stats['source'] == 'mistral_ocr']['app_%'].values[0]}%",
    ],
})

print(summary.to_string(index=False))

print("\n" + "=" * 70)
print("CONCLUSION")
print("=" * 70)
print("""
- pdfplumber est ~35x plus rapide que Mistral OCR
- Les deux méthodes extraient correctement les notes numériques (100%)
- pdfplumber a une meilleure précision sur les appréciations
- Mistral OCR retourne du markdown structuré (utile pour d'autres formats)
""")

RÉSUMÉ BENCHMARK
                 Métrique pdfplumber mistral_ocr
          Temps moyen (s)       0.06        2.35
 Moyennes élève correctes     100.0%      100.0%
Moyennes classe correctes     100.0%      100.0%
 Appréciations identiques     100.0%       93.5%

CONCLUSION

- pdfplumber est ~35x plus rapide que Mistral OCR
- Les deux méthodes extraient correctement les notes numériques (100%)
- pdfplumber a une meilleure précision sur les appréciations
- Mistral OCR retourne du markdown structuré (utile pour d'autres formats)

