# 06 - Workflow Complet Chiron

Ce notebook montre le flow complet de production :

```
PDF original
    |
    v
1. Extraction nom (pdfplumber + regex)      [local]
    |
    v
2. NER variantes (CamemBERT)                [local, ~0.4s]
    |
    v
3. Anonymisation PDF (PyMuPDF)              [local]
    |
    v
4. OCR (Mistral OCR)                        [cloud, ~2s]
    |
    v
5. Parsing structure (EleveExtraction)      [local]
    |
    v
6. Stockage DuckDB                          [local]
    |
    v
7. Generation synthese (LLM)                [cloud, ~5s]
    |
    v
8. Export CSV                               [local]
```

In [None]:
# ruff: noqa: E402
import sys
from pathlib import Path

# Auto-detecter project_root
current = Path.cwd()
while current != current.parent:
    if (current / "pyproject.toml").exists():
        project_root = current
        break
    current = current.parent

sys.path.insert(0, str(project_root))

from dotenv import load_dotenv

load_dotenv(project_root / ".env")

# Paths
DATA_DIR = project_root / "data"
RAW_DIR = DATA_DIR / "raw"
DB_DIR = DATA_DIR / "db"
DB_DIR.mkdir(exist_ok=True)

print(f"Project root: {project_root}")
print(f"PDFs disponibles: {[p.name for p in RAW_DIR.glob('*.pdf')]}")

## 1. Configuration

Affichage de la configuration centralisee depuis `src/llm/config.py`.

In [None]:
from src.document import estimate_mistral_cost
from src.llm.config import settings
from src.llm.pricing import estimate_synthese_cost

print("=== Configuration LLM ===")
print(f"OCR Model     : {settings.mistral_ocr_model}")
print(f"NER Model     : {settings.ner_model}")
print(f"LLM Provider  : OpenAI (default: {settings.default_openai_model})")
print(f"Temperature   : {settings.default_temperature}")
print(f"Max tokens    : {settings.synthese_max_tokens}")
print()

# Tarifs
print("=== Tarifs ===")
print(f"Mistral OCR  : ${settings.mistral_ocr_cost_per_1000_pages}/1000 pages")
openai_price = settings.openai_pricing.get(settings.default_openai_model, (0, 0))
print(f"OpenAI ({settings.default_openai_model}): ${openai_price[0]}/M input, ${openai_price[1]}/M output")
print()

# Estimation des couts
pdfs_preview = [p for p in (project_root / "data" / "raw").glob("*.pdf") if not p.name.startswith("ELEVE_TEST")]
if pdfs_preview:
    # Estimation OCR
    estimate_ocr = estimate_mistral_cost(pdfs_preview)

    # Estimation LLM (1 synthese par PDF)
    estimate_llm = estimate_synthese_cost(
        nb_eleves=len(pdfs_preview),
        avg_input_tokens=2000,  # ~bulletin + prompt
        avg_output_tokens=500,   # ~synthese JSON
    )

    print("=== Estimation des couts ===")
    print(f"PDFs a traiter : {len(pdfs_preview)}")
    print(f"Pages totales  : {estimate_ocr['pages']}")
    print()
    print(f"OCR Mistral    : ${estimate_ocr['cost_usd']:.4f}")
    print(f"LLM OpenAI     : ${estimate_llm['cost_usd']:.4f} ({estimate_llm['total_tokens']:,} tokens)")
    print(f"  - Par eleve  : ${estimate_llm['cost_per_eleve']:.6f}")
    print(f"TOTAL ESTIME   : ${estimate_ocr['cost_usd'] + estimate_llm['cost_usd']:.4f}")
else:
    print("Aucun PDF trouve dans data/raw/")

## 2. Chargement des modeles

- CamemBERT NER pour la detection des noms
- Client Mistral pour l'OCR

In [None]:
import os

from mistralai import Mistral
from transformers import pipeline

# Charger CamemBERT NER
print(f"Chargement NER: {settings.ner_model}...")
nlp_ner = pipeline(
    "ner",
    model=settings.ner_model,
    aggregation_strategy="simple"
)
print("  NER charge")

# Init client Mistral
mistral_api_key = os.getenv("MISTRAL_API_KEY") or os.getenv("MISTRAL_OCR_API_KEY")
if not mistral_api_key:
    raise ValueError("MISTRAL_API_KEY ou MISTRAL_OCR_API_KEY requis dans .env")

mistral_client = Mistral(api_key=mistral_api_key)
print(f"  Mistral OCR pret (model: {settings.mistral_ocr_model})")

## 3. Module d'anonymisation

Utilisation directe du module `src.document.anonymizer`.

In [None]:
from src.document.anonymizer import PDFAnonymizer

# Creer l'anonymizer (utilise la config ner_model automatiquement)
anonymizer = PDFAnonymizer()
print(f"Anonymizer cree avec NER: {anonymizer._ner_model_name}")

## 4. Fonction OCR Mistral

In [None]:
import base64
import json


def extract_with_mistral_ocr(pdf_bytes: bytes) -> dict:
    """Extrait le contenu d'un PDF avec Mistral OCR."""
    base64_pdf = base64.b64encode(pdf_bytes).decode("utf-8")

    response = mistral_client.ocr.process(
        model=settings.mistral_ocr_model,
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{base64_pdf}"
        },
    )

    return json.loads(response.model_dump_json())

print("Fonction OCR definie")

## 5. Pipeline : Anonymisation + OCR

Traitement d'un PDF de test (ou du premier PDF disponible).

In [None]:
import time

# Selectionner un PDF (exclure les fichiers de test)
pdfs = [p for p in sorted(RAW_DIR.glob("*.pdf")) if not p.name.startswith("ELEVE_TEST")]

if not pdfs:
    print("Aucun PDF trouve dans data/raw/")
    print("Creez un PDF de test ou ajoutez des bulletins.")
else:
    pdf_path = pdfs[0]
    print(f"Traitement de: {pdf_path.name}")
    print("=" * 60)

In [None]:
# Etape 1-3: Anonymisation
if pdfs:
    eleve_id = pdf_path.stem  # Utiliser le nom du fichier comme ID

    start = time.perf_counter()
    try:
        result = anonymizer.anonymize(pdf_path, eleve_id=eleve_id)
        duration_anon = time.perf_counter() - start

        print(f"1. Nom extrait     : '{result.identity.get('nom_complet', 'N/A')}'")
        print(f"2. Variantes NER   : {result.variants_found}")
        print(f"3. Remplacements   : {result.replacements_count}")
        print(f"   Duree anonymisation : {duration_anon:.2f}s")

        # Stocker pour les etapes suivantes
        pdf_anonymise = result.pdf_bytes
        identity = result.identity

    except Exception as e:
        print(f"Erreur anonymisation: {e}")
        pdf_anonymise = None

In [None]:
# Etape 4: OCR
if pdfs and pdf_anonymise:
    start = time.perf_counter()
    ocr_result = extract_with_mistral_ocr(pdf_anonymise)
    duration_ocr = time.perf_counter() - start

    markdown_ocr = ocr_result["pages"][0]["markdown"]
    print(f"4. OCR Mistral     : {len(markdown_ocr)} caracteres ({duration_ocr:.2f}s)")
    print()
    print("--- Apercu du texte OCR (500 premiers chars) ---")
    print(markdown_ocr[:500])

## 6. Parsing structure -> EleveExtraction

Conversion du markdown OCR en objet `EleveExtraction` structure.

**Note**: Cette etape est simplifiee. En production, un parser plus sophistique extraira les matieres, notes, appreciations, etc.

In [None]:
import re

from src.core.models import EleveExtraction, MatiereExtraction


def parse_ocr_to_eleve(markdown: str, eleve_id: str, identity: dict) -> EleveExtraction:
    """Parse le markdown OCR en EleveExtraction.

    Version simplifiee - extrait les informations de base.
    """
    # Detecter le genre
    genre = identity.get("genre")
    if not genre:
        if re.search(r"\bFille\b", markdown, re.IGNORECASE):
            genre = "Fille"
        elif re.search(r"\bGarcon\b|\bGarÃ§on\b", markdown, re.IGNORECASE):
            genre = "Garcon"

    # Detecter la classe
    classe_match = re.search(r"Classe\s*:\s*([^\n]+)", markdown)
    classe = classe_match.group(1).strip() if classe_match else None

    # Detecter les absences
    absences_match = re.search(r"(\d+)\s*demi-journ", markdown)
    absences = int(absences_match.group(1)) if absences_match else None

    # Detecter les retards
    retards_match = re.search(r"(\d+)\s*retard", markdown, re.IGNORECASE)
    retards = int(retards_match.group(1)) if retards_match else None

    # TODO: Parser les matieres, notes, appreciations
    # Pour l'instant, on cree des donnees de demonstration
    matieres = [
        MatiereExtraction(
            nom="Mathematiques",
            moyenne_eleve=12.5,
            moyenne_classe=11.8,
            appreciation="Bon trimestre. Travail regulier."
        ),
        MatiereExtraction(
            nom="Francais",
            moyenne_eleve=14.0,
            moyenne_classe=12.3,
            appreciation="Excellent travail. Participation active."
        ),
        MatiereExtraction(
            nom="Anglais",
            moyenne_eleve=9.5,
            moyenne_classe=11.0,
            appreciation="Des difficultes persistantes. Doit s'investir davantage."
        ),
    ]

    return EleveExtraction(
        eleve_id=eleve_id,
        genre=genre,
        classe=classe,
        trimestre=1,
        absences_demi_journees=absences,
        retards=retards,
        matieres=matieres,
        raw_text=markdown,
    )

print("Fonction de parsing definie")

In [None]:
# Parser le resultat OCR
if pdfs and pdf_anonymise:
    eleve = parse_ocr_to_eleve(markdown_ocr, eleve_id, identity)

    print("5. Parsing structure")
    print(f"   eleve_id : {eleve.eleve_id}")
    print(f"   genre    : {eleve.genre}")
    print(f"   classe   : {eleve.classe}")
    print(f"   absences : {eleve.absences_demi_journees}")
    print(f"   retards  : {eleve.retards}")
    print(f"   matieres : {len(eleve.matieres)}")
    for m in eleve.matieres:
        print(f"      - {m.nom}: {m.moyenne_eleve}/20 (classe: {m.moyenne_classe})")

## 7. Stockage DuckDB

Initialisation de la base et stockage de l'eleve.

In [None]:
from src.storage.connection import DuckDBConnection
from src.storage.repositories.classe import ClasseRepository
from src.storage.repositories.eleve import EleveRepository
from src.storage.repositories.synthese import SyntheseRepository

# Utiliser une DB de test pour le notebook
TEST_DB = DB_DIR / "chiron_notebook.duckdb"

# Initialiser la connexion et creer les tables
conn = DuckDBConnection(TEST_DB)
conn.ensure_tables()

# Creer les repositories
eleve_repo = EleveRepository(TEST_DB)
synthese_repo = SyntheseRepository(TEST_DB)
classe_repo = ClasseRepository(TEST_DB)

print(f"6. DuckDB initialise: {TEST_DB}")

In [None]:
# Creer la classe si necessaire
if pdfs and eleve:
    classe_id = eleve.classe or "CLASSE_TEST"

    if not classe_repo.get(classe_id):
        from src.storage.repositories.classe import Classe
        classe_repo.create(Classe(classe_id=classe_id, nom=classe_id))
        print(f"   Classe creee: {classe_id}")
    else:
        print(f"   Classe existante: {classe_id}")

In [None]:
# Sauvegarder l'eleve
if pdfs and eleve:
    # S'assurer que la classe est definie
    eleve.classe = classe_id

    if eleve_repo.exists(eleve.eleve_id):
        print(f"   Eleve deja existant: {eleve.eleve_id}")
        # Mettre a jour
        eleve_repo.update(eleve.eleve_id, matieres=eleve.matieres)
        print("   -> Mis a jour")
    else:
        eleve_repo.create(eleve)
        print(f"   Eleve cree: {eleve.eleve_id}")

In [None]:
# Verifier le stockage
if pdfs and eleve:
    stored = eleve_repo.get(eleve.eleve_id)
    if stored:
        print("   Verification: OK")
        print(f"   - {len(stored.matieres)} matieres stockees")
    else:
        print("   Verification: ERREUR - eleve non trouve")

## 8. Generation de synthese LLM

Utilisation du `SyntheseGenerator` pour generer une synthese.

In [None]:
from src.generation.generator import SyntheseGenerator

# Creer le generateur (utilise OpenAI par defaut)
generator = SyntheseGenerator(
    provider="openai",
    model=settings.default_openai_model,
)

print(f"7. Generateur LLM: {generator.provider}/{generator.model}")

In [None]:
# Afficher le prompt qui sera envoye au LLM (debug)
if pdfs and eleve:
    from src.generation.prompt_builder import format_eleve_data

    print("--- Donnees eleve formatees pour le LLM ---")
    print(format_eleve_data(eleve))

In [None]:
# Generer la synthese
if pdfs and eleve:
    print("Generation en cours...")
    start = time.perf_counter()

    try:
        result = generator.generate_with_metadata(
            eleve=eleve,
            max_tokens=settings.synthese_max_tokens,
        )
        duration_llm = time.perf_counter() - start

        synthese = result.synthese
        metadata = result.metadata

        print(f"   Duree LLM: {duration_llm:.2f}s")
        print(f"   Tokens: {metadata.get('tokens_total', 'N/A')}")
        print()
        print("--- Synthese generee ---")
        print(synthese.synthese_texte)
        print()
        print(f"Posture: {synthese.posture_generale}")
        print(f"Alertes: {len(synthese.alertes)}")
        for a in synthese.alertes:
            print(f"  - [{a.severite}] {a.matiere}: {a.description}")
        print(f"Reussites: {len(synthese.reussites)}")
        for r in synthese.reussites:
            print(f"  - {r.matiere}: {r.description}")
        print(f"Axes de travail: {synthese.axes_travail}")

    except Exception as e:
        print(f"Erreur generation: {e}")
        synthese = None
        metadata = None

In [None]:
# Sauvegarder la synthese en base
if pdfs and synthese:
    from src.generation.prompt_builder import format_eleve_data
    from src.generation.prompts import CURRENT_PROMPT, get_prompt_hash

    # Calculer le hash du prompt pour tracabilite
    eleve_data_str = format_eleve_data(eleve)
    prompt_hash = get_prompt_hash(CURRENT_PROMPT, eleve_data_str)

    # Preparer les metadonnees
    db_metadata = {
        "llm_provider": metadata.get("llm_provider", "openai"),
        "llm_model": metadata.get("llm_model"),
        "llm_response_raw": metadata.get("llm_response_raw"),
        "prompt_template": CURRENT_PROMPT,
        "prompt_hash": prompt_hash,
        "tokens_input": metadata.get("tokens_input"),
        "tokens_output": metadata.get("tokens_output"),
        "tokens_total": metadata.get("tokens_total"),
        "llm_duration_ms": int(duration_llm * 1000),
        "llm_temperature": settings.default_temperature,
    }

    # Creer la synthese
    synthese_id = synthese_repo.create(
        eleve_id=eleve.eleve_id,
        synthese=synthese,
        trimestre=eleve.trimestre or 1,
        metadata=db_metadata,
    )

    print(f"8. Synthese sauvegardee: {synthese_id}")
    print(f"   prompt_hash: {prompt_hash[:16]}...")

## 9. Export CSV

Generation d'un fichier CSV avec les syntheses validees.

In [None]:
# Marquer comme validee pour pouvoir exporter
if pdfs and synthese:
    synthese_repo.update_status(synthese_id, "validated", "notebook_test")
    print(f"Synthese {synthese_id} marquee comme validee")

In [None]:
# Recuperer les syntheses validees
if pdfs:
    validated = synthese_repo.get_validated(classe_id, trimestre=1)

    print(f"9. Syntheses validees: {len(validated)}")

    # Generer le CSV
    csv_lines = ["eleve_id;synthese_texte;posture_generale;alertes;reussites"]

    for item in validated:
        s = item["synthese"]
        text = s.synthese_texte.replace('"', '""')  # Escape quotes
        alertes = "; ".join(f"{a.matiere}: {a.description}" for a in s.alertes)
        reussites = "; ".join(f"{r.matiere}: {r.description}" for r in s.reussites)

        csv_lines.append(
            f'"{item["eleve_id"]}";"{text}";"{s.posture_generale}";"{alertes}";"{reussites}"'
        )

    csv_content = "\n".join(csv_lines)

    # Sauvegarder
    csv_path = DATA_DIR / f"export_syntheses_{classe_id}_T1.csv"
    csv_path.write_text(csv_content, encoding="utf-8")

    print(f"   Export: {csv_path}")
    print()
    print("--- Apercu CSV ---")
    print(csv_content[:500])

## 10. Resume du workflow

In [None]:
if pdfs:
    # Calculer le cout LLM reel
    from src.llm.pricing import PricingCalculator

    llm_cost = 0.0
    if metadata and metadata.get("tokens_input") and metadata.get("tokens_output"):
        calculator = PricingCalculator("openai", settings.openai_pricing)
        llm_cost = calculator.calculate(
            model=metadata.get("llm_model", settings.default_openai_model),
            prompt_tokens=metadata.get("tokens_input", 0),
            completion_tokens=metadata.get("tokens_output", 0),
        )

    # Cout OCR
    ocr_cost = 1 / 1000 * settings.mistral_ocr_cost_per_1000_pages  # 1 page

    print("=" * 60)
    print("RESUME DU WORKFLOW")
    print("=" * 60)
    print(f"1. PDF source       : {pdf_path.name}")
    print(f"2. Nom original     : {identity.get('nom_complet', 'N/A')}")
    print(f"3. ID anonyme       : {eleve_id}")
    print(f"4. Anonymisation    : {result.replacements_count} remplacements ({duration_anon:.2f}s)")
    print(f"5. OCR Mistral      : {len(markdown_ocr)} chars ({duration_ocr:.2f}s)")
    print(f"6. Stockage DuckDB  : {TEST_DB.name}")
    print(f"7. Generation LLM   : {metadata.get('tokens_total', 'N/A')} tokens ({duration_llm:.2f}s)")
    print(f"8. Export CSV       : {csv_path.name}")
    print()
    print("=== COUTS REELS ===")
    print(f"OCR Mistral (1 page) : ${ocr_cost:.4f}")
    print(f"LLM OpenAI           : ${llm_cost:.6f}")
    print(f"  - Input tokens     : {metadata.get('tokens_input', 0):,}")
    print(f"  - Output tokens    : {metadata.get('tokens_output', 0):,}")
    print(f"TOTAL                : ${ocr_cost + llm_cost:.4f}")
    print()
    print(f"Temps total: {duration_anon + duration_ocr + duration_llm:.2f}s")
else:
    print("Aucun PDF traite - ajoutez des PDFs dans data/raw/")

## Nettoyage (optionnel)

In [None]:
# Supprimer la base de test
# Decommenter pour nettoyer:
# TEST_DB.unlink(missing_ok=True)
# print("Base de test supprimee")