In [None]:
!pip install transformers torch torchvision

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
!pip install psutil

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [None]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-jhj5w5h2
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-jhj5w5h2
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=60b56d36906f0f525b2c56eddee4275bb2439252d65b987beb11469b059c148a
  Stored in directory: /tmp/pip-ephem-wheel-cache-7a1rm11f/wheels/3f/7c/a4/9b490845988bf7a4d

In [None]:
# Réinstaller à chaque lancement du NB
!pip install PyMuPDF



In [None]:
import torch

from transformers import AutoProcessor, AutoModel
from transformers import CLIPProcessor, CLIPModel

from PIL import Image
import fitz # relancer !pip install PyMuPDF
import io

import time
import psutil
import tracemalloc
from sklearn.metrics import f1_score, precision_score, recall_score, hamming_loss, jaccard_score

import os
import numpy as np

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


In [None]:
if torch.cuda.is_available():
    print(f'📊 GPU: {torch.cuda.get_device_name(0)}')
    print(f'💾 VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB')

📊 GPU: Tesla T4
💾 VRAM: 14.7GB


## SET UP DU MEASURE PERFORMANCE_V2

In [None]:
def measure_performance(func, *args):
    """Mesure temps et mémoire"""
    process = psutil.Process()
    ram_before = process.memory_info().rss / 1024 / 1024

    start = time.time()
    result = func(*args)
    end = time.time()

    ram_after = process.memory_info().rss / 1024 / 1024

    return {
        "result": result,
        "time_seconds": end - start,
        "ram_mb": ram_after - ram_before
    }

In [None]:
def measure_performance_v2(func, *args):
    """Mesure le temps, la RAM CPU (delta + pic), et la VRAM GPU (si dispo), v2 amélioré par chatGPT"""

    process = psutil.Process()
    ram_before = process.memory_info().rss / 1024 / 1024

    # tracking de la mémoire CPU
    tracemalloc.start()

    # Check si GPU cuda est dispon
    cuda_available = torch.cuda.is_available()
    if cuda_available:
        torch.cuda.reset_peak_memory_stats()

    # Calcul du temps
    start = time.time()
    result = func(*args)
    end = time.time()

    # Post processing, calcul de la RAM CPU
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    ram_after = process.memory_info().rss / 1024 / 1024

    # Post processing, calcul de la RAM GPU
    if cuda_available:
        vram_peak_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
    else:
        vram_peak_mb = None

    return {
        "result": result,
        "time_seconds": end - start,
        "cpu_ram_mb_delta": ram_after - ram_before,
        "cpu_ram_peak_mb": peak / 1024 / 1024,
        "gpu_vram_peak_mb": vram_peak_mb
    }

## Transformation pdf en image dans le bon dossier

@Aghiles, j'avais mis directement les PP en pdf dans mes dossiers avant la transformation en image.

In [None]:
####################################
### Changer les chemins relatifs ###
####################################

pu_p01_pp01 = "/content/drive/MyDrive/Document AI - GroupeSOS/AAP/PU_P01_PP01.pdf"
save_dir = "/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf"

In [None]:
def pdf_to_images_split(pdf_path, save_dir):
    """Convertit le AAP.pdf en AAP_page_X.jpg"""

    # automatisation de récupération du nom de l'AAP ou du PP
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

    # Check si le dir existe
    pdf_folder = os.path.join(save_dir, f"{pdf_name}_folder")
    os.makedirs(pdf_folder, exist_ok=True)

    print(f"Save directory: {pdf_folder}")

    # Ouvrir le PDF
    pdf_doc = fitz.open(pdf_path)
    print(f"PDF pages: {len(pdf_doc)}")

    images = []

    # Convertir chaque page séparément
    for page_num in range(len(pdf_doc)):
        page = pdf_doc.load_page(page_num)
        mat = fitz.Matrix(1.0, 1.0) # change la qualité si besoin
        pix = page.get_pixmap(matrix=mat)

        # Convertir avec PIL
        img_data = pix.tobytes("ppm")
        img = Image.open(io.BytesIO(img_data))
        images.append(img)

        # Sauvegarder chaque page
        output_name = os.path.join(pdf_folder, f"{pdf_name}_page_{page_num + 1}.jpg")
        img.save(output_name, "JPEG", quality=95)
        print(f"✅ Page {page_num + 1} saved: {output_name}")

    pdf_doc.close()

    # Vérification
    saved_files = [f for f in os.listdir(pdf_folder)
               if f.startswith(pdf_name) and f.endswith('.jpg') and '_page_' in f]

    print(f"✅ PDF pages: {len(images)}")
    print(f"✅ JPG files: {len(saved_files)}")
    print(f"✅ Match: {len(images) == len(saved_files)}")

    return images

## SET PU_P01_PP01

In [None]:
ground_truth_pu_p01_pp01_multilabel = {
    "p1": {"text": True, "table": False, "schema":False}
    ,"p2": {"text": True, "table": False,"schema":False}
    ,"p3": {"text": True, "table": True, "schema":False}
    ,"p4": {"text": True, "table": False, "schema":False}
    ,"p5": {"text": True, "table": False, "schema":False}
    ,"p6": {"text": True, "table": False, "schema":False}
    ,"p7": {"text": True, "table": False, "schema":True} # Image, carte, par définition c'est un schéma
    ,"p8": {"text": True, "table": True, "schema":False}
    ,"p9": {"text": True, "table": False,"schema":False}
    ,"p10": {"text": True, "table": False,"schema":False}
    ,"p11": {"text": True, "table": False,"schema":True} # Image, carte avec légende
    ,"p12": {"text": True, "table": True, "schema":False}
    ,"p13": {"text": True, "table": True, "schema":False} # tableau en image
    ,"p14": {"text": True, "table": False, "schema":False}
    ,"p15": {"text": True, "table": False, "schema":True} # Particularité, c'est une infographie
    ,"p16": {"text": True, "table": False, "schema":True} # Image, carte avec légende
    ,"p17": {"text": True, "table": False, "schema":False}
    ,"p18": {"text": True, "table": False, "schema":False}
    ,"p19": {"text": True, "table": False, "schema":False}
    ,"p20": {"text": True, "table": False, "schema":False}
    ,"p21": {"text": True, "table": False, "schema":False}
    ,"p22": {"text": True, "table": True, "schema":False}
    ,"p23": {"text": True, "table": True, "schema":False}
    ,"p24": {"text": True, "table": False, "schema":False}
    ,"p25": {"text": True, "table": False, "schema":False}
    ,"p26": {"text": True, "table": False, "schema":False}
    ,"p27": {"text": True, "table": False, "schema":False}
    ,"p28": {"text": True, "table": False, "schema":False}
    ,"p29": {"text": True, "table": False, "schema":True} # Image, carte avec légende
    ,"p30": {"text": True, "table": True, "schema":False}
    ,"p31": {"text": True, "table": True, "schema":False}
    ,"p32": {"text": True, "table": True, "schema":False}  # Fin de tableau
    ,"p33": {"text": True, "table": False, "schema":False}
    ,"p34": {"text": True, "table": False, "schema":False}
    ,"p35": {"text": True, "table": True, "schema":False}
    ,"p36": {"text": True, "table": True, "schema":False}
    ,"p37": {"text": True, "table": True, "schema":False}
    ,"p38": {"text": True, "table": True, "schema":False}  # Tableau Excel-Like
    ,"p39": {"text": True, "table": False, "schema":False}
    ,"p40": {"text": True, "table": False, "schema":False}
    ,"p41": {"text": True, "table": True, "schema":False}
    ,"p42": {"text": True, "table": True, "schema":False}
    ,"p43": {"text": True, "table": True, "schema":False}
    ,"p44": {"text": True, "table": True, "schema":False}
    ,"p45": {"text": True, "table": True, "schema":False}
    ,"p46": {"text": True, "table": False, "schema":False}
    ,"p47": {"text": True, "table": True, "schema":False} # tableau sous forme de planning
    ,"p48": {"text": True, "table": True, "schema":False} # tableau sous forme de planning
    ,"p49": {"text": True, "table": True, "schema":False} # tableau sous forme de planning
}

In [None]:
path_pu_p01_pp01 = {
    "p1":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_1.jpg"
    ,"p2":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_2.jpg"
    ,"p3":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_3.jpg"
    ,"p4":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_4.jpg"
    ,"p5":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_5.jpg"
    ,"p6":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_6.jpg"
    ,"p7":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_7.jpg"
    ,"p8":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_8.jpg"
    ,"p9":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_9.jpg"
    ,"p10":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_10.jpg"
    ,"p11":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_11.jpg"
    ,"p12":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_12.jpg"
    ,"p13":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_13.jpg"
    ,"p14":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_14.jpg"
    ,"p15":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_15.jpg"
    ,"p16":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_16.jpg"
    ,"p17":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_17.jpg"
    ,"p18":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_18.jpg"
    ,"p19":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_19.jpg"
    ,"p20":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_20.jpg"
    ,"p21":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_21.jpg"
    ,"p22":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_22.jpg"
    ,"p23":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_23.jpg"
    ,"p24":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_24.jpg"
    ,"p25":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_25.jpg"
    ,"p26":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_26.jpg"
    ,"p27":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_27.jpg"
    ,"p28":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_28.jpg"
    ,"p29":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_29.jpg"
    ,"p30":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_30.jpg"
    ,"p31":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_31.jpg"
    ,"p32":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_32.jpg"
    ,"p33":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_33.jpg"
    ,"p34":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_34.jpg"
    ,"p35":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_35.jpg"
    ,"p36":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_36.jpg"
    ,"p37":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_37.jpg"
    ,"p38":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_38.jpg"
    ,"p39":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_39.jpg"
    ,"p40":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_40.jpg"
    ,"p41":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_41.jpg"
    ,"p42":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_42.jpg"
    ,"p43":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_43.jpg"
    ,"p44":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_44.jpg"
    ,"p45":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_45.jpg"
    ,"p46":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_46.jpg"
    ,"p47":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_47.jpg"
    ,"p48":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_48.jpg"
    ,"p49":"/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_49.jpg"
}

## Chargement modèle

In [None]:
def load_clip_patch32():
  """Clip Patch32"""
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
  model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
  return processor, model, "Clip-Patch32"

In [None]:
processor_clip, model_clip, model_name_clip = load_clip_patch32()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


## Baseline avec CLIP

In [None]:
def detect_multilabel_clip_baseline(image_path,processor, model, model_name='CLIP32'):
  """Détection binaire multilabel (text/table) avec CLIP32"""

  image = Image.open(image_path).convert("RGB")

  # prompt baseline spécialisé par classe
  prompts = {
      "text":"document with printed text and readable content"
      ,"table":"document with structured tables and organized data"
  }

  results = {}
  device = next(model.parameters()).device
  for label, prompt in prompts.items():
    # Prompts binaires pour chaque classe
    binary_prompts = [prompt, f"document without {label}"]

    inputs = processor(text=binary_prompts
                       ,images=image
                       ,return_tensors="pt"
                       ,padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad(): # économie de mémoire
      outputs = model(**inputs)
      probs = outputs.logits_per_image.softmax(dim=1)

    score = float(probs[0][0])
    results[label] = score

    # Seuil baseline
  tresholds = {"text": 0.5
                 ,"table": 0.5}
  predictions = {label: score > tresholds[label] for label, score in results.items()}

  return {
      "model": model_name
      ,"predictions": predictions
      ,"scores": results
      ,"thresholds":tresholds
  }

## Fonction d'évaluation

F1_score : https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
> classes déséquilibrées (plus de pages text que table)

Hamming Loss : https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_loss.html#sklearn.metrics.hamming_loss
> pour mesurer erreurs par label individuellement

Jaccard : https://scikit-learn.org/stable/modules/generated/sklearn.metrics.jaccard_score.html#sklearn.metrics.jaccard_score
> multilabel (text ET table peuvent coexister)

In [None]:
def evaluate_binary_multilabel_model(image_folder, ground_truth, detect_function, processor, model):
  """ Evaluation spécifique pour text/table"""
  y_true_text, y_true_table = [], []
  y_pred_text, y_pred_table = [], []
  y_true_multi, y_pred_multi = [], []

  for page_id, true_labels in ground_truth.items():
    image_path = os.path.join(image_folder, f"PU_P01_PP01_page_{page_id[1:]}.jpg")

    result =  detect_function(image_path, processor, model)
    pred_labels = result["predictions"]

    y_true_text.append(true_labels["text"])
    y_true_table.append(true_labels["table"])
    y_pred_text.append(pred_labels["text"])
    y_pred_table.append(pred_labels["table"])

    y_true_multi.append([true_labels["text"], true_labels["table"]])
    y_pred_multi.append([pred_labels["text"], pred_labels["table"]])
    ### Verbose ###
    # print(f"Shape y_true_multi: {np.array(y_true_multi).shape}")
    # print(f"Shape y_pred_multi: {np.array(y_pred_multi).shape}")

    # Affichage avec visuel
    text_ok = "✅" if true_labels["text"] == pred_labels["text"] else "❌"
    table_ok = "✅" if true_labels["table"] == pred_labels["table"] else "❌"
    print(f" {page_id}: {text_ok} {table_ok} {schema_ok} | Text:{result['scores']['text']:.3F} vs Table:{result['scores']['table']:.3f}")

  return {
        "f1_text": f1_score(y_true_text, y_pred_text)
        ,"f1_table": f1_score(y_true_table, y_pred_table)
        ,"f1_macro": f1_score(y_true_multi, y_pred_multi, average='macro')
        ,"hamming_loss": hamming_loss(y_true_multi, y_pred_multi)
        ,"jaccard_macro": jaccard_score(y_true_multi, y_pred_multi, average='macro')
        ,"jaccard_micro": jaccard_score(y_true_multi, y_pred_multi, average='micro')
        ,"jaccard_samples": jaccard_score(y_true_multi, y_pred_multi, average='samples')
        ,"jaccard_per_class": jaccard_score(y_true_multi, y_pred_multi, average=None)
    }



In [None]:
def evaluate_binary_multilabel_model_v2(image_folder, ground_truth, detect_function, processor, model):
  """ Evaluation spécifique pour text/table"""
  y_true_text, y_true_table, y_true_schema = [], [], []
  y_pred_text, y_pred_table, y_pred_schema = [], [], []
  y_true_multi, y_pred_multi = [], []

  for page_id, true_labels in ground_truth.items():
    image_path = os.path.join(image_folder, f"PU_P01_PP01_page_{page_id[1:]}.jpg")

    result =  detect_function(image_path, processor, model)
    pred_labels = result["predictions"]

    y_true_text.append(true_labels["text"])
    y_true_table.append(true_labels["table"])
    y_pred_text.append(pred_labels["text"])
    y_pred_table.append(pred_labels["table"])
    y_true_schema.append(true_labels["schema"])
    y_pred_schema.append(pred_labels["schema"])

    y_true_multi.append([true_labels["text"], true_labels["table"], true_labels["schema"]])
    y_pred_multi.append([pred_labels["text"], pred_labels["table"], pred_labels["schema"]])
    ### Verbose ###
    # print(f"Shape y_true_multi: {np.array(y_true_multi).shape}")
    # print(f"Shape y_pred_multi: {np.array(y_pred_multi).shape}")

    # Affichage avec visuel
    text_ok = "✅" if true_labels["text"] == pred_labels["text"] else "❌"
    table_ok = "✅" if true_labels["table"] == pred_labels["table"] else "❌"
    schema_ok = "✅" if true_labels["schema"] == pred_labels["schema"] else "❌"
    print(f" {page_id}: {text_ok} {table_ok} {schema_ok} | Text:{result['scores']['text']:.3F} vs Table:{result['scores']['table']:.3f} vs Schema:{result['scores']['schema']:.3f}")

  return {
        "f1_text": f1_score(y_true_text, y_pred_text)
        ,"f1_table": f1_score(y_true_table, y_pred_table)
        ,"f1_schema": f1_score(y_true_schema, y_pred_schema)
        ,"f1_macro": f1_score(y_true_multi, y_pred_multi, average='macro')
        ,"hamming_loss": hamming_loss(y_true_multi, y_pred_multi)
        ,"jaccard_macro": jaccard_score(y_true_multi, y_pred_multi, average='macro')
        ,"jaccard_micro": jaccard_score(y_true_multi, y_pred_multi, average='micro')
        ,"jaccard_samples": jaccard_score(y_true_multi, y_pred_multi, average='samples')
        ,"jaccard_per_class": jaccard_score(y_true_multi, y_pred_multi, average=None)
    }


## Baseline test

In [None]:
path_folder_test_baseline = "/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder"

In [None]:
metrics_clip = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_baseline
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip['jaccard_macro']:.3f} & {metrics_clip['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip['jaccard_per_class']}")


 p1: ✅ ✅ | Text:0.983 vs Table:0.020
 p2: ✅ ✅ | Text:0.934 vs Table:0.014
 p3: ✅ ❌ | Text:0.978 vs Table:0.106
 p4: ✅ ✅ | Text:0.964 vs Table:0.030
 p5: ✅ ✅ | Text:0.957 vs Table:0.099
 p6: ✅ ✅ | Text:0.969 vs Table:0.017
 p7: ✅ ✅ | Text:0.818 vs Table:0.213
 p8: ✅ ❌ | Text:0.889 vs Table:0.053
 p9: ✅ ✅ | Text:0.968 vs Table:0.044
 p10: ✅ ✅ | Text:0.962 vs Table:0.022
 p11: ✅ ✅ | Text:0.733 vs Table:0.061
 p12: ✅ ❌ | Text:0.948 vs Table:0.319
 p13: ✅ ❌ | Text:0.960 vs Table:0.120
 p14: ✅ ✅ | Text:0.976 vs Table:0.030
 p15: ✅ ❌ | Text:0.905 vs Table:0.027
 p16: ✅ ✅ | Text:0.941 vs Table:0.075
 p17: ✅ ✅ | Text:0.969 vs Table:0.009
 p18: ✅ ✅ | Text:0.968 vs Table:0.018
 p19: ✅ ✅ | Text:0.930 vs Table:0.026
 p20: ✅ ✅ | Text:0.975 vs Table:0.024
 p21: ✅ ✅ | Text:0.923 vs Table:0.032
 p22: ✅ ❌ | Text:0.873 vs Table:0.024
 p23: ✅ ❌ | Text:0.859 vs Table:0.202
 p24: ✅ ✅ | Text:0.884 vs Table:0.060
 p25: ✅ ✅ | Text:0.937 vs Table:0.026
 p26: ✅ ✅ | Text:0.964 vs Table:0.017
 p27: ✅ ✅ | Text:0.94

## Amélioration en essayant consensus multi prompt

Note : j'avais essayé le multi prompt sur texte également mais les résultats étaient nettement moins bon, je suis revenu uniquement sur la baseline.

In [None]:
def detect_multilabel_clip_consensus(image_path, processor, model, model_name='CLIP 32 Consensus'):
  """CLIP avec consensus multi-prompts"""
  image = Image.open(image_path).convert("RGB")

  # Test de différents prompts par classe
  prompts_variants = {
        "text": [
            "document with printed text and readable content"                   # Baseline
        ]
        ,"table": [
            "document with structured tables and organized data"               # Baseline
            ,"page containing data tables with rows and columns"               # Descriptif
            ,"administrative form with budget tables and financial data"       # Contexte métier
            ,"document with tabular information and structured data layout"     # Layout focus
            ,"page showing organized data grids and numerical tables"           # Data focus
            ,"document containing structured information in table format"        # Format focus
        ]
    }

  results = {}
  device = next(model.parameters()).device

  for label, prompts_list in prompts_variants.items():

    scores = []

    for prompt in prompts_list:
      # Prompts binaires pour chaque classe
      binary_prompts = [prompt, f"document without {label}"]

      inputs = processor(text=binary_prompts
                         ,images=image
                         ,return_tensors="pt"
                         ,padding=True)
      inputs = {key: value.to(device) for key, value in inputs.items()}

      with torch.no_grad(): # économie de mémoire
        outputs = model(**inputs)
        probs = outputs.logits_per_image.softmax(dim=1)

      score = float(probs[0][0])
      scores.append(score)

    # Calcul de la moyenne des scores (consensus)
    consensus_score = sum(scores) / len(scores)
    results[label] = consensus_score

    # debug print : affiche le détail des scores
    # print(f"   {label}: {scores} → avg: {consensus_score:.3f}")

  # Seuils baseline
  thresholds = {"text": 0.5, "table": 0.5}
  predictions = {label: score > thresholds[label] for label, score in results.items()}

  return {
        "model": model_name
        ,"predictions": predictions
        ,"scores": results
        ,"thresholds": thresholds
    }


In [None]:
metrics_clip_consensus = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_consensus
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_consensus['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_consensus['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_consensus['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_consensus['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_consensus['jaccard_macro']:.3f} & {metrics_clip_consensus['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_consensus['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_consensus['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.231
 p2: ✅ ✅ | Text:0.934 vs Table:0.070
 p3: ✅ ❌ | Text:0.978 vs Table:0.317
 p4: ✅ ✅ | Text:0.964 vs Table:0.191
 p5: ✅ ✅ | Text:0.957 vs Table:0.344
 p6: ✅ ✅ | Text:0.969 vs Table:0.122
 p7: ✅ ✅ | Text:0.818 vs Table:0.383
 p8: ✅ ❌ | Text:0.889 vs Table:0.303
 p9: ✅ ✅ | Text:0.968 vs Table:0.130
 p10: ✅ ✅ | Text:0.962 vs Table:0.086
 p11: ✅ ✅ | Text:0.733 vs Table:0.097
 p12: ✅ ✅ | Text:0.948 vs Table:0.578
 p13: ✅ ❌ | Text:0.960 vs Table:0.392
 p14: ✅ ✅ | Text:0.976 vs Table:0.260
 p15: ✅ ❌ | Text:0.905 vs Table:0.121
 p16: ✅ ✅ | Text:0.941 vs Table:0.173
 p17: ✅ ✅ | Text:0.969 vs Table:0.117
 p18: ✅ ✅ | Text:0.968 vs Table:0.187
 p19: ✅ ✅ | Text:0.930 vs Table:0.144
 p20: ✅ ✅ | Text:0.975 vs Table:0.258
 p21: ✅ ✅ | Text:0.923 vs Table:0.239
 p22: ✅ ❌ | Text:0.873 vs Table:0.253
 p23: ✅ ✅ | Text:0.859 vs Table:0.581
 p24: ✅ ✅ | Text:0.884 vs Table:0.209
 p25: ✅ ✅ | Text:0.937 vs Table:0.148
 p26: ✅ ✅ | Text:0.964 vs Table:0.199
 p27: ✅ ✅ | Text:0.94

## Consensus v2 : test avec d'autres prompts (chatGPT)

In [None]:
def detect_multilabel_clip_consensus_v2(image_path, processor, model, model_name='CLIP 32 Consensus'):
  """CLIP avec consensus multi-prompts"""
  image = Image.open(image_path).convert("RGB")

  # Test de différents prompts par classe
  prompts_variants = {
        "text": [
            "document with printed text and readable content"                   # Baseline
        ]
        ,"table": [
            "document with data tables containing rows and columns"
            ,"administrative document with structured numerical tables"
            ,"page showing budget tables and financial data"
            ,"document containing organized tabular information"
            ,"form with data grids and structured layouts"
            ,"document with statistical tables and data charts"
        ]
    }

  results = {}
  device = next(model.parameters()).device

  for label, prompts_list in prompts_variants.items():

    scores = []

    for prompt in prompts_list:
      # Prompts binaires pour chaque classe
      binary_prompts = [prompt, f"document without {label}"]

      inputs = processor(text=binary_prompts
                         ,images=image
                         ,return_tensors="pt"
                         ,padding=True)
      inputs = {key: value.to(device) for key, value in inputs.items()}

      with torch.no_grad(): # économie de mémoire
        outputs = model(**inputs)
        probs = outputs.logits_per_image.softmax(dim=1)

      score = float(probs[0][0])
      scores.append(score)

    # Calcul de la moyenne des scores (consensus)
    consensus_score = sum(scores) / len(scores)
    results[label] = consensus_score

    # debug print : affiche le détail des scores
    # print(f"   {label}: {scores} → avg: {consensus_score:.3f}")

  # Seuils baseline
  thresholds = {"text": 0.5, "table": 0.5}
  predictions = {label: score > thresholds[label] for label, score in results.items()}

  return {
        "model": model_name
        ,"predictions": predictions
        ,"scores": results
        ,"thresholds": thresholds
    }


In [None]:
metrics_clip_consensus_v2 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_consensus_v2
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_consensus['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_consensus['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_consensus['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_consensus['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_consensus['jaccard_macro']:.3f} & {metrics_clip_consensus['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_consensus['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_consensus['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.332
 p2: ✅ ✅ | Text:0.934 vs Table:0.092
 p3: ✅ ❌ | Text:0.978 vs Table:0.358
 p4: ✅ ✅ | Text:0.964 vs Table:0.370
 p5: ✅ ✅ | Text:0.957 vs Table:0.245
 p6: ✅ ✅ | Text:0.969 vs Table:0.123
 p7: ✅ ❌ | Text:0.818 vs Table:0.542
 p8: ✅ ❌ | Text:0.889 vs Table:0.276
 p9: ✅ ✅ | Text:0.968 vs Table:0.107
 p10: ✅ ✅ | Text:0.962 vs Table:0.129
 p11: ✅ ✅ | Text:0.733 vs Table:0.162
 p12: ✅ ✅ | Text:0.948 vs Table:0.567
 p13: ✅ ❌ | Text:0.960 vs Table:0.364
 p14: ✅ ✅ | Text:0.976 vs Table:0.194
 p15: ✅ ❌ | Text:0.905 vs Table:0.169
 p16: ✅ ✅ | Text:0.941 vs Table:0.280
 p17: ✅ ✅ | Text:0.969 vs Table:0.104
 p18: ✅ ✅ | Text:0.968 vs Table:0.221
 p19: ✅ ✅ | Text:0.930 vs Table:0.127
 p20: ✅ ✅ | Text:0.975 vs Table:0.258
 p21: ✅ ✅ | Text:0.923 vs Table:0.263
 p22: ✅ ❌ | Text:0.873 vs Table:0.244
 p23: ✅ ✅ | Text:0.859 vs Table:0.639
 p24: ✅ ✅ | Text:0.884 vs Table:0.220
 p25: ✅ ✅ | Text:0.937 vs Table:0.106
 p26: ✅ ✅ | Text:0.964 vs Table:0.178
 p27: ✅ ✅ | Text:0.94

## Analyse des prompts pour sélectionner les meilleurs prompts

In [None]:
def analyze_prompt_effectiveness(image_path, processor, model, label="table"):
    """Analyse quel prompt fonctionne le mieux"""
    image = Image.open(image_path).convert("RGB")

    if label == "table":
        prompts = [
            "document with structured tables and organized data"                # Baseline
            ,"page containing data tables with rows and columns"                # Descriptif
            ,"administrative form with budget tables and financial data"        # Contexte métier
            ,"document with tabular information and structured data layout"     # Layout focus
            ,"page showing organized data grids and numerical tables"           # Data focus
            ,"document containing structured information in table format"       # Format focus
        ]
    else:  # text
        prompts = [
            "document with printed text and readable content"
            ,"administrative document containing written text"
            ,"page with paragraphs and textual information"
            ,"document showing readable text content and written material"
            ,"form with written instructions and text sections"
        ]

    results = []
    device = next(model.parameters()).device

    for i, prompt in enumerate(prompts):
        binary_prompts = [prompt, f"document without {label}"]

        inputs = processor(text=binary_prompts, images=image, return_tensors="pt", padding=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            probs = outputs.logits_per_image.softmax(dim=1)

        score = float(probs[0][0])
        results.append((i, prompt[:50] + "...", score))

    # Tri par efficacité
    results.sort(key=lambda x: x[2], reverse=True)
    return results

In [None]:
### page qui nécessite une amélioration ###

for page in ["p3"
            ,"p8"
            ,"p13"
            ,"p15"
            ,"p22"
            ,"p30"
            ,"p31"
            ,"p32"
            ,"p35"
            ,"p36"
            ,"p37"
            ,"p38"]:
    print(f"\n📊 {page} - Analysis table prompts:")
    analysis = analyze_prompt_effectiveness(path_pu_p01_pp01[page], processor_clip, model_clip, "table")
    for rank, (idx, prompt, score) in enumerate(analysis):
        print(f"  {rank+1}. [{idx}] {score:.3f} - {prompt}")


📊 p3 - Analysis table prompts:
  1. [3] 0.727 - document with tabular information and structured d...
  2. [2] 0.461 - administrative form with budget tables and financi...
  3. [5] 0.348 - document containing structured information in tabl...
  4. [4] 0.236 - page showing organized data grids and numerical ta...
  5. [0] 0.106 - document with structured tables and organized data...
  6. [1] 0.027 - page containing data tables with rows and columns...

📊 p8 - Analysis table prompts:
  1. [2] 0.679 - administrative form with budget tables and financi...
  2. [5] 0.578 - document containing structured information in tabl...
  3. [3] 0.272 - document with tabular information and structured d...
  4. [4] 0.214 - page showing organized data grids and numerical ta...
  5. [0] 0.053 - document with structured tables and organized data...
  6. [1] 0.019 - page containing data tables with rows and columns...

📊 p13 - Analysis table prompts:
  1. [4] 0.799 - page showing organized data grids an

## Consensus v3 : réduction du nombre de prompt suite analyse

In [None]:
def detect_multilabel_clip_consensus_v3(image_path, processor, model, model_name='CLIP 32 Consensus'):
  """CLIP avec consensus multi-prompts"""
  image = Image.open(image_path).convert("RGB")

  # Test de différents prompts par classe
  prompts_variants = {
        "text": [
            "document with printed text and readable content"                   # Baseline
        ]
        ,"table": [
            "administrative form with budget tables and financial data"
            ,"document with tabular information and structured data"
            ,"document containing structured information in table format"
        ]
    }

  results = {}
  device = next(model.parameters()).device

  for label, prompts_list in prompts_variants.items():

    scores = []

    for prompt in prompts_list:
      # Prompts binaires pour chaque classe
      binary_prompts = [prompt, f"document without {label}"]

      inputs = processor(text=binary_prompts
                         ,images=image
                         ,return_tensors="pt"
                         ,padding=True)
      inputs = {key: value.to(device) for key, value in inputs.items()}

      with torch.no_grad(): # économie de mémoire
        outputs = model(**inputs)
        probs = outputs.logits_per_image.softmax(dim=1)

      score = float(probs[0][0])
      scores.append(score)

    # Calcul de la moyenne des scores (consensus)
    consensus_score = sum(scores) / len(scores)
    results[label] = consensus_score

    # debug print : affiche le détail des scores
    # print(f"   {label}: {scores} → avg: {consensus_score:.3f}")

  # Seuils baseline
  thresholds = {"text": 0.5, "table": 0.5}
  predictions = {label: score > thresholds[label] for label, score in results.items()}

  return {
        "model": model_name
        ,"predictions": predictions
        ,"scores": results
        ,"thresholds": thresholds
    }


In [None]:
metrics_clip_consensus_v3 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_consensus_v3
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_consensus_v3['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_consensus_v3['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_consensus_v3['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_consensus_v3['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_consensus_v3['jaccard_macro']:.3f} & {metrics_clip_consensus_v3['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_consensus_v3['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_consensus_v3['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.417
 p2: ✅ ✅ | Text:0.934 vs Table:0.117
 p3: ✅ ❌ | Text:0.978 vs Table:0.484
 p4: ✅ ✅ | Text:0.964 vs Table:0.309
 p5: ✅ ❌ | Text:0.957 vs Table:0.597
 p6: ✅ ✅ | Text:0.969 vs Table:0.229
 p7: ✅ ✅ | Text:0.818 vs Table:0.452
 p8: ✅ ❌ | Text:0.889 vs Table:0.481
 p9: ✅ ✅ | Text:0.968 vs Table:0.223
 p10: ✅ ✅ | Text:0.962 vs Table:0.124
 p11: ✅ ✅ | Text:0.733 vs Table:0.175
 p12: ✅ ✅ | Text:0.948 vs Table:0.802
 p13: ✅ ❌ | Text:0.960 vs Table:0.487
 p14: ✅ ✅ | Text:0.976 vs Table:0.466
 p15: ✅ ❌ | Text:0.905 vs Table:0.201
 p16: ✅ ✅ | Text:0.941 vs Table:0.309
 p17: ✅ ✅ | Text:0.969 vs Table:0.196
 p18: ✅ ✅ | Text:0.968 vs Table:0.294
 p19: ✅ ✅ | Text:0.930 vs Table:0.240
 p20: ✅ ✅ | Text:0.975 vs Table:0.496
 p21: ✅ ✅ | Text:0.923 vs Table:0.403
 p22: ✅ ❌ | Text:0.873 vs Table:0.456
 p23: ✅ ✅ | Text:0.859 vs Table:0.720
 p24: ✅ ✅ | Text:0.884 vs Table:0.357
 p25: ✅ ✅ | Text:0.937 vs Table:0.284
 p26: ✅ ✅ | Text:0.964 vs Table:0.394
 p27: ✅ ✅ | Text:0.94

## Recherche spécifique pour p15, essai sur Consensus v3_1

In [None]:
import torch.nn.functional as F

In [None]:
def analyze_prompt_effectiveness_p15(image_path, processor, model):
    """Version spécialisée de l'analyse de prompts pour p15"""

    # Focus sur les prompts qui pourraient mieux marcher pour p15
    test_prompts = [
        # Baseline actuel
    "business process flowchart with colored sections and directional arrows, not planning schedule",
    # Focus éléments visuels uniques aux vrais diagrammes
    "flowchart with interconnected boxes and directional flow arrows",
    "diagram showing process relationships with connecting arrows",
    "business logic diagram with linked components and flow direction",

    # Exclusion explicite planning
    "process flowchart with connections, not scheduling document",
    "conceptual diagram with linked elements, not timeline or calendar",
    "workflow diagram with process flow, not planning grid",

    # Focus connexions conceptuelles
    "conceptual framework diagram with connected logical elements",
    "process model showing relationships between different components"
    ]

    image = Image.open(image_path).convert("RGB")
    device = next(model.parameters()).device

    scores = []

    print("🔬 ANALYSE SPÉCIALISÉE p15")
    print("="*50)

    for prompt in test_prompts:
        # Test binaire pour chaque prompt
        binary_prompts = [prompt, f"document with table"]

        inputs = processor(text=binary_prompts, images=image,
                         return_tensors="pt", padding=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            probs = outputs.logits_per_image.softmax(dim=1)

        score = float(probs[0][0])
        scores.append((score, prompt))
        print(f"   {score:.3f} | {prompt}")

    # Tri par performance
    sorted_scores = sorted(scores, reverse=True)

    print(f"\n🏆 RANKING:")
    for i, (score, prompt) in enumerate(sorted_scores):
        print(f"   {i+1:2d}. {score:.3f} | {prompt}")

    return sorted_scores

In [None]:
def detect_multilabel_clip_consensus_v3_1(image_path, processor, model, model_name='CLIP 32 Consensus'):
  """CLIP avec consensus multi-prompts"""
  image = Image.open(image_path).convert("RGB")

  # Test de différents prompts par classe
  prompts_variants = {
        "text": [
            "document with printed text and readable content"                   # Baseline
        ]
        ,"table": [
            "administrative form with budget tables and financial data"
            ,"document with tabular information and structured data"
            ,"document containing structured information in table format"
            ,"infographic showing structured data relationships"
        ]
    }

  results = {}
  device = next(model.parameters()).device

  for label, prompts_list in prompts_variants.items():

    scores = []

    for prompt in prompts_list:
      # Prompts binaires pour chaque classe
      binary_prompts = [prompt, f"document without {label}"]

      inputs = processor(text=binary_prompts
                         ,images=image
                         ,return_tensors="pt"
                         ,padding=True)
      inputs = {key: value.to(device) for key, value in inputs.items()}

      with torch.no_grad(): # économie de mémoire
        outputs = model(**inputs)
        probs = outputs.logits_per_image.softmax(dim=1)

      score = float(probs[0][0])
      scores.append(score)

    # Calcul de la moyenne des scores (consensus)
    consensus_score = sum(scores) / len(scores)
    results[label] = consensus_score

    # debug print : affiche le détail des scores
    # print(f"   {label}: {scores} → avg: {consensus_score:.3f}")

  # Seuils baseline
  thresholds = {"text": 0.5, "table": 0.5}
  predictions = {label: score > thresholds[label] for label, score in results.items()}

  return {
        "model": model_name
        ,"predictions": predictions
        ,"scores": results
        ,"thresholds": thresholds
    }

In [None]:
metrics_clip_consensus_v3_1 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_consensus_v3_1
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_consensus_v3_1['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_consensus_v3_1['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_consensus_v3_1['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_consensus_v3_1['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_consensus_v3_1['jaccard_macro']:.3f} & {metrics_clip_consensus_v3_1['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_consensus_v3_1['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_consensus_v3_1['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.314
 p2: ✅ ✅ | Text:0.934 vs Table:0.093
 p3: ✅ ❌ | Text:0.978 vs Table:0.366
 p4: ✅ ✅ | Text:0.964 vs Table:0.233
 p5: ✅ ✅ | Text:0.957 vs Table:0.458
 p6: ✅ ✅ | Text:0.969 vs Table:0.175
 p7: ✅ ✅ | Text:0.818 vs Table:0.480
 p8: ✅ ❌ | Text:0.889 vs Table:0.363
 p9: ✅ ✅ | Text:0.968 vs Table:0.173
 p10: ✅ ✅ | Text:0.962 vs Table:0.097
 p11: ✅ ✅ | Text:0.733 vs Table:0.320
 p12: ✅ ✅ | Text:0.948 vs Table:0.628
 p13: ✅ ❌ | Text:0.960 vs Table:0.466
 p14: ✅ ✅ | Text:0.976 vs Table:0.384
 p15: ✅ ❌ | Text:0.905 vs Table:0.397
 p16: ✅ ✅ | Text:0.941 vs Table:0.293
 p17: ✅ ✅ | Text:0.969 vs Table:0.149
 p18: ✅ ✅ | Text:0.968 vs Table:0.221
 p19: ✅ ✅ | Text:0.930 vs Table:0.181
 p20: ✅ ✅ | Text:0.975 vs Table:0.412
 p21: ✅ ✅ | Text:0.923 vs Table:0.304
 p22: ✅ ❌ | Text:0.873 vs Table:0.342
 p23: ✅ ✅ | Text:0.859 vs Table:0.544
 p24: ✅ ✅ | Text:0.884 vs Table:0.281
 p25: ✅ ✅ | Text:0.937 vs Table:0.257
 p26: ✅ ✅ | Text:0.964 vs Table:0.320
 p27: ✅ ✅ | Text:0.94

### Thinking Note

j'ai essayé l'optimisation des threshold mais ça augmentait les faux positifs.
En baissant le seuil, on augmente le rappel et baisse la précision.

Solution : Amélioration des prompts

Test : Essai des poids puis GridSearch

## Ajout des poids

Test manuel

In [None]:
def detect_multilabel_clip_weighted_v3_1(image_path, processor, model, model_name='CLIP v3.1 Weighted'):
    """CLIP consensus v3.1 avec poids optimisés basés sur l'analyse"""
    image = Image.open(image_path).convert("RGB")

    # Poids ajustés basés sur les performances observées
    prompts_variants = {
        "text": [
            ("document with printed text and readable content", 1.0)
        ],
        "table": [
            # Prompts v3 maintenus avec poids élevés (prouvés efficaces)
            ("administrative form with budget tables and financial data", 1.4)     # ↑ Excellent sur Excel
            ,("document with tabular information and structured data", 1.6)        # ↓ Très bon généraliste
            ,("document containing structured information in table format", 1.2)   # ↑ Support solide

            # Prompt infographic avec poids modéré (spécialisé diagrammes)
            ,("infographic showing structured data relationships", 1.0)             # ↓ Aide p15
        ]
    }

    results = {}
    device = next(model.parameters()).device

    for label, prompts_list in prompts_variants.items():
        weighted_scores = []
        total_weight = 0

        for prompt, weight in prompts_list:
            binary_prompts = [prompt, f"document without {label}"]

            inputs = processor(text=binary_prompts
                             ,images=image
                             ,return_tensors="pt"
                             ,padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                probs = outputs.logits_per_image.softmax(dim=1)

            score = float(probs[0][0])
            weighted_scores.append(score * weight)
            total_weight += weight

        consensus_score = sum(weighted_scores) / total_weight
        results[label] = consensus_score

    thresholds = {"text": 0.5, "table": 0.5}
    predictions = {label: score > thresholds[label] for label, score in results.items()}

    return {
        "model": model_name,
        "predictions": predictions,
        "scores": results,
        "thresholds": thresholds
    }

In [None]:
metrics_clip_weighted_v3_1 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_weighted_v3_1
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_weighted_v3_1['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_weighted_v3_1['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_weighted_v3_1['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_weighted_v3_1['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_weighted_v3_1['jaccard_macro']:.3f} & {metrics_clip_weighted_v3_1['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_weighted_v3_1['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_weighted_v3_1['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.350
 p2: ✅ ✅ | Text:0.934 vs Table:0.096
 p3: ✅ ❌ | Text:0.978 vs Table:0.405
 p4: ✅ ✅ | Text:0.964 vs Table:0.237
 p5: ✅ ✅ | Text:0.957 vs Table:0.495
 p6: ✅ ✅ | Text:0.969 vs Table:0.188
 p7: ✅ ✅ | Text:0.818 vs Table:0.494
 p8: ✅ ❌ | Text:0.889 vs Table:0.375
 p9: ✅ ✅ | Text:0.968 vs Table:0.187
 p10: ✅ ✅ | Text:0.962 vs Table:0.101
 p11: ✅ ✅ | Text:0.733 vs Table:0.302
 p12: ✅ ✅ | Text:0.948 vs Table:0.668
 p13: ✅ ❌ | Text:0.960 vs Table:0.481
 p14: ✅ ✅ | Text:0.976 vs Table:0.401
 p15: ✅ ❌ | Text:0.905 vs Table:0.363
 p16: ✅ ✅ | Text:0.941 vs Table:0.321
 p17: ✅ ✅ | Text:0.969 vs Table:0.157
 p18: ✅ ✅ | Text:0.968 vs Table:0.233
 p19: ✅ ✅ | Text:0.930 vs Table:0.181
 p20: ✅ ✅ | Text:0.975 vs Table:0.430
 p21: ✅ ✅ | Text:0.923 vs Table:0.317
 p22: ✅ ❌ | Text:0.873 vs Table:0.358
 p23: ✅ ✅ | Text:0.859 vs Table:0.570
 p24: ✅ ✅ | Text:0.884 vs Table:0.292
 p25: ✅ ✅ | Text:0.937 vs Table:0.264
 p26: ✅ ✅ | Text:0.964 vs Table:0.339
 p27: ✅ ✅ | Text:0.94

In [None]:
metrics_clip_weighted_v3_1 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_weighted_v3_1
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_weighted_v3_1['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_weighted_v3_1['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_weighted_v3_1['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_weighted_v3_1['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_weighted_v3_1['jaccard_macro']:.3f} & {metrics_clip_weighted_v3_1['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_weighted_v3_1['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_weighted_v3_1['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.369
 p2: ✅ ✅ | Text:0.934 vs Table:0.102
 p3: ✅ ❌ | Text:0.978 vs Table:0.446
 p4: ✅ ✅ | Text:0.964 vs Table:0.255
 p5: ✅ ❌ | Text:0.957 vs Table:0.540
 p6: ✅ ✅ | Text:0.969 vs Table:0.207
 p7: ✅ ✅ | Text:0.818 vs Table:0.496
 p8: ✅ ❌ | Text:0.889 vs Table:0.391
 p9: ✅ ✅ | Text:0.968 vs Table:0.203
 p10: ✅ ✅ | Text:0.962 vs Table:0.111
 p11: ✅ ✅ | Text:0.733 vs Table:0.278
 p12: ✅ ✅ | Text:0.948 vs Table:0.715
 p13: ✅ ❌ | Text:0.960 vs Table:0.494
 p14: ✅ ✅ | Text:0.976 vs Table:0.421
 p15: ✅ ❌ | Text:0.905 vs Table:0.311
 p16: ✅ ✅ | Text:0.941 vs Table:0.342
 p17: ✅ ✅ | Text:0.969 vs Table:0.171
 p18: ✅ ✅ | Text:0.968 vs Table:0.260
 p19: ✅ ✅ | Text:0.930 vs Table:0.193
 p20: ✅ ✅ | Text:0.975 vs Table:0.448
 p21: ✅ ✅ | Text:0.923 vs Table:0.338
 p22: ✅ ❌ | Text:0.873 vs Table:0.370
 p23: ✅ ✅ | Text:0.859 vs Table:0.599
 p24: ✅ ✅ | Text:0.884 vs Table:0.309
 p25: ✅ ✅ | Text:0.937 vs Table:0.263
 p26: ✅ ✅ | Text:0.964 vs Table:0.351
 p27: ✅ ✅ | Text:0.94

## Grid search CV sur les 3 prompts de consensus v3

In [None]:
import itertools

In [None]:
def detect_multilabel_clip_grid_v3(image_path, processor, model, weights):
    """Version grid search allégée - consensus v3 uniquement"""
    image = Image.open(image_path).convert("RGB")

    prompts_variants = {
        "text": [
            ("document with printed text and readable content", 1.0)
        ],
        "table": [
            ("administrative form with budget tables and financial data", weights[0])
            ,("document with tabular information and structured data", weights[1])
            ,("document containing structured information in table format", weights[2])
        ]
    }

    results = {}
    device = next(model.parameters()).device

    for label, prompts_list in prompts_variants.items():
        weighted_scores = []
        total_weight = 0

        for prompt, weight in prompts_list:
            binary_prompts = [prompt, f"document without {label}"]
            inputs = processor(text=binary_prompts, images=image,
                             return_tensors="pt", padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                probs = outputs.logits_per_image.softmax(dim=1)

            score = float(probs[0][0])
            weighted_scores.append(score * weight)
            total_weight += weight

        consensus_score = sum(weighted_scores) / total_weight
        results[label] = consensus_score

    thresholds = {"text": 0.5, "table": 0.5}
    predictions = {label: score > thresholds[label] for label, score in results.items()}

    return predictions

In [None]:
def lightweight_grid_search_v3(path_dict, ground_truth, processor, model):
    """Grid search ultra léger - focus sur les gains"""

    print("🔍 GRID SEARCH LÉGER v3 - Optimisation 3 poids")
    print("="*50)

    # Ranges très focalisés autour des valeurs actuelles
    ranges = {
        'admin_form': [1.2, 1.3, 1.4],        # ±0.1 autour de 1.3
        'tabular_info': [1.7, 1.8, 1.9],      # ±0.1 autour de 1.8
        'table_format': [0.9, 1.0, 1.1]       # ±0.1 autour de 1.0
    }

    # 27 combinaisons (3×3×3)
    combinations = list(itertools.product(
        ranges['admin_form'],
        ranges['tabular_info'],
        ranges['table_format']
    ))

    print(f"📊 {len(combinations)} combinaisons à tester")

    best_f1_table = 0
    best_weights = None
    best_results = None

    current_baseline = [1.3, 1.8, 1.0]  # Poids actuels

    # Test baseline d'abord
    print(f"\n🎯 BASELINE: {current_baseline}")
    y_true_table, y_pred_table = [], []

    for page, image_path in path_dict.items():
        pred = detect_multilabel_clip_grid_v3(image_path, processor, model, current_baseline)
        y_true_table.append(ground_truth[page]['table'])
        y_pred_table.append(pred['table'])

    baseline_f1 = f1_score(y_true_table, y_pred_table)
    print(f"   F1 Table baseline: {baseline_f1:.3f}")

    improvements = []

    for i, weights in enumerate(combinations):
        if list(weights) == current_baseline:
            continue  # Skip baseline déjà testé

        print(f"\r🧪 Test {i+1}/{len(combinations)}: {weights}", end="")

        try:
            y_true_table, y_pred_table = [], []

            for page, image_path in path_dict.items():
                pred = detect_multilabel_clip_grid_v3(image_path, processor, model, weights)
                y_true_table.append(ground_truth[page]['table'])
                y_pred_table.append(pred['table'])

            f1_table = f1_score(y_true_table, y_pred_table)

            if f1_table > baseline_f1:
                improvement = f1_table - baseline_f1
                improvements.append((weights, f1_table, improvement))

            if f1_table > best_f1_table:
                best_f1_table = f1_table
                best_weights = weights

        except Exception as e:
            print(f"\n❌ Erreur: {e}")

    print(f"\n\n🏆 RÉSULTATS:")
    print("="*40)

    if improvements:
        print(f"✅ {len(improvements)} améliorations trouvées!")

        # Top 5 améliorations
        improvements.sort(key=lambda x: x[2], reverse=True)
        print(f"\n🔥 TOP améliorations:")
        for i, (weights, f1, gain) in enumerate(improvements[:5]):
            print(f"   {i+1}. {weights} → F1: {f1:.3f} (+{gain:+.3f})")

        print(f"\n🎯 MEILLEUR:")
        best = improvements[0]
        print(f"   Poids: {best[0]}")
        print(f"   F1 Table: {best[1]:.3f}")
        print(f"   Gain: +{best[2]:.3f} ({best[2]/baseline_f1*100:+.1f}%)")

    else:
        print(f"❌ Aucune amélioration trouvée")
        print(f"   Baseline {current_baseline} reste optimal: {baseline_f1:.3f}")

    return {
        'baseline_f1': baseline_f1,
        'baseline_weights': current_baseline,
        'best_weights': best_weights,
        'best_f1': best_f1_table,
        'improvements': improvements
    }

In [None]:
def quick_test_weights(path_dict, ground_truth, processor, model, test_weights):
    """Test rapide d'un set de poids spécifique"""

    print(f"⚡ TEST RAPIDE: {test_weights}")

    y_true_table, y_pred_table = [], []

    for page, image_path in path_dict.items():
        pred = detect_multilabel_clip_grid_v3(image_path, processor, model, test_weights)
        y_true_table.append(ground_truth[page]['table'])
        y_pred_table.append(pred['table'])

    f1_table = f1_score(y_true_table, y_pred_table)
    print(f"   F1 Table: {f1_table:.3f}")

    return f1_table

In [None]:
results = lightweight_grid_search_v3(path_pu_p01_pp01
                                     ,ground_truth_pu_p01_pp01_multilabel
                                     ,processor_clip
                                     ,model_clip)

🔍 GRID SEARCH LÉGER v3 - Optimisation 3 poids
📊 27 combinaisons à tester

🎯 BASELINE: [1.3, 1.8, 1.0]


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-34-2777842423.py", line 1, in <cell line: 0>
    results = lightweight_grid_search_v3(path_pu_p01_pp01
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-32-2914503883.py", line 34, in lightweight_grid_search_v3
    pred = detect_multilabel_clip_grid_v3(image_path, processor, model, current_baseline)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-31-3852009679.py", line 30, in detect_multilabel_clip_grid_v3
    outputs = model(**inputs)
              ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  Fil

## Test after Grid

Test des 5 Top améliorations

In [None]:
def detect_multilabel_clip_grid_v3_1(image_path, processor, model, model_name='CLIP v3.1 Weighted'):
    """CLIP consensus v3.1 avec poids optimisés basés sur l'analyse"""
    image = Image.open(image_path).convert("RGB")

    # Poids ajustés basés sur les performances observées
    prompts_variants = {
        "text": [
            ("document with printed text and readable content", 1.0)
        ],
        "table": [
            # Prompts v3 maintenus avec poids élevés (prouvés efficaces)
            ("administrative form with budget tables and financial data", 1.2)     # ↑ Excellent sur Excel
            ,("document with tabular information and structured data", 1.7)        # ↓ Très bon généraliste
            ,("document containing structured information in table format", 1.0)   # ↑ Support solide
        ]
    }

    results = {}
    device = next(model.parameters()).device

    for label, prompts_list in prompts_variants.items():
        weighted_scores = []
        total_weight = 0

        for prompt, weight in prompts_list:
            binary_prompts = [prompt, f"document without {label}"]

            inputs = processor(text=binary_prompts
                             ,images=image
                             ,return_tensors="pt"
                             ,padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                probs = outputs.logits_per_image.softmax(dim=1)

            score = float(probs[0][0])
            weighted_scores.append(score * weight)
            total_weight += weight

        consensus_score = sum(weighted_scores) / total_weight
        results[label] = consensus_score

    thresholds = {"text": 0.5, "table": 0.5}
    predictions = {label: score > thresholds[label] for label, score in results.items()}

    return {
        "model": model_name,
        "predictions": predictions,
        "scores": results,
        "thresholds": thresholds
    }

In [None]:
metrics_clip_grid_v3_1 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_grid_v3_1
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_grid_v3_1['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_grid_v3_1['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_grid_v3_1['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_grid_v3_1['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_grid_v3_1['jaccard_macro']:.3f} & {metrics_clip_grid_v3_1['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_grid_v3_1['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_grid_v3_1['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.430
 p2: ✅ ✅ | Text:0.934 vs Table:0.112
 p3: ✅ ✅ | Text:0.978 vs Table:0.511
 p4: ✅ ✅ | Text:0.964 vs Table:0.282
 p5: ✅ ❌ | Text:0.957 vs Table:0.611
 p6: ✅ ✅ | Text:0.969 vs Table:0.234
 p7: ✅ ✅ | Text:0.818 vs Table:0.498
 p8: ✅ ❌ | Text:0.889 vs Table:0.439
 p9: ✅ ✅ | Text:0.968 vs Table:0.228
 p10: ✅ ✅ | Text:0.962 vs Table:0.122
 p11: ✅ ✅ | Text:0.733 vs Table:0.219
 p12: ✅ ✅ | Text:0.948 vs Table:0.801
 p13: ✅ ✅ | Text:0.960 vs Table:0.512
 p14: ✅ ✅ | Text:0.976 vs Table:0.460
 p15: ✅ ❌ | Text:0.905 vs Table:0.223
 p16: ✅ ✅ | Text:0.941 vs Table:0.368
 p17: ✅ ✅ | Text:0.969 vs Table:0.191
 p18: ✅ ✅ | Text:0.968 vs Table:0.292
 p19: ✅ ✅ | Text:0.930 vs Table:0.213
 p20: ✅ ✅ | Text:0.975 vs Table:0.488
 p21: ✅ ✅ | Text:0.923 vs Table:0.379
 p22: ✅ ❌ | Text:0.873 vs Table:0.419
 p23: ✅ ✅ | Text:0.859 vs Table:0.676
 p24: ✅ ✅ | Text:0.884 vs Table:0.341
 p25: ✅ ✅ | Text:0.937 vs Table:0.277
 p26: ✅ ✅ | Text:0.964 vs Table:0.389
 p27: ✅ ✅ | Text:0.94

In [None]:
perf_grid_v3_1 = measure_performance_v2(
    evaluate_binary_multilabel_model,
    path_folder_test_baseline,
    ground_truth_pu_p01_pp01_multilabel,
    detect_multilabel_clip_grid_v3_1,
    processor_clip,
    model_clip
)

 p1: ✅ ❌ | Text:0.983 vs Table:0.613
 p2: ✅ ✅ | Text:0.934 vs Table:0.095
 p3: ✅ ✅ | Text:0.978 vs Table:0.550
 p4: ✅ ✅ | Text:0.964 vs Table:0.200
 p5: ✅ ❌ | Text:0.957 vs Table:0.604
 p6: ✅ ✅ | Text:0.969 vs Table:0.211
 p7: ✅ ❌ | Text:0.818 vs Table:0.602
 p8: ✅ ❌ | Text:0.889 vs Table:0.440
 p9: ✅ ✅ | Text:0.968 vs Table:0.224
 p10: ✅ ✅ | Text:0.962 vs Table:0.088
 p11: ✅ ✅ | Text:0.733 vs Table:0.233
 p12: ✅ ✅ | Text:0.948 vs Table:0.814
 p13: ✅ ✅ | Text:0.960 vs Table:0.536
 p14: ✅ ✅ | Text:0.976 vs Table:0.468
 p15: ✅ ❌ | Text:0.905 vs Table:0.294
 p16: ✅ ✅ | Text:0.941 vs Table:0.448
 p17: ✅ ✅ | Text:0.969 vs Table:0.162
 p18: ✅ ✅ | Text:0.968 vs Table:0.203
 p19: ✅ ✅ | Text:0.930 vs Table:0.138
 p20: ✅ ❌ | Text:0.975 vs Table:0.515
 p21: ✅ ✅ | Text:0.923 vs Table:0.340
 p22: ✅ ❌ | Text:0.873 vs Table:0.475
 p23: ✅ ✅ | Text:0.859 vs Table:0.703
 p24: ✅ ✅ | Text:0.884 vs Table:0.326
 p25: ✅ ✅ | Text:0.937 vs Table:0.340
 p26: ✅ ✅ | Text:0.964 vs Table:0.477
 p27: ✅ ✅ | Text:0.94

In [None]:
print(f"⏱️ Temps : {perf_grid_v3_1['time_seconds']:.2f} sec")
print(f"🧠 RAM CPU delta : {perf_grid_v3_1['cpu_ram_mb_delta']:.2f} MB")
print(f"📈 RAM CPU pic : {perf_grid_v3_1['cpu_ram_peak_mb']:.2f} MB")

if perf_grid_v3_1['gpu_vram_peak_mb'] is not None:
    print(f"🚀 VRAM GPU pic : {perf_grid_v3_1['gpu_vram_peak_mb']:.2f} MB")
else:
    print("⚠️ GPU CUDA non disponible")

⏱️ Temps : 39.69 sec
🧠 RAM CPU delta : 9.54 MB
📈 RAM CPU pic : 4.16 MB
🚀 VRAM GPU pic : 0.00 MB


In [None]:
def detect_multilabel_clip_grid_v3_2(image_path, processor, model, model_name='CLIP v3.2 Weighted'):
    """CLIP consensus v3.1 avec poids optimisés basés sur l'analyse"""
    image = Image.open(image_path).convert("RGB")

    # Poids ajustés basés sur les performances observées
    prompts_variants = {
        "text": [
            ("document with printed text and readable content", 1.0)
        ],
        "table": [
            # Prompts v3 maintenus avec poids élevés (prouvés efficaces)
            ("administrative form with budget tables and financial data", 1.2)     # ↑ Excellent sur Excel
            ,("document with tabular information and structured data", 1.7)        # ↓ Très bon généraliste
            ,("document containing structured information in table format", 1.1)   # ↑ Support solide
        ]
    }

    results = {}
    device = next(model.parameters()).device

    for label, prompts_list in prompts_variants.items():
        weighted_scores = []
        total_weight = 0

        for prompt, weight in prompts_list:
            binary_prompts = [prompt, f"document without {label}"]

            inputs = processor(text=binary_prompts
                             ,images=image
                             ,return_tensors="pt"
                             ,padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                probs = outputs.logits_per_image.softmax(dim=1)

            score = float(probs[0][0])
            weighted_scores.append(score * weight)
            total_weight += weight

        consensus_score = sum(weighted_scores) / total_weight
        results[label] = consensus_score

    thresholds = {"text": 0.5, "table": 0.5}
    predictions = {label: score > thresholds[label] for label, score in results.items()}

    return {
        "model": model_name,
        "predictions": predictions,
        "scores": results,
        "thresholds": thresholds
    }

In [None]:
metrics_clip_grid_v3_2 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_grid_v3_2
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_grid_v3_2['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_grid_v3_2['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_grid_v3_2['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_grid_v3_2['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_grid_v3_2['jaccard_macro']:.3f} & {metrics_clip_grid_v3_2['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_grid_v3_2['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_grid_v3_2['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.420
 p2: ✅ ✅ | Text:0.934 vs Table:0.113
 p3: ✅ ✅ | Text:0.978 vs Table:0.507
 p4: ✅ ✅ | Text:0.964 vs Table:0.288
 p5: ✅ ❌ | Text:0.957 vs Table:0.610
 p6: ✅ ✅ | Text:0.969 vs Table:0.235
 p7: ✅ ✅ | Text:0.818 vs Table:0.489
 p8: ✅ ❌ | Text:0.889 vs Table:0.442
 p9: ✅ ✅ | Text:0.968 vs Table:0.228
 p10: ✅ ✅ | Text:0.962 vs Table:0.124
 p11: ✅ ✅ | Text:0.733 vs Table:0.214
 p12: ✅ ✅ | Text:0.948 vs Table:0.800
 p13: ✅ ✅ | Text:0.960 vs Table:0.508
 p14: ✅ ✅ | Text:0.976 vs Table:0.460
 p15: ✅ ❌ | Text:0.905 vs Table:0.218
 p16: ✅ ✅ | Text:0.941 vs Table:0.359
 p17: ✅ ✅ | Text:0.969 vs Table:0.193
 p18: ✅ ✅ | Text:0.968 vs Table:0.296
 p19: ✅ ✅ | Text:0.930 vs Table:0.219
 p20: ✅ ✅ | Text:0.975 vs Table:0.487
 p21: ✅ ✅ | Text:0.923 vs Table:0.383
 p22: ✅ ❌ | Text:0.873 vs Table:0.419
 p23: ✅ ✅ | Text:0.859 vs Table:0.678
 p24: ✅ ✅ | Text:0.884 vs Table:0.343
 p25: ✅ ✅ | Text:0.937 vs Table:0.275
 p26: ✅ ✅ | Text:0.964 vs Table:0.385
 p27: ✅ ✅ | Text:0.94

In [None]:
def detect_multilabel_clip_grid_v3_3(image_path, processor, model, model_name='CLIP v3.3 Weighted'):
    """CLIP consensus v3.1 avec poids optimisés basés sur l'analyse"""
    image = Image.open(image_path).convert("RGB")

    # Poids ajustés basés sur les performances observées
    prompts_variants = {
        "text": [
            ("document with printed text and readable content", 1.0)
        ],
        "table": [
            # Prompts v3 maintenus avec poids élevés (prouvés efficaces)
            ("administrative form with budget tables and financial data", 1.2)     # ↑ Excellent sur Excel
            ,("document with tabular information and structured data", 1.8)        # ↓ Très bon généraliste
            ,("document containing structured information in table format", 1.1)   # ↑ Support solide
        ]
    }

    results = {}
    device = next(model.parameters()).device

    for label, prompts_list in prompts_variants.items():
        weighted_scores = []
        total_weight = 0

        for prompt, weight in prompts_list:
            binary_prompts = [prompt, f"document without {label}"]

            inputs = processor(text=binary_prompts
                             ,images=image
                             ,return_tensors="pt"
                             ,padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                probs = outputs.logits_per_image.softmax(dim=1)

            score = float(probs[0][0])
            weighted_scores.append(score * weight)
            total_weight += weight

        consensus_score = sum(weighted_scores) / total_weight
        results[label] = consensus_score

    thresholds = {"text": 0.5, "table": 0.5}
    predictions = {label: score > thresholds[label] for label, score in results.items()}

    return {
        "model": model_name,
        "predictions": predictions,
        "scores": results,
        "thresholds": thresholds
    }

In [None]:
metrics_clip_grid_v3_3 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_grid_v3_3
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_grid_v3_3['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_grid_v3_3['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_grid_v3_3['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_grid_v3_3['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_grid_v3_3['jaccard_macro']:.3f} & {metrics_clip_grid_v3_3['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_grid_v3_3['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_grid_v3_3['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.418
 p2: ✅ ✅ | Text:0.934 vs Table:0.113
 p3: ✅ ✅ | Text:0.978 vs Table:0.511
 p4: ✅ ✅ | Text:0.964 vs Table:0.286
 p5: ✅ ❌ | Text:0.957 vs Table:0.612
 p6: ✅ ✅ | Text:0.969 vs Table:0.236
 p7: ✅ ✅ | Text:0.818 vs Table:0.494
 p8: ✅ ❌ | Text:0.889 vs Table:0.436
 p9: ✅ ✅ | Text:0.968 vs Table:0.229
 p10: ✅ ✅ | Text:0.962 vs Table:0.124
 p11: ✅ ✅ | Text:0.733 vs Table:0.221
 p12: ✅ ✅ | Text:0.948 vs Table:0.800
 p13: ✅ ✅ | Text:0.960 vs Table:0.511
 p14: ✅ ✅ | Text:0.976 vs Table:0.459
 p15: ✅ ❌ | Text:0.905 vs Table:0.220
 p16: ✅ ✅ | Text:0.941 vs Table:0.367
 p17: ✅ ✅ | Text:0.969 vs Table:0.193
 p18: ✅ ✅ | Text:0.968 vs Table:0.298
 p19: ✅ ✅ | Text:0.930 vs Table:0.216
 p20: ✅ ✅ | Text:0.975 vs Table:0.486
 p21: ✅ ✅ | Text:0.923 vs Table:0.380
 p22: ✅ ❌ | Text:0.873 vs Table:0.413
 p23: ✅ ✅ | Text:0.859 vs Table:0.671
 p24: ✅ ✅ | Text:0.884 vs Table:0.341
 p25: ✅ ✅ | Text:0.937 vs Table:0.273
 p26: ✅ ✅ | Text:0.964 vs Table:0.383
 p27: ✅ ✅ | Text:0.94

In [None]:
def detect_multilabel_clip_grid_v3_4(image_path, processor, model, model_name='CLIP v3.1 Weighted'):
    """CLIP consensus v3.1 avec poids optimisés basés sur l'analyse"""
    image = Image.open(image_path).convert("RGB")

    # Poids ajustés basés sur les performances observées
    prompts_variants = {
        "text": [
            ("document with printed text and readable content", 1.0)
        ],
        "table": [
            # Prompts v3 maintenus avec poids élevés (prouvés efficaces)
            ("administrative form with budget tables and financial data", 1.2)     # ↑ Excellent sur Excel
            ,("document with tabular information and structured data", 1.9)        # ↓ Très bon généraliste
            ,("document containing structured information in table format", 1.1)   # ↑ Support solide
        ]
    }

    results = {}
    device = next(model.parameters()).device

    for label, prompts_list in prompts_variants.items():
        weighted_scores = []
        total_weight = 0

        for prompt, weight in prompts_list:
            binary_prompts = [prompt, f"document without {label}"]

            inputs = processor(text=binary_prompts
                             ,images=image
                             ,return_tensors="pt"
                             ,padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                probs = outputs.logits_per_image.softmax(dim=1)

            score = float(probs[0][0])
            weighted_scores.append(score * weight)
            total_weight += weight

        consensus_score = sum(weighted_scores) / total_weight
        results[label] = consensus_score

    thresholds = {"text": 0.5, "table": 0.5}
    predictions = {label: score > thresholds[label] for label, score in results.items()}

    return {
        "model": model_name,
        "predictions": predictions,
        "scores": results,
        "thresholds": thresholds
    }

In [None]:
metrics_clip_grid_v3_4 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_grid_v3_4
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_grid_v3_4['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_grid_v3_4['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_grid_v3_4['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_grid_v3_4['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_grid_v3_4['jaccard_macro']:.3f} & {metrics_clip_grid_v3_4['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_grid_v3_4['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_grid_v3_4['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.417
 p2: ✅ ✅ | Text:0.934 vs Table:0.113
 p3: ✅ ✅ | Text:0.978 vs Table:0.514
 p4: ✅ ✅ | Text:0.964 vs Table:0.283
 p5: ✅ ❌ | Text:0.957 vs Table:0.614
 p6: ✅ ✅ | Text:0.969 vs Table:0.237
 p7: ✅ ✅ | Text:0.818 vs Table:0.499
 p8: ✅ ❌ | Text:0.889 vs Table:0.430
 p9: ✅ ✅ | Text:0.968 vs Table:0.229
 p10: ✅ ✅ | Text:0.962 vs Table:0.125
 p11: ✅ ✅ | Text:0.733 vs Table:0.226
 p12: ✅ ✅ | Text:0.948 vs Table:0.799
 p13: ✅ ✅ | Text:0.960 vs Table:0.514
 p14: ✅ ✅ | Text:0.976 vs Table:0.458
 p15: ✅ ✅ | Text:0.905 vs Table:0.222
 p16: ✅ ✅ | Text:0.941 vs Table:0.373
 p17: ✅ ✅ | Text:0.969 vs Table:0.193
 p18: ✅ ✅ | Text:0.968 vs Table:0.299
 p19: ✅ ✅ | Text:0.930 vs Table:0.213
 p20: ✅ ✅ | Text:0.975 vs Table:0.484
 p21: ✅ ✅ | Text:0.923 vs Table:0.377
 p22: ✅ ❌ | Text:0.873 vs Table:0.407
 p23: ✅ ✅ | Text:0.859 vs Table:0.664
 p24: ✅ ✅ | Text:0.884 vs Table:0.339
 p25: ✅ ✅ | Text:0.937 vs Table:0.271
 p26: ✅ ✅ | Text:0.964 vs Table:0.381
 p27: ✅ ✅ | Text:0.94

In [None]:
def detect_multilabel_clip_grid_v3_5(image_path, processor, model, model_name='CLIP v3.5 Weighted'):
    """CLIP consensus v3.1 avec poids optimisés basés sur l'analyse"""
    image = Image.open(image_path).convert("RGB")

    # Poids ajustés basés sur les performances observées
    prompts_variants = {
        "text": [
            ("document with printed text and readable content", 1.0)
        ],
        "table": [
            # Prompts v3 maintenus avec poids élevés (prouvés efficaces)
            ("administrative form with budget tables and financial data", 1.2)     # ↑ Excellent sur Excel
            ,("document with tabular information and structured data", 1.9)        # ↓ Très bon généraliste
            ,("document containing structured information in table format", 1.1)   # ↑ Support solide
        ]
    }

    results = {}
    device = next(model.parameters()).device

    for label, prompts_list in prompts_variants.items():
        weighted_scores = []
        total_weight = 0

        for prompt, weight in prompts_list:
            binary_prompts = [prompt, f"document without {label}"]

            inputs = processor(text=binary_prompts
                             ,images=image
                             ,return_tensors="pt"
                             ,padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                probs = outputs.logits_per_image.softmax(dim=1)

            score = float(probs[0][0])
            weighted_scores.append(score * weight)
            total_weight += weight

        consensus_score = sum(weighted_scores) / total_weight
        results[label] = consensus_score

    thresholds = {"text": 0.5, "table": 0.5}
    predictions = {label: score > thresholds[label] for label, score in results.items()}

    return {
        "model": model_name,
        "predictions": predictions,
        "scores": results,
        "thresholds": thresholds
    }

In [None]:
metrics_clip_grid_v3_5 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_grid_v3_5
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_grid_v3_5['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_grid_v3_5['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_grid_v3_5['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_grid_v3_5['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_grid_v3_5['jaccard_macro']:.3f} & {metrics_clip_grid_v3_5['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_grid_v3_5['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_grid_v3_5['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.983 vs Table:0.417
 p2: ✅ ✅ | Text:0.934 vs Table:0.113
 p3: ✅ ✅ | Text:0.978 vs Table:0.514
 p4: ✅ ✅ | Text:0.964 vs Table:0.283
 p5: ✅ ❌ | Text:0.957 vs Table:0.614
 p6: ✅ ✅ | Text:0.969 vs Table:0.237
 p7: ✅ ✅ | Text:0.818 vs Table:0.499
 p8: ✅ ❌ | Text:0.889 vs Table:0.430
 p9: ✅ ✅ | Text:0.968 vs Table:0.229
 p10: ✅ ✅ | Text:0.962 vs Table:0.125
 p11: ✅ ✅ | Text:0.733 vs Table:0.226
 p12: ✅ ✅ | Text:0.948 vs Table:0.799
 p13: ✅ ✅ | Text:0.960 vs Table:0.514
 p14: ✅ ✅ | Text:0.976 vs Table:0.458
 p15: ✅ ❌ | Text:0.905 vs Table:0.222
 p16: ✅ ✅ | Text:0.941 vs Table:0.373
 p17: ✅ ✅ | Text:0.969 vs Table:0.193
 p18: ✅ ✅ | Text:0.968 vs Table:0.299
 p19: ✅ ✅ | Text:0.930 vs Table:0.213
 p20: ✅ ✅ | Text:0.975 vs Table:0.484
 p21: ✅ ✅ | Text:0.923 vs Table:0.377
 p22: ✅ ❌ | Text:0.873 vs Table:0.407
 p23: ✅ ✅ | Text:0.859 vs Table:0.664
 p24: ✅ ✅ | Text:0.884 vs Table:0.339
 p25: ✅ ✅ | Text:0.937 vs Table:0.271
 p26: ✅ ✅ | Text:0.964 vs Table:0.381
 p27: ✅ ✅ | Text:0.94

## Amélioration des prompts pour détection tableau Excel-like

In [None]:
# Test rapide prompt Excel sur les 5 pages restantes
excel_pages = ["p3","p8","p15", "p32", "p38"]

excel_prompts = [
    "administrative form with budget tables and financial data",
    "document with tabular information and structured data layout",
    "spreadsheet-like table with rows and columns of textual and numerical data, commonly found in financial, budgetary, or informational reports, featuring structured headers and organized content for clear data presentation",
]

print("🧪 TEST PROMPTS EXCEL - Pages problématiques")
print("="*60)

def analyze_prompt_effectiveness_custom(image_path, processor, model, custom_prompts):
    """Version custom avec prompts spécifiques"""
    image = Image.open(image_path).convert("RGB")

    results = []
    device = next(model.parameters()).device

    for i, prompt in enumerate(custom_prompts):
        binary_prompts = [prompt, "document without table"]

        inputs = processor(text=binary_prompts, images=image, return_tensors="pt", padding=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            probs = outputs.logits_per_image.softmax(dim=1)

        score = float(probs[0][0])
        results.append((i, prompt, score))

    # Tri par efficacité
    results.sort(key=lambda x: x[2], reverse=True)
    return results

for page in excel_pages:
    print(f"\n📊 {page} - Analysis Excel prompts:")
    analysis = analyze_prompt_effectiveness_custom(path_pu_p01_pp01[page], processor_clip, model_clip, excel_prompts)
    for rank, (idx, prompt, score) in enumerate(analysis):
        print(f"  {rank+1}. [{idx}] {score:.3f} - {prompt[:50]}...")

🧪 TEST PROMPTS EXCEL - Pages problématiques

📊 p3 - Analysis Excel prompts:
  1. [1] 0.727 - document with tabular information and structured d...
  2. [0] 0.461 - administrative form with budget tables and financi...
  3. [2] 0.315 - spreadsheet-like table with rows and columns of te...

📊 p8 - Analysis Excel prompts:
  1. [0] 0.679 - administrative form with budget tables and financi...
  2. [1] 0.272 - document with tabular information and structured d...
  3. [2] 0.233 - spreadsheet-like table with rows and columns of te...

📊 p15 - Analysis Excel prompts:
  1. [0] 0.285 - administrative form with budget tables and financi...
  2. [1] 0.173 - document with tabular information and structured d...
  3. [2] 0.096 - spreadsheet-like table with rows and columns of te...

📊 p32 - Analysis Excel prompts:
  1. [1] 0.242 - document with tabular information and structured d...
  2. [0] 0.084 - administrative form with budget tables and financi...
  3. [2] 0.050 - spreadsheet-like table with 

## Réflexion pour amélioration détection Excel

On garde ces 2 prompts

-"administrative form with budget tables and financial data"

-"document with tabular information and structured data"

Test excel_v4_1
- avec uniquement ces 2 prompts
- 2 prompts + 1 autre
- 2 nouveaux prompts
- 2 prompts validés + 1 prompt infographie ? ou on laisse tomber (attente de validation de Aghiles)

Quid du poids ?

In [None]:
def detect_multilabel_clip_excel_v4_1(image_path, processor, model, model_name='CLIP v4 Excel'):
    """CLIP consensus v4.1 avec multiprompt pour détection tableau Excel-like"""
    image = Image.open(image_path).convert("RGB")

    # Poids ajustés basés sur les performances observées
    prompts_variants = {
        "text": [
            ("document with printed text, paragraphs, bullet point lists, and readable content, without grid", 1.0)
        ],
        "table": [
            # Prompts v3 maintenus avec poids élevés (prouvés efficaces)
            ("administrative form with budget tables and financial data", 1.1)
            ,("spreadsheet-like table with rows and columns of textual and numerical data, commonly found in financial, budgetary, or informational reports, featuring structured headers and organized content for clear data presentation", 1.0)
        ]
    }

    results = {}
    device = next(model.parameters()).device

    for label, prompts_list in prompts_variants.items():
        weighted_scores = []
        total_weight = 0

        for prompt, weight in prompts_list:
            binary_prompts = [prompt, f"document without {label}"]

            inputs = processor(text=binary_prompts
                             ,images=image
                             ,return_tensors="pt"
                             ,padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                probs = outputs.logits_per_image.softmax(dim=1)

            score = float(probs[0][0])
            weighted_scores.append(score * weight)
            total_weight += weight

        # print(f"Weighted scores: {weighted_scores}")
        # print(f"Total weight: {total_weight}")

        consensus_score = sum(weighted_scores) / total_weight
        results[label] = consensus_score

        # print(f"Consensus: {consensus_score:.3f}")



    thresholds = {"text": 0.5, "table": 0.5}
    predictions = {label: score > thresholds[label] for label, score in results.items()}

    return {
        "model": model_name,
        "predictions": predictions,
        "scores": results,
        "thresholds": thresholds
    }

In [None]:
metrics_clip_excel_v4_1 = evaluate_binary_multilabel_model(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_excel_v4_1
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_excel_v4_1['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_excel_v4_1['f1_table']:.3f}")
print(f"F1 Macro: {metrics_clip_excel_v4_1['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_excel_v4_1['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_excel_v4_1['jaccard_macro']:.3f} & {metrics_clip_excel_v4_1['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_excel_v4_1['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_excel_v4_1['jaccard_per_class']}")

 p1: ✅ ✅ | Text:0.878 vs Table:0.493
 p2: ✅ ✅ | Text:0.813 vs Table:0.072
 p3: ✅ ❌ | Text:0.982 vs Table:0.391
 p4: ✅ ✅ | Text:0.900 vs Table:0.255
 p5: ✅ ✅ | Text:0.945 vs Table:0.421
 p6: ✅ ✅ | Text:0.947 vs Table:0.089
 p7: ✅ ✅ | Text:0.896 vs Table:0.327
 p8: ✅ ❌ | Text:0.791 vs Table:0.467
 p9: ✅ ✅ | Text:0.940 vs Table:0.139
 p10: ✅ ✅ | Text:0.873 vs Table:0.037
 p11: ✅ ✅ | Text:0.507 vs Table:0.011
 p12: ✅ ✅ | Text:0.937 vs Table:0.818
 p13: ✅ ✅ | Text:0.968 vs Table:0.515
 p14: ✅ ✅ | Text:0.981 vs Table:0.300
 p15: ✅ ✅ | Text:0.963 vs Table:0.195
 p16: ✅ ✅ | Text:0.969 vs Table:0.140
 p17: ✅ ✅ | Text:0.876 vs Table:0.080
 p18: ✅ ✅ | Text:0.833 vs Table:0.044
 p19: ✅ ✅ | Text:0.829 vs Table:0.104
 p20: ✅ ✅ | Text:0.970 vs Table:0.407
 p21: ✅ ✅ | Text:0.873 vs Table:0.291
 p22: ✅ ✅ | Text:0.751 vs Table:0.512
 p23: ✅ ✅ | Text:0.832 vs Table:0.984
 p24: ✅ ✅ | Text:0.884 vs Table:0.431
 p25: ✅ ✅ | Text:0.957 vs Table:0.300
 p26: ✅ ✅ | Text:0.929 vs Table:0.384
 p27: ✅ ✅ | Text:0.95

### Thinking note

si on reprend les résultats "TEST PROMPTS EXCEL - Pages problématiques" et les poids cela devrait donner

- p3: (0.721×1.2 + 0.582×1.1) / 2.3 = 0.657
→ Devrait passer le seuil 0.5 !

- p8: (0.860×1.2 + 0.469×1.1) / 2.3 = 0.672
→ Devrait passer aussi !

- p38: (0.576×1.1 + 0.543×1.2) / 2.3 = 0.559
→ ici aussi

Il n'y a plus de faux positif c'est une excellente nouvelle.

- p15: c'est l'infographie (modification du ground truth Table:False)
- p32 : c'est une fin de tableau


Erreur trouvée !
dans l'analyse, il y a une coquille dans le consensus, j'ai écrit:
- binary_prompts = [prompt, "document without tables"] → mauvais
- binary_prompts = [prompt, "document without table"] → bon

les résultats sont cohérents.

In [None]:
result_p3 = detect_multilabel_clip_excel_v4_1("/content/drive/MyDrive/Document AI - GroupeSOS/Outputs/Save_img_from_pdf/PU_P01_PP01_folder/PU_P01_PP01_page_3.jpg", processor_clip, model_clip)
print(f"Scores p3: {result_p3['scores']}")
print(f"Table score: {result_p3['scores']['table']}")

Scores p3: {'text': 0.9820206761360168, 'table': 0.391020461269047}
Table score: 0.391020461269047


## Ajout de la classe "Schema"

In [None]:
# Ajout d'un 2ème prompt ?

schema_pages = ["p15","p47","p48","p49"]

for page in schema_pages:
    print(f"\n📊 {page} - Analysis p15 prompts:")
    analysis = analyze_prompt_effectiveness_p15(path_pu_p01_pp01[page], processor_clip, model_clip)
    for rank, (score, prompt) in enumerate(analysis):
        print(f"  {rank+1}. {score:.3f} - {prompt[:50]}...")


📊 p15 - Analysis p15 prompts:
🔬 ANALYSE SPÉCIALISÉE p15
   0.998 | business process flowchart with colored sections and directional arrows, not planning schedule
   0.300 | flowchart with interconnected boxes and directional flow arrows
   0.208 | diagram showing process relationships with connecting arrows
   0.602 | business logic diagram with linked components and flow direction
   0.921 | process flowchart with connections, not scheduling document
   0.879 | conceptual diagram with linked elements, not timeline or calendar
   0.968 | workflow diagram with process flow, not planning grid
   0.981 | conceptual framework diagram with connected logical elements
   0.935 | process model showing relationships between different components

🏆 RANKING:
    1. 0.998 | business process flowchart with colored sections and directional arrows, not planning schedule
    2. 0.981 | conceptual framework diagram with connected logical elements
    3. 0.968 | workflow diagram with process flow, not 

In [None]:
def detect_multilabel_clip_schema_v5(image_path, processor, model, model_name='CLIP v5_schema'):
    """CLIP consensus v5 avec multiprompt pour détection Schema / infographie"""
    image = Image.open(image_path).convert("RGB")

    # Poids ajustés basés sur les performances observées
    prompts_variants = {
        "text": [
            ("document with printed text, paragraphs, bullet point lists, and readable content, without grid", 1.0)
        ],
        "table": [
            # Prompts v3 maintenus avec poids élevés (prouvés efficaces)
            ("administrative form with budget tables and financial data", 1.1)
            ,("spreadsheet-like table with rows and columns of textual and numerical data, commonly found in financial, budgetary, or informational reports, featuring structured headers and organized content for clear data presentation", 1.0)
        ],
        "schema": [
            ("diagram with colored boxes connected by arrows",1.1)
            ,("infographic map with visual elements and legend",1.3)
        ]
    }

    results = {}
    device = next(model.parameters()).device

    for label, prompts_list in prompts_variants.items():
        weighted_scores = []
        total_weight = 0

        for prompt, weight in prompts_list:
            binary_prompts = [prompt, f"document without {label}"]

            inputs = processor(text=binary_prompts
                             ,images=image
                             ,return_tensors="pt"
                             ,padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                probs = outputs.logits_per_image.softmax(dim=1)

            score = float(probs[0][0])
            weighted_scores.append(score * weight)
            total_weight += weight

        # print(f"Weighted scores: {weighted_scores}")
        # print(f"Total weight: {total_weight}")

        consensus_score = sum(weighted_scores) / total_weight
        results[label] = consensus_score

        # print(f"Consensus: {consensus_score:.3f}")



    thresholds = {"text": 0.5, "table": 0.5, "schema": 0.5}
    predictions = {label: score > thresholds[label] for label, score in results.items()}

    return {
        "model": model_name,
        "predictions": predictions,
        "scores": results,
        "thresholds": thresholds
    }

In [None]:
metrics_clip_schema_v5 = evaluate_binary_multilabel_model_v2(
    path_folder_test_baseline
    ,ground_truth_pu_p01_pp01_multilabel
    ,detect_multilabel_clip_schema_v5
    ,processor_clip
    ,model_clip
)

print(f"\n📊 Résultats CLIP:")
print(f"F1 Text: {metrics_clip_schema_v5['f1_text']:.3f}")
print(f"F1 Table: {metrics_clip_schema_v5['f1_table']:.3f}")
print(f"F1 Schema: {metrics_clip_schema_v5['f1_schema']:.3f}")
print(f"F1 Macro: {metrics_clip_schema_v5['f1_macro']:.3f}")
print(f"Hamming loss: {metrics_clip_schema_v5['hamming_loss']:.3f}")
print(f"Jaccard: {metrics_clip_schema_v5['jaccard_macro']:.3f} & {metrics_clip_schema_v5['jaccard_micro']:.3f}")
print(f"Jaccard samples :{metrics_clip_schema_v5['jaccard_samples']:.3f}")
print(f"Jaccard per class: {metrics_clip_schema_v5['jaccard_per_class']}")

 p1: ✅ ✅ ✅ | Text:0.878 vs Table:0.493 vs Schema:0.320
 p2: ✅ ✅ ✅ | Text:0.813 vs Table:0.072 vs Schema:0.087
 p3: ✅ ❌ ✅ | Text:0.982 vs Table:0.391 vs Schema:0.021
 p4: ✅ ✅ ✅ | Text:0.900 vs Table:0.255 vs Schema:0.004
 p5: ✅ ✅ ✅ | Text:0.945 vs Table:0.421 vs Schema:0.009
 p6: ✅ ✅ ✅ | Text:0.947 vs Table:0.089 vs Schema:0.050
 p7: ✅ ✅ ✅ | Text:0.896 vs Table:0.327 vs Schema:0.886
 p8: ✅ ❌ ✅ | Text:0.791 vs Table:0.467 vs Schema:0.014
 p9: ✅ ✅ ✅ | Text:0.940 vs Table:0.139 vs Schema:0.009
 p10: ✅ ✅ ✅ | Text:0.873 vs Table:0.037 vs Schema:0.069
 p11: ✅ ✅ ✅ | Text:0.507 vs Table:0.011 vs Schema:0.699
 p12: ✅ ✅ ✅ | Text:0.937 vs Table:0.818 vs Schema:0.017
 p13: ✅ ✅ ✅ | Text:0.968 vs Table:0.515 vs Schema:0.078
 p14: ✅ ✅ ✅ | Text:0.981 vs Table:0.300 vs Schema:0.026
 p15: ✅ ✅ ✅ | Text:0.963 vs Table:0.195 vs Schema:0.647
 p16: ✅ ✅ ✅ | Text:0.969 vs Table:0.140 vs Schema:0.652
 p17: ✅ ✅ ✅ | Text:0.876 vs Table:0.080 vs Schema:0.008
 p18: ✅ ✅ ✅ | Text:0.833 vs Table:0.044 vs Schema:0.001
 