diff --git a/docling_eval/benchmarks/constants.py b/docling_eval/benchmarks/constants.py index 8f0885a3..3985320a 100644 --- a/docling_eval/benchmarks/constants.py +++ b/docling_eval/benchmarks/constants.py @@ -3,7 +3,7 @@ class BenchMarkColumns(str, Enum): CONVERTER_TYPE = "converter_type" - DOCLING_VERSION = "docling_version" + CONVERTER_VERSION = "converter_version" DOCLING_PIPELINE = "docling_pipeline" STATUS = "status" diff --git a/docling_eval/benchmarks/cvat_annotation/create.py b/docling_eval/benchmarks/cvat_annotation/create.py index d2274b1e..fe3e1740 100644 --- a/docling_eval/benchmarks/cvat_annotation/create.py +++ b/docling_eval/benchmarks/cvat_annotation/create.py @@ -1,66 +1,50 @@ import argparse -import copy import glob import json import logging import os from pathlib import Path -from typing import Dict, Generator, Iterator, List, Optional, Tuple, cast +from typing import Dict, Iterator, List, Optional, Tuple, cast import xmltodict # type: ignore[import] -from datasets import Dataset, load_dataset from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size from docling_core.types.doc.document import ( - DocItem, DoclingDocument, FloatingItem, GraphData, ImageRef, PageItem, - PictureItem, ProvenanceItem, TableData, TableItem, ) -from docling_core.types.doc.labels import ( - DocItemLabel, - GroupLabel, - PictureClassificationLabel, - TableCellLabel, -) +from docling_core.types.doc.labels import DocItemLabel from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import] from PIL import Image # as PILImage from tqdm import tqdm # type: ignore -from docling_eval.benchmarks.constants import BenchMarkColumns, EvaluationModality +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) from docling_eval.benchmarks.cvat_annotation.utils import ( - AnnotatedDoc, AnnotatedImage, - AnnotationBBox, - AnnotationLine, AnnotationOverview, BenchMarkDirs, - DocLinkLabel, - TableComponentLabel, - rgb_to_hex, ) from docling_eval.benchmarks.utils import ( - draw_clusters_with_reading_order, - get_binhash, - save_comparison_html_with_clusters, - save_inspection_html, - write_datasets_info, -) -from docling_eval.docling.conversion import create_docling_converter -from docling_eval.docling.utils import ( crop_bounding_box, docling_version, extract_images, from_pil_to_base64uri, get_binary, - insert_images, + get_binhash, save_shard_to_disk, + write_datasets_info, ) +from docling_eval.converters.conversion import create_pdf_docling_converter +from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters # from pydantic import @@ -602,14 +586,14 @@ def create_true_document(basename: str, annot: dict, desc: AnnotatedImage): img_height = page_image.height if pred_doc.pages[page_no] is None: - logging.error(f"Page item is None, skipping ...") + logging.error("Page item is None, skipping ...") continue pred_page_item = pred_doc.pages[page_no] pred_page_imageref = pred_page_item.image if pred_page_imageref is None: - logging.error(f"Page ImageRef is None, skipping ...") + logging.error("Page ImageRef is None, skipping ...") continue assert pred_page_imageref.size.width == img_width @@ -644,14 +628,14 @@ def create_true_document(basename: str, annot: dict, desc: AnnotatedImage): page_no = 1 if (page_no not in true_doc.pages) or (true_doc.pages[page_no] is None): - logging.error(f"Page item is None, skipping ...") + logging.error("Page item is None, skipping ...") continue true_page_item = true_doc.pages[page_no] true_page_imageref = true_page_item.image if true_page_imageref is None: - logging.error(f"Page ImageRef is None, skipping ...") + logging.error("Page ImageRef is None, skipping ...") continue true_page_pilimage = true_page_imageref.pil_image @@ -943,7 +927,7 @@ def create_layout_dataset_from_annotations( # Create Converter image_scale = 2.0 - doc_converter = create_docling_converter(page_image_scale=image_scale) + doc_converter = create_pdf_docling_converter(page_image_scale=image_scale) records = [] for basename, desc, true_doc in tqdm( @@ -1000,7 +984,8 @@ def create_layout_dataset_from_annotations( ) record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(conv_results.status), BenchMarkColumns.DOC_ID: str(basename), BenchMarkColumns.DOC_PATH: str(basename), diff --git a/docling_eval/benchmarks/cvat_annotation/create_dataset_from_pdfs.py b/docling_eval/benchmarks/cvat_annotation/create_dataset_from_pdfs.py index 0f84efa9..c002f0d8 100644 --- a/docling_eval/benchmarks/cvat_annotation/create_dataset_from_pdfs.py +++ b/docling_eval/benchmarks/cvat_annotation/create_dataset_from_pdfs.py @@ -1,11 +1,8 @@ import argparse -import copy import glob import json -import logging import os from pathlib import Path -from typing import Dict, List, Tuple from datasets import Features from datasets import Image as Features_Image @@ -13,21 +10,12 @@ from docling_eval.benchmarks.constants import BenchMarkColumns from docling_eval.benchmarks.utils import ( - add_pages_to_true_doc, - convert_html_table_into_docling_tabledata, - save_comparison_html, - save_comparison_html_with_clusters, - write_datasets_info, -) -from docling_eval.docling.conversion import create_docling_converter -from docling_eval.docling.utils import ( - crop_bounding_box, docling_version, extract_images, - from_pil_to_base64uri, get_binary, save_shard_to_disk, ) +from docling_eval.converters.conversion import create_pdf_docling_converter def parse_args(): @@ -61,7 +49,7 @@ def _write_datasets_info( ): features = Features( { - BenchMarkColumns.DOCLING_VERSION: Value("string"), + BenchMarkColumns.CONVERTER_VERSION: Value("string"), BenchMarkColumns.STATUS: Value("string"), BenchMarkColumns.DOC_ID: Value("string"), # BenchMarkColumns.DOC_PATH: Value("string"), @@ -113,7 +101,7 @@ def main(): os.makedirs(_) # Create Converter - doc_converter = create_docling_converter( + doc_converter = create_pdf_docling_converter( page_image_scale=image_scale, artifacts_path=artifacts_path ) @@ -134,7 +122,7 @@ def main(): ) record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(conv_results.status), BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)), BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()), diff --git a/docling_eval/benchmarks/cvat_annotation/eval.py b/docling_eval/benchmarks/cvat_annotation/eval.py index 9a650bee..da175aee 100644 --- a/docling_eval/benchmarks/cvat_annotation/eval.py +++ b/docling_eval/benchmarks/cvat_annotation/eval.py @@ -1,13 +1,4 @@ -import argparse import logging -import os -from pathlib import Path - -from huggingface_hub import snapshot_download -from tabulate import tabulate # type: ignore - -from docling_eval.benchmarks.constants import BenchMarkNames, EvaluationModality -from docling_eval.cli.main import evaluate, visualise # Configure logging logging.basicConfig( diff --git a/docling_eval/benchmarks/cvat_annotation/preannotate.py b/docling_eval/benchmarks/cvat_annotation/preannotate.py index a53cfb0c..ff3404a6 100644 --- a/docling_eval/benchmarks/cvat_annotation/preannotate.py +++ b/docling_eval/benchmarks/cvat_annotation/preannotate.py @@ -1,27 +1,15 @@ import argparse -import copy import glob import json import logging import os from pathlib import Path -from typing import Dict, List, Tuple - -from datasets import Dataset, load_dataset -from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size -from docling_core.types.doc.document import ( - DocItem, - DoclingDocument, - PictureItem, - TableItem, -) -from docling_core.types.doc.labels import ( - DocItemLabel, - GroupLabel, - PictureClassificationLabel, - TableCellLabel, -) -from pydantic import BaseModel +from typing import List + +from datasets import load_dataset +from docling_core.types.doc.base import BoundingBox, ImageRefMode +from docling_core.types.doc.document import DocItem, DoclingDocument +from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel from tqdm import tqdm # type: ignore from docling_eval.benchmarks.constants import BenchMarkColumns @@ -29,15 +17,13 @@ AnnotatedDoc, AnnotatedImage, AnnotationBBox, - AnnotationLine, AnnotationOverview, BenchMarkDirs, DocLinkLabel, TableComponentLabel, rgb_to_hex, ) -from docling_eval.benchmarks.utils import get_binhash -from docling_eval.docling.utils import insert_images +from docling_eval.benchmarks.utils import get_binhash, insert_images # Configure logging logging.basicConfig( @@ -252,11 +238,11 @@ def create_cvat_preannotation_file_for_single_page( annotated_image.page_nos = [page_no] overview.img_annotations[filename] = annotated_image else: - logging.warning(f"missing pillow image of the page, skipping ...") + logging.warning("missing pillow image of the page, skipping ...") continue else: - logging.warning(f"missing image-ref of the page, skipping ...") + logging.warning("missing image-ref of the page, skipping ...") continue page_bboxes: List[AnnotationBBox] = [] diff --git a/docling_eval/benchmarks/cvat_annotation/utils.py b/docling_eval/benchmarks/cvat_annotation/utils.py index 50305403..4b858bd0 100644 --- a/docling_eval/benchmarks/cvat_annotation/utils.py +++ b/docling_eval/benchmarks/cvat_annotation/utils.py @@ -6,13 +6,8 @@ from pathlib import Path from typing import Dict, List, Tuple -from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size -from docling_core.types.doc.labels import ( - DocItemLabel, - GroupLabel, - PictureClassificationLabel, - TableCellLabel, -) +from docling_core.types.doc.base import BoundingBox +from docling_core.types.doc.labels import DocItemLabel from pydantic import BaseModel diff --git a/docling_eval/benchmarks/doclaynet_v1/create.py b/docling_eval/benchmarks/doclaynet_v1/create.py index fada3a70..6b2ccaf4 100644 --- a/docling_eval/benchmarks/doclaynet_v1/create.py +++ b/docling_eval/benchmarks/doclaynet_v1/create.py @@ -5,7 +5,7 @@ import os from pathlib import Path -from datasets import load_dataset, load_from_disk +from datasets import load_dataset from docling_core.types import DoclingDocument from docling_core.types.doc import ( BoundingBox, @@ -13,7 +13,6 @@ DocItemLabel, GroupLabel, ImageRef, - PageItem, ProvenanceItem, Size, TableCell, @@ -22,23 +21,25 @@ from docling_core.types.io import DocumentStream from tqdm import tqdm # type: ignore -from docling_eval.benchmarks.constants import BenchMarkColumns, ConverterTypes +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) from docling_eval.benchmarks.utils import ( add_pages_to_true_doc, - save_comparison_html_with_clusters, - write_datasets_info, -) -from docling_eval.docling.conversion import ( - create_docling_converter, - create_vlm_converter, -) -from docling_eval.docling.utils import ( crop_bounding_box, docling_version, extract_images, from_pil_to_base64uri, save_shard_to_disk, + write_datasets_info, +) +from docling_eval.converters.conversion import ( + create_pdf_docling_converter, + create_smol_docling_converter, ) +from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters # Configure logging logging.basicConfig( @@ -184,9 +185,9 @@ def create_dlnv1_e2e_dataset( # Decide which converter type to initialize if converter_type == ConverterTypes.DOCLING: - converter = create_docling_converter(page_image_scale=1.0) + converter = create_pdf_docling_converter(page_image_scale=1.0) else: - converter = create_vlm_converter() + converter = create_smol_docling_converter() if do_viz: viz_dir = output_dir / "visualizations" @@ -205,10 +206,6 @@ def create_dlnv1_e2e_dataset( ): page_hash = doc["metadata"]["page_hash"] - # # TODO: Debug - # if page_hash != "2b49edc9d0a47e4efaaeabf907a8b8b84b747c295dd10a639e2b5265ac258cf5": - # continue - pdf = doc["pdf"] pdf_stream = io.BytesIO(pdf) pdf_stream.seek(0) @@ -238,9 +235,6 @@ def create_dlnv1_e2e_dataset( for l, b, c in zip(labels, bboxes, contents): update(true_doc, current_list, img, old_size, l, b, c) - # TODO: Debug - # print(f"Create doc_id={page_hash}") - if do_viz: save_comparison_html_with_clusters( filename=viz_dir / f"{true_doc.name}-clusters.html", @@ -265,7 +259,7 @@ def create_dlnv1_e2e_dataset( record = { BenchMarkColumns.CONVERTER_TYPE: converter_type, - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(conv_results.status), BenchMarkColumns.DOC_ID: page_hash, BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -276,6 +270,10 @@ def create_dlnv1_e2e_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.ORIGINAL: pdf_stream.getvalue(), BenchMarkColumns.MIMETYPE: "image/png", + BenchMarkColumns.MODALITIES: [ + EvaluationModality.LAYOUT, + EvaluationModality.READING_ORDER, + ], } pdf_stream.close() records.append(record) diff --git a/docling_eval/benchmarks/dpbench/create.py b/docling_eval/benchmarks/dpbench/create.py index 678f5a0e..a75c28c9 100644 --- a/docling_eval/benchmarks/dpbench/create.py +++ b/docling_eval/benchmarks/dpbench/create.py @@ -1,4 +1,3 @@ -import argparse import json import logging import os @@ -6,14 +5,18 @@ from typing import Dict, Optional from docling.datamodel.pipeline_options import TableFormerMode -from tqdm import tqdm # type: ignore +from tqdm import tqdm + +from docling_eval.visualisation.visualisations import ( # type: ignore + save_comparison_html, + save_comparison_html_with_clusters, +) # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) -from bs4 import BeautifulSoup # type: ignore from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size from docling_core.types.doc.document import ( DoclingDocument, @@ -25,28 +28,28 @@ from docling_core.types.doc.labels import DocItemLabel from PIL import Image # as PILImage -from docling_eval.benchmarks.constants import BenchMarkColumns, ConverterTypes +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) from docling_eval.benchmarks.utils import ( add_pages_to_true_doc, convert_html_table_into_docling_tabledata, - save_comparison_html, - save_comparison_html_with_clusters, - write_datasets_info, -) -from docling_eval.docling.conversion import ( - create_docling_converter, - create_vlm_converter, -) -from docling_eval.docling.models.tableformer.tf_model_prediction import ( - TableFormerUpdater, -) -from docling_eval.docling.utils import ( crop_bounding_box, docling_version, extract_images, from_pil_to_base64uri, get_binary, save_shard_to_disk, + write_datasets_info, +) +from docling_eval.converters.conversion import ( + create_pdf_docling_converter, + create_smol_docling_converter, +) +from docling_eval.converters.models.tableformer.tf_model_prediction import ( + TableFormerUpdater, ) TRUE_HTML_EXPORT_LABELS = { @@ -247,12 +250,12 @@ def create_dpbench_e2e_dataset( ): # Create Converter if converter_type == ConverterTypes.DOCLING: - converter = create_docling_converter(page_image_scale=1.0) + converter = create_pdf_docling_converter(page_image_scale=1.0) else: - converter = create_vlm_converter() + converter = create_smol_docling_converter() # load the groundtruth - with open(dpbench_dir / f"dataset/reference.json", "r") as fr: + with open(dpbench_dir / "dataset/reference.json", "r") as fr: gt = json.load(fr) viz_dir = output_dir / "vizualisations" @@ -329,7 +332,7 @@ def create_dpbench_e2e_dataset( record = { BenchMarkColumns.CONVERTER_TYPE: converter_type, - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(conv_results.status), BenchMarkColumns.DOC_ID: str(filename), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -340,6 +343,10 @@ def create_dpbench_e2e_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.ORIGINAL: get_binary(pdf_path), BenchMarkColumns.MIMETYPE: "application/pdf", + BenchMarkColumns.MODALITIES: [ + EvaluationModality.LAYOUT, + EvaluationModality.READING_ORDER, + ], } records.append(record) @@ -367,7 +374,7 @@ def create_dpbench_tableformer_dataset( tf_updater = TableFormerUpdater(mode, artifacts_path=artifacts_path) # load the groundtruth - with open(dpbench_dir / f"dataset/reference.json", "r") as fr: + with open(dpbench_dir / "dataset/reference.json", "r") as fr: gt = json.load(fr) viz_dir = output_dir / "vizualisations" @@ -436,7 +443,7 @@ def create_dpbench_tableformer_dataset( record = { BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: "SUCCESS", BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -447,6 +454,9 @@ def create_dpbench_tableformer_dataset( BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, BenchMarkColumns.PREDICTION_PAGE_IMAGES: pred_page_images, BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, + BenchMarkColumns.MODALITIES: [ + EvaluationModality.TABLE_STRUCTURE, + ], } records.append(record) diff --git a/docling_eval/benchmarks/funsd/create.py b/docling_eval/benchmarks/funsd/create.py index 8a32f6cc..159b67e6 100644 --- a/docling_eval/benchmarks/funsd/create.py +++ b/docling_eval/benchmarks/funsd/create.py @@ -26,16 +26,20 @@ from PIL import Image from tqdm import tqdm # type: ignore -from docling_eval.benchmarks.constants import BenchMarkColumns -from docling_eval.benchmarks.utils import write_datasets_info -from docling_eval.docling.conversion import create_image_converter -from docling_eval.docling.utils import ( +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) +from docling_eval.benchmarks.utils import ( crop_bounding_box, docling_version, extract_images, from_pil_to_base64uri, save_shard_to_disk, + write_datasets_info, ) +from docling_eval.converters.conversion import create_image_docling_converter SHARD_SIZE = 1000 @@ -443,7 +447,7 @@ def create_funsd_dataset( splits: List[str] = ["train", "test"], max_items: int = -1, ): - doc_converter = create_image_converter(do_ocr=True, ocr_lang=["en"]) + doc_converter = create_image_docling_converter(do_ocr=True, ocr_lang=["en"]) num_train_rows = 0 num_test_rows = 0 @@ -511,13 +515,18 @@ def create_funsd_dataset( ) record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.DOC_ID: img_path.stem, BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images, BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, BenchMarkColumns.ORIGINAL: img_bytes, BenchMarkColumns.MIMETYPE: "image/png", + BenchMarkColumns.MODALITIES: [ + EvaluationModality.LAYOUT, + EvaluationModality.READING_ORDER, + ], } records.append(record) count += 1 diff --git a/docling_eval/benchmarks/omnidocbench/create.py b/docling_eval/benchmarks/omnidocbench/create.py index 772e93f3..e08ea4dc 100644 --- a/docling_eval/benchmarks/omnidocbench/create.py +++ b/docling_eval/benchmarks/omnidocbench/create.py @@ -1,4 +1,3 @@ -import argparse import glob import json import logging @@ -6,7 +5,6 @@ from pathlib import Path from typing import Optional -from bs4 import BeautifulSoup # type: ignore from docling.datamodel.pipeline_options import TableFormerMode from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size from docling_core.types.doc.document import DoclingDocument, ImageRef, ProvenanceItem @@ -14,28 +12,32 @@ from PIL import Image # as PILImage from tqdm import tqdm # type: ignore -from docling_eval.benchmarks.constants import BenchMarkColumns, ConverterTypes +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) from docling_eval.benchmarks.utils import ( add_pages_to_true_doc, convert_html_table_into_docling_tabledata, - save_comparison_html, - save_comparison_html_with_clusters, - write_datasets_info, -) -from docling_eval.docling.conversion import ( - create_docling_converter, - create_vlm_converter, -) -from docling_eval.docling.models.tableformer.tf_model_prediction import ( - TableFormerUpdater, -) -from docling_eval.docling.utils import ( crop_bounding_box, docling_version, extract_images, from_pil_to_base64uri, get_binary, save_shard_to_disk, + write_datasets_info, +) +from docling_eval.converters.conversion import ( + create_pdf_docling_converter, + create_smol_docling_converter, +) +from docling_eval.converters.models.tableformer.tf_model_prediction import ( + TableFormerUpdater, +) +from docling_eval.visualisation.visualisations import ( + save_comparison_html, + save_comparison_html_with_clusters, ) # Configure logging @@ -260,12 +262,12 @@ def create_omnidocbench_e2e_dataset( # Create Converter if converter_type == ConverterTypes.DOCLING: - converter = create_docling_converter(page_image_scale=1.0) + converter = create_pdf_docling_converter(page_image_scale=1.0) else: - converter = create_vlm_converter() + converter = create_smol_docling_converter() # load the groundtruth - with open(omnidocbench_dir / f"OmniDocBench.json", "r") as fr: + with open(omnidocbench_dir / "OmniDocBench.json", "r") as fr: gt = json.load(fr) gt = update_gt_into_map(gt) @@ -290,7 +292,7 @@ def create_omnidocbench_e2e_dataset( pdf_path = Path(page_tuple[1]) # logging.info(f"file: {pdf_path}") - if not os.path.basename(jpg_path) in gt: + if os.path.basename(jpg_path) not in gt: logging.error(f"did not find ground-truth for {os.path.basename(jpg_path)}") continue @@ -356,7 +358,7 @@ def create_omnidocbench_e2e_dataset( record = { BenchMarkColumns.CONVERTER_TYPE: converter_type, - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: "SUCCESS", BenchMarkColumns.DOC_ID: str(os.path.basename(jpg_path)), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -368,6 +370,10 @@ def create_omnidocbench_e2e_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images, BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, + BenchMarkColumns.MODALITIES: [ + EvaluationModality.LAYOUT, + EvaluationModality.READING_ORDER, + ], } records.append(record) @@ -395,7 +401,7 @@ def create_omnidocbench_tableformer_dataset( tf_updater = TableFormerUpdater(mode, artifacts_path=artifacts_path) # load the groundtruth - with open(omnidocbench_dir / f"OmniDocBench.json", "r") as fr: + with open(omnidocbench_dir / "OmniDocBench.json", "r") as fr: gt = json.load(fr) gt = update_gt_into_map(gt) @@ -418,7 +424,7 @@ def create_omnidocbench_tableformer_dataset( pdf_path = Path(page_tuple[1]) # logging.info(f"file: {pdf_path}") - if not os.path.basename(jpg_path) in gt: + if os.path.basename(jpg_path) not in gt: logging.error(f"did not find ground-truth for {os.path.basename(jpg_path)}") continue @@ -474,7 +480,7 @@ def create_omnidocbench_tableformer_dataset( record = { BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: "SUCCESS", BenchMarkColumns.DOC_ID: str(os.path.basename(jpg_path)), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -485,6 +491,7 @@ def create_omnidocbench_tableformer_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images, BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, + BenchMarkColumns.MODALITIES: [EvaluationModality.TABLE_STRUCTURE], } records.append(record) diff --git a/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py b/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py index a253db42..93ba2ff2 100644 --- a/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py +++ b/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py @@ -17,21 +17,23 @@ from docling_core.types.doc.labels import DocItemLabel from tqdm import tqdm # type: ignore -from docling_eval.benchmarks.constants import BenchMarkColumns +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) from docling_eval.benchmarks.utils import ( convert_html_table_into_docling_tabledata, - save_comparison_html, -) -from docling_eval.docling.models.tableformer.tf_model_prediction import ( - PageTokens, - TableFormerUpdater, -) -from docling_eval.docling.utils import ( docling_version, extract_images, from_pil_to_base64uri, save_shard_to_disk, ) +from docling_eval.converters.models.tableformer.tf_model_prediction import ( + PageTokens, + TableFormerUpdater, +) +from docling_eval.visualisation.visualisations import save_comparison_html HTML_EXPORT_LABELS = { DocItemLabel.TITLE, @@ -223,7 +225,8 @@ def create_huggingface_otsl_tableformer_dataset( ) record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(ConversionStatus.SUCCESS.value), BenchMarkColumns.DOC_ID: str(os.path.basename(filename)), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -234,11 +237,13 @@ def create_huggingface_otsl_tableformer_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images, BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, + BenchMarkColumns.MODALITIES: [EvaluationModality.TABLE_STRUCTURE], } records.append(record) else: record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(ConversionStatus.FAILURE.value), BenchMarkColumns.DOC_ID: str(os.path.basename(filename)), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -249,6 +254,7 @@ def create_huggingface_otsl_tableformer_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images, BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, + BenchMarkColumns.MODALITIES: [EvaluationModality.TABLE_STRUCTURE], } records.append(record) diff --git a/docling_eval/benchmarks/utils.py b/docling_eval/benchmarks/utils.py index bf54a553..ec0dc992 100644 --- a/docling_eval/benchmarks/utils.py +++ b/docling_eval/benchmarks/utils.py @@ -1,41 +1,33 @@ -import copy +import base64 import hashlib +import io import json import logging +from importlib.metadata import version from io import BytesIO from pathlib import Path -from typing import Dict, List, Optional, Set +from typing import Any, Dict, List, Optional -import pypdfium2 as pdfium +import pandas as pd from bs4 import BeautifulSoup # type: ignore -from datasets import Features +from datasets import Dataset, Features from datasets import Image as Features_Image from datasets import Sequence, Value from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend -from docling.datamodel.base_models import BoundingBox, Cluster, InputFormat, Page +from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import InputDocument -from docling.utils.visualization import draw_clusters -from docling_core.types.doc.base import Size +from docling_core.types.doc.base import BoundingBox, Size from docling_core.types.doc.document import ( - DocItem, DoclingDocument, ImageRef, - ImageRefMode, PageItem, TableCell, TableData, ) -from docling_core.types.doc.labels import DocItemLabel -from PIL import Image, ImageDraw, ImageFont +from PIL import Image +from pydantic import AnyUrl from docling_eval.benchmarks.constants import BenchMarkColumns -from docling_eval.docling.constants import ( - HTML_COMPARISON_PAGE, - HTML_COMPARISON_PAGE_WITH_CLUSTERS, - HTML_DEFAULT_HEAD_FOR_COMP, - HTML_INSPECTION, -) -from docling_eval.docling.utils import from_pil_to_base64, from_pil_to_base64uri def get_binhash(binary_data: bytes) -> str: @@ -55,7 +47,7 @@ def write_datasets_info( ): features = Features( { - BenchMarkColumns.DOCLING_VERSION: Value("string"), + BenchMarkColumns.CONVERTER_VERSION: Value("string"), BenchMarkColumns.STATUS: Value("string"), BenchMarkColumns.DOC_ID: Value("string"), BenchMarkColumns.DOC_PATH: Value("string"), @@ -102,6 +94,14 @@ def get_input_document(file: Path | BytesIO) -> InputDocument: ) +def from_pil_to_base64uri(img: Image.Image) -> AnyUrl: + + image_base64 = from_pil_to_base64(img) + uri = AnyUrl(f"data:image/png;base64,{image_base64}") + + return uri + + def add_pages_to_true_doc( pdf_path: Path | BytesIO, true_doc: DoclingDocument, image_scale: float = 1.0 ): @@ -234,382 +234,203 @@ def convert_html_table_into_docling_tabledata( return TableData(num_rows=num_rows, num_cols=num_cols, table_cells=cells) -def save_comparison_html( - filename: Path, - true_doc: DoclingDocument, - pred_doc: DoclingDocument, - page_image: Image.Image, - true_labels: Set[DocItemLabel], - pred_labels: Set[DocItemLabel], -): +def docling_version() -> str: + return version("docling") # may raise PackageNotFoundError - true_doc_html = true_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=true_labels, - ) - pred_doc_html = pred_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=pred_labels, - ) +def get_binary(file_path: Path): + """Read binary document into buffer.""" + with open(file_path, "rb") as f: + return f.read() - # since the string in srcdoc are wrapped by ', we need to replace all ' by it HTML convention - true_doc_html = true_doc_html.replace("'", "'") - pred_doc_html = pred_doc_html.replace("'", "'") - image_base64 = from_pil_to_base64(page_image) +def map_to_records(item: Dict): + """Map cells from pdf-parser into a records.""" + header = item["header"] + data = item["data"] - """ - # Convert the image to a bytes object + # Create a DataFrame + df = pd.DataFrame(data, columns=header) + return df.to_dict(orient="records") + + +def from_pil_to_base64(img: Image.Image) -> str: + # Convert the image to a base64 str buffered = io.BytesIO() - page_image.save( - buffered, format="PNG" - ) # Specify the format (e.g., JPEG, PNG, etc.) + img.save(buffered, format="PNG") # Specify the format (e.g., JPEG, PNG, etc.) image_bytes = buffered.getvalue() # Encode the bytes to a Base64 string image_base64 = base64.b64encode(image_bytes).decode("utf-8") - """ + return image_base64 - comparison_page = copy.deepcopy(HTML_COMPARISON_PAGE) - comparison_page = comparison_page.replace("BASE64PAGE", image_base64) - comparison_page = comparison_page.replace("TRUEDOC", true_doc_html) - comparison_page = comparison_page.replace("PREDDOC", pred_doc_html) - with open(str(filename), "w") as fw: - fw.write(comparison_page) +def to_base64(item: Dict[str, Any]) -> str: + image_bytes = item["bytes"] + # Wrap the bytes in a BytesIO object + image_stream = BytesIO(image_bytes) -def draw_arrow( - draw: ImageDraw.ImageDraw, - arrow_coords: tuple[float, float, float, float], - line_width: int = 2, - color: str = "red", -): - r""" - Draw an arrow inside the given draw object - """ - x0, y0, x1, y1 = arrow_coords - - # Arrow parameters - start_point = (x0, y0) # Starting point of the arrow - end_point = (x1, y1) # Ending point of the arrow - arrowhead_length = 20 # Length of the arrowhead - arrowhead_width = 10 # Width of the arrowhead - - # Draw the arrow shaft (line) - draw.line([start_point, end_point], fill=color, width=line_width) - - # Calculate the arrowhead points - dx = end_point[0] - start_point[0] - dy = end_point[1] - start_point[1] - angle = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft - - # Normalized direction vector for the arrow shaft - ux, uy = dx / angle, dy / angle - - # Base of the arrowhead - base_x = end_point[0] - ux * arrowhead_length - base_y = end_point[1] - uy * arrowhead_length - - # Left and right points of the arrowhead - left_x = base_x - uy * arrowhead_width - left_y = base_y + ux * arrowhead_width - right_x = base_x + uy * arrowhead_width - right_y = base_y - ux * arrowhead_width - - # Draw the arrowhead (triangle) - draw.polygon( - [end_point, (left_x, left_y), (right_x, right_y)], - fill=color, - ) - return draw - - -def draw_clusters_with_reading_order( - doc: DoclingDocument, - page_image: Image.Image, - labels: Set[DocItemLabel], - page_no: int = 1, - reading_order: bool = True, -): - - # img = copy.deepcopy(page_image) - img = page_image.copy() - draw = ImageDraw.Draw(img) - - # Load a font (adjust the font size and path as needed) - font = ImageFont.load_default() - try: - font = ImageFont.truetype("arial.ttf", size=15) - except IOError: - font = ImageFont.load_default() + # Open the image using PIL + image = Image.open(image_stream) - x0, y0 = None, None + # Convert the image to a bytes object + buffered = io.BytesIO() + image.save(buffered, format="PNG") # Specify the format (e.g., JPEG, PNG, etc.) + image_bytes = buffered.getvalue() - for item, level in doc.iterate_items(): - if isinstance(item, DocItem): # and item.label in labels: - for prov in item.prov: + # Encode the bytes to a Base64 string + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + return image_base64 - if page_no != prov.page_no: - continue - bbox = prov.bbox.to_top_left_origin( - page_height=doc.pages[prov.page_no].size.height - ) - bbox = bbox.normalized(doc.pages[prov.page_no].size) +def to_pil(uri): - bbox.l = round(bbox.l * img.width) - bbox.r = round(bbox.r * img.width) - bbox.t = round(bbox.t * img.height) - bbox.b = round(bbox.b * img.height) + base64_string = str(uri) + base64_string = base64_string.split(",")[1] - if bbox.b > bbox.t: - bbox.b, bbox.t = bbox.t, bbox.b + # Step 1: Decode the Base64 string + image_data = base64.b64decode(base64_string) - if not reading_order: - x0, y0 = None, None - elif x0 is None and y0 is None: - x0 = (bbox.l + bbox.r) / 2.0 - y0 = (bbox.b + bbox.t) / 2.0 - else: - assert x0 is not None - assert y0 is not None + # Step 2: Open the image using Pillow + image = Image.open(BytesIO(image_data)) - x1 = (bbox.l + bbox.r) / 2.0 - y1 = (bbox.b + bbox.t) / 2.0 + return image - # Arrow parameters - start_point = (x0, y0) # Starting point of the arrow - end_point = (x1, y1) # Ending point of the arrow - arrowhead_length = 20 # Length of the arrowhead - arrowhead_width = 10 # Width of the arrowhead - arrow_color = "red" - line_width = 2 - - # Draw the arrow shaft (line) - draw.line( - [start_point, end_point], fill=arrow_color, width=line_width - ) +def extract_images( + document: DoclingDocument, + pictures_column: str, + page_images_column: str, +): - # Calculate the arrowhead points - dx = end_point[0] - start_point[0] - dy = end_point[1] - start_point[1] - angle = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft + pictures = [] + page_images = [] - # Normalized direction vector for the arrow shaft - ux, uy = dx / angle, dy / angle + # Save page images + for img_no, picture in enumerate(document.pictures): + if picture.image is not None: + # img = picture.image.pil_image + # pictures.append(to_pil(picture.image.uri)) + pictures.append(picture.image.pil_image) + picture.image.uri = Path(f"{pictures_column}/{img_no}") + + # Save page images + for page_no, page in document.pages.items(): + if page.image is not None: + # img = page.image.pil_image + # img.show() + page_images.append(page.image.pil_image) + page.image.uri = Path(f"{page_images_column}/{page_no}") + + return document, pictures, page_images + + +def insert_images( + document: DoclingDocument, + pictures: List[Dict[str, Any]], + page_images: List[Dict[str, Any]], +): - # Base of the arrowhead - base_x = end_point[0] - ux * arrowhead_length - base_y = end_point[1] - uy * arrowhead_length + # Save page images + for pic_no, picture in enumerate(document.pictures): + if picture.image is not None: + if pic_no < len(pictures): + b64 = to_base64(pictures[pic_no]) - # Left and right points of the arrowhead - left_x = base_x - uy * arrowhead_width - left_y = base_y + ux * arrowhead_width - right_x = base_x + uy * arrowhead_width - right_y = base_y - ux * arrowhead_width + image_ref = document.pictures[pic_no].image + if image_ref is not None: + image_ref.uri = AnyUrl(f"data:image/png;base64,{b64}") + document.pictures[pic_no].image = image_ref + else: + logging.warning(f"image-ref is none for picture {pic_no}") - # Draw the arrowhead (triangle) - draw.polygon( - [end_point, (left_x, left_y), (right_x, right_y)], - fill=arrow_color, + """ + if document.pictures[pic_no].image is not None: + document.pictures[pic_no].image.uri = AnyUrl( + f"data:image/png;base64,{b64}" ) + else: + logging.warning(f"image-ref is none for picture {pic_no}") + """ - x0, y0 = x1, y1 - - # Draw rectangle with only a border - rectangle_color = "blue" - border_width = 1 - draw.rectangle( - [bbox.l, bbox.b, bbox.r, bbox.t], - outline=rectangle_color, - width=border_width, - ) - - # Calculate label size using getbbox - text_bbox = font.getbbox(str(item.label)) - label_width = text_bbox[2] - text_bbox[0] - label_height = text_bbox[3] - text_bbox[1] - label_x = bbox.l - label_y = ( - bbox.b - label_height - ) # - 5 # Place the label above the rectangle - - # Draw label text - draw.text( - (label_x, label_y), - str(item.label), - fill=rectangle_color, - font=font, - ) - - return img - - -def save_comparison_html_with_clusters( - filename: Path, - true_doc: DoclingDocument, - pred_doc: DoclingDocument, - page_image: Image.Image, - true_labels: Set[DocItemLabel], - pred_labels: Set[DocItemLabel], - draw_reading_order: bool = True, -): - if (1 not in true_doc.pages) or (1 not in pred_doc.pages): - logging.error(f"1 not in true_doc.pages -> skipping {filename} ") - return - - def draw_doc_layout(doc: DoclingDocument, image: Image.Image): - r""" - Draw the document clusters and optionaly the reading order - """ - clusters = [] - for idx, (elem, _) in enumerate(doc.iterate_items()): - if not isinstance(elem, DocItem): - continue - if len(elem.prov) == 0: - continue # Skip elements without provenances - prov = elem.prov[0] - - if prov.page_no not in true_doc.pages or prov.page_no != 1: - logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ") - continue - - tlo_bbox = prov.bbox.to_top_left_origin( - page_height=true_doc.pages[prov.page_no].size.height - ) - cluster = Cluster( - id=idx, - label=elem.label, - bbox=BoundingBox.model_validate(tlo_bbox), - cells=[], - ) - clusters.append(cluster) - - scale_x = image.width / true_doc.pages[1].size.width - scale_y = image.height / true_doc.pages[1].size.height - draw_clusters(image, clusters, scale_x, scale_y) - - return image - - def draw_doc_reading_order(doc: DoclingDocument, image: Image.Image): - r""" - Draw the reading order - """ - draw = ImageDraw.Draw(image) - x0, y0 = None, None - - for elem, _ in doc.iterate_items(): - if not isinstance(elem, DocItem): - continue - if len(elem.prov) == 0: - continue # Skip elements without provenances - prov = elem.prov[0] - - if prov.page_no not in true_doc.pages or prov.page_no != 1: - logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ") - continue - - tlo_bbox = prov.bbox.to_top_left_origin( - page_height=true_doc.pages[prov.page_no].size.height - ) - ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size) - ro_bbox.l = round(ro_bbox.l * image.width) - ro_bbox.r = round(ro_bbox.r * image.width) - ro_bbox.t = round(ro_bbox.t * image.height) - ro_bbox.b = round(ro_bbox.b * image.height) - - if ro_bbox.b > ro_bbox.t: - ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b - - if x0 is None and y0 is None: - x0 = (ro_bbox.l + ro_bbox.r) / 2.0 - y0 = (ro_bbox.b + ro_bbox.t) / 2.0 + """ else: - assert x0 is not None - assert y0 is not None - - x1 = (ro_bbox.l + ro_bbox.r) / 2.0 - y1 = (ro_bbox.b + ro_bbox.t) / 2.0 + document.pictures[pic_no].image.uri = None + # logging.warning(f"inconsistent number of images in the document ({len(pictures)} != {len(document.pictures)})") + """ + + # Save page images + for page_no, page in document.pages.items(): + if page.image is not None: + # print(f"inserting image to page: {page_no}") + b64 = to_base64(page_images[page_no - 1]) + + image_ref = document.pages[page_no].image + if image_ref is not None: + image_ref.uri = AnyUrl(f"data:image/png;base64,{b64}") + document.pages[page_no].image = image_ref + + return document + + +def save_shard_to_disk( + items: List[Any], + dataset_path: Path, + thread_id: int = 0, + shard_id: int = 0, + features: Optional[Features] = None, + shard_format: str = "parquet", +): + """Save shard of to disk.""" - draw = draw_arrow( - draw, - (x0, y0, x1, y1), - line_width=2, - color="red", - ) - x0, y0 = x1, y1 - return image - - # HTML rendering - true_doc_html = true_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=true_labels, - ) + batch = Dataset.from_list(items) # , features=features) - pred_doc_html = pred_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=pred_labels, - ) + output_file = dataset_path / f"shard_{thread_id:06}_{shard_id:06}.{shard_format}" + logging.info(f"Saved shard {shard_id} to {output_file} with {len(items)} documents") - # since the string in srcdoc are wrapped by ', we need to replace all ' by it HTML convention - true_doc_html = true_doc_html.replace("'", "'") - pred_doc_html = pred_doc_html.replace("'", "'") + if shard_format == "json": + batch.to_json(output_file) - true_doc_img = draw_doc_layout(true_doc, copy.deepcopy(page_image)) - pred_doc_img = draw_doc_layout(pred_doc, copy.deepcopy(page_image)) + elif shard_format == "parquet": + batch.to_parquet(output_file) - if draw_reading_order: - true_doc_img = draw_doc_reading_order(true_doc, true_doc_img) - pred_doc_img = draw_doc_reading_order(pred_doc, pred_doc_img) + else: + raise ValueError(f"Unsupported shard_format: {shard_format}") - true_doc_img_b64 = from_pil_to_base64(true_doc_img) - pred_doc_img_b64 = from_pil_to_base64(pred_doc_img) + shard_id += 1 - comparison_page = copy.deepcopy(HTML_COMPARISON_PAGE_WITH_CLUSTERS) - comparison_page = comparison_page.replace("BASE64TRUEPAGE", true_doc_img_b64) - comparison_page = comparison_page.replace("TRUEDOC", true_doc_html) - comparison_page = comparison_page.replace("BASE64PREDPAGE", pred_doc_img_b64) - comparison_page = comparison_page.replace("PREDDOC", pred_doc_html) + return shard_id, [], 0 - with open(str(filename), "w") as fw: - fw.write(comparison_page) +def crop_bounding_box(page_image: Image.Image, page: PageItem, bbox: BoundingBox): + """ + Crop a bounding box from a PIL image. + + :param img: PIL Image object + :param l: Left coordinate + :param t: Top coordinate (from bottom-left origin) + :param r: Right coordinate + :param b: Bottom coordinate (from bottom-left origin) + :return: Cropped PIL Image + """ + width = float(page.size.width) + height = float(page.size.height) -def save_inspection_html( - filename: Path, doc: DoclingDocument, labels: Set[DocItemLabel] -): + img_width = float(page_image.width) + img_height = float(page_image.height) - html_doc = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED, labels=labels) - html_doc = html_doc.replace("'", "'") + scale_x = img_width / width + scale_y = img_height / height - page_images = [] - page_template = '