diff --git a/docling_eval/benchmarks/constants.py b/docling_eval/benchmarks/constants.py index 8f0885a3..3985320a 100644 --- a/docling_eval/benchmarks/constants.py +++ b/docling_eval/benchmarks/constants.py @@ -3,7 +3,7 @@ class BenchMarkColumns(str, Enum): CONVERTER_TYPE = "converter_type" - DOCLING_VERSION = "docling_version" + CONVERTER_VERSION = "converter_version" DOCLING_PIPELINE = "docling_pipeline" STATUS = "status" diff --git a/docling_eval/benchmarks/cvat_annotation/create.py b/docling_eval/benchmarks/cvat_annotation/create.py index d2274b1e..fe3e1740 100644 --- a/docling_eval/benchmarks/cvat_annotation/create.py +++ b/docling_eval/benchmarks/cvat_annotation/create.py @@ -1,66 +1,50 @@ import argparse -import copy import glob import json import logging import os from pathlib import Path -from typing import Dict, Generator, Iterator, List, Optional, Tuple, cast +from typing import Dict, Iterator, List, Optional, Tuple, cast import xmltodict # type: ignore[import] -from datasets import Dataset, load_dataset from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size from docling_core.types.doc.document import ( - DocItem, DoclingDocument, FloatingItem, GraphData, ImageRef, PageItem, - PictureItem, ProvenanceItem, TableData, TableItem, ) -from docling_core.types.doc.labels import ( - DocItemLabel, - GroupLabel, - PictureClassificationLabel, - TableCellLabel, -) +from docling_core.types.doc.labels import DocItemLabel from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import] from PIL import Image # as PILImage from tqdm import tqdm # type: ignore -from docling_eval.benchmarks.constants import BenchMarkColumns, EvaluationModality +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) from docling_eval.benchmarks.cvat_annotation.utils import ( - AnnotatedDoc, AnnotatedImage, - AnnotationBBox, - AnnotationLine, AnnotationOverview, BenchMarkDirs, - DocLinkLabel, - TableComponentLabel, - rgb_to_hex, ) from docling_eval.benchmarks.utils import ( - draw_clusters_with_reading_order, - get_binhash, - save_comparison_html_with_clusters, - save_inspection_html, - write_datasets_info, -) -from docling_eval.docling.conversion import create_docling_converter -from docling_eval.docling.utils import ( crop_bounding_box, docling_version, extract_images, from_pil_to_base64uri, get_binary, - insert_images, + get_binhash, save_shard_to_disk, + write_datasets_info, ) +from docling_eval.converters.conversion import create_pdf_docling_converter +from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters # from pydantic import @@ -602,14 +586,14 @@ def create_true_document(basename: str, annot: dict, desc: AnnotatedImage): img_height = page_image.height if pred_doc.pages[page_no] is None: - logging.error(f"Page item is None, skipping ...") + logging.error("Page item is None, skipping ...") continue pred_page_item = pred_doc.pages[page_no] pred_page_imageref = pred_page_item.image if pred_page_imageref is None: - logging.error(f"Page ImageRef is None, skipping ...") + logging.error("Page ImageRef is None, skipping ...") continue assert pred_page_imageref.size.width == img_width @@ -644,14 +628,14 @@ def create_true_document(basename: str, annot: dict, desc: AnnotatedImage): page_no = 1 if (page_no not in true_doc.pages) or (true_doc.pages[page_no] is None): - logging.error(f"Page item is None, skipping ...") + logging.error("Page item is None, skipping ...") continue true_page_item = true_doc.pages[page_no] true_page_imageref = true_page_item.image if true_page_imageref is None: - logging.error(f"Page ImageRef is None, skipping ...") + logging.error("Page ImageRef is None, skipping ...") continue true_page_pilimage = true_page_imageref.pil_image @@ -943,7 +927,7 @@ def create_layout_dataset_from_annotations( # Create Converter image_scale = 2.0 - doc_converter = create_docling_converter(page_image_scale=image_scale) + doc_converter = create_pdf_docling_converter(page_image_scale=image_scale) records = [] for basename, desc, true_doc in tqdm( @@ -1000,7 +984,8 @@ def create_layout_dataset_from_annotations( ) record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(conv_results.status), BenchMarkColumns.DOC_ID: str(basename), BenchMarkColumns.DOC_PATH: str(basename), diff --git a/docling_eval/benchmarks/cvat_annotation/create_dataset_from_pdfs.py b/docling_eval/benchmarks/cvat_annotation/create_dataset_from_pdfs.py index 0f84efa9..c002f0d8 100644 --- a/docling_eval/benchmarks/cvat_annotation/create_dataset_from_pdfs.py +++ b/docling_eval/benchmarks/cvat_annotation/create_dataset_from_pdfs.py @@ -1,11 +1,8 @@ import argparse -import copy import glob import json -import logging import os from pathlib import Path -from typing import Dict, List, Tuple from datasets import Features from datasets import Image as Features_Image @@ -13,21 +10,12 @@ from docling_eval.benchmarks.constants import BenchMarkColumns from docling_eval.benchmarks.utils import ( - add_pages_to_true_doc, - convert_html_table_into_docling_tabledata, - save_comparison_html, - save_comparison_html_with_clusters, - write_datasets_info, -) -from docling_eval.docling.conversion import create_docling_converter -from docling_eval.docling.utils import ( - crop_bounding_box, docling_version, extract_images, - from_pil_to_base64uri, get_binary, save_shard_to_disk, ) +from docling_eval.converters.conversion import create_pdf_docling_converter def parse_args(): @@ -61,7 +49,7 @@ def _write_datasets_info( ): features = Features( { - BenchMarkColumns.DOCLING_VERSION: Value("string"), + BenchMarkColumns.CONVERTER_VERSION: Value("string"), BenchMarkColumns.STATUS: Value("string"), BenchMarkColumns.DOC_ID: Value("string"), # BenchMarkColumns.DOC_PATH: Value("string"), @@ -113,7 +101,7 @@ def main(): os.makedirs(_) # Create Converter - doc_converter = create_docling_converter( + doc_converter = create_pdf_docling_converter( page_image_scale=image_scale, artifacts_path=artifacts_path ) @@ -134,7 +122,7 @@ def main(): ) record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(conv_results.status), BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)), BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()), diff --git a/docling_eval/benchmarks/cvat_annotation/eval.py b/docling_eval/benchmarks/cvat_annotation/eval.py index 9a650bee..da175aee 100644 --- a/docling_eval/benchmarks/cvat_annotation/eval.py +++ b/docling_eval/benchmarks/cvat_annotation/eval.py @@ -1,13 +1,4 @@ -import argparse import logging -import os -from pathlib import Path - -from huggingface_hub import snapshot_download -from tabulate import tabulate # type: ignore - -from docling_eval.benchmarks.constants import BenchMarkNames, EvaluationModality -from docling_eval.cli.main import evaluate, visualise # Configure logging logging.basicConfig( diff --git a/docling_eval/benchmarks/cvat_annotation/preannotate.py b/docling_eval/benchmarks/cvat_annotation/preannotate.py index a53cfb0c..ff3404a6 100644 --- a/docling_eval/benchmarks/cvat_annotation/preannotate.py +++ b/docling_eval/benchmarks/cvat_annotation/preannotate.py @@ -1,27 +1,15 @@ import argparse -import copy import glob import json import logging import os from pathlib import Path -from typing import Dict, List, Tuple - -from datasets import Dataset, load_dataset -from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size -from docling_core.types.doc.document import ( - DocItem, - DoclingDocument, - PictureItem, - TableItem, -) -from docling_core.types.doc.labels import ( - DocItemLabel, - GroupLabel, - PictureClassificationLabel, - TableCellLabel, -) -from pydantic import BaseModel +from typing import List + +from datasets import load_dataset +from docling_core.types.doc.base import BoundingBox, ImageRefMode +from docling_core.types.doc.document import DocItem, DoclingDocument +from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel from tqdm import tqdm # type: ignore from docling_eval.benchmarks.constants import BenchMarkColumns @@ -29,15 +17,13 @@ AnnotatedDoc, AnnotatedImage, AnnotationBBox, - AnnotationLine, AnnotationOverview, BenchMarkDirs, DocLinkLabel, TableComponentLabel, rgb_to_hex, ) -from docling_eval.benchmarks.utils import get_binhash -from docling_eval.docling.utils import insert_images +from docling_eval.benchmarks.utils import get_binhash, insert_images # Configure logging logging.basicConfig( @@ -252,11 +238,11 @@ def create_cvat_preannotation_file_for_single_page( annotated_image.page_nos = [page_no] overview.img_annotations[filename] = annotated_image else: - logging.warning(f"missing pillow image of the page, skipping ...") + logging.warning("missing pillow image of the page, skipping ...") continue else: - logging.warning(f"missing image-ref of the page, skipping ...") + logging.warning("missing image-ref of the page, skipping ...") continue page_bboxes: List[AnnotationBBox] = [] diff --git a/docling_eval/benchmarks/cvat_annotation/utils.py b/docling_eval/benchmarks/cvat_annotation/utils.py index 50305403..4b858bd0 100644 --- a/docling_eval/benchmarks/cvat_annotation/utils.py +++ b/docling_eval/benchmarks/cvat_annotation/utils.py @@ -6,13 +6,8 @@ from pathlib import Path from typing import Dict, List, Tuple -from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size -from docling_core.types.doc.labels import ( - DocItemLabel, - GroupLabel, - PictureClassificationLabel, - TableCellLabel, -) +from docling_core.types.doc.base import BoundingBox +from docling_core.types.doc.labels import DocItemLabel from pydantic import BaseModel diff --git a/docling_eval/benchmarks/doclaynet_v1/create.py b/docling_eval/benchmarks/doclaynet_v1/create.py index fada3a70..6b2ccaf4 100644 --- a/docling_eval/benchmarks/doclaynet_v1/create.py +++ b/docling_eval/benchmarks/doclaynet_v1/create.py @@ -5,7 +5,7 @@ import os from pathlib import Path -from datasets import load_dataset, load_from_disk +from datasets import load_dataset from docling_core.types import DoclingDocument from docling_core.types.doc import ( BoundingBox, @@ -13,7 +13,6 @@ DocItemLabel, GroupLabel, ImageRef, - PageItem, ProvenanceItem, Size, TableCell, @@ -22,23 +21,25 @@ from docling_core.types.io import DocumentStream from tqdm import tqdm # type: ignore -from docling_eval.benchmarks.constants import BenchMarkColumns, ConverterTypes +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) from docling_eval.benchmarks.utils import ( add_pages_to_true_doc, - save_comparison_html_with_clusters, - write_datasets_info, -) -from docling_eval.docling.conversion import ( - create_docling_converter, - create_vlm_converter, -) -from docling_eval.docling.utils import ( crop_bounding_box, docling_version, extract_images, from_pil_to_base64uri, save_shard_to_disk, + write_datasets_info, +) +from docling_eval.converters.conversion import ( + create_pdf_docling_converter, + create_smol_docling_converter, ) +from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters # Configure logging logging.basicConfig( @@ -184,9 +185,9 @@ def create_dlnv1_e2e_dataset( # Decide which converter type to initialize if converter_type == ConverterTypes.DOCLING: - converter = create_docling_converter(page_image_scale=1.0) + converter = create_pdf_docling_converter(page_image_scale=1.0) else: - converter = create_vlm_converter() + converter = create_smol_docling_converter() if do_viz: viz_dir = output_dir / "visualizations" @@ -205,10 +206,6 @@ def create_dlnv1_e2e_dataset( ): page_hash = doc["metadata"]["page_hash"] - # # TODO: Debug - # if page_hash != "2b49edc9d0a47e4efaaeabf907a8b8b84b747c295dd10a639e2b5265ac258cf5": - # continue - pdf = doc["pdf"] pdf_stream = io.BytesIO(pdf) pdf_stream.seek(0) @@ -238,9 +235,6 @@ def create_dlnv1_e2e_dataset( for l, b, c in zip(labels, bboxes, contents): update(true_doc, current_list, img, old_size, l, b, c) - # TODO: Debug - # print(f"Create doc_id={page_hash}") - if do_viz: save_comparison_html_with_clusters( filename=viz_dir / f"{true_doc.name}-clusters.html", @@ -265,7 +259,7 @@ def create_dlnv1_e2e_dataset( record = { BenchMarkColumns.CONVERTER_TYPE: converter_type, - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(conv_results.status), BenchMarkColumns.DOC_ID: page_hash, BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -276,6 +270,10 @@ def create_dlnv1_e2e_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.ORIGINAL: pdf_stream.getvalue(), BenchMarkColumns.MIMETYPE: "image/png", + BenchMarkColumns.MODALITIES: [ + EvaluationModality.LAYOUT, + EvaluationModality.READING_ORDER, + ], } pdf_stream.close() records.append(record) diff --git a/docling_eval/benchmarks/dpbench/create.py b/docling_eval/benchmarks/dpbench/create.py index 678f5a0e..a75c28c9 100644 --- a/docling_eval/benchmarks/dpbench/create.py +++ b/docling_eval/benchmarks/dpbench/create.py @@ -1,4 +1,3 @@ -import argparse import json import logging import os @@ -6,14 +5,18 @@ from typing import Dict, Optional from docling.datamodel.pipeline_options import TableFormerMode -from tqdm import tqdm # type: ignore +from tqdm import tqdm + +from docling_eval.visualisation.visualisations import ( # type: ignore + save_comparison_html, + save_comparison_html_with_clusters, +) # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) -from bs4 import BeautifulSoup # type: ignore from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size from docling_core.types.doc.document import ( DoclingDocument, @@ -25,28 +28,28 @@ from docling_core.types.doc.labels import DocItemLabel from PIL import Image # as PILImage -from docling_eval.benchmarks.constants import BenchMarkColumns, ConverterTypes +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) from docling_eval.benchmarks.utils import ( add_pages_to_true_doc, convert_html_table_into_docling_tabledata, - save_comparison_html, - save_comparison_html_with_clusters, - write_datasets_info, -) -from docling_eval.docling.conversion import ( - create_docling_converter, - create_vlm_converter, -) -from docling_eval.docling.models.tableformer.tf_model_prediction import ( - TableFormerUpdater, -) -from docling_eval.docling.utils import ( crop_bounding_box, docling_version, extract_images, from_pil_to_base64uri, get_binary, save_shard_to_disk, + write_datasets_info, +) +from docling_eval.converters.conversion import ( + create_pdf_docling_converter, + create_smol_docling_converter, +) +from docling_eval.converters.models.tableformer.tf_model_prediction import ( + TableFormerUpdater, ) TRUE_HTML_EXPORT_LABELS = { @@ -247,12 +250,12 @@ def create_dpbench_e2e_dataset( ): # Create Converter if converter_type == ConverterTypes.DOCLING: - converter = create_docling_converter(page_image_scale=1.0) + converter = create_pdf_docling_converter(page_image_scale=1.0) else: - converter = create_vlm_converter() + converter = create_smol_docling_converter() # load the groundtruth - with open(dpbench_dir / f"dataset/reference.json", "r") as fr: + with open(dpbench_dir / "dataset/reference.json", "r") as fr: gt = json.load(fr) viz_dir = output_dir / "vizualisations" @@ -329,7 +332,7 @@ def create_dpbench_e2e_dataset( record = { BenchMarkColumns.CONVERTER_TYPE: converter_type, - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(conv_results.status), BenchMarkColumns.DOC_ID: str(filename), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -340,6 +343,10 @@ def create_dpbench_e2e_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.ORIGINAL: get_binary(pdf_path), BenchMarkColumns.MIMETYPE: "application/pdf", + BenchMarkColumns.MODALITIES: [ + EvaluationModality.LAYOUT, + EvaluationModality.READING_ORDER, + ], } records.append(record) @@ -367,7 +374,7 @@ def create_dpbench_tableformer_dataset( tf_updater = TableFormerUpdater(mode, artifacts_path=artifacts_path) # load the groundtruth - with open(dpbench_dir / f"dataset/reference.json", "r") as fr: + with open(dpbench_dir / "dataset/reference.json", "r") as fr: gt = json.load(fr) viz_dir = output_dir / "vizualisations" @@ -436,7 +443,7 @@ def create_dpbench_tableformer_dataset( record = { BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: "SUCCESS", BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -447,6 +454,9 @@ def create_dpbench_tableformer_dataset( BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, BenchMarkColumns.PREDICTION_PAGE_IMAGES: pred_page_images, BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, + BenchMarkColumns.MODALITIES: [ + EvaluationModality.TABLE_STRUCTURE, + ], } records.append(record) diff --git a/docling_eval/benchmarks/funsd/create.py b/docling_eval/benchmarks/funsd/create.py index 8a32f6cc..159b67e6 100644 --- a/docling_eval/benchmarks/funsd/create.py +++ b/docling_eval/benchmarks/funsd/create.py @@ -26,16 +26,20 @@ from PIL import Image from tqdm import tqdm # type: ignore -from docling_eval.benchmarks.constants import BenchMarkColumns -from docling_eval.benchmarks.utils import write_datasets_info -from docling_eval.docling.conversion import create_image_converter -from docling_eval.docling.utils import ( +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) +from docling_eval.benchmarks.utils import ( crop_bounding_box, docling_version, extract_images, from_pil_to_base64uri, save_shard_to_disk, + write_datasets_info, ) +from docling_eval.converters.conversion import create_image_docling_converter SHARD_SIZE = 1000 @@ -443,7 +447,7 @@ def create_funsd_dataset( splits: List[str] = ["train", "test"], max_items: int = -1, ): - doc_converter = create_image_converter(do_ocr=True, ocr_lang=["en"]) + doc_converter = create_image_docling_converter(do_ocr=True, ocr_lang=["en"]) num_train_rows = 0 num_test_rows = 0 @@ -511,13 +515,18 @@ def create_funsd_dataset( ) record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.DOC_ID: img_path.stem, BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images, BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, BenchMarkColumns.ORIGINAL: img_bytes, BenchMarkColumns.MIMETYPE: "image/png", + BenchMarkColumns.MODALITIES: [ + EvaluationModality.LAYOUT, + EvaluationModality.READING_ORDER, + ], } records.append(record) count += 1 diff --git a/docling_eval/benchmarks/omnidocbench/create.py b/docling_eval/benchmarks/omnidocbench/create.py index 772e93f3..e08ea4dc 100644 --- a/docling_eval/benchmarks/omnidocbench/create.py +++ b/docling_eval/benchmarks/omnidocbench/create.py @@ -1,4 +1,3 @@ -import argparse import glob import json import logging @@ -6,7 +5,6 @@ from pathlib import Path from typing import Optional -from bs4 import BeautifulSoup # type: ignore from docling.datamodel.pipeline_options import TableFormerMode from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size from docling_core.types.doc.document import DoclingDocument, ImageRef, ProvenanceItem @@ -14,28 +12,32 @@ from PIL import Image # as PILImage from tqdm import tqdm # type: ignore -from docling_eval.benchmarks.constants import BenchMarkColumns, ConverterTypes +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) from docling_eval.benchmarks.utils import ( add_pages_to_true_doc, convert_html_table_into_docling_tabledata, - save_comparison_html, - save_comparison_html_with_clusters, - write_datasets_info, -) -from docling_eval.docling.conversion import ( - create_docling_converter, - create_vlm_converter, -) -from docling_eval.docling.models.tableformer.tf_model_prediction import ( - TableFormerUpdater, -) -from docling_eval.docling.utils import ( crop_bounding_box, docling_version, extract_images, from_pil_to_base64uri, get_binary, save_shard_to_disk, + write_datasets_info, +) +from docling_eval.converters.conversion import ( + create_pdf_docling_converter, + create_smol_docling_converter, +) +from docling_eval.converters.models.tableformer.tf_model_prediction import ( + TableFormerUpdater, +) +from docling_eval.visualisation.visualisations import ( + save_comparison_html, + save_comparison_html_with_clusters, ) # Configure logging @@ -260,12 +262,12 @@ def create_omnidocbench_e2e_dataset( # Create Converter if converter_type == ConverterTypes.DOCLING: - converter = create_docling_converter(page_image_scale=1.0) + converter = create_pdf_docling_converter(page_image_scale=1.0) else: - converter = create_vlm_converter() + converter = create_smol_docling_converter() # load the groundtruth - with open(omnidocbench_dir / f"OmniDocBench.json", "r") as fr: + with open(omnidocbench_dir / "OmniDocBench.json", "r") as fr: gt = json.load(fr) gt = update_gt_into_map(gt) @@ -290,7 +292,7 @@ def create_omnidocbench_e2e_dataset( pdf_path = Path(page_tuple[1]) # logging.info(f"file: {pdf_path}") - if not os.path.basename(jpg_path) in gt: + if os.path.basename(jpg_path) not in gt: logging.error(f"did not find ground-truth for {os.path.basename(jpg_path)}") continue @@ -356,7 +358,7 @@ def create_omnidocbench_e2e_dataset( record = { BenchMarkColumns.CONVERTER_TYPE: converter_type, - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: "SUCCESS", BenchMarkColumns.DOC_ID: str(os.path.basename(jpg_path)), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -368,6 +370,10 @@ def create_omnidocbench_e2e_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images, BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, + BenchMarkColumns.MODALITIES: [ + EvaluationModality.LAYOUT, + EvaluationModality.READING_ORDER, + ], } records.append(record) @@ -395,7 +401,7 @@ def create_omnidocbench_tableformer_dataset( tf_updater = TableFormerUpdater(mode, artifacts_path=artifacts_path) # load the groundtruth - with open(omnidocbench_dir / f"OmniDocBench.json", "r") as fr: + with open(omnidocbench_dir / "OmniDocBench.json", "r") as fr: gt = json.load(fr) gt = update_gt_into_map(gt) @@ -418,7 +424,7 @@ def create_omnidocbench_tableformer_dataset( pdf_path = Path(page_tuple[1]) # logging.info(f"file: {pdf_path}") - if not os.path.basename(jpg_path) in gt: + if os.path.basename(jpg_path) not in gt: logging.error(f"did not find ground-truth for {os.path.basename(jpg_path)}") continue @@ -474,7 +480,7 @@ def create_omnidocbench_tableformer_dataset( record = { BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: "SUCCESS", BenchMarkColumns.DOC_ID: str(os.path.basename(jpg_path)), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -485,6 +491,7 @@ def create_omnidocbench_tableformer_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images, BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, + BenchMarkColumns.MODALITIES: [EvaluationModality.TABLE_STRUCTURE], } records.append(record) diff --git a/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py b/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py index a253db42..93ba2ff2 100644 --- a/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py +++ b/docling_eval/benchmarks/tableformer_huggingface_otsl/create.py @@ -17,21 +17,23 @@ from docling_core.types.doc.labels import DocItemLabel from tqdm import tqdm # type: ignore -from docling_eval.benchmarks.constants import BenchMarkColumns +from docling_eval.benchmarks.constants import ( + BenchMarkColumns, + ConverterTypes, + EvaluationModality, +) from docling_eval.benchmarks.utils import ( convert_html_table_into_docling_tabledata, - save_comparison_html, -) -from docling_eval.docling.models.tableformer.tf_model_prediction import ( - PageTokens, - TableFormerUpdater, -) -from docling_eval.docling.utils import ( docling_version, extract_images, from_pil_to_base64uri, save_shard_to_disk, ) +from docling_eval.converters.models.tableformer.tf_model_prediction import ( + PageTokens, + TableFormerUpdater, +) +from docling_eval.visualisation.visualisations import save_comparison_html HTML_EXPORT_LABELS = { DocItemLabel.TITLE, @@ -223,7 +225,8 @@ def create_huggingface_otsl_tableformer_dataset( ) record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(ConversionStatus.SUCCESS.value), BenchMarkColumns.DOC_ID: str(os.path.basename(filename)), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -234,11 +237,13 @@ def create_huggingface_otsl_tableformer_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images, BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, + BenchMarkColumns.MODALITIES: [EvaluationModality.TABLE_STRUCTURE], } records.append(record) else: record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING, + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(ConversionStatus.FAILURE.value), BenchMarkColumns.DOC_ID: str(os.path.basename(filename)), BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()), @@ -249,6 +254,7 @@ def create_huggingface_otsl_tableformer_dataset( BenchMarkColumns.PREDICTION_PICTURES: pred_pictures, BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images, BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures, + BenchMarkColumns.MODALITIES: [EvaluationModality.TABLE_STRUCTURE], } records.append(record) diff --git a/docling_eval/benchmarks/utils.py b/docling_eval/benchmarks/utils.py index bf54a553..ec0dc992 100644 --- a/docling_eval/benchmarks/utils.py +++ b/docling_eval/benchmarks/utils.py @@ -1,41 +1,33 @@ -import copy +import base64 import hashlib +import io import json import logging +from importlib.metadata import version from io import BytesIO from pathlib import Path -from typing import Dict, List, Optional, Set +from typing import Any, Dict, List, Optional -import pypdfium2 as pdfium +import pandas as pd from bs4 import BeautifulSoup # type: ignore -from datasets import Features +from datasets import Dataset, Features from datasets import Image as Features_Image from datasets import Sequence, Value from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend -from docling.datamodel.base_models import BoundingBox, Cluster, InputFormat, Page +from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import InputDocument -from docling.utils.visualization import draw_clusters -from docling_core.types.doc.base import Size +from docling_core.types.doc.base import BoundingBox, Size from docling_core.types.doc.document import ( - DocItem, DoclingDocument, ImageRef, - ImageRefMode, PageItem, TableCell, TableData, ) -from docling_core.types.doc.labels import DocItemLabel -from PIL import Image, ImageDraw, ImageFont +from PIL import Image +from pydantic import AnyUrl from docling_eval.benchmarks.constants import BenchMarkColumns -from docling_eval.docling.constants import ( - HTML_COMPARISON_PAGE, - HTML_COMPARISON_PAGE_WITH_CLUSTERS, - HTML_DEFAULT_HEAD_FOR_COMP, - HTML_INSPECTION, -) -from docling_eval.docling.utils import from_pil_to_base64, from_pil_to_base64uri def get_binhash(binary_data: bytes) -> str: @@ -55,7 +47,7 @@ def write_datasets_info( ): features = Features( { - BenchMarkColumns.DOCLING_VERSION: Value("string"), + BenchMarkColumns.CONVERTER_VERSION: Value("string"), BenchMarkColumns.STATUS: Value("string"), BenchMarkColumns.DOC_ID: Value("string"), BenchMarkColumns.DOC_PATH: Value("string"), @@ -102,6 +94,14 @@ def get_input_document(file: Path | BytesIO) -> InputDocument: ) +def from_pil_to_base64uri(img: Image.Image) -> AnyUrl: + + image_base64 = from_pil_to_base64(img) + uri = AnyUrl(f"data:image/png;base64,{image_base64}") + + return uri + + def add_pages_to_true_doc( pdf_path: Path | BytesIO, true_doc: DoclingDocument, image_scale: float = 1.0 ): @@ -234,382 +234,203 @@ def convert_html_table_into_docling_tabledata( return TableData(num_rows=num_rows, num_cols=num_cols, table_cells=cells) -def save_comparison_html( - filename: Path, - true_doc: DoclingDocument, - pred_doc: DoclingDocument, - page_image: Image.Image, - true_labels: Set[DocItemLabel], - pred_labels: Set[DocItemLabel], -): +def docling_version() -> str: + return version("docling") # may raise PackageNotFoundError - true_doc_html = true_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=true_labels, - ) - pred_doc_html = pred_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=pred_labels, - ) +def get_binary(file_path: Path): + """Read binary document into buffer.""" + with open(file_path, "rb") as f: + return f.read() - # since the string in srcdoc are wrapped by ', we need to replace all ' by it HTML convention - true_doc_html = true_doc_html.replace("'", "'") - pred_doc_html = pred_doc_html.replace("'", "'") - image_base64 = from_pil_to_base64(page_image) +def map_to_records(item: Dict): + """Map cells from pdf-parser into a records.""" + header = item["header"] + data = item["data"] - """ - # Convert the image to a bytes object + # Create a DataFrame + df = pd.DataFrame(data, columns=header) + return df.to_dict(orient="records") + + +def from_pil_to_base64(img: Image.Image) -> str: + # Convert the image to a base64 str buffered = io.BytesIO() - page_image.save( - buffered, format="PNG" - ) # Specify the format (e.g., JPEG, PNG, etc.) + img.save(buffered, format="PNG") # Specify the format (e.g., JPEG, PNG, etc.) image_bytes = buffered.getvalue() # Encode the bytes to a Base64 string image_base64 = base64.b64encode(image_bytes).decode("utf-8") - """ + return image_base64 - comparison_page = copy.deepcopy(HTML_COMPARISON_PAGE) - comparison_page = comparison_page.replace("BASE64PAGE", image_base64) - comparison_page = comparison_page.replace("TRUEDOC", true_doc_html) - comparison_page = comparison_page.replace("PREDDOC", pred_doc_html) - with open(str(filename), "w") as fw: - fw.write(comparison_page) +def to_base64(item: Dict[str, Any]) -> str: + image_bytes = item["bytes"] + # Wrap the bytes in a BytesIO object + image_stream = BytesIO(image_bytes) -def draw_arrow( - draw: ImageDraw.ImageDraw, - arrow_coords: tuple[float, float, float, float], - line_width: int = 2, - color: str = "red", -): - r""" - Draw an arrow inside the given draw object - """ - x0, y0, x1, y1 = arrow_coords - - # Arrow parameters - start_point = (x0, y0) # Starting point of the arrow - end_point = (x1, y1) # Ending point of the arrow - arrowhead_length = 20 # Length of the arrowhead - arrowhead_width = 10 # Width of the arrowhead - - # Draw the arrow shaft (line) - draw.line([start_point, end_point], fill=color, width=line_width) - - # Calculate the arrowhead points - dx = end_point[0] - start_point[0] - dy = end_point[1] - start_point[1] - angle = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft - - # Normalized direction vector for the arrow shaft - ux, uy = dx / angle, dy / angle - - # Base of the arrowhead - base_x = end_point[0] - ux * arrowhead_length - base_y = end_point[1] - uy * arrowhead_length - - # Left and right points of the arrowhead - left_x = base_x - uy * arrowhead_width - left_y = base_y + ux * arrowhead_width - right_x = base_x + uy * arrowhead_width - right_y = base_y - ux * arrowhead_width - - # Draw the arrowhead (triangle) - draw.polygon( - [end_point, (left_x, left_y), (right_x, right_y)], - fill=color, - ) - return draw - - -def draw_clusters_with_reading_order( - doc: DoclingDocument, - page_image: Image.Image, - labels: Set[DocItemLabel], - page_no: int = 1, - reading_order: bool = True, -): - - # img = copy.deepcopy(page_image) - img = page_image.copy() - draw = ImageDraw.Draw(img) - - # Load a font (adjust the font size and path as needed) - font = ImageFont.load_default() - try: - font = ImageFont.truetype("arial.ttf", size=15) - except IOError: - font = ImageFont.load_default() + # Open the image using PIL + image = Image.open(image_stream) - x0, y0 = None, None + # Convert the image to a bytes object + buffered = io.BytesIO() + image.save(buffered, format="PNG") # Specify the format (e.g., JPEG, PNG, etc.) + image_bytes = buffered.getvalue() - for item, level in doc.iterate_items(): - if isinstance(item, DocItem): # and item.label in labels: - for prov in item.prov: + # Encode the bytes to a Base64 string + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + return image_base64 - if page_no != prov.page_no: - continue - bbox = prov.bbox.to_top_left_origin( - page_height=doc.pages[prov.page_no].size.height - ) - bbox = bbox.normalized(doc.pages[prov.page_no].size) +def to_pil(uri): - bbox.l = round(bbox.l * img.width) - bbox.r = round(bbox.r * img.width) - bbox.t = round(bbox.t * img.height) - bbox.b = round(bbox.b * img.height) + base64_string = str(uri) + base64_string = base64_string.split(",")[1] - if bbox.b > bbox.t: - bbox.b, bbox.t = bbox.t, bbox.b + # Step 1: Decode the Base64 string + image_data = base64.b64decode(base64_string) - if not reading_order: - x0, y0 = None, None - elif x0 is None and y0 is None: - x0 = (bbox.l + bbox.r) / 2.0 - y0 = (bbox.b + bbox.t) / 2.0 - else: - assert x0 is not None - assert y0 is not None + # Step 2: Open the image using Pillow + image = Image.open(BytesIO(image_data)) - x1 = (bbox.l + bbox.r) / 2.0 - y1 = (bbox.b + bbox.t) / 2.0 + return image - # Arrow parameters - start_point = (x0, y0) # Starting point of the arrow - end_point = (x1, y1) # Ending point of the arrow - arrowhead_length = 20 # Length of the arrowhead - arrowhead_width = 10 # Width of the arrowhead - arrow_color = "red" - line_width = 2 - - # Draw the arrow shaft (line) - draw.line( - [start_point, end_point], fill=arrow_color, width=line_width - ) +def extract_images( + document: DoclingDocument, + pictures_column: str, + page_images_column: str, +): - # Calculate the arrowhead points - dx = end_point[0] - start_point[0] - dy = end_point[1] - start_point[1] - angle = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft + pictures = [] + page_images = [] - # Normalized direction vector for the arrow shaft - ux, uy = dx / angle, dy / angle + # Save page images + for img_no, picture in enumerate(document.pictures): + if picture.image is not None: + # img = picture.image.pil_image + # pictures.append(to_pil(picture.image.uri)) + pictures.append(picture.image.pil_image) + picture.image.uri = Path(f"{pictures_column}/{img_no}") + + # Save page images + for page_no, page in document.pages.items(): + if page.image is not None: + # img = page.image.pil_image + # img.show() + page_images.append(page.image.pil_image) + page.image.uri = Path(f"{page_images_column}/{page_no}") + + return document, pictures, page_images + + +def insert_images( + document: DoclingDocument, + pictures: List[Dict[str, Any]], + page_images: List[Dict[str, Any]], +): - # Base of the arrowhead - base_x = end_point[0] - ux * arrowhead_length - base_y = end_point[1] - uy * arrowhead_length + # Save page images + for pic_no, picture in enumerate(document.pictures): + if picture.image is not None: + if pic_no < len(pictures): + b64 = to_base64(pictures[pic_no]) - # Left and right points of the arrowhead - left_x = base_x - uy * arrowhead_width - left_y = base_y + ux * arrowhead_width - right_x = base_x + uy * arrowhead_width - right_y = base_y - ux * arrowhead_width + image_ref = document.pictures[pic_no].image + if image_ref is not None: + image_ref.uri = AnyUrl(f"data:image/png;base64,{b64}") + document.pictures[pic_no].image = image_ref + else: + logging.warning(f"image-ref is none for picture {pic_no}") - # Draw the arrowhead (triangle) - draw.polygon( - [end_point, (left_x, left_y), (right_x, right_y)], - fill=arrow_color, + """ + if document.pictures[pic_no].image is not None: + document.pictures[pic_no].image.uri = AnyUrl( + f"data:image/png;base64,{b64}" ) + else: + logging.warning(f"image-ref is none for picture {pic_no}") + """ - x0, y0 = x1, y1 - - # Draw rectangle with only a border - rectangle_color = "blue" - border_width = 1 - draw.rectangle( - [bbox.l, bbox.b, bbox.r, bbox.t], - outline=rectangle_color, - width=border_width, - ) - - # Calculate label size using getbbox - text_bbox = font.getbbox(str(item.label)) - label_width = text_bbox[2] - text_bbox[0] - label_height = text_bbox[3] - text_bbox[1] - label_x = bbox.l - label_y = ( - bbox.b - label_height - ) # - 5 # Place the label above the rectangle - - # Draw label text - draw.text( - (label_x, label_y), - str(item.label), - fill=rectangle_color, - font=font, - ) - - return img - - -def save_comparison_html_with_clusters( - filename: Path, - true_doc: DoclingDocument, - pred_doc: DoclingDocument, - page_image: Image.Image, - true_labels: Set[DocItemLabel], - pred_labels: Set[DocItemLabel], - draw_reading_order: bool = True, -): - if (1 not in true_doc.pages) or (1 not in pred_doc.pages): - logging.error(f"1 not in true_doc.pages -> skipping {filename} ") - return - - def draw_doc_layout(doc: DoclingDocument, image: Image.Image): - r""" - Draw the document clusters and optionaly the reading order - """ - clusters = [] - for idx, (elem, _) in enumerate(doc.iterate_items()): - if not isinstance(elem, DocItem): - continue - if len(elem.prov) == 0: - continue # Skip elements without provenances - prov = elem.prov[0] - - if prov.page_no not in true_doc.pages or prov.page_no != 1: - logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ") - continue - - tlo_bbox = prov.bbox.to_top_left_origin( - page_height=true_doc.pages[prov.page_no].size.height - ) - cluster = Cluster( - id=idx, - label=elem.label, - bbox=BoundingBox.model_validate(tlo_bbox), - cells=[], - ) - clusters.append(cluster) - - scale_x = image.width / true_doc.pages[1].size.width - scale_y = image.height / true_doc.pages[1].size.height - draw_clusters(image, clusters, scale_x, scale_y) - - return image - - def draw_doc_reading_order(doc: DoclingDocument, image: Image.Image): - r""" - Draw the reading order - """ - draw = ImageDraw.Draw(image) - x0, y0 = None, None - - for elem, _ in doc.iterate_items(): - if not isinstance(elem, DocItem): - continue - if len(elem.prov) == 0: - continue # Skip elements without provenances - prov = elem.prov[0] - - if prov.page_no not in true_doc.pages or prov.page_no != 1: - logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ") - continue - - tlo_bbox = prov.bbox.to_top_left_origin( - page_height=true_doc.pages[prov.page_no].size.height - ) - ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size) - ro_bbox.l = round(ro_bbox.l * image.width) - ro_bbox.r = round(ro_bbox.r * image.width) - ro_bbox.t = round(ro_bbox.t * image.height) - ro_bbox.b = round(ro_bbox.b * image.height) - - if ro_bbox.b > ro_bbox.t: - ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b - - if x0 is None and y0 is None: - x0 = (ro_bbox.l + ro_bbox.r) / 2.0 - y0 = (ro_bbox.b + ro_bbox.t) / 2.0 + """ else: - assert x0 is not None - assert y0 is not None - - x1 = (ro_bbox.l + ro_bbox.r) / 2.0 - y1 = (ro_bbox.b + ro_bbox.t) / 2.0 + document.pictures[pic_no].image.uri = None + # logging.warning(f"inconsistent number of images in the document ({len(pictures)} != {len(document.pictures)})") + """ + + # Save page images + for page_no, page in document.pages.items(): + if page.image is not None: + # print(f"inserting image to page: {page_no}") + b64 = to_base64(page_images[page_no - 1]) + + image_ref = document.pages[page_no].image + if image_ref is not None: + image_ref.uri = AnyUrl(f"data:image/png;base64,{b64}") + document.pages[page_no].image = image_ref + + return document + + +def save_shard_to_disk( + items: List[Any], + dataset_path: Path, + thread_id: int = 0, + shard_id: int = 0, + features: Optional[Features] = None, + shard_format: str = "parquet", +): + """Save shard of to disk.""" - draw = draw_arrow( - draw, - (x0, y0, x1, y1), - line_width=2, - color="red", - ) - x0, y0 = x1, y1 - return image - - # HTML rendering - true_doc_html = true_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=true_labels, - ) + batch = Dataset.from_list(items) # , features=features) - pred_doc_html = pred_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=pred_labels, - ) + output_file = dataset_path / f"shard_{thread_id:06}_{shard_id:06}.{shard_format}" + logging.info(f"Saved shard {shard_id} to {output_file} with {len(items)} documents") - # since the string in srcdoc are wrapped by ', we need to replace all ' by it HTML convention - true_doc_html = true_doc_html.replace("'", "'") - pred_doc_html = pred_doc_html.replace("'", "'") + if shard_format == "json": + batch.to_json(output_file) - true_doc_img = draw_doc_layout(true_doc, copy.deepcopy(page_image)) - pred_doc_img = draw_doc_layout(pred_doc, copy.deepcopy(page_image)) + elif shard_format == "parquet": + batch.to_parquet(output_file) - if draw_reading_order: - true_doc_img = draw_doc_reading_order(true_doc, true_doc_img) - pred_doc_img = draw_doc_reading_order(pred_doc, pred_doc_img) + else: + raise ValueError(f"Unsupported shard_format: {shard_format}") - true_doc_img_b64 = from_pil_to_base64(true_doc_img) - pred_doc_img_b64 = from_pil_to_base64(pred_doc_img) + shard_id += 1 - comparison_page = copy.deepcopy(HTML_COMPARISON_PAGE_WITH_CLUSTERS) - comparison_page = comparison_page.replace("BASE64TRUEPAGE", true_doc_img_b64) - comparison_page = comparison_page.replace("TRUEDOC", true_doc_html) - comparison_page = comparison_page.replace("BASE64PREDPAGE", pred_doc_img_b64) - comparison_page = comparison_page.replace("PREDDOC", pred_doc_html) + return shard_id, [], 0 - with open(str(filename), "w") as fw: - fw.write(comparison_page) +def crop_bounding_box(page_image: Image.Image, page: PageItem, bbox: BoundingBox): + """ + Crop a bounding box from a PIL image. + + :param img: PIL Image object + :param l: Left coordinate + :param t: Top coordinate (from bottom-left origin) + :param r: Right coordinate + :param b: Bottom coordinate (from bottom-left origin) + :return: Cropped PIL Image + """ + width = float(page.size.width) + height = float(page.size.height) -def save_inspection_html( - filename: Path, doc: DoclingDocument, labels: Set[DocItemLabel] -): + img_width = float(page_image.width) + img_height = float(page_image.height) - html_doc = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED, labels=labels) - html_doc = html_doc.replace("'", "'") + scale_x = img_width / width + scale_y = img_height / height - page_images = [] - page_template = '
Example Image
' - for page_no, page in doc.pages.items(): - # page_img = page.image.pil_image - - if page.image is not None and page.image.pil_image is not None: - - page_img = draw_clusters_with_reading_order( - doc=doc, - page_image=page.image.pil_image, - labels=labels, - page_no=page_no, - reading_order=True, - ) + bbox = bbox.to_top_left_origin(page.size.height) - page_base64 = from_pil_to_base64(page_img) - page_images.append(page_template.replace("BASE64PAGE", page_base64)) + l = bbox.l * scale_x + t = bbox.t * scale_y + r = bbox.r * scale_x + b = bbox.b * scale_y - html_viz = copy.deepcopy(HTML_INSPECTION) - html_viz = html_viz.replace("PREDDOC", html_doc) - html_viz = html_viz.replace("PAGE_IMAGES", "\n".join(page_images)) + # Crop using the converted coordinates + cropped_image = page_image.crop((l, t, r, b)) - with open(str(filename), "w") as fw: - fw.write(html_viz) + return cropped_image diff --git a/docling_eval/docling/__init__.py b/docling_eval/converters/__init__.py similarity index 100% rename from docling_eval/docling/__init__.py rename to docling_eval/converters/__init__.py diff --git a/docling_eval/docling/conversion.py b/docling_eval/converters/conversion.py similarity index 97% rename from docling_eval/docling/conversion.py rename to docling_eval/converters/conversion.py index fee903e6..3629464b 100644 --- a/docling_eval/docling/conversion.py +++ b/docling_eval/converters/conversion.py @@ -28,7 +28,7 @@ logging.getLogger("docling").setLevel(logging.WARNING) -def create_docling_converter( +def create_pdf_docling_converter( page_image_scale: float = 2.0, do_ocr: bool = False, ocr_lang: List[str] = ["en"], @@ -81,7 +81,7 @@ def create_docling_converter( return doc_converter -def create_image_converter( +def create_image_docling_converter( do_ocr: bool = False, ocr_lang: List[str] = ["en"], ocr_engine: OcrEngine = OcrEngine.EASYOCR, @@ -131,7 +131,7 @@ def create_image_converter( return doc_converter -def create_vlm_converter( +def create_smol_docling_converter( timings: bool = True, ): vlm_options = SmolDoclingOptions() diff --git a/docling_eval/docling/models/__init__.py b/docling_eval/converters/models/__init__.py similarity index 100% rename from docling_eval/docling/models/__init__.py rename to docling_eval/converters/models/__init__.py diff --git a/docling_eval/docling/models/reading_order/reading_order_updater.py b/docling_eval/converters/models/reading_order/reading_order_updater.py similarity index 95% rename from docling_eval/docling/models/reading_order/reading_order_updater.py rename to docling_eval/converters/models/reading_order/reading_order_updater.py index 92216c80..11a97ff5 100644 --- a/docling_eval/docling/models/reading_order/reading_order_updater.py +++ b/docling_eval/converters/models/reading_order/reading_order_updater.py @@ -1,5 +1,4 @@ import copy -import json import logging from pathlib import Path from typing import Optional @@ -13,8 +12,6 @@ docling_document_to_legacy, ) -from docling_eval.benchmarks.utils import get_input_document - _log = logging.getLogger(__name__) @@ -38,7 +35,7 @@ def __call__( try: # TODO: Understand why some documents fail here glm_doc = self._nlp_model.apply_on_doc(ds_doc_dict) - except RuntimeError as ex: + except RuntimeError: # print("nlp_model.apply_on_doc()") return None diff --git a/docling_eval/docling/models/tableformer/__init__.py b/docling_eval/converters/models/tableformer/__init__.py similarity index 100% rename from docling_eval/docling/models/tableformer/__init__.py rename to docling_eval/converters/models/tableformer/__init__.py diff --git a/docling_eval/docling/models/tableformer/tf_model_prediction.py b/docling_eval/converters/models/tableformer/tf_model_prediction.py similarity index 95% rename from docling_eval/docling/models/tableformer/tf_model_prediction.py rename to docling_eval/converters/models/tableformer/tf_model_prediction.py index a8f30d4a..a3b2cf50 100644 --- a/docling_eval/docling/models/tableformer/tf_model_prediction.py +++ b/docling_eval/converters/models/tableformer/tf_model_prediction.py @@ -1,21 +1,18 @@ import copy import logging -import os from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple import numpy as np from docling.datamodel.base_models import Cluster, LayoutPrediction, Page, Table -from docling.datamodel.document import ConversionResult, InputDocument +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, - PdfPipelineOptions, TableFormerMode, TableStructureOptions, ) from docling.models.table_structure_model import TableStructureModel -from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling_core.types.doc import DocItemLabel from docling_core.types.doc.base import BoundingBox from docling_core.types.doc.document import ( @@ -24,14 +21,11 @@ TableData, TableItem, ) -from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from docling_parse.pdf_parsers import pdf_parser_v2 -from huggingface_hub import snapshot_download from PIL import Image from pydantic import BaseModel -from docling_eval.benchmarks.utils import get_input_document -from docling_eval.docling.utils import crop_bounding_box, map_to_records +from docling_eval.benchmarks.utils import get_input_document, map_to_records # Configure logging logging.basicConfig( diff --git a/docling_eval/docling/utils.py b/docling_eval/docling/utils.py deleted file mode 100644 index af649b9c..00000000 --- a/docling_eval/docling/utils.py +++ /dev/null @@ -1,274 +0,0 @@ -import base64 -import io -import logging -from importlib.metadata import version -from io import BytesIO -from pathlib import Path -from typing import Any, Dict, List, Optional - -import pandas as pd # import-untyped -from datasets import Dataset, DatasetInfo, Features, concatenate_datasets -from docling_core.types.doc.base import BoundingBox -from docling_core.types.doc.document import DoclingDocument, PageItem -from PIL import Image # as PILImage -from pydantic import AnyUrl - -from docling_eval.docling.constants import HTML_DEFAULT_HEAD - - -def docling_version() -> str: - return version("docling") # may raise PackageNotFoundError - - -def create_styled_html(body: str) -> str: - - html_lines = [ - "", - "", - HTML_DEFAULT_HEAD, - "", - body, - "", - ] - return "".join(html_lines) - - -def get_binary(file_path: Path): - """Read binary document into buffer.""" - with open(file_path, "rb") as f: - return f.read() - - -def map_to_records(item: Dict): - """Map cells from pdf-parser into a records.""" - header = item["header"] - data = item["data"] - - # Create a DataFrame - df = pd.DataFrame(data, columns=header) - return df.to_dict(orient="records") - - -def from_pil_to_base64(img: Image.Image) -> str: - # Convert the image to a base64 str - buffered = io.BytesIO() - img.save(buffered, format="PNG") # Specify the format (e.g., JPEG, PNG, etc.) - image_bytes = buffered.getvalue() - - # Encode the bytes to a Base64 string - image_base64 = base64.b64encode(image_bytes).decode("utf-8") - return image_base64 - - -def from_pil_to_base64uri(img: Image.Image) -> AnyUrl: - - image_base64 = from_pil_to_base64(img) - uri = AnyUrl(f"data:image/png;base64,{image_base64}") - - return uri - - -def to_base64(item: Dict[str, Any]) -> str: - image_bytes = item["bytes"] - - # Wrap the bytes in a BytesIO object - image_stream = BytesIO(image_bytes) - - # Open the image using PIL - image = Image.open(image_stream) - - # Convert the image to a bytes object - buffered = io.BytesIO() - image.save(buffered, format="PNG") # Specify the format (e.g., JPEG, PNG, etc.) - image_bytes = buffered.getvalue() - - # Encode the bytes to a Base64 string - image_base64 = base64.b64encode(image_bytes).decode("utf-8") - return image_base64 - - -def to_pil(uri): - - base64_string = str(uri) - base64_string = base64_string.split(",")[1] - - # Step 1: Decode the Base64 string - image_data = base64.b64decode(base64_string) - - # Step 2: Open the image using Pillow - image = Image.open(BytesIO(image_data)) - - return image - - -def extract_images( - document: DoclingDocument, - pictures_column: str, - page_images_column: str, -): - - pictures = [] - page_images = [] - - # Save page images - for img_no, picture in enumerate(document.pictures): - if picture.image is not None: - # img = picture.image.pil_image - # pictures.append(to_pil(picture.image.uri)) - pictures.append(picture.image.pil_image) - picture.image.uri = Path(f"{pictures_column}/{img_no}") - - # Save page images - for page_no, page in document.pages.items(): - if page.image is not None: - # img = page.image.pil_image - # img.show() - page_images.append(page.image.pil_image) - page.image.uri = Path(f"{page_images_column}/{page_no}") - - return document, pictures, page_images - - -def insert_images( - document: DoclingDocument, - pictures: List[Dict[str, Any]], - page_images: List[Dict[str, Any]], -): - - # Save page images - for pic_no, picture in enumerate(document.pictures): - if picture.image is not None: - if pic_no < len(pictures): - b64 = to_base64(pictures[pic_no]) - - image_ref = document.pictures[pic_no].image - if image_ref is not None: - image_ref.uri = AnyUrl(f"data:image/png;base64,{b64}") - document.pictures[pic_no].image = image_ref - else: - logging.warning(f"image-ref is none for picture {pic_no}") - - """ - if document.pictures[pic_no].image is not None: - document.pictures[pic_no].image.uri = AnyUrl( - f"data:image/png;base64,{b64}" - ) - else: - logging.warning(f"image-ref is none for picture {pic_no}") - """ - - """ - else: - document.pictures[pic_no].image.uri = None - # logging.warning(f"inconsistent number of images in the document ({len(pictures)} != {len(document.pictures)})") - """ - - # Save page images - for page_no, page in document.pages.items(): - if page.image is not None: - # print(f"inserting image to page: {page_no}") - b64 = to_base64(page_images[page_no - 1]) - - image_ref = document.pages[page_no].image - if image_ref is not None: - image_ref.uri = AnyUrl(f"data:image/png;base64,{b64}") - document.pages[page_no].image = image_ref - - return document - - -def save_shard_to_disk( - items: List[Any], - dataset_path: Path, - thread_id: int = 0, - shard_id: int = 0, - features: Optional[Features] = None, - shard_format: str = "parquet", -): - """Save shard of to disk.""" - - batch = Dataset.from_list(items) # , features=features) - - output_file = dataset_path / f"shard_{thread_id:06}_{shard_id:06}.{shard_format}" - logging.info(f"Saved shard {shard_id} to {output_file} with {len(items)} documents") - - if shard_format == "json": - batch.to_json(output_file) - - elif shard_format == "parquet": - batch.to_parquet(output_file) - - else: - raise ValueError(f"Unsupported shard_format: {shard_format}") - - shard_id += 1 - - return shard_id, [], 0 - - -def load_shard_from_disk(parquet_path: Path): - """ - Load Parquet shard into a single Hugging Face dataset. - """ - return Dataset.from_parquet(parquet_path) - - -def load_shards_from_disk(dataset_path: Path): - """ - Load all Parquet shards from a directory into a single Hugging Face dataset. - """ - parquet_files = sorted(list(Path(dataset_path).glob("*.parquet"))) - datasets = [Dataset.from_parquet(str(pfile)) for pfile in parquet_files] - return concatenate_datasets(datasets) - - -def generate_dataset_info( - output_dir: Path, - features: Features, - description: str = "", - license: str = "CC-BY 4.0", - version="1.0.0", -): - """ - Generate dataset_info.json manually for a dataset. - """ - dataset_info = DatasetInfo( - description=description, - features=features, - license="CC-BY 4.0", - version="1.0.0", - ) - dataset_info.save_to_disk(str(output_dir)) - - -def crop_bounding_box(page_image: Image.Image, page: PageItem, bbox: BoundingBox): - """ - Crop a bounding box from a PIL image. - - :param img: PIL Image object - :param l: Left coordinate - :param t: Top coordinate (from bottom-left origin) - :param r: Right coordinate - :param b: Bottom coordinate (from bottom-left origin) - :return: Cropped PIL Image - """ - width = float(page.size.width) - height = float(page.size.height) - - img_width = float(page_image.width) - img_height = float(page_image.height) - - scale_x = img_width / width - scale_y = img_height / height - - bbox = bbox.to_top_left_origin(page.size.height) - - l = bbox.l * scale_x - t = bbox.t * scale_y - r = bbox.r * scale_x - b = bbox.b * scale_y - - # Crop using the converted coordinates - cropped_image = page_image.crop((l, t, r, b)) - - return cropped_image diff --git a/docling_eval/evaluators/base_readingorder_evaluator.py b/docling_eval/evaluators/base_readingorder_evaluator.py index b758fbc1..7921394b 100644 --- a/docling_eval/evaluators/base_readingorder_evaluator.py +++ b/docling_eval/evaluators/base_readingorder_evaluator.py @@ -16,8 +16,8 @@ from tqdm import tqdm # type: ignore from docling_eval.benchmarks.constants import BenchMarkColumns -from docling_eval.benchmarks.utils import draw_arrow from docling_eval.evaluators.stats import DatasetStatistics, compute_stats +from docling_eval.visualisation.visualisations import draw_arrow _log = logging.getLogger(__name__) diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py index 721086ee..67b7ee45 100644 --- a/docling_eval/evaluators/bbox_text_evaluator.py +++ b/docling_eval/evaluators/bbox_text_evaluator.py @@ -1,6 +1,6 @@ import logging from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional import nltk from datasets import load_dataset diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 78bb64e1..f1bdea42 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -1,6 +1,4 @@ -import copy import glob -import json import logging from pathlib import Path from typing import Dict, List, Tuple diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py index 0e338dda..c5364765 100644 --- a/docling_eval/evaluators/markdown_text_evaluator.py +++ b/docling_eval/evaluators/markdown_text_evaluator.py @@ -1,6 +1,6 @@ import logging from pathlib import Path -from typing import Dict, Iterator, List, Optional, Set, Tuple +from typing import List, Optional, Set import evaluate import nltk diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py index 76390e51..7edc6794 100644 --- a/docling_eval/evaluators/table_evaluator.py +++ b/docling_eval/evaluators/table_evaluator.py @@ -2,7 +2,6 @@ import logging import random from pathlib import Path -from typing import List, Optional import matplotlib.pyplot as plt from datasets import Dataset, load_dataset @@ -14,7 +13,7 @@ from docling_eval.benchmarks.constants import BenchMarkColumns from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.teds import TEDScorer +from docling_eval.evaluators.teds import TEDScorer _log = logging.getLogger(__name__) @@ -236,7 +235,7 @@ def _evaluate_tables_in_documents( pred_nrows=pred_table.data.num_rows, ) table_evaluations.append(table_evaluation) - except Exception as exc: + except Exception: logging.error( f"Table {table_id} from document {doc_id} could not be compared!" ) diff --git a/docling_eval/utils/teds.py b/docling_eval/evaluators/teds.py similarity index 100% rename from docling_eval/utils/teds.py rename to docling_eval/evaluators/teds.py diff --git a/docling_eval/utils/__init__.py b/docling_eval/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docling_eval/utils/bleu.py b/docling_eval/utils/bleu.py deleted file mode 100644 index 15381392..00000000 --- a/docling_eval/utils/bleu.py +++ /dev/null @@ -1,53 +0,0 @@ -# from nltk.translate.bleu_score import corpus_bleu - - -# def compute_bleu_scores( -# targets: list[list[str]], predictions: list[list[str]] -# ) -> tuple[list[float], float]: -# r""" -# Compute the BLEU score for the given targets and predictions. - -# Parameters -# ---------- -# targets : List[List[str]] -# The ground truth target sequences. -# predictions : List[List[str]] -# The predicted sequences. Each prediction is a list of tokens. - -# Returns -# ------- -# Tuple[List[float], float] -# A tuple containing: -# - bleu_scores (List[float]): The BLEU scores of the predictions. -# - avg_bleu_score (float): The average BLEU score across all predictions. -# """ -# weights = (0.25, 0.25, 0.25, 0.25) - -# bleu_scores = [ -# corpus_bleu([[tg]], [pred], weights=weights) -# for tg, pred in zip(targets, predictions) -# ] -# return bleu_scores, sum(bleu_scores) / len(bleu_scores) - - -# def compute_bleu_score(target: list[str], prediction: list[str]) -> float: -# r""" -# Compute the BLEU score for the given targetand prediction text. - -# Parameters -# ---------- -# targets : List[List[str]] -# The ground truth target sequences. -# predictions : List[List[str]] -# The predicted sequences. Each prediction is a list of tokens. - -# Returns -# ------- -# bleu_score -# """ -# weights = (0.25, 0.25, 0.25, 0.25) - -# # reference: Ground truth (in BLEU there can be many references) -# # hypothesis: prediction -# bleu = corpus_bleu([[target]], [prediction], weights=weights) -# return bleu diff --git a/docling_eval/utils/evaluation_error.py b/docling_eval/utils/evaluation_error.py deleted file mode 100644 index ef62902a..00000000 --- a/docling_eval/utils/evaluation_error.py +++ /dev/null @@ -1,10 +0,0 @@ -from typing import Optional - - -class EvaluationError(Exception): - r""" - Evaluation error class - """ - - def __init__(self, msg: Optional[str] = None): - Exception.__init__(self, msg) diff --git a/docling_eval/utils/repository.py b/docling_eval/utils/repository.py deleted file mode 100644 index d0391af4..00000000 --- a/docling_eval/utils/repository.py +++ /dev/null @@ -1,57 +0,0 @@ -import logging -import os -import subprocess -from pathlib import Path - -# Configure logging -logging.basicConfig( - level=logging.INFO, # Set the logging level - format="%(asctime)s - %(levelname)s - %(message)s", -) - -logger = logging.getLogger(__name__) - -# TODO: Unused codes below. - - -def is_git_lfs_installed(): - """ - Check if Git LFS is installed. - """ - try: - result = subprocess.run( - ["git", "lfs", "version"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - if result.returncode == 0: - logger.info("Git LFS is installed.") - return True - else: - logger.warning("Git LFS is not installed.") - return False - except FileNotFoundError: - logger.error("Git is not installed.") - return False - - -def clone_repository(repo_url: str, target_directory: Path): - """ - Clone a Git repository to the specified target directory. - """ - if os.path.exists(target_directory): - logger.warning( - f"Target directory '{target_directory}' already exists. Skipping clone." - ) - return - - try: - subprocess.run( - ["git", "clone", repo_url, target_directory], - check=True, - ) - logger.info(f"Repository cloned into '{target_directory}'.") - except subprocess.CalledProcessError as e: - logger.error(f"Failed to clone repository: {e}") - raise diff --git a/docling_eval/docling/constants.py b/docling_eval/visualisation/constants.py similarity index 100% rename from docling_eval/docling/constants.py rename to docling_eval/visualisation/constants.py diff --git a/docling_eval/visualisation/visualisations.py b/docling_eval/visualisation/visualisations.py new file mode 100644 index 00000000..c2120f78 --- /dev/null +++ b/docling_eval/visualisation/visualisations.py @@ -0,0 +1,399 @@ +import copy +import logging +from pathlib import Path +from typing import Set + +from docling.datamodel.base_models import BoundingBox, Cluster +from docling.utils.visualization import draw_clusters +from docling_core.types.doc.document import DocItem, DoclingDocument, ImageRefMode +from docling_core.types.doc.labels import DocItemLabel +from PIL import Image, ImageDraw, ImageFont + +from docling_eval.benchmarks.utils import from_pil_to_base64 +from docling_eval.visualisation.constants import ( + HTML_COMPARISON_PAGE, + HTML_COMPARISON_PAGE_WITH_CLUSTERS, + HTML_DEFAULT_HEAD_FOR_COMP, + HTML_INSPECTION, +) + + +def save_comparison_html( + filename: Path, + true_doc: DoclingDocument, + pred_doc: DoclingDocument, + page_image: Image.Image, + true_labels: Set[DocItemLabel], + pred_labels: Set[DocItemLabel], +): + + true_doc_html = true_doc.export_to_html( + image_mode=ImageRefMode.EMBEDDED, + html_head=HTML_DEFAULT_HEAD_FOR_COMP, + labels=true_labels, + ) + + pred_doc_html = pred_doc.export_to_html( + image_mode=ImageRefMode.EMBEDDED, + html_head=HTML_DEFAULT_HEAD_FOR_COMP, + labels=pred_labels, + ) + + # since the string in srcdoc are wrapped by ', we need to replace all ' by it HTML convention + true_doc_html = true_doc_html.replace("'", "'") + pred_doc_html = pred_doc_html.replace("'", "'") + + image_base64 = from_pil_to_base64(page_image) + + """ + # Convert the image to a bytes object + buffered = io.BytesIO() + page_image.save( + buffered, format="PNG" + ) # Specify the format (e.g., JPEG, PNG, etc.) + image_bytes = buffered.getvalue() + + # Encode the bytes to a Base64 string + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + """ + + comparison_page = copy.deepcopy(HTML_COMPARISON_PAGE) + comparison_page = comparison_page.replace("BASE64PAGE", image_base64) + comparison_page = comparison_page.replace("TRUEDOC", true_doc_html) + comparison_page = comparison_page.replace("PREDDOC", pred_doc_html) + + with open(str(filename), "w") as fw: + fw.write(comparison_page) + + +def draw_arrow( + draw: ImageDraw.ImageDraw, + arrow_coords: tuple[float, float, float, float], + line_width: int = 2, + color: str = "red", +): + r""" + Draw an arrow inside the given draw object + """ + x0, y0, x1, y1 = arrow_coords + + # Arrow parameters + start_point = (x0, y0) # Starting point of the arrow + end_point = (x1, y1) # Ending point of the arrow + arrowhead_length = 20 # Length of the arrowhead + arrowhead_width = 10 # Width of the arrowhead + + # Draw the arrow shaft (line) + draw.line([start_point, end_point], fill=color, width=line_width) + + # Calculate the arrowhead points + dx = end_point[0] - start_point[0] + dy = end_point[1] - start_point[1] + angle = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft + + # Normalized direction vector for the arrow shaft + ux, uy = dx / angle, dy / angle + + # Base of the arrowhead + base_x = end_point[0] - ux * arrowhead_length + base_y = end_point[1] - uy * arrowhead_length + + # Left and right points of the arrowhead + left_x = base_x - uy * arrowhead_width + left_y = base_y + ux * arrowhead_width + right_x = base_x + uy * arrowhead_width + right_y = base_y - ux * arrowhead_width + + # Draw the arrowhead (triangle) + draw.polygon( + [end_point, (left_x, left_y), (right_x, right_y)], + fill=color, + ) + return draw + + +def draw_clusters_with_reading_order( + doc: DoclingDocument, + page_image: Image.Image, + labels: Set[DocItemLabel], + page_no: int = 1, + reading_order: bool = True, +): + + # img = copy.deepcopy(page_image) + img = page_image.copy() + draw = ImageDraw.Draw(img) + + # Load a font (adjust the font size and path as needed) + font = ImageFont.load_default() + try: + font = ImageFont.truetype("arial.ttf", size=15) + except IOError: + font = ImageFont.load_default() + + x0, y0 = None, None + + for item, level in doc.iterate_items(): + if isinstance(item, DocItem): # and item.label in labels: + for prov in item.prov: + + if page_no != prov.page_no: + continue + + bbox = prov.bbox.to_top_left_origin( + page_height=doc.pages[prov.page_no].size.height + ) + bbox = bbox.normalized(doc.pages[prov.page_no].size) + + bbox.l = round(bbox.l * img.width) + bbox.r = round(bbox.r * img.width) + bbox.t = round(bbox.t * img.height) + bbox.b = round(bbox.b * img.height) + + if bbox.b > bbox.t: + bbox.b, bbox.t = bbox.t, bbox.b + + if not reading_order: + x0, y0 = None, None + elif x0 is None and y0 is None: + x0 = (bbox.l + bbox.r) / 2.0 + y0 = (bbox.b + bbox.t) / 2.0 + else: + assert x0 is not None + assert y0 is not None + + x1 = (bbox.l + bbox.r) / 2.0 + y1 = (bbox.b + bbox.t) / 2.0 + + # Arrow parameters + start_point = (x0, y0) # Starting point of the arrow + end_point = (x1, y1) # Ending point of the arrow + arrowhead_length = 20 # Length of the arrowhead + arrowhead_width = 10 # Width of the arrowhead + + arrow_color = "red" + line_width = 2 + + # Draw the arrow shaft (line) + draw.line( + [start_point, end_point], fill=arrow_color, width=line_width + ) + + # Calculate the arrowhead points + dx = end_point[0] - start_point[0] + dy = end_point[1] - start_point[1] + angle = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft + + # Normalized direction vector for the arrow shaft + ux, uy = dx / angle, dy / angle + + # Base of the arrowhead + base_x = end_point[0] - ux * arrowhead_length + base_y = end_point[1] - uy * arrowhead_length + + # Left and right points of the arrowhead + left_x = base_x - uy * arrowhead_width + left_y = base_y + ux * arrowhead_width + right_x = base_x + uy * arrowhead_width + right_y = base_y - ux * arrowhead_width + + # Draw the arrowhead (triangle) + draw.polygon( + [end_point, (left_x, left_y), (right_x, right_y)], + fill=arrow_color, + ) + + x0, y0 = x1, y1 + + # Draw rectangle with only a border + rectangle_color = "blue" + border_width = 1 + draw.rectangle( + [bbox.l, bbox.b, bbox.r, bbox.t], + outline=rectangle_color, + width=border_width, + ) + + # Calculate label size using getbbox + text_bbox = font.getbbox(str(item.label)) + label_width = text_bbox[2] - text_bbox[0] + label_height = text_bbox[3] - text_bbox[1] + label_x = bbox.l + label_y = ( + bbox.b - label_height + ) # - 5 # Place the label above the rectangle + + # Draw label text + draw.text( + (label_x, label_y), + str(item.label), + fill=rectangle_color, + font=font, + ) + + return img + + +def save_comparison_html_with_clusters( + filename: Path, + true_doc: DoclingDocument, + pred_doc: DoclingDocument, + page_image: Image.Image, + true_labels: Set[DocItemLabel], + pred_labels: Set[DocItemLabel], + draw_reading_order: bool = True, +): + if (1 not in true_doc.pages) or (1 not in pred_doc.pages): + logging.error(f"1 not in true_doc.pages -> skipping {filename} ") + return + + def draw_doc_layout(doc: DoclingDocument, image: Image.Image): + r""" + Draw the document clusters and optionaly the reading order + """ + clusters = [] + for idx, (elem, _) in enumerate(doc.iterate_items()): + if not isinstance(elem, DocItem): + continue + if len(elem.prov) == 0: + continue # Skip elements without provenances + prov = elem.prov[0] + + if prov.page_no not in true_doc.pages or prov.page_no != 1: + logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ") + continue + + tlo_bbox = prov.bbox.to_top_left_origin( + page_height=true_doc.pages[prov.page_no].size.height + ) + cluster = Cluster( + id=idx, + label=elem.label, + bbox=BoundingBox.model_validate(tlo_bbox), + cells=[], + ) + clusters.append(cluster) + + scale_x = image.width / true_doc.pages[1].size.width + scale_y = image.height / true_doc.pages[1].size.height + draw_clusters(image, clusters, scale_x, scale_y) + + return image + + def draw_doc_reading_order(doc: DoclingDocument, image: Image.Image): + r""" + Draw the reading order + """ + draw = ImageDraw.Draw(image) + x0, y0 = None, None + + for elem, _ in doc.iterate_items(): + if not isinstance(elem, DocItem): + continue + if len(elem.prov) == 0: + continue # Skip elements without provenances + prov = elem.prov[0] + + if prov.page_no not in true_doc.pages or prov.page_no != 1: + logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ") + continue + + tlo_bbox = prov.bbox.to_top_left_origin( + page_height=true_doc.pages[prov.page_no].size.height + ) + ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size) + ro_bbox.l = round(ro_bbox.l * image.width) + ro_bbox.r = round(ro_bbox.r * image.width) + ro_bbox.t = round(ro_bbox.t * image.height) + ro_bbox.b = round(ro_bbox.b * image.height) + + if ro_bbox.b > ro_bbox.t: + ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b + + if x0 is None and y0 is None: + x0 = (ro_bbox.l + ro_bbox.r) / 2.0 + y0 = (ro_bbox.b + ro_bbox.t) / 2.0 + else: + assert x0 is not None + assert y0 is not None + + x1 = (ro_bbox.l + ro_bbox.r) / 2.0 + y1 = (ro_bbox.b + ro_bbox.t) / 2.0 + + draw = draw_arrow( + draw, + (x0, y0, x1, y1), + line_width=2, + color="red", + ) + x0, y0 = x1, y1 + return image + + # HTML rendering + true_doc_html = true_doc.export_to_html( + image_mode=ImageRefMode.EMBEDDED, + html_head=HTML_DEFAULT_HEAD_FOR_COMP, + labels=true_labels, + ) + + pred_doc_html = pred_doc.export_to_html( + image_mode=ImageRefMode.EMBEDDED, + html_head=HTML_DEFAULT_HEAD_FOR_COMP, + labels=pred_labels, + ) + + # since the string in srcdoc are wrapped by ', we need to replace all ' by it HTML convention + true_doc_html = true_doc_html.replace("'", "'") + pred_doc_html = pred_doc_html.replace("'", "'") + + true_doc_img = draw_doc_layout(true_doc, copy.deepcopy(page_image)) + pred_doc_img = draw_doc_layout(pred_doc, copy.deepcopy(page_image)) + + if draw_reading_order: + true_doc_img = draw_doc_reading_order(true_doc, true_doc_img) + pred_doc_img = draw_doc_reading_order(pred_doc, pred_doc_img) + + true_doc_img_b64 = from_pil_to_base64(true_doc_img) + pred_doc_img_b64 = from_pil_to_base64(pred_doc_img) + + comparison_page = copy.deepcopy(HTML_COMPARISON_PAGE_WITH_CLUSTERS) + comparison_page = comparison_page.replace("BASE64TRUEPAGE", true_doc_img_b64) + comparison_page = comparison_page.replace("TRUEDOC", true_doc_html) + comparison_page = comparison_page.replace("BASE64PREDPAGE", pred_doc_img_b64) + comparison_page = comparison_page.replace("PREDDOC", pred_doc_html) + + with open(str(filename), "w") as fw: + fw.write(comparison_page) + + +def save_inspection_html( + filename: Path, doc: DoclingDocument, labels: Set[DocItemLabel] +): + + html_doc = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED, labels=labels) + html_doc = html_doc.replace("'", "'") + + page_images = [] + page_template = '
Example Image
' + for page_no, page in doc.pages.items(): + # page_img = page.image.pil_image + + if page.image is not None and page.image.pil_image is not None: + + page_img = draw_clusters_with_reading_order( + doc=doc, + page_image=page.image.pil_image, + labels=labels, + page_no=page_no, + reading_order=True, + ) + + page_base64 = from_pil_to_base64(page_img) + page_images.append(page_template.replace("BASE64PAGE", page_base64)) + + html_viz = copy.deepcopy(HTML_INSPECTION) + html_viz = html_viz.replace("PREDDOC", html_doc) + html_viz = html_viz.replace("PAGE_IMAGES", "\n".join(page_images)) + + with open(str(filename), "w") as fw: + fw.write(html_viz) diff --git a/docs/examples/benchmark_doclaynet_v1.py b/docs/examples/benchmark_doclaynet_v1.py index b16de340..b9518388 100644 --- a/docs/examples/benchmark_doclaynet_v1.py +++ b/docs/examples/benchmark_doclaynet_v1.py @@ -2,8 +2,6 @@ import os from pathlib import Path -from tabulate import tabulate # type: ignore - from docling_eval.benchmarks.constants import BenchMarkNames, EvaluationModality from docling_eval.benchmarks.doclaynet_v1.create import create_dlnv1_e2e_dataset from docling_eval.cli.main import evaluate, visualise diff --git a/docs/examples/benchmark_dpbench.py b/docs/examples/benchmark_dpbench.py index 4f44b453..4d602513 100644 --- a/docs/examples/benchmark_dpbench.py +++ b/docs/examples/benchmark_dpbench.py @@ -3,7 +3,6 @@ from pathlib import Path from huggingface_hub import snapshot_download -from tabulate import tabulate # type: ignore from docling_eval.benchmarks.constants import BenchMarkNames, EvaluationModality from docling_eval.benchmarks.dpbench.create import ( diff --git a/docs/examples/benchmark_omnidocbench.py b/docs/examples/benchmark_omnidocbench.py index cd16e949..76a41dec 100644 --- a/docs/examples/benchmark_omnidocbench.py +++ b/docs/examples/benchmark_omnidocbench.py @@ -3,7 +3,6 @@ from pathlib import Path from huggingface_hub import snapshot_download -from tabulate import tabulate # type: ignore from docling_eval.benchmarks.constants import BenchMarkNames, EvaluationModality from docling_eval.benchmarks.omnidocbench.create import ( diff --git a/docs/examples/package_pdfs.py b/docs/examples/package_pdfs.py index 59b7a043..435b05e1 100644 --- a/docs/examples/package_pdfs.py +++ b/docs/examples/package_pdfs.py @@ -9,15 +9,15 @@ from tqdm import tqdm # type: ignore from docling_eval.benchmarks.constants import BenchMarkColumns -from docling_eval.benchmarks.utils import draw_clusters_with_reading_order -from docling_eval.docling.constants import HTML_INSPECTION -from docling_eval.docling.conversion import create_docling_converter -from docling_eval.docling.utils import ( +from docling_eval.benchmarks.utils import ( docling_version, from_pil_to_base64, get_binary, save_shard_to_disk, ) +from docling_eval.converters.conversion import create_pdf_docling_converter +from docling_eval.visualisation.constants import HTML_INSPECTION +from docling_eval.visualisation.visualisations import draw_clusters_with_reading_order # Configure logging logging.basicConfig( @@ -133,7 +133,7 @@ def main(): os.makedirs(viz_dir, exist_ok=True) # Create Converter - doc_converter = create_docling_converter( + doc_converter = create_pdf_docling_converter( page_image_scale=image_scale, do_ocr=do_ocr ) @@ -149,7 +149,7 @@ def main(): pred_doc = conv_results.document except: record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(ConversionStatus.FAILURE.value), BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_file)), BenchMarkColumns.PREDICTION: json.dumps(None), @@ -197,7 +197,7 @@ def main(): fw.write(page) record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(conv_results.status.value), BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_file)), BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()), diff --git a/docs/examples/package_pngs.py b/docs/examples/package_pngs.py index 3f5ec122..7dc9b793 100644 --- a/docs/examples/package_pngs.py +++ b/docs/examples/package_pngs.py @@ -7,23 +7,19 @@ from docling.datamodel.base_models import ConversionStatus from docling_core.types.doc.document import ImageRefMode from docling_core.types.doc.labels import DocItemLabel -from PIL import Image as PILImage from tqdm import tqdm # type: ignore from docling_eval.benchmarks.constants import BenchMarkColumns -from docling_eval.benchmarks.utils import draw_clusters_with_reading_order -from docling_eval.docling.constants import HTML_INSPECTION -from docling_eval.docling.conversion import ( - create_docling_converter, - create_image_converter, -) -from docling_eval.docling.utils import ( +from docling_eval.benchmarks.utils import ( docling_version, extract_images, from_pil_to_base64, get_binary, save_shard_to_disk, ) +from docling_eval.converters.conversion import create_image_docling_converter +from docling_eval.visualisation.constants import HTML_INSPECTION +from docling_eval.visualisation.visualisations import draw_clusters_with_reading_order # Configure logging logging.basicConfig( @@ -143,7 +139,7 @@ def main(): os.makedirs(viz_dir, exist_ok=True) # Create Converter - doc_converter = create_image_converter( + doc_converter = create_image_docling_converter( do_ocr=True, ocr_lang=["en"], ocr_engine=OcrEngine.OCRMAC ) @@ -159,7 +155,7 @@ def main(): pred_doc = conv_results.document except: record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(ConversionStatus.FAILURE.value), BenchMarkColumns.DOC_ID: str(os.path.basename(img_file)), BenchMarkColumns.DOC_PATH: str(img_file), @@ -222,7 +218,7 @@ def main(): ) record = { - BenchMarkColumns.DOCLING_VERSION: docling_version(), + BenchMarkColumns.CONVERTER_VERSION: docling_version(), BenchMarkColumns.STATUS: str(conv_results.status.value), BenchMarkColumns.DOC_ID: str(os.path.basename(img_file)), BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()), diff --git a/poetry.lock b/poetry.lock index df592bd3..94c257f8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -890,8 +890,8 @@ vlm = ["transformers (>=4.42.0,<4.43.0)", "transformers (>=4.46.0,<5.0.0)"] [package.source] type = "git" url = "https://github.com/DS4SD/docling.git" -reference = "a7a1f32b107f4fd48b0ab89aad2e42f03997113a" -resolved_reference = "a7a1f32b107f4fd48b0ab89aad2e42f03997113a" +reference = "mly/smol-docling-integration" +resolved_reference = "1c75b52f850e19c34ea8a8e0c20021779d1b605e" [[package]] name = "docling-core" @@ -6402,4 +6402,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "78fb87e08a946919849d1daf977abaa304edfc7b5b5dbec2247cd871b3075a6c" +content-hash = "e9f12a7628ff0b035f03c036594641d91086e1493db6117ce8c44d81b92e313a" diff --git a/pyproject.toml b/pyproject.toml index caa6bf9d..f764b45b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,8 @@ datasets = "^3.2.0" apted = "^1.0.3" Distance = "^0.1.3" # docling = "^2.24.0" -docling = { git = "https://github.com/DS4SD/docling.git", rev = "a7a1f32b107f4fd48b0ab89aad2e42f03997113a" } +# docling = { git = "https://github.com/DS4SD/docling.git", rev = "dc3a388aa2fe73e6dab8f7edd0c0ee7c6f692244" } +docling = { git = "https://github.com/DS4SD/docling.git", branch = "mly/smol-docling-integration" } matplotlib = "^3.10.0" torch = "^2.5.1" torchmetrics = "^1.6.0" @@ -100,7 +101,8 @@ module = [ "nltk.*", "huggingface_hub.*", "PIL.*", - "evaluate.*" + "evaluate.*", + "tqdm.*" ] ignore_missing_imports = true