From c0d6ec7a08447e692ea3eb4bda0628a17f778776 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 26 Mar 2025 20:01:19 +0100 Subject: [PATCH 1/4] More module removal and renaming Signed-off-by: Christoph Auer --- docling_eval/cli/main.py | 3 +- docling_eval/converters/models/__init__.py | 0 .../reading_order/reading_order_updater.py | 56 ---- .../converters/models/tableformer/__init__.py | 0 .../models/tableformer/tf_model_prediction.py | 289 ------------------ docling_eval/datamodels/dataset_record.py | 2 +- .../datamodels/{constants.py => types.py} | 18 ++ .../dataset_builders/doclaynet_v1_builder.py | 2 +- .../dataset_builders/doclaynet_v2_builder.py | 2 +- .../dataset_builders/funsd_builder.py | 2 +- .../otsl_table_dataset_builder.py | 7 +- .../dataset_builders/xfund_builder.py | 2 +- docling_eval/evaluators/base_evaluator.py | 2 +- .../evaluators/base_readingorder_evaluator.py | 2 +- .../evaluators/bbox_text_evaluator.py | 2 +- docling_eval/evaluators/layout_evaluator.py | 2 +- .../evaluators/markdown_text_evaluator.py | 2 +- .../evaluators/readingorder_evaluator_glm.py | 76 ----- docling_eval/evaluators/table_evaluator.py | 2 +- .../{ => legacy}/converters/__init__.py | 0 .../{ => legacy}/converters/conversion.py | 0 docling_eval/legacy/cvat_annotation/create.py | 4 +- .../create_dataset_from_pdfs.py | 4 +- .../legacy/cvat_annotation/preannotate.py | 2 +- docling_eval/legacy/doclaynet_v1/create.py | 10 +- docling_eval/legacy/doclaynet_v2/create.py | 10 +- docling_eval/legacy/dpbench/create.py | 14 +- docling_eval/legacy/funsd/create.py | 4 +- docling_eval/legacy/omnidocbench/create.py | 14 +- .../tableformer_huggingface_otsl/create.py | 8 +- docling_eval/legacy/xfund/create.py | 4 +- .../base_prediction_provider.py | 2 +- .../prediction_providers/docling_provider.py | 2 +- .../prediction_providers/file_provider.py | 2 +- .../tableformer_provider.py | 279 ++++++++++++++++- docling_eval/utils/utils.py | 2 +- docs/examples/benchmark_doclaynet_v1.py | 2 +- docs/examples/benchmark_doclaynet_v2.py | 2 +- docs/examples/benchmark_dpbench.py | 2 +- docs/examples/benchmark_omnidocbench.py | 2 +- .../benchmark_tableformer_fintabnet.py | 2 +- docs/examples/benchmark_tableformer_p1m.py | 2 +- .../benchmark_tableformer_pubtabnet.py | 2 +- docs/examples/create_funsd.py | 2 +- docs/examples/create_xfund.py | 2 +- docs/examples/package_pdfs.py | 4 +- docs/examples/package_pngs.py | 4 +- tests/test_dataset_builder.py | 8 +- 48 files changed, 361 insertions(+), 505 deletions(-) delete mode 100644 docling_eval/converters/models/__init__.py delete mode 100644 docling_eval/converters/models/reading_order/reading_order_updater.py delete mode 100644 docling_eval/converters/models/tableformer/__init__.py delete mode 100644 docling_eval/converters/models/tableformer/tf_model_prediction.py rename docling_eval/datamodels/{constants.py => types.py} (88%) delete mode 100644 docling_eval/evaluators/readingorder_evaluator_glm.py rename docling_eval/{ => legacy}/converters/__init__.py (100%) rename docling_eval/{ => legacy}/converters/conversion.py (100%) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index 82741978..d0174ccd 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -8,7 +8,7 @@ import typer from tabulate import tabulate # type: ignore -from docling_eval.datamodels.constants import ( +from docling_eval.datamodels.types import ( BenchMarkNames, ConverterTypes, EvaluationModality, @@ -284,7 +284,6 @@ def evaluate( json.dump(table_evaluation.model_dump(), fd, indent=2, sort_keys=True) elif modality == EvaluationModality.READING_ORDER: - # readingorder_evaluator = ReadingOrderEvaluatorGlm() readingorder_evaluator = ReadingOrderEvaluator() readingorder_evaluation = readingorder_evaluator(idir, split=split) diff --git a/docling_eval/converters/models/__init__.py b/docling_eval/converters/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docling_eval/converters/models/reading_order/reading_order_updater.py b/docling_eval/converters/models/reading_order/reading_order_updater.py deleted file mode 100644 index 11a97ff5..00000000 --- a/docling_eval/converters/models/reading_order/reading_order_updater.py +++ /dev/null @@ -1,56 +0,0 @@ -import copy -import logging -from pathlib import Path -from typing import Optional - -from deepsearch_glm.andromeda_nlp import nlp_model # type: ignore -from docling.utils.glm_utils import to_docling_document -from docling_core.types.doc.document import DoclingDocument -from docling_core.types.doc.labels import DocItemLabel -from docling_core.utils.legacy import ( - doc_item_label_to_legacy_name, - docling_document_to_legacy, -) - -_log = logging.getLogger(__name__) - - -class ReadingOrderUpdater: - def __init__(self): - r""" """ - self._nlp_model = nlp_model(loglevel="error", text_ordering=True) - self._labels_forward_mapping = { - doc_item_label_to_legacy_name(v): v.value for v in DocItemLabel - } - - def __call__( - self, pdf_path: Path, true_doc: DoclingDocument - ) -> Optional[DoclingDocument]: - r""" """ - print(true_doc.name) - # deep copy of the true-document - pred_doc = copy.deepcopy(true_doc) - pred_doc_legacy = docling_document_to_legacy(pred_doc) - ds_doc_dict = pred_doc_legacy.model_dump(by_alias=True, exclude_none=True) - try: - # TODO: Understand why some documents fail here - glm_doc = self._nlp_model.apply_on_doc(ds_doc_dict) - except RuntimeError: - # print("nlp_model.apply_on_doc()") - return None - - # Map from value to key.value before calling to_docling_document - for page_element in glm_doc["page-elements"]: - page_element["name"] = self._labels_forward_mapping[page_element["name"]] - - # When true_doc.name == "ground-truth 01030000000016.pdf" - # pydantic_core._pydantic_core.ValidationError: 1 validation error for TextItem label - # Input should be , , - # , , - # , , , - # , , , - # or - # [type=literal_error, input_value=, input_type=DocItemLabel] - pred_doc = to_docling_document(glm_doc) - - return pred_doc diff --git a/docling_eval/converters/models/tableformer/__init__.py b/docling_eval/converters/models/tableformer/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docling_eval/converters/models/tableformer/tf_model_prediction.py b/docling_eval/converters/models/tableformer/tf_model_prediction.py deleted file mode 100644 index 9aff3c48..00000000 --- a/docling_eval/converters/models/tableformer/tf_model_prediction.py +++ /dev/null @@ -1,289 +0,0 @@ -import copy -import logging -from io import BytesIO -from pathlib import Path -from typing import List, Optional, Tuple - -import numpy as np -from docling.datamodel.base_models import Cluster, LayoutPrediction, Page, Table -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.models.table_structure_model import TableStructureModel -from docling_core.types.doc import DocItemLabel -from docling_core.types.doc.base import BoundingBox -from docling_core.types.doc.document import ( - DoclingDocument, - TableCell, - TableData, - TableItem, -) -from docling_parse.pdf_parsers import pdf_parser_v2 -from PIL import Image -from pydantic import BaseModel - -from docling_eval.utils.utils import get_input_document - -# Get logger -log = logging.getLogger(__name__) - - -class PageToken(BaseModel): - bbox: BoundingBox - - text: str - id: int - - -class PageTokens(BaseModel): - tokens: List[PageToken] - - height: float - width: float - - -class TableFormerUpdater: - - def __init__( - self, - mode: TableFormerMode, - num_threads: int = 16, - artifacts_path: Optional[Path] = None, - ): - r""" """ - # Init the TableFormer model - table_structure_options = TableStructureOptions(mode=mode) - accelerator_options = AcceleratorOptions( - num_threads=num_threads, device=AcceleratorDevice.AUTO - ) - self._docling_tf_model = TableStructureModel( - enabled=True, - artifacts_path=artifacts_path, - options=table_structure_options, - accelerator_options=accelerator_options, - ) - log.info("Initialize %s", mode) - - def to_np(self, pil_image: Image.Image): - # Convert to NumPy array - np_image = np.array(pil_image) - - # Handle different formats - if np_image.ndim == 3: # RGB or RGBA image - if np_image.shape[2] == 4: # RGBA image - # Discard alpha channel and convert to BGR - np_image = np_image[:, :, :3] # Keep only RGB channels - - # Convert RGB to BGR by reversing the last axis - np_image = np_image[:, :, ::-1] - - return np_image - else: - raise ValueError("Unsupported image format") - - def get_page_cells(self, filename: str): - - parser = pdf_parser_v2("fatal") - - try: - key = "key" - parser.load_document(key=key, filename=filename) - - parsed_doc = parser.parse_pdf_from_key(key=key) - - parser.unload_document(key) - return parsed_doc - - except Exception as exc: - log.error(exc) - - return None - - def _make_internal_page_with_table(self, input_doc, prov): - page = Page(page_no=prov.page_no - 1) - page._backend = input_doc._backend.load_page(page.page_no) - page.cells = list(page._backend.get_text_cells()) - page.size = page._backend.get_size() - - if page._backend is not None and page._backend.is_valid(): - cluster = Cluster( - id=0, - label=DocItemLabel.TABLE, - bbox=prov.bbox.to_top_left_origin(page.size.height), - ) - for cell in page.cells: - overlap = cell.rect.to_bounding_box().intersection_area_with( - cluster.bbox - ) - if cell.rect.to_bounding_box().area() > 0: - overlap_ratio = overlap / cell.rect.to_bounding_box().area() - if overlap_ratio > 0.2: - cluster.cells.append(cell) - - page.predictions.layout = LayoutPrediction(clusters=[cluster]) - - return page - - def replace_tabledata( - self, - pdf_path: Path | BytesIO, - true_doc: DoclingDocument, - ) -> Tuple[bool, DoclingDocument]: - - updated = False - - # deep copy of the true-document - pred_doc = true_doc.model_copy(deep=True) - - input_doc = get_input_document(pdf_path) - if not input_doc.valid: - log.error("could not parse pdf-file") - return False, pred_doc - - conv_res = ConversionResult(input=input_doc) - - # parsed_doc = self.get_page_cells(str(pdf_path)) - # if parsed_doc is None: - # log.error("could not parse pdf-file") - # return False, pred_doc - - # Replace the groundtruth tables with predictions from TableFormer - for item, level in pred_doc.iterate_items(): - if isinstance(item, TableItem): - for prov in item.prov: - page = self._make_internal_page_with_table(input_doc, prov) - - page = next(self._docling_tf_model(conv_res, [page])) # type: ignore - tbl: Table = page.predictions.tablestructure.table_map[0] - table_data: TableData = TableData( - num_rows=tbl.num_rows, - num_cols=tbl.num_cols, - table_cells=tbl.table_cells, - ) - - item.data = table_data - page._backend.unload() - - updated = True - - # md = item.export_to_markdown() - # print("prediction from table-former: \n\n", md) - - return updated, pred_doc - - def _tf_predict_with_page_tokens( - self, - page_image: Image.Image, - page_tokens: PageTokens, - table_bbox: Tuple[float, float, float, float], - image_scale: float = 1.0, - ): - r""" - Test the TFPredictor - """ - table_bboxes = [[table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3]]] - - ocr_page = page_tokens.dict() - - ocr_page["image"] = self.to_np(page_image) - ocr_page["table_bboxes"] = table_bboxes - - # TODO: Here we bypass docling API and we steal the tf_preditor private object :-( - predictor = self._docling_tf_model.tf_predictor - - # Loop over the iocr_pages - tf_output = predictor.multi_table_predict( - ocr_page, - table_bboxes=table_bboxes, - do_matching=True, - correct_overlapping_cells=False, - sort_row_col_indexes=True, - ) - # print("tf-output: ", json.dumps(tf_output, indent=2)) - - table_out = tf_output[0] - - do_cell_matching = True - - table_cells = [] - for element in table_out["tf_responses"]: - - tc = TableCell.model_validate(element) - if do_cell_matching and tc.bbox is not None: - tc.bbox = tc.bbox.scaled(1 / image_scale) - table_cells.append(tc) - - # Retrieving cols/rows, after post processing: - num_rows = table_out["predict_details"]["num_rows"] - num_cols = table_out["predict_details"]["num_cols"] - - table_data = TableData( - num_rows=num_rows, num_cols=num_cols, table_cells=table_cells - ) - - return table_data - - def replace_tabledata_with_page_tokens( - self, - true_doc: DoclingDocument, - true_page_images: List[Image.Image], - page_tokens: Optional[PageTokens] = None, - ) -> Tuple[bool, DoclingDocument]: - - updated = False - - # deep copy of the true-document - pred_doc = copy.deepcopy(true_doc) - - assert len(pred_doc.pages) == 1 - page_size = pred_doc.pages[1].size - - # Replace the groundtruth tables with predictions from TableFormer - for item, level in pred_doc.iterate_items(): - if isinstance(item, TableItem): - for prov in item.prov: - - # md = item.export_to_markdown() - # print("groundtruth: \n\n", md) - - page_image = true_page_images[prov.page_no - 1] - # page_image.show() - - # Ensure that the bbox will be inside the min/max ranges - table_bbox = ( - max(prov.bbox.l, 0.0), - max(prov.bbox.b, 0.0), - min(prov.bbox.r, page_size.width), - min(prov.bbox.t, page_size.height), - ) - - if page_tokens is None: - ptokens = [] - for ix, table_cell in enumerate(item.data.table_cells): - pt = PageToken( - bbox=table_cell.bbox, text=table_cell.text, id=ix - ) - ptokens.append(pt) - page_tokens = PageTokens( - tokens=ptokens, - height=prov.bbox.height, - width=prov.bbox.width, - ) - - table_data = self._tf_predict_with_page_tokens( - page_image=page_image, - page_tokens=page_tokens, - table_bbox=table_bbox, - ) - item.data = table_data - - updated = True - - # md = item.export_to_markdown() - # print("prediction from table-former: \n\n", md) - - return updated, pred_doc diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py index b0fd7fe5..50b7bc92 100644 --- a/docling_eval/datamodels/dataset_record.py +++ b/docling_eval/datamodels/dataset_record.py @@ -12,7 +12,7 @@ from docling_core.types.io import DocumentStream from pydantic import BaseModel, ConfigDict, Field, model_validator -from docling_eval.datamodels.constants import EvaluationModality, PredictionFormats +from docling_eval.datamodels.types import EvaluationModality, PredictionFormats class DatasetRecord( diff --git a/docling_eval/datamodels/constants.py b/docling_eval/datamodels/types.py similarity index 88% rename from docling_eval/datamodels/constants.py rename to docling_eval/datamodels/types.py index ce2f8b62..31b015e2 100644 --- a/docling_eval/datamodels/constants.py +++ b/docling_eval/datamodels/types.py @@ -1,4 +1,8 @@ from enum import Enum +from typing import List + +from docling_core.types.doc import BoundingBox +from pydantic import BaseModel class BenchMarkColumns(str, Enum): @@ -82,3 +86,17 @@ class PredictionFormats(str, Enum): JSON = "json" YAML = "yaml" DOCTAGS = "doctags" + + +class PageToken(BaseModel): + bbox: BoundingBox + + text: str + id: int + + +class PageTokens(BaseModel): + tokens: List[PageToken] + + height: float + width: float diff --git a/docling_eval/dataset_builders/doclaynet_v1_builder.py b/docling_eval/dataset_builders/doclaynet_v1_builder.py index d5d99101..3969733e 100644 --- a/docling_eval/dataset_builders/doclaynet_v1_builder.py +++ b/docling_eval/dataset_builders/doclaynet_v1_builder.py @@ -20,8 +20,8 @@ from docling_core.types.io import DocumentStream from tqdm import tqdm -from docling_eval.datamodels.constants import BenchMarkColumns, EvaluationModality from docling_eval.datamodels.dataset_record import DatasetRecord +from docling_eval.datamodels.types import BenchMarkColumns, EvaluationModality from docling_eval.dataset_builders.dataset_builder import ( BaseEvaluationDatasetBuilder, HFSource, diff --git a/docling_eval/dataset_builders/doclaynet_v2_builder.py b/docling_eval/dataset_builders/doclaynet_v2_builder.py index 95d7c5dc..9c641618 100644 --- a/docling_eval/dataset_builders/doclaynet_v2_builder.py +++ b/docling_eval/dataset_builders/doclaynet_v2_builder.py @@ -27,8 +27,8 @@ from PIL import Image from tqdm import tqdm -from docling_eval.datamodels.constants import BenchMarkColumns, EvaluationModality from docling_eval.datamodels.dataset_record import DatasetRecord +from docling_eval.datamodels.types import BenchMarkColumns, EvaluationModality from docling_eval.dataset_builders.dataset_builder import BaseEvaluationDatasetBuilder from docling_eval.utils.utils import ( classify_cells, diff --git a/docling_eval/dataset_builders/funsd_builder.py b/docling_eval/dataset_builders/funsd_builder.py index 618aeea7..3bb55c63 100644 --- a/docling_eval/dataset_builders/funsd_builder.py +++ b/docling_eval/dataset_builders/funsd_builder.py @@ -14,8 +14,8 @@ from PIL import Image from tqdm import tqdm -from docling_eval.datamodels.constants import BenchMarkColumns, EvaluationModality from docling_eval.datamodels.dataset_record import DatasetRecord +from docling_eval.datamodels.types import BenchMarkColumns, EvaluationModality from docling_eval.dataset_builders.dataset_builder import BaseEvaluationDatasetBuilder from docling_eval.utils.utils import ( classify_cells, diff --git a/docling_eval/dataset_builders/otsl_table_dataset_builder.py b/docling_eval/dataset_builders/otsl_table_dataset_builder.py index dd0a3fd7..91174bad 100644 --- a/docling_eval/dataset_builders/otsl_table_dataset_builder.py +++ b/docling_eval/dataset_builders/otsl_table_dataset_builder.py @@ -16,9 +16,12 @@ from docling_core.types.io import DocumentStream from tqdm import tqdm -from docling_eval.converters.models.tableformer.tf_model_prediction import PageTokens -from docling_eval.datamodels.constants import BenchMarkColumns, EvaluationModality from docling_eval.datamodels.dataset_record import DatasetRecord +from docling_eval.datamodels.types import ( + BenchMarkColumns, + EvaluationModality, + PageTokens, +) from docling_eval.dataset_builders.dataset_builder import ( BaseEvaluationDatasetBuilder, HFSource, diff --git a/docling_eval/dataset_builders/xfund_builder.py b/docling_eval/dataset_builders/xfund_builder.py index d9757d82..d0e4a4d6 100644 --- a/docling_eval/dataset_builders/xfund_builder.py +++ b/docling_eval/dataset_builders/xfund_builder.py @@ -15,8 +15,8 @@ from PIL import Image from tqdm import tqdm -from docling_eval.datamodels.constants import BenchMarkColumns, EvaluationModality from docling_eval.datamodels.dataset_record import DatasetRecord +from docling_eval.datamodels.types import BenchMarkColumns, EvaluationModality from docling_eval.dataset_builders.dataset_builder import BaseEvaluationDatasetBuilder from docling_eval.utils.utils import ( classify_cells, diff --git a/docling_eval/evaluators/base_evaluator.py b/docling_eval/evaluators/base_evaluator.py index 8a7416a2..3cda3466 100644 --- a/docling_eval/evaluators/base_evaluator.py +++ b/docling_eval/evaluators/base_evaluator.py @@ -4,7 +4,7 @@ from pydantic import BaseModel -from docling_eval.datamodels.constants import PredictionFormats +from docling_eval.datamodels.types import PredictionFormats _log = logging.getLogger(__name__) diff --git a/docling_eval/evaluators/base_readingorder_evaluator.py b/docling_eval/evaluators/base_readingorder_evaluator.py index 827bd322..b1e19996 100644 --- a/docling_eval/evaluators/base_readingorder_evaluator.py +++ b/docling_eval/evaluators/base_readingorder_evaluator.py @@ -15,7 +15,7 @@ from pydantic import BaseModel from tqdm import tqdm # type: ignore -from docling_eval.datamodels.constants import BenchMarkColumns +from docling_eval.datamodels.types import BenchMarkColumns from docling_eval.evaluators.stats import DatasetStatistics, compute_stats from docling_eval.visualisation.visualisations import draw_arrow diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py index 8e22d27b..7511310d 100644 --- a/docling_eval/evaluators/bbox_text_evaluator.py +++ b/docling_eval/evaluators/bbox_text_evaluator.py @@ -13,7 +13,7 @@ from pydantic import BaseModel from tqdm import tqdm # type: ignore -from docling_eval.datamodels.constants import BenchMarkColumns # type: ignore +from docling_eval.datamodels.types import BenchMarkColumns # type: ignore from docling_eval.evaluators.stats import DatasetStatistics, compute_stats _log = logging.getLogger(__name__) diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 8e72cd84..aa4e21a7 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -15,7 +15,7 @@ from torchmetrics.detection.mean_ap import MeanAveragePrecision from tqdm import tqdm # type: ignore -from docling_eval.datamodels.constants import BenchMarkColumns, PredictionFormats +from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats from docling_eval.evaluators.base_evaluator import BaseEvaluator, DatasetEvaluation from docling_eval.evaluators.stats import DatasetStatistics, compute_stats diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py index f5690a57..8b391eef 100644 --- a/docling_eval/evaluators/markdown_text_evaluator.py +++ b/docling_eval/evaluators/markdown_text_evaluator.py @@ -14,7 +14,7 @@ from pydantic import BaseModel from tqdm import tqdm # type: ignore -from docling_eval.datamodels.constants import ( # type: ignore +from docling_eval.datamodels.types import ( # type: ignore BenchMarkColumns, PredictionFormats, ) diff --git a/docling_eval/evaluators/readingorder_evaluator_glm.py b/docling_eval/evaluators/readingorder_evaluator_glm.py deleted file mode 100644 index 7328f635..00000000 --- a/docling_eval/evaluators/readingorder_evaluator_glm.py +++ /dev/null @@ -1,76 +0,0 @@ -import copy -import logging -from typing import Dict, Optional - -from deepsearch_glm.andromeda_nlp import nlp_model # type: ignore -from docling_core.types.doc.document import DoclingDocument -from docling_core.utils.legacy import docling_document_to_legacy - -from docling_eval.evaluators.base_readingorder_evaluator import ( - BaseReadingOrderEvaluator, -) - -_log = logging.getLogger(__name__) - - -class ReadingOrderEvaluatorGlm(BaseReadingOrderEvaluator): - r""" - Evaluate the reading order using the Average Relative Distance metric - """ - - def __init__(self): - self._nlp_model = nlp_model(loglevel="error", text_ordering=True) - - def _get_reading_order_preds( - self, doc_id: str, true_doc: DoclingDocument - ) -> Optional[dict]: - r""" - Return dict with the bboxes and the predicted reading order or None if something goes wrong. - None is also returned if the document contains items with multiple provenances - - Returns - ------- - reading_order: Keys are "bboxes" and "pred_order". Return None if the document is broken. - """ - try: - page_size = true_doc.pages[1].size - - # Convert the bboxes to bottom-left coords before running the GLM - bboxes = [] - for item, level in true_doc.iterate_items(): - pred_len = len(item.prov) # type: ignore - if pred_len > 1: - _log.warning( - "Skipping document %s as it has %s provenances", - doc_id, - pred_len, - ) - return None - - # Convert the bbox to BOTTOM-LEFT origin - bbox = item.prov[0].bbox.to_bottom_left_origin(page_size.height) # type: ignore - item.prov[0].bbox = bbox # type: ignore - bboxes.append(copy.deepcopy(bbox)) - - # Run the reading order model - legacy_doc = docling_document_to_legacy(true_doc) - legacy_doc_dict = legacy_doc.model_dump(by_alias=True, exclude_none=True) - legacy_doc_dict = self._ensure_bboxes_in_legacy_tables(legacy_doc_dict) - glm_doc = self._nlp_model.apply_on_doc(legacy_doc_dict) - - # pred_to_origin_order: predicted order -> original order - pred_to_origin_order: Dict[int, int] = {} - for po, pe in enumerate(glm_doc["page-elements"]): - oo = pe["orig-order"] - pred_to_origin_order[po] = oo - - # pred_order: The index is the predicted order and the value is the original order - pred_order = [ - pred_to_origin_order[x] for x in range(len(pred_to_origin_order)) - ] - - reading_order = {"bboxes": bboxes, "pred_order": pred_order} - return reading_order - except RuntimeError as ex: - _log.error(str(ex)) - return None diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py index 934beb2a..776dff52 100644 --- a/docling_eval/evaluators/table_evaluator.py +++ b/docling_eval/evaluators/table_evaluator.py @@ -13,7 +13,7 @@ from pydantic import BaseModel from tqdm import tqdm # type: ignore -from docling_eval.datamodels.constants import BenchMarkColumns, PredictionFormats +from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats from docling_eval.evaluators.base_evaluator import BaseEvaluator, DatasetEvaluation from docling_eval.evaluators.stats import DatasetStatistics, compute_stats from docling_eval.evaluators.teds import TEDScorer diff --git a/docling_eval/converters/__init__.py b/docling_eval/legacy/converters/__init__.py similarity index 100% rename from docling_eval/converters/__init__.py rename to docling_eval/legacy/converters/__init__.py diff --git a/docling_eval/converters/conversion.py b/docling_eval/legacy/converters/conversion.py similarity index 100% rename from docling_eval/converters/conversion.py rename to docling_eval/legacy/converters/conversion.py diff --git a/docling_eval/legacy/cvat_annotation/create.py b/docling_eval/legacy/cvat_annotation/create.py index 4a5176e7..1f3126b1 100644 --- a/docling_eval/legacy/cvat_annotation/create.py +++ b/docling_eval/legacy/cvat_annotation/create.py @@ -23,12 +23,12 @@ from PIL import Image # as PILImage from tqdm import tqdm # type: ignore -from docling_eval.converters.conversion import create_pdf_docling_converter -from docling_eval.datamodels.constants import ( +from docling_eval.datamodels.types import ( BenchMarkColumns, ConverterTypes, EvaluationModality, ) +from docling_eval.legacy.converters.conversion import create_pdf_docling_converter from docling_eval.legacy.cvat_annotation.utils import ( AnnotatedImage, AnnotationOverview, diff --git a/docling_eval/legacy/cvat_annotation/create_dataset_from_pdfs.py b/docling_eval/legacy/cvat_annotation/create_dataset_from_pdfs.py index 7107e6a8..a5554450 100644 --- a/docling_eval/legacy/cvat_annotation/create_dataset_from_pdfs.py +++ b/docling_eval/legacy/cvat_annotation/create_dataset_from_pdfs.py @@ -8,8 +8,8 @@ from datasets import Image as Features_Image from datasets import Sequence, Value -from docling_eval.converters.conversion import create_pdf_docling_converter -from docling_eval.datamodels.constants import BenchMarkColumns +from docling_eval.datamodels.types import BenchMarkColumns +from docling_eval.legacy.converters.conversion import create_pdf_docling_converter from docling_eval.utils.utils import ( docling_version, extract_images, diff --git a/docling_eval/legacy/cvat_annotation/preannotate.py b/docling_eval/legacy/cvat_annotation/preannotate.py index c7f1617c..e70b6884 100644 --- a/docling_eval/legacy/cvat_annotation/preannotate.py +++ b/docling_eval/legacy/cvat_annotation/preannotate.py @@ -12,7 +12,7 @@ from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel from tqdm import tqdm # type: ignore -from docling_eval.datamodels.constants import BenchMarkColumns +from docling_eval.datamodels.types import BenchMarkColumns from docling_eval.legacy.cvat_annotation.utils import ( AnnotatedDoc, AnnotatedImage, diff --git a/docling_eval/legacy/doclaynet_v1/create.py b/docling_eval/legacy/doclaynet_v1/create.py index b80ac862..bcfe5e50 100644 --- a/docling_eval/legacy/doclaynet_v1/create.py +++ b/docling_eval/legacy/doclaynet_v1/create.py @@ -20,15 +20,15 @@ from docling_core.types.io import DocumentStream from tqdm import tqdm # type: ignore -from docling_eval.converters.conversion import ( - create_pdf_docling_converter, - create_smol_docling_converter, -) -from docling_eval.datamodels.constants import ( +from docling_eval.datamodels.types import ( BenchMarkColumns, ConverterTypes, EvaluationModality, ) +from docling_eval.legacy.converters.conversion import ( + create_pdf_docling_converter, + create_smol_docling_converter, +) from docling_eval.utils.utils import ( add_pages_to_true_doc, crop_bounding_box, diff --git a/docling_eval/legacy/doclaynet_v2/create.py b/docling_eval/legacy/doclaynet_v2/create.py index da360169..52e79f3a 100644 --- a/docling_eval/legacy/doclaynet_v2/create.py +++ b/docling_eval/legacy/doclaynet_v2/create.py @@ -27,15 +27,15 @@ from docling_core.types.io import DocumentStream from tqdm import tqdm # type: ignore -from docling_eval.converters.conversion import ( - create_image_docling_converter, - create_smol_docling_converter, -) -from docling_eval.datamodels.constants import ( +from docling_eval.datamodels.types import ( BenchMarkColumns, ConverterTypes, EvaluationModality, ) +from docling_eval.legacy.converters.conversion import ( + create_image_docling_converter, + create_smol_docling_converter, +) from docling_eval.legacy.doclaynet_v1.create import ( PRED_HTML_EXPORT_LABELS, TRUE_HTML_EXPORT_LABELS, diff --git a/docling_eval/legacy/dpbench/create.py b/docling_eval/legacy/dpbench/create.py index edaf1fdc..85c7c57a 100644 --- a/docling_eval/legacy/dpbench/create.py +++ b/docling_eval/legacy/dpbench/create.py @@ -17,18 +17,16 @@ from PIL import Image # as PILImage from tqdm import tqdm -from docling_eval.converters.conversion import ( - create_pdf_docling_converter, - create_smol_docling_converter, -) -from docling_eval.converters.models.tableformer.tf_model_prediction import ( - TableFormerUpdater, -) -from docling_eval.datamodels.constants import ( +from docling_eval.datamodels.types import ( BenchMarkColumns, ConverterTypes, EvaluationModality, ) +from docling_eval.legacy.converters.conversion import ( + create_pdf_docling_converter, + create_smol_docling_converter, +) +from docling_eval.prediction_providers.tableformer_provider import TableFormerUpdater from docling_eval.utils.utils import ( add_pages_to_true_doc, convert_html_table_into_docling_tabledata, diff --git a/docling_eval/legacy/funsd/create.py b/docling_eval/legacy/funsd/create.py index bf9de885..7e6e15b1 100644 --- a/docling_eval/legacy/funsd/create.py +++ b/docling_eval/legacy/funsd/create.py @@ -12,8 +12,8 @@ from PIL import Image from tqdm import tqdm # type: ignore -from docling_eval.converters.conversion import create_image_docling_converter -from docling_eval.datamodels.constants import BenchMarkColumns, ConverterTypes +from docling_eval.datamodels.types import BenchMarkColumns, ConverterTypes +from docling_eval.legacy.converters.conversion import create_image_docling_converter from docling_eval.utils.utils import ( docling_version, extract_images, diff --git a/docling_eval/legacy/omnidocbench/create.py b/docling_eval/legacy/omnidocbench/create.py index dd36efa1..1fc93c1f 100644 --- a/docling_eval/legacy/omnidocbench/create.py +++ b/docling_eval/legacy/omnidocbench/create.py @@ -12,18 +12,16 @@ from PIL import Image # as PILImage from tqdm import tqdm # type: ignore -from docling_eval.converters.conversion import ( - create_pdf_docling_converter, - create_smol_docling_converter, -) -from docling_eval.converters.models.tableformer.tf_model_prediction import ( - TableFormerUpdater, -) -from docling_eval.datamodels.constants import ( +from docling_eval.datamodels.types import ( BenchMarkColumns, ConverterTypes, EvaluationModality, ) +from docling_eval.legacy.converters.conversion import ( + create_pdf_docling_converter, + create_smol_docling_converter, +) +from docling_eval.prediction_providers.tableformer_provider import TableFormerUpdater from docling_eval.utils.utils import ( add_pages_to_true_doc, convert_html_table_into_docling_tabledata, diff --git a/docling_eval/legacy/tableformer_huggingface_otsl/create.py b/docling_eval/legacy/tableformer_huggingface_otsl/create.py index c1f7fbd9..f64bb947 100644 --- a/docling_eval/legacy/tableformer_huggingface_otsl/create.py +++ b/docling_eval/legacy/tableformer_huggingface_otsl/create.py @@ -18,15 +18,13 @@ from docling_core.types.doc.labels import DocItemLabel from tqdm import tqdm # type: ignore -from docling_eval.converters.models.tableformer.tf_model_prediction import ( - PageTokens, - TableFormerUpdater, -) -from docling_eval.datamodels.constants import ( +from docling_eval.datamodels.types import ( BenchMarkColumns, ConverterTypes, EvaluationModality, + PageTokens, ) +from docling_eval.prediction_providers.tableformer_provider import TableFormerUpdater from docling_eval.utils.utils import ( convert_html_table_into_docling_tabledata, docling_version, diff --git a/docling_eval/legacy/xfund/create.py b/docling_eval/legacy/xfund/create.py index d7b4b6a9..03696d27 100644 --- a/docling_eval/legacy/xfund/create.py +++ b/docling_eval/legacy/xfund/create.py @@ -26,8 +26,8 @@ from PIL import Image from tqdm import tqdm # type: ignore -from docling_eval.converters.conversion import create_image_docling_converter -from docling_eval.datamodels.constants import BenchMarkColumns, ConverterTypes +from docling_eval.datamodels.types import BenchMarkColumns, ConverterTypes +from docling_eval.legacy.converters.conversion import create_image_docling_converter from docling_eval.utils.utils import ( crop_bounding_box, docling_version, diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index 464afbd4..0c025334 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -14,11 +14,11 @@ from docling_core.types.io import DocumentStream from tqdm import tqdm -from docling_eval.datamodels.constants import PredictionFormats from docling_eval.datamodels.dataset_record import ( DatasetRecord, DatasetRecordWithPrediction, ) +from docling_eval.datamodels.types import PredictionFormats from docling_eval.utils.utils import save_shard_to_disk, write_datasets_info from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters diff --git a/docling_eval/prediction_providers/docling_provider.py b/docling_eval/prediction_providers/docling_provider.py index 9bb265b8..4fc552cf 100644 --- a/docling_eval/prediction_providers/docling_provider.py +++ b/docling_eval/prediction_providers/docling_provider.py @@ -4,11 +4,11 @@ from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.document_converter import DocumentConverter, FormatOption -from docling_eval.datamodels.constants import PredictionFormats from docling_eval.datamodels.dataset_record import ( DatasetRecord, DatasetRecordWithPrediction, ) +from docling_eval.datamodels.types import PredictionFormats from docling_eval.prediction_providers.base_prediction_provider import ( BasePredictionProvider, ) diff --git a/docling_eval/prediction_providers/file_provider.py b/docling_eval/prediction_providers/file_provider.py index aa679afd..c5591d4f 100644 --- a/docling_eval/prediction_providers/file_provider.py +++ b/docling_eval/prediction_providers/file_provider.py @@ -7,11 +7,11 @@ from docling_core.types.doc.document import DocTagsDocument, DocTagsPage from PIL import Image -from docling_eval.datamodels.constants import PredictionFormats from docling_eval.datamodels.dataset_record import ( DatasetRecord, DatasetRecordWithPrediction, ) +from docling_eval.datamodels.types import PredictionFormats from docling_eval.prediction_providers.base_prediction_provider import ( BasePredictionProvider, ) diff --git a/docling_eval/prediction_providers/tableformer_provider.py b/docling_eval/prediction_providers/tableformer_provider.py index 56802752..b4ebcaa6 100644 --- a/docling_eval/prediction_providers/tableformer_provider.py +++ b/docling_eval/prediction_providers/tableformer_provider.py @@ -1,22 +1,42 @@ import copy -from typing import Dict +import logging +from io import BytesIO +from pathlib import Path +from typing import Dict, List, Optional, Tuple -from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.pipeline_options import TableFormerMode +import numpy as np +from docling.datamodel.base_models import ( + Cluster, + ConversionStatus, + LayoutPrediction, + Page, + Table, +) +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + TableFormerMode, + TableStructureOptions, +) +from docling.models.table_structure_model import TableStructureModel +from docling_core.types import DoclingDocument +from docling_core.types.doc import DocItemLabel, TableCell, TableData, TableItem from docling_core.types.io import DocumentStream +from docling_parse.pdf_parsers import pdf_parser_v2 +from PIL import Image -from docling_eval.converters.models.tableformer.tf_model_prediction import ( - TableFormerUpdater, -) -from docling_eval.datamodels.constants import PredictionFormats from docling_eval.datamodels.dataset_record import ( DatasetRecord, DatasetRecordWithPrediction, ) +from docling_eval.datamodels.types import PageToken, PageTokens, PredictionFormats from docling_eval.prediction_providers.base_prediction_provider import ( BasePredictionProvider, ) -from docling_eval.utils.utils import docling_models_version +from docling_eval.utils.utils import docling_models_version, get_input_document + +_log = logging.getLogger(__name__) class TableFormerPredictionProvider(BasePredictionProvider): @@ -65,3 +85,246 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: def info(self) -> Dict: return {"asset": "TableFormer", "version": docling_models_version()} + + +class TableFormerUpdater: + + def __init__( + self, + mode: TableFormerMode, + num_threads: int = 16, + artifacts_path: Optional[Path] = None, + ): + r""" """ + # Init the TableFormer model + table_structure_options = TableStructureOptions(mode=mode) + accelerator_options = AcceleratorOptions( + num_threads=num_threads, device=AcceleratorDevice.AUTO + ) + self._docling_tf_model = TableStructureModel( + enabled=True, + artifacts_path=artifacts_path, + options=table_structure_options, + accelerator_options=accelerator_options, + ) + _log.info("Initialize %s", mode) + + def to_np(self, pil_image: Image.Image): + # Convert to NumPy array + np_image = np.array(pil_image) + + # Handle different formats + if np_image.ndim == 3: # RGB or RGBA image + if np_image.shape[2] == 4: # RGBA image + # Discard alpha channel and convert to BGR + np_image = np_image[:, :, :3] # Keep only RGB channels + + # Convert RGB to BGR by reversing the last axis + np_image = np_image[:, :, ::-1] + + return np_image + else: + raise ValueError("Unsupported image format") + + def get_page_cells(self, filename: str): + + parser = pdf_parser_v2("fatal") + + try: + key = "key" + parser.load_document(key=key, filename=filename) + + parsed_doc = parser.parse_pdf_from_key(key=key) + + parser.unload_document(key) + return parsed_doc + + except Exception as exc: + _log.error(exc) + + return None + + def _make_internal_page_with_table(self, input_doc, prov): + page = Page(page_no=prov.page_no - 1) + page._backend = input_doc._backend.load_page(page.page_no) + page.cells = list(page._backend.get_text_cells()) + page.size = page._backend.get_size() + + if page._backend is not None and page._backend.is_valid(): + cluster = Cluster( + id=0, + label=DocItemLabel.TABLE, + bbox=prov.bbox.to_top_left_origin(page.size.height), + ) + for cell in page.cells: + overlap = cell.rect.to_bounding_box().intersection_area_with( + cluster.bbox + ) + if cell.rect.to_bounding_box().area() > 0: + overlap_ratio = overlap / cell.rect.to_bounding_box().area() + if overlap_ratio > 0.2: + cluster.cells.append(cell) + + page.predictions.layout = LayoutPrediction(clusters=[cluster]) + + return page + + def replace_tabledata( + self, + pdf_path: Path | BytesIO, + true_doc: DoclingDocument, + ) -> Tuple[bool, DoclingDocument]: + + updated = False + + # deep copy of the true-document + pred_doc = true_doc.model_copy(deep=True) + + input_doc = get_input_document(pdf_path) + if not input_doc.valid: + _log.error("could not parse pdf-file") + return False, pred_doc + + conv_res = ConversionResult(input=input_doc) + + # parsed_doc = self.get_page_cells(str(pdf_path)) + # if parsed_doc is None: + # log.error("could not parse pdf-file") + # return False, pred_doc + + # Replace the groundtruth tables with predictions from TableFormer + for item, level in pred_doc.iterate_items(): + if isinstance(item, TableItem): + for prov in item.prov: + page = self._make_internal_page_with_table(input_doc, prov) + + page = next(self._docling_tf_model(conv_res, [page])) # type: ignore + tbl: Table = page.predictions.tablestructure.table_map[0] + table_data: TableData = TableData( + num_rows=tbl.num_rows, + num_cols=tbl.num_cols, + table_cells=tbl.table_cells, + ) + + item.data = table_data + page._backend.unload() + + updated = True + + # md = item.export_to_markdown() + # print("prediction from table-former: \n\n", md) + + return updated, pred_doc + + def _tf_predict_with_page_tokens( + self, + page_image: Image.Image, + page_tokens: PageTokens, + table_bbox: Tuple[float, float, float, float], + image_scale: float = 1.0, + ): + r""" + Test the TFPredictor + """ + table_bboxes = [[table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3]]] + + ocr_page = page_tokens.dict() + + ocr_page["image"] = self.to_np(page_image) + ocr_page["table_bboxes"] = table_bboxes + + # TODO: Here we bypass docling API and we steal the tf_preditor private object :-( + predictor = self._docling_tf_model.tf_predictor + + # Loop over the iocr_pages + tf_output = predictor.multi_table_predict( + ocr_page, + table_bboxes=table_bboxes, + do_matching=True, + correct_overlapping_cells=False, + sort_row_col_indexes=True, + ) + # print("tf-output: ", json.dumps(tf_output, indent=2)) + + table_out = tf_output[0] + + do_cell_matching = True + + table_cells = [] + for element in table_out["tf_responses"]: + + tc = TableCell.model_validate(element) + if do_cell_matching and tc.bbox is not None: + tc.bbox = tc.bbox.scaled(1 / image_scale) + table_cells.append(tc) + + # Retrieving cols/rows, after post processing: + num_rows = table_out["predict_details"]["num_rows"] + num_cols = table_out["predict_details"]["num_cols"] + + table_data = TableData( + num_rows=num_rows, num_cols=num_cols, table_cells=table_cells + ) + + return table_data + + def replace_tabledata_with_page_tokens( + self, + true_doc: DoclingDocument, + true_page_images: List[Image.Image], + page_tokens: Optional[PageTokens] = None, + ) -> Tuple[bool, DoclingDocument]: + + updated = False + + # deep copy of the true-document + pred_doc = copy.deepcopy(true_doc) + + assert len(pred_doc.pages) == 1 + page_size = pred_doc.pages[1].size + + # Replace the groundtruth tables with predictions from TableFormer + for item, level in pred_doc.iterate_items(): + if isinstance(item, TableItem): + for prov in item.prov: + + # md = item.export_to_markdown() + # print("groundtruth: \n\n", md) + + page_image = true_page_images[prov.page_no - 1] + # page_image.show() + + # Ensure that the bbox will be inside the min/max ranges + table_bbox = ( + max(prov.bbox.l, 0.0), + max(prov.bbox.b, 0.0), + min(prov.bbox.r, page_size.width), + min(prov.bbox.t, page_size.height), + ) + + if page_tokens is None: + ptokens = [] + for ix, table_cell in enumerate(item.data.table_cells): + pt = PageToken( + bbox=table_cell.bbox, text=table_cell.text, id=ix + ) + ptokens.append(pt) + page_tokens = PageTokens( + tokens=ptokens, + height=prov.bbox.height, + width=prov.bbox.width, + ) + + table_data = self._tf_predict_with_page_tokens( + page_image=page_image, + page_tokens=page_tokens, + table_bbox=table_bbox, + ) + item.data = table_data + + updated = True + + # md = item.export_to_markdown() + # print("prediction from table-former: \n\n", md) + + return updated, pred_doc diff --git a/docling_eval/utils/utils.py b/docling_eval/utils/utils.py index 738ea76a..a2594a50 100644 --- a/docling_eval/utils/utils.py +++ b/docling_eval/utils/utils.py @@ -30,7 +30,7 @@ from PIL import Image from pydantic import AnyUrl -from docling_eval.datamodels.constants import BenchMarkColumns +from docling_eval.datamodels.types import BenchMarkColumns def get_binhash(binary_data: bytes) -> str: diff --git a/docs/examples/benchmark_doclaynet_v1.py b/docs/examples/benchmark_doclaynet_v1.py index 64cc64eb..7eda76b7 100644 --- a/docs/examples/benchmark_doclaynet_v1.py +++ b/docs/examples/benchmark_doclaynet_v1.py @@ -3,7 +3,7 @@ from pathlib import Path from docling_eval.cli.main import evaluate, visualise -from docling_eval.datamodels.constants import BenchMarkNames, EvaluationModality +from docling_eval.datamodels.types import BenchMarkNames, EvaluationModality from docling_eval.legacy.doclaynet_v1.create import create_dlnv1_e2e_dataset # Configure logging diff --git a/docs/examples/benchmark_doclaynet_v2.py b/docs/examples/benchmark_doclaynet_v2.py index cd0f3749..a3d90c60 100644 --- a/docs/examples/benchmark_doclaynet_v2.py +++ b/docs/examples/benchmark_doclaynet_v2.py @@ -5,7 +5,7 @@ from tabulate import tabulate # type: ignore from docling_eval.cli.main import evaluate, visualise -from docling_eval.datamodels.constants import BenchMarkNames, EvaluationModality +from docling_eval.datamodels.types import BenchMarkNames, EvaluationModality from docling_eval.legacy.doclaynet_v2.create import create_dlnv2_e2e_dataset # Configure logging diff --git a/docs/examples/benchmark_dpbench.py b/docs/examples/benchmark_dpbench.py index e7d3050b..c3155ca2 100644 --- a/docs/examples/benchmark_dpbench.py +++ b/docs/examples/benchmark_dpbench.py @@ -5,7 +5,7 @@ from huggingface_hub import snapshot_download from docling_eval.cli.main import evaluate, visualise -from docling_eval.datamodels.constants import BenchMarkNames, EvaluationModality +from docling_eval.datamodels.types import BenchMarkNames, EvaluationModality from docling_eval.legacy.dpbench.create import ( create_dpbench_e2e_dataset, create_dpbench_tableformer_dataset, diff --git a/docs/examples/benchmark_omnidocbench.py b/docs/examples/benchmark_omnidocbench.py index 007576a3..00e51c3f 100644 --- a/docs/examples/benchmark_omnidocbench.py +++ b/docs/examples/benchmark_omnidocbench.py @@ -5,7 +5,7 @@ from huggingface_hub import snapshot_download from docling_eval.cli.main import evaluate, visualise -from docling_eval.datamodels.constants import BenchMarkNames, EvaluationModality +from docling_eval.datamodels.types import BenchMarkNames, EvaluationModality from docling_eval.legacy.omnidocbench.create import ( create_omnidocbench_e2e_dataset, create_omnidocbench_tableformer_dataset, diff --git a/docs/examples/benchmark_tableformer_fintabnet.py b/docs/examples/benchmark_tableformer_fintabnet.py index 70008e07..d7c6fc4a 100644 --- a/docs/examples/benchmark_tableformer_fintabnet.py +++ b/docs/examples/benchmark_tableformer_fintabnet.py @@ -3,7 +3,7 @@ from pathlib import Path from docling_eval.cli.main import evaluate, visualise -from docling_eval.datamodels.constants import BenchMarkNames, EvaluationModality +from docling_eval.datamodels.types import BenchMarkNames, EvaluationModality from docling_eval.legacy.tableformer_huggingface_otsl.create import ( create_fintabnet_tableformer_dataset, ) diff --git a/docs/examples/benchmark_tableformer_p1m.py b/docs/examples/benchmark_tableformer_p1m.py index 9f0ba6a9..8b5baa8b 100644 --- a/docs/examples/benchmark_tableformer_p1m.py +++ b/docs/examples/benchmark_tableformer_p1m.py @@ -3,7 +3,7 @@ from pathlib import Path from docling_eval.cli.main import evaluate, visualise -from docling_eval.datamodels.constants import BenchMarkNames, EvaluationModality +from docling_eval.datamodels.types import BenchMarkNames, EvaluationModality from docling_eval.legacy.tableformer_huggingface_otsl.create import ( create_p1m_tableformer_dataset, ) diff --git a/docs/examples/benchmark_tableformer_pubtabnet.py b/docs/examples/benchmark_tableformer_pubtabnet.py index 2f4810c5..7bb28aa5 100644 --- a/docs/examples/benchmark_tableformer_pubtabnet.py +++ b/docs/examples/benchmark_tableformer_pubtabnet.py @@ -3,7 +3,7 @@ from pathlib import Path from docling_eval.cli.main import evaluate, visualise -from docling_eval.datamodels.constants import BenchMarkNames, EvaluationModality +from docling_eval.datamodels.types import BenchMarkNames, EvaluationModality from docling_eval.legacy.tableformer_huggingface_otsl.create import ( create_pubtabnet_tableformer_dataset, ) diff --git a/docs/examples/create_funsd.py b/docs/examples/create_funsd.py index 69b506b4..e87cccf3 100644 --- a/docs/examples/create_funsd.py +++ b/docs/examples/create_funsd.py @@ -2,7 +2,7 @@ import os from pathlib import Path -from docling_eval.datamodels.constants import BenchMarkNames +from docling_eval.datamodels.types import BenchMarkNames from docling_eval.legacy.funsd.create import create_funsd_dataset # Configure logging diff --git a/docs/examples/create_xfund.py b/docs/examples/create_xfund.py index 2eade602..99d934bf 100644 --- a/docs/examples/create_xfund.py +++ b/docs/examples/create_xfund.py @@ -2,7 +2,7 @@ import os from pathlib import Path -from docling_eval.datamodels.constants import BenchMarkNames +from docling_eval.datamodels.types import BenchMarkNames from docling_eval.legacy.xfund.create import create_xfund_dataset # Configure logging diff --git a/docs/examples/package_pdfs.py b/docs/examples/package_pdfs.py index daa58117..e6eb4601 100644 --- a/docs/examples/package_pdfs.py +++ b/docs/examples/package_pdfs.py @@ -8,8 +8,8 @@ from docling_core.types.doc.labels import DocItemLabel from tqdm import tqdm # type: ignore -from docling_eval.converters.conversion import create_pdf_docling_converter -from docling_eval.datamodels.constants import BenchMarkColumns +from docling_eval.datamodels.types import BenchMarkColumns +from docling_eval.legacy.converters.conversion import create_pdf_docling_converter from docling_eval.utils.utils import ( docling_version, from_pil_to_base64, diff --git a/docs/examples/package_pngs.py b/docs/examples/package_pngs.py index 990bf9f7..b90bf749 100644 --- a/docs/examples/package_pngs.py +++ b/docs/examples/package_pngs.py @@ -9,8 +9,8 @@ from docling_core.types.doc.labels import DocItemLabel from tqdm import tqdm # type: ignore -from docling_eval.converters.conversion import create_image_docling_converter -from docling_eval.datamodels.constants import BenchMarkColumns +from docling_eval.datamodels.types import BenchMarkColumns +from docling_eval.legacy.converters.conversion import create_image_docling_converter from docling_eval.utils.utils import ( docling_version, extract_images, diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py index 47e7e112..1bf4066c 100644 --- a/tests/test_dataset_builder.py +++ b/tests/test_dataset_builder.py @@ -12,7 +12,7 @@ from docling.models.factories import get_ocr_factory from docling_eval.cli.main import evaluate -from docling_eval.datamodels.constants import ( +from docling_eval.datamodels.types import ( BenchMarkNames, EvaluationModality, PredictionFormats, @@ -117,7 +117,7 @@ def test_run_doclaynet_with_doctags_fileprovider(): dataset_layout.retrieve_input_dataset() # fetches the source dataset from HF dataset_layout.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=80 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. file_provider.create_prediction_dataset( @@ -322,7 +322,7 @@ def test_run_fintabnet_builder(): evaluate( modality=EvaluationModality.TABLE_STRUCTURE, - benchmark=BenchMarkNames.DPBENCH, + benchmark=BenchMarkNames.FINTABNET, idir=target_path / "eval_dataset", odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, ) @@ -349,7 +349,7 @@ def test_run_p1m_builder(): evaluate( modality=EvaluationModality.TABLE_STRUCTURE, - benchmark=BenchMarkNames.DPBENCH, + benchmark=BenchMarkNames.PUB1M, idir=target_path / "eval_dataset", odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, ) From 86744a7732ef56e9df6bb10d71305887c59eda11 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 26 Mar 2025 21:37:12 +0100 Subject: [PATCH 2/4] Small test fixes Signed-off-by: Christoph Auer --- tests/test_dataset_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py index 1bf4066c..035e17cf 100644 --- a/tests/test_dataset_builder.py +++ b/tests/test_dataset_builder.py @@ -105,7 +105,7 @@ def test_run_doclaynet_with_doctags_fileprovider(): file_provider = FilePredictionProvider( prediction_format=PredictionFormats.DOCTAGS, source_path=Path("./tests/data/doclaynet_v1_doctags_sample"), - do_visualization=True, + do_visualization=False, ignore_missing_files=True, ) From a406f616f68d0afad4d4a7f86e009873926eb467 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Thu, 27 Mar 2025 22:17:47 +0100 Subject: [PATCH 3/4] Small test fixes Signed-off-by: Christoph Auer --- .../dataset_builders/dataset_builder.py | 107 ++++- .../dataset_builders/doclaynet_v1_builder.py | 137 +++--- .../dataset_builders/doclaynet_v2_builder.py | 70 +-- .../dataset_builders/dpbench_builder.py | 110 +++-- .../dataset_builders/funsd_builder.py | 105 ++++- .../dataset_builders/omnidocbench_builder.py | 148 ++++--- .../otsl_table_dataset_builder.py | 111 ++++- .../dataset_builders/xfund_builder.py | 103 ++++- .../base_prediction_provider.py | 230 ++++++++-- .../prediction_providers/docling_provider.py | 60 ++- .../prediction_providers/file_provider.py | 210 ++++++--- .../tableformer_provider.py | 408 ++++++++++++------ tests/test_dataset_builder.py | 15 +- 13 files changed, 1340 insertions(+), 474 deletions(-) diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py index a984a23d..7d94e636 100644 --- a/docling_eval/dataset_builders/dataset_builder.py +++ b/docling_eval/dataset_builders/dataset_builder.py @@ -1,8 +1,9 @@ +import logging import os import sys from abc import abstractmethod from pathlib import Path -from typing import Iterable, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Union from docling.utils.utils import chunkify from huggingface_hub import snapshot_download @@ -11,6 +12,9 @@ from docling_eval.datamodels.dataset_record import DatasetRecord from docling_eval.utils.utils import save_shard_to_disk, write_datasets_info +# Get logger +_log = logging.getLogger(__name__) + class HFSource(BaseModel): repo_id: str @@ -24,29 +28,55 @@ class S3Source(BaseModel): class BaseEvaluationDatasetBuilder: + """ + Base class for dataset builders that create evaluation datasets. + + This class provides common functionality for retrieving datasets, + applying index ranges, and saving processed data to disk. + """ + def __init__( self, name: str, dataset_source: Union[HFSource, S3Source, Path], target: Path, split: str = "test", + begin_index: int = 0, + end_index: int = -1, ): + """ + Initialize the dataset builder. + + Args: + name: Name of the dataset + dataset_source: Source of the dataset (HuggingFace, S3, or local path) + target: Path where processed dataset will be saved + split: Dataset split to use (train, test, etc.) + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ self.name = name self.target: Path = target - self.dataset_source = dataset_source - self.dataset_local_path: Optional[Path] = None # TBD + self.dataset_local_path: Optional[Path] = None self.split = split + self.begin_index = begin_index + self.end_index = end_index self.retrieved = False def retrieve_input_dataset(self) -> Path: + """ + Download and retrieve the input dataset. + + Returns: + Path to the retrieved dataset + """ if isinstance(self.dataset_source, HFSource): if not self.dataset_local_path: path_str = snapshot_download( repo_id=self.dataset_source.repo_id, repo_type="dataset", token=self.dataset_source.hf_token, - # local_dir=self.target, ) path: Path = Path(path_str) self.dataset_local_path = path @@ -66,24 +96,82 @@ def retrieve_input_dataset(self) -> Path: ) self.retrieved = True - return path + def get_effective_indices(self, total_items: int) -> tuple[int, int]: + """ + Calculate the effective begin and end indices based on dataset size. + + Args: + total_items: Total number of items available + + Returns: + Tuple of (effective_begin_index, effective_end_index) + """ + begin = self.begin_index if self.begin_index >= 0 else 0 + end = self.end_index if self.end_index > 0 else total_items + end = min(end, total_items) + + if begin >= total_items: + _log.warning( + f"Begin index ({begin}) is greater than or equal to dataset size ({total_items}). " + f"No items will be processed." + ) + begin = total_items + + _log.info( + f"Processing range [{begin}:{end}] out of {total_items} total items " + f"({end - begin} items)" + ) + + return begin, end + + def log_dataset_stats(self, total_items: int, selected_items: int) -> None: + """ + Log dataset statistics for debugging. + + Args: + total_items: Total number of items in the dataset + selected_items: Number of items selected after applying indices + """ + _log.info( + f"Dataset '{self.name}' total items: {total_items}. " + f"Selected range: [{self.begin_index}, {self.end_index}] = {selected_items} items" + ) + @abstractmethod def iterate(self) -> Iterable[DatasetRecord]: + """ + Iterate through the dataset and yield DatasetRecord objects. + + Implementations should respect begin_index and end_index. + + Returns: + Iterable of DatasetRecord objects + """ pass - def save_to_disk(self, chunk_size: int = 80, max_num_chunks: int = sys.maxsize): + def save_to_disk( + self, chunk_size: int = 80, max_num_chunks: int = sys.maxsize + ) -> None: + """ + Save the dataset to disk in chunks. + + Args: + chunk_size: Number of records per chunk + max_num_chunks: Maximum number of chunks to save + """ if not self.retrieved: raise RuntimeError( "You must first retrieve the source dataset. Call retrieve_input_dataset()." ) test_dir = self.target / self.split - os.makedirs(test_dir, exist_ok=True) + test_dir.mkdir(parents=True, exist_ok=True) count = 0 chunk_count = 0 + for record_chunk in chunkify(self.iterate(), chunk_size): record_chunk = [r.as_record_dict() for r in record_chunk] save_shard_to_disk( @@ -93,8 +181,13 @@ def save_to_disk(self, chunk_size: int = 80, max_num_chunks: int = sys.maxsize): chunk_count += 1 if chunk_count >= max_num_chunks: + _log.info( + f"Reached maximum number of chunks ({max_num_chunks}). Stopping." + ) break + _log.info(f"Saved {count} records in {chunk_count} chunks to {test_dir}") + write_datasets_info( name=self.name, output_dir=self.target, diff --git a/docling_eval/dataset_builders/doclaynet_v1_builder.py b/docling_eval/dataset_builders/doclaynet_v1_builder.py index 3969733e..c2d725ba 100644 --- a/docling_eval/dataset_builders/doclaynet_v1_builder.py +++ b/docling_eval/dataset_builders/doclaynet_v1_builder.py @@ -2,14 +2,16 @@ import logging import os from pathlib import Path -from typing import Iterable +from typing import Iterable, List, Optional, Set +import PIL.Image from datasets import load_dataset from docling_core.types import DoclingDocument from docling_core.types.doc import ( BoundingBox, CoordOrigin, DocItemLabel, + GroupItem, GroupLabel, ImageRef, ProvenanceItem, @@ -37,8 +39,8 @@ # Get logger _log = logging.getLogger(__name__) -# Blacklisted document IDs -BLACKLISTED_DOC_IDS = [ +# Blacklisted document IDs (documents with known issues) +BLACKLISTED_DOC_IDS: Set[str] = { "f556167ac3284665652050b1b0bc1e6f5af27f54f17f27566c60c80f6f134a92", "dbc51622cbe9b8766f44db3b3fda8d0a745da06b9bfec9935bd003d2bdd494c8", "d4c0401fffc04d24e629a9fada23266a3b492ea63e889641b3c33adf815d44e3", @@ -64,9 +66,10 @@ "1763e54be635759ccb66ebb462548f8a40d44567f62cecc5ca26f22acd28e823", "048a570b2e415b653a62313ef82504adfda480c99f69826fcbeb67758ea3c7a4", "0261791e343389682847c913a16789776d0ba41a584901571846c7ddab3cbaa6", -] +} -TRUE_HTML_EXPORT_LABELS = { +# Labels to export in HTML visualization +TRUE_HTML_EXPORT_LABELS: Set[DocItemLabel] = { DocItemLabel.TITLE, DocItemLabel.DOCUMENT_INDEX, DocItemLabel.SECTION_HEADER, @@ -86,7 +89,7 @@ DocItemLabel.FOOTNOTE, } -PRED_HTML_EXPORT_LABELS = { +PRED_HTML_EXPORT_LABELS: Set[DocItemLabel] = { DocItemLabel.TITLE, DocItemLabel.DOCUMENT_INDEX, DocItemLabel.SECTION_HEADER, @@ -105,29 +108,39 @@ DocItemLabel.FOOTNOTE, } -SHARD_SIZE = 100 - class DocLayNetV1DatasetBuilder(BaseEvaluationDatasetBuilder): - """DocLayNet V1 dataset builder implementing the base dataset builder interface.""" + """ + DocLayNet V1 dataset builder implementing the base dataset builder interface. + + This builder processes the DocLayNet V1.2 dataset, which contains document + layout annotations for a variety of document types. + """ def __init__( self, - # prediction_provider: BasePredictionProvider, target: Path, split: str = "test", begin_index: int = 0, end_index: int = -1, ): + """ + Initialize the DocLayNet V1 dataset builder. + + Args: + target: Path where processed dataset will be saved + split: Dataset split to use + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ super().__init__( name="DocLayNetV1", dataset_source=HFSource(repo_id="ds4sd/DocLayNet-v1.2"), - # prediction_provider=prediction_provider, target=target, split=split, + begin_index=begin_index, + end_index=end_index, ) - self.begin_index = begin_index - self.end_index = end_index self.blacklisted_ids = set(BLACKLISTED_DOC_IDS) self.category_map = { 1: "caption", @@ -144,15 +157,44 @@ def __init__( } @staticmethod - def ltwh_to_ltrb(box): - """Convert left, top, width, height format to left, top, right, bottom.""" + def ltwh_to_ltrb(box: List[float]) -> List[float]: + """ + Convert left, top, width, height format to left, top, right, bottom. + + Args: + box: Box in [left, top, width, height] format + + Returns: + Box in [left, top, right, bottom] format + """ l, t, w, h = box - return l, t, l + w, t + h + return [l, t, l + w, t + h] def update_doc_with_gt( - self, doc: DoclingDocument, current_list, img, old_size, label_str, box, content - ): - """Add an element to the document based on its label type.""" + self, + doc: DoclingDocument, + current_list: Optional[GroupItem], # This was incorrectly typed as str | None + img: PIL.Image.Image, + old_size: Size, + label_str: str, + box: List[float], + content: str, + ) -> Optional[GroupItem]: # Return type should match the parameter type + """ + Add an element to the document based on its label type. + + Args: + doc: DoclingDocument to update + current_list: Current list group for list items + img: Page image + old_size: Original page size + label_str: Element label as string + box: Bounding box coordinates + content: Text content + + Returns: + Updated list group or None + """ # Map string label to DocItemLabel label_map = { "caption": DocItemLabel.CAPTION, @@ -236,7 +278,12 @@ def update_doc_with_gt( return current_list def iterate(self) -> Iterable[DatasetRecord]: - """Iterate through the dataset and yield DatasetRecord objects.""" + """ + Iterate through the dataset and yield DatasetRecord objects. + + Yields: + DatasetRecord objects + """ if not self.retrieved: raise RuntimeError( "You must first retrieve the source dataset. Call retrieve_input_dataset()." @@ -247,27 +294,16 @@ def iterate(self) -> Iterable[DatasetRecord]: # Load dataset from the retrieved path ds = load_dataset("ds4sd/DocLayNet-v1.2", split=self.split) - # Apply index ranges if specified + # Apply HuggingFace's select method for index ranges total_ds_len = len(ds) - begin_index = self.begin_index - end_index = self.end_index - - if end_index == -1 or end_index > total_ds_len: - end_index = total_ds_len - - if begin_index < 0: - begin_index = 0 + begin, end = self.get_effective_indices(total_ds_len) - ds = ds.select(range(begin_index, end_index)) + # Select the range (HuggingFace datasets have a convenient select method) + ds = ds.select(range(begin, end)) selected_ds_len = len(ds) - _log.info( - "Dataset len: %s. Selected range: [%s, %s] = %s", - total_ds_len, - begin_index, - end_index, - selected_ds_len, - ) + # Log stats + self.log_dataset_stats(total_ds_len, selected_ds_len) skipped_rows = 0 exported_rows = 0 @@ -331,7 +367,6 @@ def iterate(self) -> Iterable[DatasetRecord]: # Create dataset record record = DatasetRecord( - # predictor_info=self.prediction_provider.info(), doc_id=page_hash, doc_hash=get_binhash(pdf), ground_truth_doc=true_doc, @@ -345,32 +380,6 @@ def iterate(self) -> Iterable[DatasetRecord]: ground_truth_page_images=true_page_images, ) - # Update prediction - # self.update_prediction(record) - - # Extract images from the predicted document if available - # if record.predicted_doc is not None: - # pred_doc, pred_pictures, pred_page_images = extract_images( - # document=record.predicted_doc, - # pictures_column=BenchMarkColumns.PREDICTION_PICTURES, - # page_images_column=BenchMarkColumns.PREDICTION_PAGE_IMAGES, - # ) - # record.predicted_doc = pred_doc - # record.predicted_pictures = pred_pictures - # record.predicted_page_images = pred_page_images - - # # Create visualization if requested - # if self.do_visualization and record.predicted_doc is not None: - # save_comparison_html_with_clusters( - # filename=viz_dir / f"{page_hash}-clusters.html", - # true_doc=true_doc, - # pred_doc=record.predicted_doc, - # page_image=img, - # true_labels=TRUE_HTML_EXPORT_LABELS, - # pred_labels=PRED_HTML_EXPORT_LABELS, - # draw_reading_order=False, - # ) - exported_rows += 1 yield record diff --git a/docling_eval/dataset_builders/doclaynet_v2_builder.py b/docling_eval/dataset_builders/doclaynet_v2_builder.py index 9c641618..29e5b46e 100644 --- a/docling_eval/dataset_builders/doclaynet_v2_builder.py +++ b/docling_eval/dataset_builders/doclaynet_v2_builder.py @@ -4,7 +4,7 @@ import os import re from pathlib import Path -from typing import Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Tuple from datasets import load_from_disk from docling_core.types import DoclingDocument @@ -43,36 +43,41 @@ class DocLayNetV2DatasetBuilder(BaseEvaluationDatasetBuilder): - """DocLayNet V2 dataset builder implementing the base dataset builder interface.""" + """ + DocLayNet V2 dataset builder implementing the base dataset builder interface. + + This builder processes the DocLayNet V2 dataset, which contains document + layout annotations and key-value data for a variety of document types. + """ def __init__( self, dataset_path: Path, - # prediction_provider: BasePredictionProvider, target: Path, split: str = "test", - max_items: int = -1, + begin_index: int = 0, + end_index: int = -1, ): """ Initialize the DocLayNet V2 dataset builder. Args: dataset_path: Path to the pre-downloaded dataset - target: Path where the processed dataset will be saved - do_visualization: Whether to create visualizations + target: Path where processed dataset will be saved split: Dataset split to use - max_items: Maximum number of items to process (-1 for all) + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all """ super().__init__( name="DocLayNetV2: end-to-end", dataset_source=dataset_path, # Local Path to dataset - # prediction_provider=prediction_provider, target=target, split=split, + begin_index=begin_index, + end_index=end_index, ) - self.max_items = max_items - def extract_tokens_and_text(self, s: str): + def extract_tokens_and_text(self, s: str) -> Tuple[List[str], List[str]]: """ Extract tokens and text from a string. @@ -80,7 +85,7 @@ def extract_tokens_and_text(self, s: str): s: Input string Returns: - tuple: (tokens, text_parts) + Tuple of (tokens, text_parts) """ # Pattern to match anything enclosed by < > (including the angle brackets themselves) pattern = r"(<[^>]+>)" @@ -104,7 +109,9 @@ def extract_tokens_and_text(self, s: str): return tokens, text_parts - def parse_texts(self, texts, tokens): + def parse_texts( + self, texts: List[str], tokens: List[str] + ) -> Tuple[List[TableCell], List[List[str]]]: """ Parse tokens and texts into table cells. @@ -113,7 +120,7 @@ def parse_texts(self, texts, tokens): tokens: List of tokens Returns: - tuple: (table_cells, split_row_tokens) + Tuple of (table_cells, split_row_tokens) """ split_word = TableToken.OTSL_NL.value split_row_tokens = [ @@ -125,7 +132,9 @@ def parse_texts(self, texts, tokens): r_idx = 0 c_idx = 0 - def count_right(tokens, c_idx, r_idx, which_tokens): + def count_right( + tokens: List[List[str]], c_idx: int, r_idx: int, which_tokens: List[str] + ) -> int: span = 0 c_idx_iter = c_idx while tokens[r_idx][c_idx_iter] in which_tokens: @@ -135,7 +144,9 @@ def count_right(tokens, c_idx, r_idx, which_tokens): return span return span - def count_down(tokens, c_idx, r_idx, which_tokens): + def count_down( + tokens: List[List[str]], c_idx: int, r_idx: int, which_tokens: List[str] + ) -> int: span = 0 r_idx_iter = r_idx while tokens[r_idx_iter][c_idx] in which_tokens: @@ -391,7 +402,9 @@ def populate_key_value_item( # Add the key_value_item to the document. doc.add_key_values(graph=graph, prov=prov) - def create_kv_pairs(self, data): + # The minimal fix for DocLayNetV2Builder is to add type annotation to link_pairs: + + def create_kv_pairs(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Create key-value pairs from document data. @@ -401,7 +414,7 @@ def create_kv_pairs(self, data): Returns: List of key-value pair dictionaries """ - link_pairs = [] + link_pairs: List[Dict[str, Any]] = [] seg_with_id = {} bbox_with_id = {} @@ -577,23 +590,30 @@ def iterate(self) -> Iterable[DatasetRecord]: # Load dataset ds = load_from_disk(str(self.dataset_source)) - # Set max items - if self.max_items == -1: - max_items = len(ds[self.split]) - else: - max_items = min(self.max_items, len(ds[self.split])) + # Get total number of items in the dataset + total_items = len(ds[self.split]) - _log.info(f"Processing DocLayNetV2 dataset: {max_items} documents") + # Calculate effective indices + begin, end = self.get_effective_indices(total_items) + + # Log stats + self.log_dataset_stats(total_items, end - begin) + _log.info(f"Processing DocLayNetV2 dataset: {end - begin} documents") # Process each document for i, doc in enumerate( tqdm( ds[self.split], - total=max_items, + total=end - begin, desc="Processing DocLayNetV2 documents", ) ): - if i >= max_items: + # Skip documents before begin_index + if i < begin: + continue + + # Stop after end_index + if i >= end: break try: diff --git a/docling_eval/dataset_builders/dpbench_builder.py b/docling_eval/dataset_builders/dpbench_builder.py index 5e13e0e7..27156274 100644 --- a/docling_eval/dataset_builders/dpbench_builder.py +++ b/docling_eval/dataset_builders/dpbench_builder.py @@ -1,8 +1,9 @@ import json +import logging import os from io import BytesIO from pathlib import Path -from typing import Dict, Iterable, List +from typing import Dict, Iterable, List, Set from docling_core.types import DoclingDocument from docling_core.types.doc import ( @@ -33,7 +34,11 @@ get_binhash, ) -TRUE_HTML_EXPORT_LABELS = { +# Get logger +_log = logging.getLogger(__name__) + +# Labels to export in HTML visualization +TRUE_HTML_EXPORT_LABELS: Set[DocItemLabel] = { DocItemLabel.TITLE, DocItemLabel.DOCUMENT_INDEX, DocItemLabel.SECTION_HEADER, @@ -54,7 +59,7 @@ DocItemLabel.FOOTNOTE, } -PRED_HTML_EXPORT_LABELS = { +PRED_HTML_EXPORT_LABELS: Set[DocItemLabel] = { DocItemLabel.TITLE, DocItemLabel.DOCUMENT_INDEX, DocItemLabel.SECTION_HEADER, @@ -76,17 +81,36 @@ class DPBenchDatasetBuilder(BaseEvaluationDatasetBuilder): + """ + DPBench dataset builder implementing the base dataset builder interface. + + This builder processes the DPBench dataset, which contains document + understanding benchmarks for various document types. + """ + def __init__( self, - # prediction_provider: BasePredictionProvider, target: Path, split: str = "test", + begin_index: int = 0, + end_index: int = -1, ): + """ + Initialize the DPBench dataset builder. + + Args: + target: Path where processed dataset will be saved + split: Dataset split to use + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ super().__init__( name="DPBench", dataset_source=HFSource(repo_id="upstage/dp-bench"), target=target, split=split, + begin_index=begin_index, + end_index=end_index, ) def _update_gt_doc( @@ -97,26 +121,36 @@ def _update_gt_doc( page_image: Image, page_width: float, page_height: float, - ): - + ) -> None: + """ + Update ground truth document with annotations. + + Args: + doc: DoclingDocument to update + annots: Annotation data + page: Page object + page_image: Page image + page_width: Page width + page_height: Page height + """ label = annots["category"] + # Extract coordinates min_x = annots["coordinates"][0]["x"] max_x = annots["coordinates"][0]["x"] - min_y = annots["coordinates"][0]["y"] max_y = annots["coordinates"][0]["y"] for coor in annots["coordinates"]: min_x = min(min_x, coor["x"]) max_x = max(max_x, coor["x"]) - min_y = min(min_y, coor["y"]) max_y = max(max_y, coor["y"]) text = annots["content"]["text"].replace("\n", " ") html = annots["content"]["html"] + # Create bounding box bbox = BoundingBox( l=min_x * page_width, r=max_x * page_width, @@ -125,10 +159,13 @@ def _update_gt_doc( coord_origin=CoordOrigin.TOPLEFT, ) + # Create provenance prov = ProvenanceItem(page_no=1, bbox=bbox, charspan=(0, len(text))) + # Crop image element img = crop_bounding_box(page_image=page_image, page=page, bbox=bbox) + # Add element to document based on label if label == "Header": doc.add_text( label=DocItemLabel.PAGE_HEADER, text=text, orig=text, prov=prov @@ -143,7 +180,6 @@ def _update_gt_doc( doc.add_text(label=DocItemLabel.TEXT, text=text, orig=text, prov=prov) elif label == "Index": - # FIXME: ultra approximate solution text = annots["content"]["text"] rows = text.split("\n") @@ -194,7 +230,6 @@ def _update_gt_doc( elif label == "List": doc.add_list_item(text=text, orig=text, prov=prov) - # doc.add_text(label=DocItemLabel.TEXT, text=text, orig=text, prov=prov) elif label == "Caption": doc.add_text(label=DocItemLabel.CAPTION, text=text, orig=text, prov=prov) @@ -204,46 +239,41 @@ def _update_gt_doc( elif label == "Figure": uri = from_pil_to_base64uri(img) - imgref = ImageRef( mimetype="image/png", dpi=72, size=Size(width=img.width, height=img.height), uri=uri, ) - doc.add_picture(prov=prov, image=imgref) elif label == "Table": - table_data = convert_html_table_into_docling_tabledata(table_html=html) - doc.add_table(data=table_data, caption=None, prov=prov) elif label == "Chart": uri = from_pil_to_base64uri(img) - imgref = ImageRef( mimetype="image/png", dpi=72, size=Size(width=img.width, height=img.height), uri=uri, ) - doc.add_picture(prov=prov, image=imgref) - # doc.add_picture(prov=prov) - elif label == "Footnote": doc.add_text(label=DocItemLabel.FOOTNOTE, text=text, orig=text, prov=prov) elif label == "Heading1": doc.add_heading(text=text, orig=text, level=1, prov=prov) - else: - return - def iterate(self) -> Iterable[DatasetRecord]: + """ + Iterate through the dataset and yield DatasetRecord objects. + + Yields: + DatasetRecord objects + """ if not self.retrieved: raise RuntimeError( "You must first retrieve the source dataset. Call retrieve_input_dataset()." @@ -251,36 +281,47 @@ def iterate(self) -> Iterable[DatasetRecord]: assert self.dataset_local_path is not None - # load the groundtruth - with open(self.dataset_local_path / f"dataset/reference.json", "r") as fr: + # Load the ground truth + reference_path = self.dataset_local_path / "dataset/reference.json" + with open(reference_path, "r") as fr: gt = json.load(fr) - cnt = 0 - for filename, annots in tqdm( - gt.items(), + # Sort the filenames for deterministic ordering + sorted_filenames = sorted(gt.keys()) + total_files = len(sorted_filenames) + + # Apply index range + begin, end = self.get_effective_indices(total_files) + selected_filenames = sorted_filenames[begin:end] + + # Log stats + self.log_dataset_stats(total_files, len(selected_filenames)) + _log.info(f"Processing DP-Bench dataset with {len(selected_filenames)} files") + + for filename in tqdm( + selected_filenames, desc="Processing files for DP-Bench with end-to-end", - total=len(gt), ncols=128, ): - cnt += 1 - - # if cnt == 10: - # break - + # Get annotations for this file + annots = gt[filename] pdf_path = self.dataset_local_path / f"dataset/pdfs/{filename}" - # Create the groundtruth Document + # Create the ground truth Document true_doc = DoclingDocument( name=f"ground-truth {os.path.basename(pdf_path)}" ) true_doc, true_page_images = add_pages_to_true_doc( pdf_path=pdf_path, true_doc=true_doc, image_scale=2.0 ) + assert len(true_page_images) == 1, "len(true_page_images)==1" + # Get page dimensions page_width = true_doc.pages[1].size.width page_height = true_doc.pages[1].size.height + # Process each element in the annotation for elem in annots["elements"]: self._update_gt_doc( true_doc, @@ -291,10 +332,11 @@ def iterate(self) -> Iterable[DatasetRecord]: page_height=page_height, ) + # Get PDF as binary data pdf_bytes = get_binary(pdf_path) - pdf_stream = DocumentStream(name=pdf_path.name, stream=BytesIO(pdf_bytes)) + # Create dataset record record = DatasetRecord( doc_id=str(filename), doc_hash=get_binhash(pdf_bytes), diff --git a/docling_eval/dataset_builders/funsd_builder.py b/docling_eval/dataset_builders/funsd_builder.py index 3bb55c63..2d014196 100644 --- a/docling_eval/dataset_builders/funsd_builder.py +++ b/docling_eval/dataset_builders/funsd_builder.py @@ -29,27 +29,47 @@ class FUNSDDatasetBuilder(BaseEvaluationDatasetBuilder): - """FUNSD Dataset builder implementing the base dataset builder interface.""" + """ + FUNSD Dataset builder implementing the base dataset builder interface. + + This builder handles the Form Understanding in Noisy Scanned Documents (FUNSD) dataset, + which contains form annotations for form understanding tasks. + """ def __init__( self, dataset_source: Path, - # prediction_provider: BasePredictionProvider, target: Path, split: str = "test", - max_items: int = -1, + begin_index: int = 0, + end_index: int = -1, ): + """ + Initialize the FUNSD dataset builder. + + Args: + dataset_source: Path to the dataset source + target: Path where processed dataset will be saved + split: Dataset split to use ('train' or 'test') + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ super().__init__( name="FUNSD", - dataset_source=dataset_source, # Standard location - # prediction_provider=prediction_provider, + dataset_source=dataset_source, target=target, split=split, + begin_index=begin_index, + end_index=end_index, ) - self.max_items = max_items def retrieve_input_dataset(self) -> Path: - """Download and extract the FUNSD dataset if needed.""" + """ + Download and extract the FUNSD dataset if needed. + + Returns: + Path to the retrieved dataset + """ assert isinstance(self.dataset_source, Path) dataset_path = self.dataset_source @@ -99,7 +119,15 @@ def retrieve_input_dataset(self) -> Path: return dataset_path def convert_bbox(self, bbox_data) -> BoundingBox: - """Convert bbox format to BoundingBox object.""" + """ + Convert bbox format to BoundingBox object. + + Args: + bbox_data: Bounding box data as list or BoundingBox + + Returns: + BoundingBox object + """ if isinstance(bbox_data, list) and len(bbox_data) == 4: return BoundingBox( l=bbox_data[0], t=bbox_data[1], r=bbox_data[2], b=bbox_data[3] @@ -117,7 +145,17 @@ def create_graph_link( value_cell: GraphCell, label: GraphLinkLabel = GraphLinkLabel.TO_VALUE, ) -> GraphLink: - """Create a graph link between key and value cells.""" + """ + Create a graph link between key and value cells. + + Args: + key_cell: Source cell (key) + value_cell: Target cell (value) + label: Link label + + Returns: + GraphLink object + """ return GraphLink( source_cell_id=key_cell.cell_id, target_cell_id=value_cell.cell_id, @@ -127,7 +165,16 @@ def create_graph_link( def get_overall_bbox( self, links: List[GraphLink], cell_dict: Dict[int, GraphCell] ) -> Optional[BoundingBox]: - """Compute the overall bounding box from all cell ids.""" + """ + Compute the overall bounding box from all cell ids. + + Args: + links: List of GraphLink objects + cell_dict: Dictionary mapping cell IDs to GraphCell objects + + Returns: + BoundingBox encompassing all cells, or None if no bounding boxes + """ all_bboxes = [] for link in links: src_prov = cell_dict[link.source_cell_id].prov @@ -145,7 +192,16 @@ def get_overall_bbox( def populate_key_value_item( self, doc: DoclingDocument, funsd_data: dict ) -> DoclingDocument: - """Populate the key-value item from the FUNSD data.""" + """ + Populate the key-value item from the FUNSD data. + + Args: + doc: DoclingDocument to update + funsd_data: FUNSD annotation data + + Returns: + Updated DoclingDocument + """ if "form" not in funsd_data: raise ValueError("Invalid FUNSD data: missing 'form' key.") @@ -224,7 +280,12 @@ def populate_key_value_item( return doc def iterate(self) -> Iterable[DatasetRecord]: - """Iterate through the dataset and yield DatasetRecord objects.""" + """ + Iterate through the dataset and yield DatasetRecord objects. + + Yields: + DatasetRecord objects + """ if not self.retrieved: raise RuntimeError( "You must first retrieve the source dataset. Call retrieve_input_dataset()." @@ -241,17 +302,19 @@ def iterate(self) -> Iterable[DatasetRecord]: raise ValueError(f"Invalid split: {self.split}. Expected 'train' or 'test'") # List all PNG images - images = list(image_dir.glob("*.png")) + images = sorted(list(image_dir.glob("*.png"))) + total_images = len(images) - # Limit number of items if specified - if self.max_items > 0 and len(images) > self.max_items: - images = images[: self.max_items] + # Apply index range + begin, end = self.get_effective_indices(total_images) + images = images[begin:end] - total_images = len(images) - _log.info(f"Processing FUNSD {self.split} dataset: {total_images} images") + # Log stats + self.log_dataset_stats(total_images, len(images)) + _log.info(f"Processing FUNSD {self.split} dataset: {len(images)} images") # Process each image - for img_path in tqdm(images, total=total_images): + for img_path in tqdm(images, total=len(images)): try: # Determine annotation path annotation_path = ( @@ -300,7 +363,6 @@ def iterate(self) -> Iterable[DatasetRecord]: # Create dataset record record = DatasetRecord( - # predictor_info=self.prediction_provider.info(), doc_id=img_path.stem, doc_hash=get_binhash(img_bytes), ground_truth_doc=true_doc, @@ -311,9 +373,6 @@ def iterate(self) -> Iterable[DatasetRecord]: ground_truth_page_images=true_page_images, ) - # Update prediction - # self.update_prediction(record) - yield record except Exception as ex: diff --git a/docling_eval/dataset_builders/omnidocbench_builder.py b/docling_eval/dataset_builders/omnidocbench_builder.py index 7189ddf0..ac08a0d0 100644 --- a/docling_eval/dataset_builders/omnidocbench_builder.py +++ b/docling_eval/dataset_builders/omnidocbench_builder.py @@ -4,7 +4,7 @@ import os from io import BytesIO from pathlib import Path -from typing import Iterable, List, Tuple +from typing import Dict, Iterable, List, Tuple from docling_core.types import DoclingDocument from docling_core.types.doc import ( @@ -12,6 +12,7 @@ CoordOrigin, DocItemLabel, ImageRef, + PageItem, ProvenanceItem, Size, ) @@ -36,6 +37,7 @@ # Get logger _log = logging.getLogger(__name__) +# Labels to export in HTML visualization TRUE_HTML_EXPORT_LABELS = { DocItemLabel.TITLE, DocItemLabel.DOCUMENT_INDEX, @@ -79,32 +81,64 @@ class OmniDocBenchDatasetBuilder(BaseEvaluationDatasetBuilder): + """ + OmniDocBench dataset builder implementing the base dataset builder interface. + + This builder processes the OmniDocBench dataset, which contains document + layout annotations for a variety of document types. + """ + def __init__( self, - # prediction_provider: BasePredictionProvider, target: Path, split: str = "test", + begin_index: int = 0, + end_index: int = -1, ): + """ + Initialize the OmniDocBench dataset builder. + + Args: + target: Path where processed dataset will be saved + split: Dataset split to use + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ super().__init__( name="OmniDocBench: end-to-end", dataset_source=HFSource(repo_id="opendatalab/OmniDocBench"), - # prediction_provider=prediction_provider, target=target, split=split, + begin_index=begin_index, + end_index=end_index, ) - def update_gt_into_map(self, gt): + def update_gt_into_map(self, gt: List[Dict]) -> Dict[str, Dict]: + """ + Convert list of annotation items to a map keyed by image path. - result = {} + Args: + gt: List of ground truth annotations + Returns: + Dictionary mapping image paths to their annotations + """ + result = {} for item in gt: path = item["page_info"]["image_path"] result[path] = item - return result def get_filenames(self, omnidocbench_dir: Path) -> List[Tuple[str, str]]: + """ + Get pairs of image and PDF paths from the dataset directory. + + Args: + omnidocbench_dir: Path to the OmniDocBench directory + Returns: + List of (image_path, pdf_path) tuples + """ page_images = sorted(glob.glob(str(omnidocbench_dir / "images/*.jpg"))) page_pdfs = sorted(glob.glob(str(omnidocbench_dir / "ori_pdfs/*.pdf"))) @@ -116,38 +150,49 @@ def get_filenames(self, omnidocbench_dir: Path) -> List[Tuple[str, str]]: def update_doc_with_gt( self, - gt, - true_doc, - page, + gt: Dict, + true_doc: DoclingDocument, + page: PageItem, page_image: Image, page_width: float, page_height: float, - ): - + ) -> DoclingDocument: + """ + Update document with ground truth annotations. + + Args: + gt: Ground truth annotations + true_doc: Document to update + page: Page object + page_image: Page image + page_width: Page width + page_height: Page height + + Returns: + Updated document + """ gt_width = float(gt["page_info"]["width"]) gt_height = float(gt["page_info"]["height"]) for item in gt["layout_dets"]: - label = item["category_type"] - text = f"<omitted text for {label}>" if "text" in item: text = item["text"] + # Find bounding box coordinates min_x = item["poly"][0] max_x = item["poly"][0] - min_y = item["poly"][1] max_y = item["poly"][1] for i in range(0, 4): min_x = min(min_x, item["poly"][2 * i]) max_x = max(max_x, item["poly"][2 * i]) - min_y = min(min_y, item["poly"][2 * i + 1]) max_y = max(max_y, item["poly"][2 * i + 1]) + # Create bounding box bbox = BoundingBox( l=min_x * page_width / gt_width, r=max_x * page_width / gt_width, @@ -156,11 +201,13 @@ def update_doc_with_gt( coord_origin=CoordOrigin.TOPLEFT, ) + # Create provenance prov = ProvenanceItem(page_no=1, bbox=bbox, charspan=(0, len(text))) + # Crop the image element - use page directly since we've updated the signature img = crop_bounding_box(page_image=page_image, page=page, bbox=bbox) - # img.show() + # Add element to document based on label if label == "title": true_doc.add_heading(text=text, orig=text, level=1, prov=prov) @@ -175,7 +222,6 @@ def update_doc_with_gt( ) elif label == "table": - table_data = convert_html_table_into_docling_tabledata( table_html=item["html"] ) @@ -197,16 +243,13 @@ def update_doc_with_gt( ) elif label == "figure": - uri = from_pil_to_base64uri(img) - imgref = ImageRef( mimetype="image/png", dpi=72, size=Size(width=img.width, height=img.height), uri=uri, ) - true_doc.add_picture(prov=prov, image=imgref) elif label == "figure_caption": @@ -270,69 +313,65 @@ def update_doc_with_gt( ) else: - logging.error(f"label {label} is not assigned!") + _log.error(f"label {label} is not assigned!") return true_doc def iterate(self) -> Iterable[DatasetRecord]: + """ + Iterate through the dataset and yield DatasetRecord objects. + + Yields: + DatasetRecord objects + """ if not self.retrieved: raise RuntimeError( "You must first retrieve the source dataset. Call retrieve_input_dataset()." ) assert self.dataset_local_path is not None - # load the groundtruth + + # Load the ground truth with open(self.dataset_local_path / "OmniDocBench.json", "r") as fr: gt = json.load(fr) gt = self.update_gt_into_map(gt) + # Create visualization directory if needed viz_dir = self.target / "vizualisations" - os.makedirs(viz_dir, exist_ok=True) + viz_dir.mkdir(exist_ok=True) + # Get all file paths page_tuples = self.get_filenames(self.dataset_local_path) + total_items = len(page_tuples) - # Apply index ranges - total_ds_len = len(page_tuples) - - begin_index = 0 - end_index = -1 - # begin_index, end_index = set_selection_range( - # begin_index, end_index, total_ds_len - # ) - page_tuples = page_tuples[begin_index:end_index] - selected_ds_len = len(page_tuples) - _log.info( - "Dataset len: %s. Selected range: [%s, %s] = %s", - total_ds_len, - begin_index, - end_index, - selected_ds_len, - ) + # Apply index range + begin, end = self.get_effective_indices(total_items) + page_tuples = page_tuples[begin:end] + selected_items = len(page_tuples) + + # Log stats + self.log_dataset_stats(total_items, selected_items) for page_tuple in tqdm( page_tuples, - total=selected_ds_len, + total=selected_items, ncols=128, desc="Processing files for OmniDocBench with end-to-end", ): - jpg_path = page_tuple[0] pdf_path = Path(page_tuple[1]) - # logging.info(f"file: {pdf_path}") - if os.path.basename(jpg_path) not in gt: - logging.error( - f"did not find ground-truth for {os.path.basename(jpg_path)}" - ) + # Check if ground truth exists for this image + jpg_basename = os.path.basename(jpg_path) + if jpg_basename not in gt: + _log.error(f"Did not find ground-truth for {jpg_basename}") continue - gt_doc = gt[os.path.basename(jpg_path)] + gt_doc = gt[jpg_basename] - # Create the groundtruth Document - true_doc = DoclingDocument( - name=f"ground-truth {os.path.basename(jpg_path)}" - ) + # Create the ground truth Document + true_doc = DoclingDocument(name=f"ground-truth {jpg_basename}") true_doc, true_page_images = add_pages_to_true_doc( pdf_path=pdf_path, true_doc=true_doc, image_scale=2.0 ) @@ -343,6 +382,7 @@ def iterate(self) -> Iterable[DatasetRecord]: page_width = true_doc.pages[1].size.width page_height = true_doc.pages[1].size.height + # Update document with ground truth true_doc = self.update_doc_with_gt( gt=gt_doc, true_doc=true_doc, @@ -352,12 +392,12 @@ def iterate(self) -> Iterable[DatasetRecord]: page_height=page_height, ) + # Get PDF as binary data pdf_bytes = get_binary(pdf_path) - pdf_stream = DocumentStream(name=pdf_path.name, stream=BytesIO(pdf_bytes)) + # Create dataset record record = DatasetRecord( - # predictor_info=self.prediction_provider.info(), doc_id=str(os.path.basename(jpg_path)), doc_hash=get_binhash(pdf_bytes), ground_truth_doc=true_doc, diff --git a/docling_eval/dataset_builders/otsl_table_dataset_builder.py b/docling_eval/dataset_builders/otsl_table_dataset_builder.py index 91174bad..7c031f2b 100644 --- a/docling_eval/dataset_builders/otsl_table_dataset_builder.py +++ b/docling_eval/dataset_builders/otsl_table_dataset_builder.py @@ -1,7 +1,7 @@ import io import logging from pathlib import Path -from typing import Any, Iterable, List +from typing import Any, Iterable, List, Optional from datasets import load_dataset from docling_core.types import DoclingDocument @@ -14,6 +14,7 @@ Size, ) from docling_core.types.io import DocumentStream +from PIL import Image from tqdm import tqdm from docling_eval.datamodels.dataset_record import DatasetRecord @@ -37,7 +38,12 @@ class TableDatasetBuilder(BaseEvaluationDatasetBuilder): - """Base class for table dataset builders.""" + """ + Base class for table dataset builders. + + This class provides common functionality for building datasets + focused on table structure recognition tasks. + """ def __init__( self, @@ -45,18 +51,36 @@ def __init__( dataset_source: HFSource, target: Path, split: str = "test", - max_items: int = -1, + begin_index: int = 0, + end_index: int = -1, ): + """ + Initialize the table dataset builder. + + Args: + name: Name of the dataset + dataset_source: HuggingFace dataset source + target: Path where processed dataset will be saved + split: Dataset split to use + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ super().__init__( name=name, dataset_source=dataset_source, target=target, split=split, + begin_index=begin_index, + end_index=end_index, ) - self.max_items = max_items def retrieve_input_dataset(self) -> Path: - """Download and extract the dataset.""" + """ + Download and extract the dataset. + + Returns: + Path to the retrieved dataset + """ assert isinstance(self.dataset_source, HFSource) dataset_path = super().retrieve_input_dataset() self.retrieved = True @@ -65,7 +89,17 @@ def retrieve_input_dataset(self) -> Path: def create_page_tokens( self, data: List[Any], height: float, width: float ) -> PageTokens: - """Create page tokens from cell data.""" + """ + Create page tokens from cell data. + + Args: + data: Table cell data + height: Page height + width: Page width + + Returns: + PageTokens object containing token information + """ tokens = [] cnt = 0 for i, row in enumerate(data): @@ -89,7 +123,12 @@ def create_page_tokens( return PageTokens.model_validate(result) def iterate(self) -> Iterable[DatasetRecord]: - """Iterate through the dataset and yield DatasetRecord objects.""" + """ + Iterate through the dataset and yield DatasetRecord objects. + + Yields: + DatasetRecord objects + """ if not self.retrieved: raise RuntimeError( "You must first retrieve the source dataset. Call retrieve_input_dataset()." @@ -99,8 +138,17 @@ def iterate(self) -> Iterable[DatasetRecord]: # Load dataset from the retrieved path ds = load_dataset(self.dataset_source.repo_id, split=self.split) - if self.max_items > 0: - ds = ds.select(range(self.max_items)) + # Apply index range + total_items = len(ds) + begin, end = self.get_effective_indices(total_items) + + # Use HuggingFace's select method for applying range + ds = ds.select(range(begin, end)) + selected_items = len(ds) + + # Log stats + self.log_dataset_stats(total_items, selected_items) + _log.info(f"Processing {self.name} dataset: {selected_items} items") for item in tqdm(ds, desc=f"Processing {self.name} dataset"): try: @@ -211,14 +259,25 @@ def __init__( self, target: Path, split: str = "test", - max_items: int = -1, + begin_index: int = 0, + end_index: int = -1, ): + """ + Initialize the FinTabNet dataset builder. + + Args: + target: Path where processed dataset will be saved + split: Dataset split to use + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ super().__init__( name="FinTabNet", dataset_source=HFSource(repo_id="ds4sd/FinTabNet_OTSL"), target=target, split=split, - max_items=max_items, + begin_index=begin_index, + end_index=end_index, ) @@ -229,14 +288,25 @@ def __init__( self, target: Path, split: str = "val", # PubTabNet uses "val" instead of "test" - max_items: int = -1, + begin_index: int = 0, + end_index: int = -1, ): + """ + Initialize the PubTabNet dataset builder. + + Args: + target: Path where processed dataset will be saved + split: Dataset split to use + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ super().__init__( name="PubTabNet", dataset_source=HFSource(repo_id="ds4sd/PubTabNet_OTSL"), target=target, split=split, - max_items=max_items, + begin_index=begin_index, + end_index=end_index, ) @@ -247,12 +317,23 @@ def __init__( self, target: Path, split: str = "test", - max_items: int = -1, + begin_index: int = 0, + end_index: int = -1, ): + """ + Initialize the PubTables-1M dataset builder. + + Args: + target: Path where processed dataset will be saved + split: Dataset split to use + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ super().__init__( name="PubTables-1M", dataset_source=HFSource(repo_id="ds4sd/PubTables-1M_OTSL-v1.1"), target=target, split=split, - max_items=max_items, + begin_index=begin_index, + end_index=end_index, ) diff --git a/docling_eval/dataset_builders/xfund_builder.py b/docling_eval/dataset_builders/xfund_builder.py index d0e4a4d6..f9559c74 100644 --- a/docling_eval/dataset_builders/xfund_builder.py +++ b/docling_eval/dataset_builders/xfund_builder.py @@ -30,24 +30,39 @@ class XFUNDDatasetBuilder(BaseEvaluationDatasetBuilder): - """XFUND Dataset builder implementing the base dataset builder interface.""" + """ + XFUND Dataset builder implementing the base dataset builder interface. + + XFUND is a multilingual form understanding dataset that includes forms in + multiple languages: Chinese, Japanese, Spanish, French, Italian, German, and Portuguese. + """ def __init__( self, dataset_source: Path, - # prediction_provider: BasePredictionProvider, target: Path, split: str = "val", # XFUND uses "val" instead of "test" - max_items: int = -1, + begin_index: int = 0, + end_index: int = -1, ): + """ + Initialize the XFUND dataset builder. + + Args: + dataset_source: Path to the dataset source + target: Path where processed dataset will be saved + split: Dataset split to use ("val" for XFUND) + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ super().__init__( name="XFUND", - dataset_source=dataset_source, # Local Path to dataset - # prediction_provider=prediction_provider, + dataset_source=dataset_source, target=target, split=split, + begin_index=begin_index, + end_index=end_index, ) - self.max_items = max_items self._langs = [ "zh", "de", @@ -59,7 +74,12 @@ def __init__( ] # Fixed supported languages def retrieve_input_dataset(self) -> Path: - """Download and extract the XFUND dataset if needed.""" + """ + Download and extract the XFUND dataset if needed. + + Returns: + Path to the retrieved dataset + """ assert isinstance(self.dataset_source, Path) dataset_path = self.dataset_source @@ -107,7 +127,15 @@ def retrieve_input_dataset(self) -> Path: return dataset_path def convert_bbox(self, bbox_data) -> BoundingBox: - """Convert bbox format to BoundingBox object.""" + """ + Convert bbox format to BoundingBox object. + + Args: + bbox_data: Bounding box data as list or BoundingBox + + Returns: + BoundingBox object + """ if isinstance(bbox_data, list) and len(bbox_data) == 4: return BoundingBox( l=bbox_data[0], t=bbox_data[1], r=bbox_data[2], b=bbox_data[3] @@ -125,7 +153,17 @@ def create_graph_link( value_cell: GraphCell, label: GraphLinkLabel = GraphLinkLabel.TO_VALUE, ) -> GraphLink: - """Create a graph link between key and value cells.""" + """ + Create a graph link between key and value cells. + + Args: + key_cell: Source cell (key) + value_cell: Target cell (value) + label: Link label + + Returns: + GraphLink object + """ return GraphLink( source_cell_id=key_cell.cell_id, target_cell_id=value_cell.cell_id, @@ -135,7 +173,16 @@ def create_graph_link( def get_overall_bbox( self, links: List[GraphLink], cell_dict: Dict[int, GraphCell] ) -> Optional[BoundingBox]: - """Compute the overall bounding box from all cell ids.""" + """ + Compute the overall bounding box from all cell ids. + + Args: + links: List of GraphLink objects + cell_dict: Dictionary mapping cell IDs to GraphCell objects + + Returns: + BoundingBox encompassing all cells, or None if no bounding boxes + """ all_bboxes = [] for link in links: src_prov = cell_dict[link.source_cell_id].prov @@ -153,7 +200,16 @@ def get_overall_bbox( def populate_key_value_item( self, doc: DoclingDocument, xfund_data: dict ) -> DoclingDocument: - """Populate the key-value item from the XFUND data.""" + """ + Populate the key-value item from the XFUND data. + + Args: + doc: DoclingDocument to update + xfund_data: XFUND annotation data + + Returns: + Updated DoclingDocument + """ if "document" not in xfund_data: raise ValueError("Invalid XFUND data: missing 'document' key.") @@ -232,7 +288,12 @@ def populate_key_value_item( return doc def iterate(self) -> Iterable[DatasetRecord]: - """Iterate through the dataset and yield DatasetRecord objects.""" + """ + Iterate through the dataset and yield DatasetRecord objects. + + Yields: + DatasetRecord objects + """ if not self.retrieved: raise RuntimeError( "You must first retrieve the source dataset. Call retrieve_input_dataset()." @@ -252,19 +313,22 @@ def iterate(self) -> Iterable[DatasetRecord]: data = json.load(f) all_documents.extend(data.get("documents", [])) - # Limit number of items if specified - if self.max_items > 0 and len(all_documents) > self.max_items: - import random + # Sort documents by image filename for deterministic ordering + all_documents = sorted(all_documents, key=lambda doc: doc["img"]["fname"]) + total_documents = len(all_documents) - random.seed(42) # For reproducibility - all_documents = random.sample(all_documents, self.max_items) + # Apply index range + begin, end = self.get_effective_indices(total_documents) + selected_documents = all_documents[begin:end] + # Log stats + self.log_dataset_stats(total_documents, len(selected_documents)) _log.info( - f"Processing XFUND {self.split} dataset: {len(all_documents)} documents" + f"Processing XFUND {self.split} dataset: {len(selected_documents)} documents" ) # Process each document - for doc_data in tqdm(all_documents, total=len(all_documents)): + for doc_data in tqdm(selected_documents, total=len(selected_documents)): try: # Get image path img_filename = doc_data["img"]["fname"] @@ -311,7 +375,6 @@ def iterate(self) -> Iterable[DatasetRecord]: assert img.format is not None # Create dataset record record = DatasetRecord( - # predictor_info=self.prediction_provider.info(), doc_id=Path(img_filename).stem, doc_hash=get_binhash(img_bytes), ground_truth_doc=true_doc, diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index 0c025334..e480a993 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -1,10 +1,10 @@ import copy -import os +import logging import sys from abc import abstractmethod from io import BytesIO from pathlib import Path -from typing import Dict, Optional +from typing import Dict, Iterable, List, Optional, Set, Tuple, Union from datasets import load_dataset from docling.datamodel.base_models import ConversionStatus @@ -22,7 +22,11 @@ from docling_eval.utils.utils import save_shard_to_disk, write_datasets_info from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters -TRUE_HTML_EXPORT_LABELS = { +# Get logger +_log = logging.getLogger(__name__) + +# Default HTML export labels for visualization +TRUE_HTML_EXPORT_LABELS: Set[DocItemLabel] = { DocItemLabel.TITLE, DocItemLabel.DOCUMENT_INDEX, DocItemLabel.SECTION_HEADER, @@ -42,7 +46,7 @@ DocItemLabel.FOOTNOTE, } -PRED_HTML_EXPORT_LABELS = { +PRED_HTML_EXPORT_LABELS: Set[DocItemLabel] = { DocItemLabel.TITLE, DocItemLabel.DOCUMENT_INDEX, DocItemLabel.SECTION_HEADER, @@ -63,14 +67,47 @@ class BasePredictionProvider: + """ + Base class for all prediction providers. + + Prediction providers are responsible for generating predictions from input data + in the form of DoclingDocument objects or other formats. + """ + def __init__( - self, do_visualization: bool = False, ignore_missing_predictions: bool = True + self, + do_visualization: bool = False, + ignore_missing_predictions: bool = True, + true_labels: Optional[Set[DocItemLabel]] = None, + pred_labels: Optional[Set[DocItemLabel]] = None, ): + """ + Initialize the prediction provider. + + Args: + do_visualization: Whether to generate visualizations of predictions + ignore_missing_predictions: Whether to ignore records with missing predictions + true_labels: Set of DocItemLabel to use for ground truth visualization + pred_labels: Set of DocItemLabel to use for prediction visualization + """ self.do_visualization = do_visualization self.ignore_missing_predictions = ignore_missing_predictions + # Label sets for visualization + self.true_labels = true_labels or TRUE_HTML_EXPORT_LABELS + self.pred_labels = pred_labels or PRED_HTML_EXPORT_LABELS + @abstractmethod def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: + """ + Generate a prediction for a dataset record. + + Args: + record: Input dataset record + + Returns: + Dataset record with prediction added + """ pred_record = self.create_dataset_record_with_prediction( record, DoclingDocument(name="dummy"), @@ -79,13 +116,29 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: return pred_record @abstractmethod - def info(self) -> Dict: + def info(self) -> Dict[str, str]: + """ + Get information about the prediction provider. + + Returns: + Dictionary with provider information + """ return {} def visualize_results( self, prediction_record: DatasetRecordWithPrediction, target_dataset_dir: Path - ): - if prediction_record.predicted_doc is not None: + ) -> None: + """ + Create visualizations of prediction results. + + Args: + prediction_record: Record with prediction to visualize + target_dataset_dir: Directory to save visualizations + """ + if ( + prediction_record.predicted_doc is not None + and prediction_record.ground_truth_page_images + ): save_comparison_html_with_clusters( filename=target_dataset_dir / "visualizations" @@ -93,14 +146,20 @@ def visualize_results( true_doc=prediction_record.ground_truth_doc, pred_doc=prediction_record.predicted_doc, page_image=prediction_record.ground_truth_page_images[0], - true_labels=TRUE_HTML_EXPORT_LABELS, - pred_labels=PRED_HTML_EXPORT_LABELS, + true_labels=self.true_labels, + pred_labels=self.pred_labels, draw_reading_order=True, ) @property @abstractmethod def prediction_format(self) -> PredictionFormats: + """ + Get the format of predictions generated by this provider. + + Returns: + Prediction format enum value + """ pass def create_dataset_record_with_prediction( @@ -108,23 +167,41 @@ def create_dataset_record_with_prediction( record: DatasetRecord, predicted_doc: Optional[DoclingDocument] = None, original_prediction: Optional[str] = None, - ): - pred_record = DatasetRecordWithPrediction.model_validate( - { - **record.as_record_dict(), - "predicted_doc": predicted_doc, - "original_prediction": original_prediction, - "prediction_format": self.prediction_format, - } - ) - pred_record.validate_images() # type: ignore - return pred_record + ) -> DatasetRecordWithPrediction: + """ + Create a dataset record with prediction from an input record. + + Args: + record: Input dataset record + predicted_doc: Predicted DoclingDocument + original_prediction: Original prediction text/data + + Returns: + Dataset record with prediction + """ + data = { + **record.as_record_dict(), + "predicted_doc": predicted_doc, + "original_prediction": original_prediction, + "prediction_format": self.prediction_format, + "predictor_info": self.info(), + } + return DatasetRecordWithPrediction.model_validate(data) def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: - # This might need customization depending on the input the dataset has. - # The default implementation assumes that there is an original file in binary format which is accepted. + """ + Add a prediction to a dataset record. + + Args: + record: Input dataset record + + Returns: + Dataset record with prediction + """ + # Copy the original input data to avoid modifying it input_data = copy.deepcopy(record.original) + # Convert Path to DocumentStream if needed if not isinstance(input_data, DocumentStream): if isinstance(input_data, Path): input_data = DocumentStream( @@ -136,43 +213,114 @@ def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: return pred_record + def get_effective_indices( + self, total_items: int, begin_index: int, end_index: int + ) -> Tuple[int, int]: + """ + Calculate the effective begin and end indices based on dataset size. + + Args: + total_items: Total number of items available + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + + Returns: + Tuple of (effective_begin_index, effective_end_index) + """ + begin = begin_index if begin_index >= 0 else 0 + end = end_index if end_index > 0 else total_items + end = min(end, total_items) + + if begin >= total_items: + _log.warning( + f"Begin index ({begin}) is greater than or equal to dataset size ({total_items}). " + f"No items will be processed." + ) + begin = total_items + + _log.info( + f"Processing range [{begin}:{end}] out of {total_items} total items " + f"({end - begin} items)" + ) + + return begin, end + def create_prediction_dataset( self, name: str, gt_dataset_dir: Path, target_dataset_dir: Path, split: str = "test", - ): + begin_index: int = 0, + end_index: int = -1, + ) -> None: + """ + Create a prediction dataset from a ground truth dataset. + + Args: + name: Name of the dataset + gt_dataset_dir: Path to ground truth dataset + target_dataset_dir: Path to save prediction dataset + split: Dataset split to process + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ + # Load the dataset parquet_files = str(gt_dataset_dir / split / "*.parquet") ds = load_dataset("parquet", data_files={split: parquet_files}) - # _log.info(f"oveview of dataset: {ds}") - if ds is not None: - ds_selection = ds[split] - def _iterate_predictions(): + if ds is None: + _log.error(f"Failed to load dataset from {parquet_files}") + return + + ds_selection = ds[split] + total_items = len(ds_selection) + + # Calculate effective indices + begin, end = self.get_effective_indices(total_items, begin_index, end_index) + + # Apply range + if begin > 0 or end < total_items: + ds_selection = ds_selection.select(range(begin, end)) + + selected_items = len(ds_selection) + _log.info( + f"Dataset '{name}' total items: {total_items}. " + f"Selected range: [{begin}, {end}] = {selected_items} items" + ) + + def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]: + """Generate predictions for each record in the dataset.""" for i, data in tqdm( enumerate(ds_selection), desc="Creating predictions", ncols=120, total=len(ds_selection), ): - record = DatasetRecord.model_validate(data) - pred_record = self.add_prediction(record) + try: + record = DatasetRecord.model_validate(data) + pred_record = self.add_prediction(record) - if ( - self.ignore_missing_predictions - and pred_record.status == ConversionStatus.FAILURE - ): - continue + if ( + self.ignore_missing_predictions + and pred_record.status == ConversionStatus.FAILURE + ): + continue - yield pred_record + yield pred_record + except Exception as e: + _log.error(f"Error processing record {i}: {str(e)}") + if not self.ignore_missing_predictions: + raise + # Create output directories test_dir = target_dataset_dir / split - os.makedirs(test_dir, exist_ok=True) + test_dir.mkdir(parents=True, exist_ok=True) if self.do_visualization: - os.makedirs(target_dataset_dir / "visualizations", exist_ok=True) + (target_dataset_dir / "visualizations").mkdir(parents=True, exist_ok=True) + # Process in chunks chunk_size = 80 max_num_chunks = sys.maxsize @@ -192,8 +340,14 @@ def _iterate_predictions(): chunk_count += 1 if chunk_count >= max_num_chunks: + _log.info( + f"Reached maximum number of chunks ({max_num_chunks}). Stopping." + ) break + _log.info(f"Saved {count} records in {chunk_count} chunks to {test_dir}") + + # Write dataset info write_datasets_info( name=name, output_dir=target_dataset_dir, diff --git a/docling_eval/prediction_providers/docling_provider.py b/docling_eval/prediction_providers/docling_provider.py index 4fc552cf..6baeb9dd 100644 --- a/docling_eval/prediction_providers/docling_provider.py +++ b/docling_eval/prediction_providers/docling_provider.py @@ -1,8 +1,9 @@ import copy -from typing import Dict, Optional +from typing import Dict, Optional, Set from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.document_converter import DocumentConverter, FormatOption +from docling_core.types.doc import DocItemLabel from docling_eval.datamodels.dataset_record import ( DatasetRecord, @@ -16,26 +17,66 @@ class DoclingPredictionProvider(BasePredictionProvider): + """ + Prediction provider that uses Docling document converter. + + This provider converts documents using the Docling document converter + with specified format options. + """ + def __init__( self, format_options: Optional[Dict[InputFormat, FormatOption]] = None, do_visualization: bool = False, + ignore_missing_predictions: bool = True, + true_labels: Optional[Set[DocItemLabel]] = None, + pred_labels: Optional[Set[DocItemLabel]] = None, ): - super().__init__(do_visualization=do_visualization) + """ + Initialize the Docling prediction provider. + + Args: + format_options: Dictionary mapping input formats to format options + do_visualization: Whether to generate visualizations + ignore_missing_predictions: Whether to ignore missing predictions + true_labels: Set of DocItemLabel to use for ground truth visualization + pred_labels: Set of DocItemLabel to use for prediction visualization + """ + super().__init__( + do_visualization=do_visualization, + ignore_missing_predictions=ignore_missing_predictions, + true_labels=true_labels, + pred_labels=pred_labels, + ) self.doc_converter = DocumentConverter(format_options=format_options) @property def prediction_format(self) -> PredictionFormats: + """Get the prediction format.""" return PredictionFormats.DOCLING_DOCUMENT - def predict( - self, - record: DatasetRecord, - ) -> DatasetRecordWithPrediction: - assert ( - record.original is not None - ), "stream must be given for docling prediction provider to work." + def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: + """ + Generate a prediction by converting the document. + + Args: + record: Input dataset record + + Returns: + Dataset record with prediction + + Raises: + RuntimeError: If original document stream is not available + """ + if record.original is None: + raise RuntimeError( + "Stream must be given for docling prediction provider to work." + ) + + # Convert the document res = self.doc_converter.convert(copy.deepcopy(record.original)) + + # Create prediction record pred_record = self.create_dataset_record_with_prediction( record, res.document, @@ -46,4 +87,5 @@ def predict( return pred_record def info(self) -> Dict: + """Get information about the prediction provider.""" return {"asset": "Docling", "version": docling_version()} diff --git a/docling_eval/prediction_providers/file_provider.py b/docling_eval/prediction_providers/file_provider.py index c5591d4f..4a1bd583 100644 --- a/docling_eval/prediction_providers/file_provider.py +++ b/docling_eval/prediction_providers/file_provider.py @@ -1,10 +1,14 @@ -from abc import abstractmethod +import logging from pathlib import Path -from typing import Dict, Optional +from typing import Dict, Optional, Set from docling.datamodel.base_models import ConversionStatus -from docling_core.types import DoclingDocument -from docling_core.types.doc.document import DocTagsDocument, DocTagsPage +from docling_core.types.doc import DocItemLabel +from docling_core.types.doc.document import ( + DoclingDocument, + DocTagsDocument, + DocTagsPage, +) from PIL import Image from docling_eval.datamodels.dataset_record import ( @@ -16,16 +20,46 @@ BasePredictionProvider, ) +_log = logging.getLogger(__name__) + class FilePredictionProvider(BasePredictionProvider): + """ + Prediction provider that reads prediction files from a directory. + + This provider supports various file formats like DOCTAGS, MARKDOWN, + JSON, and YAML. + """ + def __init__( self, prediction_format: PredictionFormats, source_path: Path, do_visualization: bool = False, - ignore_missing_files: Optional[bool] = False, + ignore_missing_files: bool = False, + ignore_missing_predictions: bool = True, + true_labels: Optional[Set[DocItemLabel]] = None, + pred_labels: Optional[Set[DocItemLabel]] = None, ): - super().__init__(do_visualization=do_visualization) + """ + Initialize the file prediction provider. + + Args: + prediction_format: Format of prediction files + source_path: Path to directory containing prediction files + do_visualization: Whether to generate visualizations + ignore_missing_files: Whether to ignore missing files + ignore_missing_predictions: Whether to ignore missing predictions + true_labels: Set of DocItemLabel to use for ground truth visualization + pred_labels: Set of DocItemLabel to use for prediction visualization + """ + super().__init__( + do_visualization=do_visualization, + ignore_missing_predictions=ignore_missing_predictions, + true_labels=true_labels, + pred_labels=pred_labels, + ) + self._supported_prediction_formats = [ PredictionFormats.DOCTAGS, PredictionFormats.MARKDOWN, @@ -33,48 +67,63 @@ def __init__( PredictionFormats.YAML, ] + # Validate the prediction format + if prediction_format not in self._supported_prediction_formats: + msg = f"Unsupported file prediction format: {prediction_format}." + msg += f" The prediction format must be one of {self._supported_prediction_formats}" + raise RuntimeError(msg) + # Read the input self._prediction_format = prediction_format self._prediction_source_path = source_path self._ignore_missing_files = ignore_missing_files - # Validate the prediction format - if self._prediction_format not in self._supported_prediction_formats: - msg = f"Unsupported file prediction format: {self._prediction_format}." - msg += f" The prediction format must be one of {self._supported_prediction_formats}" - raise RuntimeError(msg) - # Validate if the source_path exists if not self._prediction_source_path.is_dir(): raise RuntimeError(f"Missing source path: {self._prediction_source_path}") - @abstractmethod def info(self) -> Dict: + """Get information about the prediction provider.""" return { - "supported_prediction_formats": self._supported_prediction_formats, - "prediction_format": self._prediction_format, - "source_path": self._prediction_source_path, + "supported_prediction_formats": [ + fmt.value for fmt in self._supported_prediction_formats + ], + "prediction_format": self._prediction_format.value, + "source_path": str(self._prediction_source_path), } - @abstractmethod def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: + """ + Generate a prediction by reading from a file. + + Args: + record: Input dataset record + + Returns: + Dataset record with prediction + """ doc_id = record.doc_id raw = None + pred_doc = None + + # Load document based on prediction format if self._prediction_format == PredictionFormats.DOCTAGS: pred_doc = self._load_doctags_doc(doc_id) elif self._prediction_format == PredictionFormats.MARKDOWN: - pred_doc = None raw = self._load_md_raw(doc_id) elif self._prediction_format == PredictionFormats.JSON: pred_doc = self._load_json_doc(doc_id) elif self._prediction_format == PredictionFormats.YAML: pred_doc = self._load_yaml_doc(doc_id) - if pred_doc is None: - status = ConversionStatus.FAILURE - else: - status = ConversionStatus.SUCCESS + # Set status based on whether document was loaded + status = ( + ConversionStatus.SUCCESS + if pred_doc is not None + else ConversionStatus.FAILURE + ) + # Create prediction record pred_record = self.create_dataset_record_with_prediction( record, pred_doc, @@ -84,61 +133,120 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: return pred_record @property - @abstractmethod def prediction_format(self) -> PredictionFormats: - r""" """ + """Get the prediction format.""" return self._prediction_format def _load_doctags_doc(self, doc_id: str) -> Optional[DoclingDocument]: - r"""Load doctags file into DoclingDocument""" + """ + Load doctags file into DoclingDocument. + + Args: + doc_id: Document ID + + Returns: + DoclingDocument or None if file not found + """ # Read the doctags file doctags_fn = self._prediction_source_path / f"{doc_id}.dt" if self._ignore_missing_files and not doctags_fn.is_file(): return None - with open(doctags_fn, "r") as fd: - doctags = fd.read() - - # Check if an optional page image is present - page_image_fn = self._prediction_source_path / f"{doc_id}.png" - page_image = None - if page_image_fn.is_file(): - page_image = Image.open(page_image_fn) + try: + with open(doctags_fn, "r") as fd: + doctags = fd.read() + + # Check if an optional page image is present + page_image_fn = self._prediction_source_path / f"{doc_id}.png" + page_image = None + if page_image_fn.is_file(): + page_image = Image.open(page_image_fn) + + # Build DoclingDocument + doctags_page = DocTagsPage(tokens=doctags, image=page_image) + doctags_doc = DocTagsDocument(pages=[doctags_page]) + doc = DoclingDocument(name=doc_id) + doc.load_from_doctags(doctags_doc) + + return doc + except Exception as e: + _log.error(f"Error loading doctags document {doc_id}: {str(e)}") + if not self._ignore_missing_files: + raise + return None - # Build DoclingDocument - doctags_page = DocTagsPage(tokens=doctags, image=page_image) - doctags_doc = DocTagsDocument(pages=[doctags_page]) - doc = DoclingDocument(name=doc_id) - doc.load_from_doctags(doctags_doc) + def _load_json_doc(self, doc_id: str) -> Optional[DoclingDocument]: + """ + Load DoclingDocument from JSON. - return doc + Args: + doc_id: Document ID - def _load_json_doc(self, doc_id: str) -> Optional[DoclingDocument]: - r"""Load DoclingDocument from json""" + Returns: + DoclingDocument or None if file not found + """ json_fn = self._prediction_source_path / f"{doc_id}.json" if self._ignore_missing_files and not json_fn.is_file(): return None - doc: DoclingDocument = DoclingDocument.load_from_json(json_fn) - return doc + + try: + doc: DoclingDocument = DoclingDocument.load_from_json(json_fn) + return doc + except Exception as e: + _log.error(f"Error loading JSON document {doc_id}: {str(e)}") + if not self._ignore_missing_files: + raise + return None def _load_yaml_doc(self, doc_id: str) -> Optional[DoclingDocument]: - r"""Load DoclingDocument from yaml""" + """ + Load DoclingDocument from YAML. + + Args: + doc_id: Document ID + + Returns: + DoclingDocument or None if file not found + """ + # Try with .yaml extension yaml_fn = self._prediction_source_path / f"{doc_id}.yaml" + + # If not found, try with .yml extension if not yaml_fn.is_file(): - # Try alternative yaml extension yaml_fn = self._prediction_source_path / f"{doc_id}.yml" + if self._ignore_missing_files and not yaml_fn.is_file(): return None - doc: DoclingDocument = DoclingDocument.load_from_yaml(yaml_fn) - return doc + try: + doc: DoclingDocument = DoclingDocument.load_from_yaml(yaml_fn) + return doc + except Exception as e: + _log.error(f"Error loading YAML document {doc_id}: {str(e)}") + if not self._ignore_missing_files: + raise + return None def _load_md_raw(self, doc_id: str) -> Optional[str]: - r"""Load the markdown content""" + """ + Load the markdown content. + + Args: + doc_id: Document ID + + Returns: + Markdown content or None if file not found + """ md_fn = self._prediction_source_path / f"{doc_id}.md" if self._ignore_missing_files and not md_fn.is_file(): return None - with open(md_fn, "r") as fd: - md = fd.read() - return md + try: + with open(md_fn, "r") as fd: + md = fd.read() + return md + except Exception as e: + _log.error(f"Error loading markdown document {doc_id}: {str(e)}") + if not self._ignore_missing_files: + raise + return None diff --git a/docling_eval/prediction_providers/tableformer_provider.py b/docling_eval/prediction_providers/tableformer_provider.py index b4ebcaa6..0d3ff4ff 100644 --- a/docling_eval/prediction_providers/tableformer_provider.py +++ b/docling_eval/prediction_providers/tableformer_provider.py @@ -2,7 +2,7 @@ import logging from io import BytesIO from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Set, Tuple, Union import numpy as np from docling.datamodel.base_models import ( @@ -23,7 +23,6 @@ from docling_core.types import DoclingDocument from docling_core.types.doc import DocItemLabel, TableCell, TableData, TableItem from docling_core.types.io import DocumentStream -from docling_parse.pdf_parsers import pdf_parser_v2 from PIL import Image from docling_eval.datamodels.dataset_record import ( @@ -40,54 +39,126 @@ class TableFormerPredictionProvider(BasePredictionProvider): - def __init__(self, do_visualization: bool = False): - super().__init__(do_visualization=do_visualization) - self.tf_updater = TableFormerUpdater(TableFormerMode.ACCURATE) + """ + Prediction provider that uses TableFormer for table structure prediction. + + This provider is specialized for predicting table structures in documents. + """ + + def __init__( + self, + mode: TableFormerMode = TableFormerMode.ACCURATE, + num_threads: int = 16, + artifacts_path: Optional[Path] = None, + do_visualization: bool = False, + ignore_missing_predictions: bool = True, + true_labels: Optional[Set[DocItemLabel]] = None, + pred_labels: Optional[Set[DocItemLabel]] = None, + ): + """ + Initialize the TableFormer prediction provider. + + Args: + mode: TableFormer prediction mode + num_threads: Number of threads for prediction + artifacts_path: Path to artifacts + do_visualization: Whether to generate visualizations + ignore_missing_predictions: Whether to ignore missing predictions + true_labels: Set of DocItemLabel to use for ground truth visualization + pred_labels: Set of DocItemLabel to use for prediction visualization + """ + super().__init__( + do_visualization=do_visualization, + ignore_missing_predictions=ignore_missing_predictions, + true_labels=true_labels, + pred_labels=pred_labels, + ) + self.tf_updater = TableFormerUpdater(mode, num_threads, artifacts_path) @property def prediction_format(self) -> PredictionFormats: + """Get the prediction format.""" return PredictionFormats.DOCLING_DOCUMENT def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: - r""" """ - assert ( - record.ground_truth_doc is not None - ), "true_doc must be given for TableFormer prediction provider to work." + """ + Generate a prediction for table structure. - if record.mime_type == "application/pdf": - assert isinstance(record.original, DocumentStream) + Args: + record: Input dataset record - updated, pred_doc = self.tf_updater.replace_tabledata( - copy.deepcopy(record.original.stream), record.ground_truth_doc - ) - elif record.mime_type == "image/png": - updated, pred_doc = self.tf_updater.replace_tabledata_with_page_tokens( - record.ground_truth_doc, - record.ground_truth_page_images, - ) - else: + Returns: + Dataset record with prediction + + Raises: + RuntimeError: If ground truth doc is not available or if mime type is unsupported + """ + if record.ground_truth_doc is None: raise RuntimeError( - "TableFormerPredictionProvider is missing data to predict on." + "true_doc must be given for TableFormer prediction provider to work." ) - if updated is False: - status = ConversionStatus.FAILURE - else: - status = ConversionStatus.SUCCESS + updated = False + pred_doc = None - pred_record = self.create_dataset_record_with_prediction( - record, - pred_doc, - None, - ) - pred_record.status = status - return pred_record + try: + if record.mime_type == "application/pdf": + if not isinstance(record.original, DocumentStream): + raise RuntimeError( + "Original document must be a DocumentStream for PDF files" + ) + + # Process PDF + updated, pred_doc = self.tf_updater.replace_tabledata( + copy.deepcopy(record.original.stream), record.ground_truth_doc + ) + + elif record.mime_type == "image/png": + # Process image + updated, pred_doc = self.tf_updater.replace_tabledata_with_page_tokens( + record.ground_truth_doc, + record.ground_truth_page_images, + ) + + else: + raise RuntimeError( + f"Unsupported mime type: {record.mime_type}. TableFormerPredictionProvider supports 'application/pdf' and 'image/png'" + ) + + # Set status based on update success + status = ConversionStatus.SUCCESS if updated else ConversionStatus.FAILURE + + except Exception as e: + _log.error(f"Error in TableFormer prediction: {str(e)}") + status = ConversionStatus.FAILURE + if not self.ignore_missing_predictions: + raise + pred_doc = record.ground_truth_doc.model_copy( + deep=True + ) # Use copy of ground truth as fallback + + # Create prediction record + data = { + **record.as_record_dict(), + "predicted_doc": pred_doc, + "original_prediction": None, + "prediction_format": self.prediction_format, + "predictor_info": self.info(), + "status": status, + } + return DatasetRecordWithPrediction.model_validate(data) def info(self) -> Dict: + """Get information about the prediction provider.""" return {"asset": "TableFormer", "version": docling_models_version()} class TableFormerUpdater: + """ + Utility class for updating table data using TableFormer. + + This class handles the prediction of table structures using the TableFormer model. + """ def __init__( self, @@ -95,8 +166,15 @@ def __init__( num_threads: int = 16, artifacts_path: Optional[Path] = None, ): - r""" """ - # Init the TableFormer model + """ + Initialize the TableFormer updater. + + Args: + mode: TableFormer prediction mode + num_threads: Number of threads for prediction + artifacts_path: Path to artifacts + """ + # Initialize the TableFormer model table_structure_options = TableStructureOptions(mode=mode) accelerator_options = AcceleratorOptions( num_threads=num_threads, device=AcceleratorDevice.AUTO @@ -107,55 +185,87 @@ def __init__( options=table_structure_options, accelerator_options=accelerator_options, ) - _log.info("Initialize %s", mode) + _log.info(f"Initialized TableFormer in {mode} mode") + + def to_np(self, pil_image: Image.Image) -> np.ndarray: + """ + Convert PIL image to NumPy array in BGR format. + + Args: + pil_image: PIL image + + Returns: + NumPy array in BGR format - def to_np(self, pil_image: Image.Image): + Raises: + ValueError: If image format is unsupported + """ # Convert to NumPy array np_image = np.array(pil_image) # Handle different formats if np_image.ndim == 3: # RGB or RGBA image if np_image.shape[2] == 4: # RGBA image - # Discard alpha channel and convert to BGR - np_image = np_image[:, :, :3] # Keep only RGB channels + # Discard alpha channel + np_image = np_image[:, :, :3] # Convert RGB to BGR by reversing the last axis np_image = np_image[:, :, ::-1] - return np_image else: raise ValueError("Unsupported image format") - def get_page_cells(self, filename: str): + def get_page_cells(self, filename: str) -> Optional[DoclingDocument]: + """ + Parse PDF to extract page cells. + + Args: + filename: Path to PDF file + + Returns: + Parsed document or None if parsing failed + """ + from docling_parse.pdf_parsers import pdf_parser_v2 parser = pdf_parser_v2("fatal") try: key = "key" parser.load_document(key=key, filename=filename) - parsed_doc = parser.parse_pdf_from_key(key=key) - parser.unload_document(key) return parsed_doc - except Exception as exc: - _log.error(exc) + _log.error(f"Error parsing PDF: {exc}") + return None + + def _make_internal_page_with_table(self, input_doc, prov) -> Page: + """ + Create a page object with a table from input document. - return None + Args: + input_doc: Input document + prov: Provenance item - def _make_internal_page_with_table(self, input_doc, prov): + Returns: + Page object with table + """ page = Page(page_no=prov.page_no - 1) page._backend = input_doc._backend.load_page(page.page_no) - page.cells = list(page._backend.get_text_cells()) - page.size = page._backend.get_size() + # Add null checks to avoid mypy errors if page._backend is not None and page._backend.is_valid(): + page.cells = list(page._backend.get_text_cells()) + page.size = page._backend.get_size() + + # Create cluster for table cluster = Cluster( id=0, label=DocItemLabel.TABLE, bbox=prov.bbox.to_top_left_origin(page.size.height), ) + + # Add cells that overlap with the cluster for cell in page.cells: overlap = cell.rect.to_bounding_box().intersection_area_with( cluster.bbox @@ -171,48 +281,70 @@ def _make_internal_page_with_table(self, input_doc, prov): def replace_tabledata( self, - pdf_path: Path | BytesIO, + pdf_path: Union[Path, BytesIO], true_doc: DoclingDocument, ) -> Tuple[bool, DoclingDocument]: + """ + Replace table data in document with predictions from TableFormer. - updated = False + Args: + pdf_path: Path to PDF file or PDF data as BytesIO + true_doc: Document with ground truth tables - # deep copy of the true-document + Returns: + Tuple of (success, updated_document) + """ + # Make a deep copy of the document pred_doc = true_doc.model_copy(deep=True) + # Parse the PDF input_doc = get_input_document(pdf_path) if not input_doc.valid: - _log.error("could not parse pdf-file") + _log.error("Could not parse PDF file") return False, pred_doc conv_res = ConversionResult(input=input_doc) + updated = False - # parsed_doc = self.get_page_cells(str(pdf_path)) - # if parsed_doc is None: - # log.error("could not parse pdf-file") - # return False, pred_doc - - # Replace the groundtruth tables with predictions from TableFormer + # Process each table item in the document for item, level in pred_doc.iterate_items(): if isinstance(item, TableItem): for prov in item.prov: - page = self._make_internal_page_with_table(input_doc, prov) - - page = next(self._docling_tf_model(conv_res, [page])) # type: ignore - tbl: Table = page.predictions.tablestructure.table_map[0] - table_data: TableData = TableData( - num_rows=tbl.num_rows, - num_cols=tbl.num_cols, - table_cells=tbl.table_cells, - ) - - item.data = table_data - page._backend.unload() - - updated = True - - # md = item.export_to_markdown() - # print("prediction from table-former: \n\n", md) + try: + # Create page with table + page = self._make_internal_page_with_table(input_doc, prov) + + # Fix mypy error with next() by converting iterator to list + model_results = list(self._docling_tf_model(conv_res, [page])) + + if model_results and hasattr( + model_results[0].predictions, "tablestructure" + ): + page = model_results[0] + if ( + page.predictions.tablestructure is not None + and hasattr( + page.predictions.tablestructure, "table_map" + ) + and page.predictions.tablestructure.table_map + ): + tbl: Table = page.predictions.tablestructure.table_map[ + 0 + ] + table_data: TableData = TableData( + num_rows=tbl.num_rows, + num_cols=tbl.num_cols, + table_cells=tbl.table_cells, + ) + + # Update item data + item.data = table_data + updated = True + + finally: + # Ensure page backend is unloaded to free resources + if hasattr(page, "_backend") and page._backend is not None: + page._backend.unload() return updated, pred_doc @@ -222,21 +354,29 @@ def _tf_predict_with_page_tokens( page_tokens: PageTokens, table_bbox: Tuple[float, float, float, float], image_scale: float = 1.0, - ): - r""" - Test the TFPredictor + ) -> TableData: """ - table_bboxes = [[table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3]]] + Predict table structure from image using page tokens. - ocr_page = page_tokens.dict() + Args: + page_image: Page image + page_tokens: Page tokens + table_bbox: Table bounding box coordinates (l, t, r, b) + image_scale: Image scale factor + Returns: + Predicted table data + """ + # Prepare input for TableFormer + table_bboxes = [[table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3]]] + ocr_page = page_tokens.dict() ocr_page["image"] = self.to_np(page_image) ocr_page["table_bboxes"] = table_bboxes - # TODO: Here we bypass docling API and we steal the tf_preditor private object :-( + # Get predictor from model predictor = self._docling_tf_model.tf_predictor - # Loop over the iocr_pages + # Run prediction tf_output = predictor.multi_table_predict( ocr_page, table_bboxes=table_bboxes, @@ -244,24 +384,23 @@ def _tf_predict_with_page_tokens( correct_overlapping_cells=False, sort_row_col_indexes=True, ) - # print("tf-output: ", json.dumps(tf_output, indent=2)) + # Extract table data table_out = tf_output[0] - - do_cell_matching = True - table_cells = [] - for element in table_out["tf_responses"]: + # Process each cell + for element in table_out["tf_responses"]: tc = TableCell.model_validate(element) - if do_cell_matching and tc.bbox is not None: + if tc.bbox is not None: tc.bbox = tc.bbox.scaled(1 / image_scale) table_cells.append(tc) - # Retrieving cols/rows, after post processing: + # Get table dimensions num_rows = table_out["predict_details"]["num_rows"] num_cols = table_out["predict_details"]["num_cols"] + # Create table data table_data = TableData( num_rows=num_rows, num_cols=num_cols, table_cells=table_cells ) @@ -274,57 +413,70 @@ def replace_tabledata_with_page_tokens( true_page_images: List[Image.Image], page_tokens: Optional[PageTokens] = None, ) -> Tuple[bool, DoclingDocument]: + """ + Replace table data in document using page tokens and images. - updated = False + Args: + true_doc: Document with ground truth tables + true_page_images: Page images + page_tokens: Optional page tokens - # deep copy of the true-document + Returns: + Tuple of (success, updated_document) + """ + # Make a deep copy of the document pred_doc = copy.deepcopy(true_doc) + updated = False + + # Ensure document has exactly one page + if len(pred_doc.pages) != 1: + _log.error("Document must have exactly one page") + return False, pred_doc - assert len(pred_doc.pages) == 1 page_size = pred_doc.pages[1].size - # Replace the groundtruth tables with predictions from TableFormer + # Process each table item for item, level in pred_doc.iterate_items(): if isinstance(item, TableItem): for prov in item.prov: - - # md = item.export_to_markdown() - # print("groundtruth: \n\n", md) - - page_image = true_page_images[prov.page_no - 1] - # page_image.show() - - # Ensure that the bbox will be inside the min/max ranges - table_bbox = ( - max(prov.bbox.l, 0.0), - max(prov.bbox.b, 0.0), - min(prov.bbox.r, page_size.width), - min(prov.bbox.t, page_size.height), - ) - - if page_tokens is None: - ptokens = [] - for ix, table_cell in enumerate(item.data.table_cells): - pt = PageToken( - bbox=table_cell.bbox, text=table_cell.text, id=ix - ) - ptokens.append(pt) - page_tokens = PageTokens( - tokens=ptokens, - height=prov.bbox.height, - width=prov.bbox.width, + try: + # Get page image + page_image = true_page_images[prov.page_no - 1] + + # Ensure bounding box is within page bounds + table_bbox = ( + max(prov.bbox.l, 0.0), + max(prov.bbox.b, 0.0), + min(prov.bbox.r, page_size.width), + min(prov.bbox.t, page_size.height), ) - table_data = self._tf_predict_with_page_tokens( - page_image=page_image, - page_tokens=page_tokens, - table_bbox=table_bbox, - ) - item.data = table_data + # Create page tokens if not provided + if page_tokens is None: + ptokens = [] + for ix, table_cell in enumerate(item.data.table_cells): + pt = PageToken( + bbox=table_cell.bbox, text=table_cell.text, id=ix + ) + ptokens.append(pt) + page_tokens = PageTokens( + tokens=ptokens, + height=prov.bbox.height, + width=prov.bbox.width, + ) - updated = True + # Predict table data + table_data = self._tf_predict_with_page_tokens( + page_image=page_image, + page_tokens=page_tokens, + table_bbox=table_bbox, + ) - # md = item.export_to_markdown() - # print("prediction from table-former: \n\n", md) + # Update item data + item.data = table_data + updated = True + except Exception as e: + _log.error(f"Error predicting table: {str(e)}") + raise return updated, pred_doc diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py index 035e17cf..be174462 100644 --- a/tests/test_dataset_builder.py +++ b/tests/test_dataset_builder.py @@ -111,8 +111,8 @@ def test_run_doclaynet_with_doctags_fileprovider(): dataset_layout = DocLayNetV1DatasetBuilder( # prediction_provider=docling_provider, - target=target_path - / "gt_dataset", + target=target_path / "gt_dataset", + end_index=5, ) dataset_layout.retrieve_input_dataset() # fetches the source dataset from HF @@ -222,8 +222,8 @@ def test_run_doclaynet_v1_e2e(): dataset_layout = DocLayNetV1DatasetBuilder( # prediction_provider=docling_provider, - target=target_path - / "gt_dataset", + target=target_path / "gt_dataset", + end_index=80, ) dataset_layout.retrieve_input_dataset() # fetches the source dataset from HF @@ -365,18 +365,21 @@ def test_run_pubtabnet_builder(): dataset.retrieve_input_dataset() # fetches the source dataset from HF dataset.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=80 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. tableformer_provider.create_prediction_dataset( name=dataset.name, + split="val", gt_dataset_dir=target_path / "gt_dataset", target_dataset_dir=target_path / "eval_dataset", + end_index=25, ) evaluate( modality=EvaluationModality.TABLE_STRUCTURE, - benchmark=BenchMarkNames.DPBENCH, + benchmark=BenchMarkNames.PUBTABNET, idir=target_path / "eval_dataset", odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, + split="val", ) From d1bccd077102921063ec47ce0a7866e6044290c6 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 28 Mar 2025 11:00:06 +0100 Subject: [PATCH 4/4] Cleanup of tests and more fixes Signed-off-by: Christoph Auer --- .../dataset_builders/doclaynet_v1_builder.py | 1 + .../base_prediction_provider.py | 21 +++++++-- docling_eval/utils/utils.py | 21 +++++++++ ...3fd9c23543fbbafc9408cbb372b1f4898095a66.dt | 8 ---- ...074e8a6810bc4b96f1e7aab53051d0b5adfe38a.dt | 12 ------ ...49c038171a06ddfcac892c3c6d7e6b4091c642.txt | 9 ++++ ...33e7e7216d615b9797444f3a220b8768636ff26.dt | 30 ------------- ...a8b8b84b747c295dd10a639e2b5265ac258cf5.txt | 10 +++++ ...a87b7cb40eb65eee4483725c1a85c4dc72d141.txt | 18 ++++++++ ...4eec02c7efd931db5e2a34e66444ee8b8e5079.txt | 14 ++++++ ...497c8236af331fd67bd13b7f7e506d42b4a7e2.txt | 17 ++++++++ tests/test_dataset_builder.py | 43 ++++++++++++------- 12 files changed, 135 insertions(+), 69 deletions(-) delete mode 100644 tests/data/doclaynet_v1_doctags_sample/0e26a4b4e7ad14e1d25385c173fd9c23543fbbafc9408cbb372b1f4898095a66.dt delete mode 100644 tests/data/doclaynet_v1_doctags_sample/0ef90754c44461177b7d29ca9074e8a6810bc4b96f1e7aab53051d0b5adfe38a.dt create mode 100644 tests/data/doclaynet_v1_doctags_sample/132a855ee8b23533d8ae69af0049c038171a06ddfcac892c3c6d7e6b4091c642.txt delete mode 100644 tests/data/doclaynet_v1_doctags_sample/1a89e84c0f1a1d0c841f8807c33e7e7216d615b9797444f3a220b8768636ff26.dt create mode 100644 tests/data/doclaynet_v1_doctags_sample/2b49edc9d0a47e4efaaeabf907a8b8b84b747c295dd10a639e2b5265ac258cf5.txt create mode 100644 tests/data/doclaynet_v1_doctags_sample/61d269cb1f2c976191469b891aa87b7cb40eb65eee4483725c1a85c4dc72d141.txt create mode 100644 tests/data/doclaynet_v1_doctags_sample/a5acc4c1c47a19543362fccf014eec02c7efd931db5e2a34e66444ee8b8e5079.txt create mode 100644 tests/data/doclaynet_v1_doctags_sample/b13563b6fd80bed51928fc4b42497c8236af331fd67bd13b7f7e506d42b4a7e2.txt diff --git a/docling_eval/dataset_builders/doclaynet_v1_builder.py b/docling_eval/dataset_builders/doclaynet_v1_builder.py index c2d725ba..861036d8 100644 --- a/docling_eval/dataset_builders/doclaynet_v1_builder.py +++ b/docling_eval/dataset_builders/doclaynet_v1_builder.py @@ -381,6 +381,7 @@ def iterate(self) -> Iterable[DatasetRecord]: ) exported_rows += 1 + print(page_hash) yield record except Exception as ex: diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index e480a993..2bb77f5a 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -19,7 +19,11 @@ DatasetRecordWithPrediction, ) from docling_eval.datamodels.types import PredictionFormats -from docling_eval.utils.utils import save_shard_to_disk, write_datasets_info +from docling_eval.utils.utils import ( + insert_images_from_pil, + save_shard_to_disk, + write_datasets_info, +) from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters # Get logger @@ -139,12 +143,23 @@ def visualize_results( prediction_record.predicted_doc is not None and prediction_record.ground_truth_page_images ): + gt_doc = insert_images_from_pil( + prediction_record.ground_truth_doc, + prediction_record.ground_truth_pictures, + prediction_record.ground_truth_page_images, + ) + pred_doc = insert_images_from_pil( + prediction_record.predicted_doc, + prediction_record.predicted_pictures, + prediction_record.predicted_page_images, + ) + save_comparison_html_with_clusters( filename=target_dataset_dir / "visualizations" / f"{prediction_record.doc_id}.html", - true_doc=prediction_record.ground_truth_doc, - pred_doc=prediction_record.predicted_doc, + true_doc=gt_doc, + pred_doc=pred_doc, page_image=prediction_record.ground_truth_page_images[0], true_labels=self.true_labels, pred_labels=self.pred_labels, diff --git a/docling_eval/utils/utils.py b/docling_eval/utils/utils.py index a2594a50..96393d76 100644 --- a/docling_eval/utils/utils.py +++ b/docling_eval/utils/utils.py @@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Tuple import pandas as pd +import PIL.Image from bs4 import BeautifulSoup # type: ignore from datasets import Dataset, Features from datasets import Image as Features_Image @@ -333,6 +334,26 @@ def extract_images( return document, pictures, page_images +def insert_images_from_pil( + document: DoclingDocument, + pictures: List[PIL.Image.Image], + page_images: List[PIL.Image.Image], +): + + # Inject picture images + for pic_no, picture in enumerate(document.pictures): + if picture.image is not None: + if pic_no < len(pictures): + picture.image._pil = pictures[pic_no] + # Inject page images + for page_no, page in document.pages.items(): + if page.image is not None: + if (page_no - 1) < len(page_images): + page.image._pil = page_images[page_no - 1] + + return document + + def insert_images( document: DoclingDocument, pictures: List[Dict[str, Any]], diff --git a/tests/data/doclaynet_v1_doctags_sample/0e26a4b4e7ad14e1d25385c173fd9c23543fbbafc9408cbb372b1f4898095a66.dt b/tests/data/doclaynet_v1_doctags_sample/0e26a4b4e7ad14e1d25385c173fd9c23543fbbafc9408cbb372b1f4898095a66.dt deleted file mode 100644 index 52d0aaa5..00000000 --- a/tests/data/doclaynet_v1_doctags_sample/0e26a4b4e7ad14e1d25385c173fd9c23543fbbafc9408cbb372b1f4898095a66.dt +++ /dev/null @@ -1,8 +0,0 @@ - Problems with automatic license activation -If connections problems occur with the automatic license activation procedure, the system times out after 3 minutes with an error. -Check whether the personal computer that is used to connect to the Storwize V7000 GUI and activate the license can access the internet. If you cannot complete the automatic activation procedure, use the manual activation procedure that is described in 12.3.5, "Activate the license manually" on page 617. -Although authorization codes and encryption license keys use the same format (four groups of four hexadecimal digits), you can only use each of them in the appropriate activation process. If you use a license key when the system expects an authorization code, the system displays an error message, as shown in Figure 12-16. -Activate License Automatically Sent machine information to ibm.com Sent machine information to ibm.com Retrieved license keys Retrieved license keys Applied license keys Applied license keysProcessing of authorization code 1234-ABCD-DCBA-4321 has failed for the following reason: The authorization code cannot be decrypted. Processing of authorization code 1234-ABCD-DCBA-4321 has failed for the following reason: The authorization code cannot be decrypted.Ensure that you entered the authorization code correctly and retry the activation request. Ensure that you entered the authorization code correctly and retry the activation request. -616 -Implementing the IBM Storwize V7000 with IBM Spectrum Virtualize V8.2.1 - \ No newline at end of file diff --git a/tests/data/doclaynet_v1_doctags_sample/0ef90754c44461177b7d29ca9074e8a6810bc4b96f1e7aab53051d0b5adfe38a.dt b/tests/data/doclaynet_v1_doctags_sample/0ef90754c44461177b7d29ca9074e8a6810bc4b96f1e7aab53051d0b5adfe38a.dt deleted file mode 100644 index f59aba7e..00000000 --- a/tests/data/doclaynet_v1_doctags_sample/0ef90754c44461177b7d29ca9074e8a6810bc4b96f1e7aab53051d0b5adfe38a.dt +++ /dev/null @@ -1,12 +0,0 @@ - 2010 ANNUAL REPORT | 9 -horizontal wells drilled just to the Bossier may not always hold Haynesville rights. Therefore, Chesapeake and other producers have been drilling aggressively to hold all rights through the Haynesville before the initial three-year term of a typical lease expires. As a result, there has not been much drilling to the Bossier to date. However, once our leases are held by production (HBP) by Haynesville drilling (we expect to be largely complete with HBP drilling by year-end 2011 and completely finished by year-end 2012). We will begin developing the Bossier Shale more aggressively in 2013. In the Bossier play, we own 205,000 net leasehold acres and estimate we could drill up to 2,600 net wells in the years ahead. - -Generating the highest returns in the company, plays like the Oklahoma Colony Granite Wash inspire Chesapeake to find other liquids-rich opportunities. -Marcellus Shale - We first became aware of the Marcellus in 2005 when we were negotiating our $2.2 billion acquisition of Appalachia's second-largest natural gas producer, Columbia Natural Resources, LLC. In 2007 we aggressively accelerated our Marcellus leasehold acquisition efforts and began to prepare for our first drilling activities. By early 2008, we had determined the Marcellus could be prospective over an area of approximately 15 million net acres (approximately five times larger than the prospective Haynesville core area and 10 times larger than the Barnett core area). -After acquiring 1.8 million net leasehold acres, we entered into a joint venture agreement in late 2008 with Oso-based Statoil, one of the -largest and most respected European energy companies. In this transaction, we sold Statoil 32.5% of our Marcellus assets for $3.375 billion in cash and drilling carries. Today, having sold 32.5% of our original 1.8 million net leasehold acres, we have returned to owning 1.7 million net leasehold acres in the play and are the industry's leading leasehold owner, largest producer and most active developer. We are producing from more than 100 net wells in the Marcellus on our 1.7 million net acres, are currently drilling with 32 rigs and estimate we could drill up to 21,000 additional net wells in the years ahead. -Colony and Texas Panhandle Granite Wash - These liquids-rich plays generate the company's highest returns (routinely more than 100%) and provided the inspiration -The very significant upward trajectory of value creation that Chesapeake is on today is primarily driven by the quality of our assets, which feature dominant positions in 16 of the 20 most important major unconventional natural gas and liquids plays in the U.S. -for the company to find other liquids-rich plays in 2010. The Granite Wash, and other plays with liquids-rich gas production streams, provide the strongest economics in the industry today because they possess the best of both worlds: high-volume natural gas production along with significant volumes of highly valued liquids that dramatically increase investment returns. -We are producing from approximately 150 net Granite Wash wells, are currently drilling with 16 rigs and estimate we could drill up to 1,700 additional net wells on our 215,000 net leasehold acres in the years ahead. Based on current NYMEX futures prices for natural gas and oil, each Granite Wash well should generate approximately $11.5 million of present value (or up to an undiscounted total of $19.5 billion for all 1,700 wells), making it obvious why finding, leasing and developing more unconventional liquids-rich plays was Chesapeake's number one priority for 2010. We were very successful - \ No newline at end of file diff --git a/tests/data/doclaynet_v1_doctags_sample/132a855ee8b23533d8ae69af0049c038171a06ddfcac892c3c6d7e6b4091c642.txt b/tests/data/doclaynet_v1_doctags_sample/132a855ee8b23533d8ae69af0049c038171a06ddfcac892c3c6d7e6b4091c642.txt new file mode 100644 index 00000000..399df58b --- /dev/null +++ b/tests/data/doclaynet_v1_doctags_sample/132a855ee8b23533d8ae69af0049c038171a06ddfcac892c3c6d7e6b4091c642.txt @@ -0,0 +1,9 @@ + 8 +Leigh Taliaferro, M.D. General Surgeon Abilene, Texas +Leigh Taliaferro, M.D., values consistency. +The Abilene native started his practice 17 years ago and has developed a flourishing business as a general surgeon. He estimates that 90 percent of his practice is for abdominal surgery. With such a busy practice, he finds comfort in having a reliable banking partner. "I have almost every type of business, trust and personal account with First National Bank of Abilene," says Dr. Taliaferro. +"First National is immersed in this city - everywhere you go, they are involved with helping people with their business. It's because of the people who work there - they are leaders ... generous people who make their mark on the bank and on the community. While they may be the biggest bank in town, they sure don't act like it. It's like banking with friends." +Dr. Taliaferro has invested in First Financial Bankshares for more than a decade. "My stock has done nothing but go up in value. They are solid, sound businesspeople. I sleep up well at night knowing that my investments are in good hands." + +"While they may be the biggest bank in town, they sure don't act like it. It's like banking with friends. + \ No newline at end of file diff --git a/tests/data/doclaynet_v1_doctags_sample/1a89e84c0f1a1d0c841f8807c33e7e7216d615b9797444f3a220b8768636ff26.dt b/tests/data/doclaynet_v1_doctags_sample/1a89e84c0f1a1d0c841f8807c33e7e7216d615b9797444f3a220b8768636ff26.dt deleted file mode 100644 index 381a6907..00000000 --- a/tests/data/doclaynet_v1_doctags_sample/1a89e84c0f1a1d0c841f8807c33e7e7216d615b9797444f3a220b8768636ff26.dt +++ /dev/null @@ -1,30 +0,0 @@ - NAVWEPS 00-80T-80 APPLICATION OF AERODYNAMICS TO SPECIFIC PROBLEMS OF FLYING -possibility of a refused takeoff before exceed- ing the refusal speed. To this end, the pilot must carefully evaluate airplane and power- plant performance and judge the acceleration of the airplane by the use of "line speeds." The accelerated motion of the airplane during takeoff roll will define certain relationships be- tween velocity and distance when the acceler- ation of the airplane is normal. By comparison of predicted and actual speeds at various points along the runway, the pilot can evaluate the acceleration and assess the takeoff perform- ance. -An example of an acceleration profile is shown by the second illustration of figure 6.12, where the variation of velocity and distance is defined for the case of uniformly accelerated motion, i.e., constant acceleration. While the case of uniformly accelerated motion doesnot correspond exactly to the takeoff performance of all airplanes, it is sufficiently applicable to illustrate the principle of line speeds and al- celeration checks. If the takeoff acceleration of the airplane were constant, the airplane would develop specific percentages of the take- off speed at specific percentages of the takeoff distance. Representative values from figure 6.12 are as follows: -Percent of takeoff -Percent of takeoff -Percent of takeoff -time -0 -0 -0 -50.0 -50.0 -50.0 -70.7 -70.7 -70.7 -86.5 -86.5 -86.5 -100 -100 -100 -As an example of this uniformly accelerated motion, the airplane upon reaching the half- way point of takeoff roll would have spent 70.7 percent of the total takeoff time and ac- celerated to 70.7 percent of the takeoff speed. -If the airplane has not reached a specific speed at a specific distance, it is obvious that the ac- celeration is below the predicted value and the airplane surely will not achieve the takeoff speed in the specified takeoff distance. Therefore, properly computed line speeds at various -points along the runway will allow the pilot to monitor the takeoff performance and recog- nize a deficiency of acceleration. Of course, a deficiency of acceleration must be recognized prior to reaching some point along the runway where takeoff cannot be safely achieved or refused. -The fundamental principles of refusal speeds and line speeds are applicable equally well to single-engine and multingene airplanes. How- ever, in the case of the multingene airplane additional consideration must be given to the decision to continue or refuse takeoff when engine failure occurs during the takeoff roll. If failure of one engine occurs prior to reaching the refusal speed, takeoff should be discon- nued and the airplane brought to a stop on the remaining runway. If failure of one engine occurs after exceeding the refusal speed, the airplane is committed to continue takeoff with the remaining engines operative or an unsafe caused takeoff. In some cases, the remaining runway may not be sufficient to allow acceleration to the takeoff speed and the airplane can either takeoff or stop on the runway remain- ing. To facilitate consideration of this prob- lem, several specific definitions are necessary. -(1) Takeoff and initial climb speed: A speed, usually a fixed percentage above the stall speed, a which the airplane will become airborne and best clear obstacles immediately after takeoff. For a particular airplane in the takeoff con- figuration, this speed (in EAs or CAS) is a function of gross weight but in no circumstances should it be less than the minimum directional control speed for the critical asymmetrical power condition. Generally, the takeoff and initial climb speed is referred to as the "Vz" speed. -(2) Critical engine failure speed: A speed achieved during the takeoff roll at which fail- ure of one engine will require the same distance to continue accelerating with the operative en- gines to accomplish safe takeoff or refuse takeoff and decelerate to a stop utilizing the airplane brakes. At critical engine failure -394 - \ No newline at end of file diff --git a/tests/data/doclaynet_v1_doctags_sample/2b49edc9d0a47e4efaaeabf907a8b8b84b747c295dd10a639e2b5265ac258cf5.txt b/tests/data/doclaynet_v1_doctags_sample/2b49edc9d0a47e4efaaeabf907a8b8b84b747c295dd10a639e2b5265ac258cf5.txt new file mode 100644 index 00000000..26fdebed --- /dev/null +++ b/tests/data/doclaynet_v1_doctags_sample/2b49edc9d0a47e4efaaeabf907a8b8b84b747c295dd10a639e2b5265ac258cf5.txt @@ -0,0 +1,10 @@ + 82 +Nissan Annual Report 2004 +The Company and its domestic consolidated subsidiaries have defined benefit plans, i.e., welfare pension fund plans ('WPPF'), tax-qualified pension plans and lump-sum payment plans, covering substantially all employees who are entitled to lump-sum or annuity payments, the amounts of which are determined by reference to their basic rates of pay, length of service, and the conditions under which termination occurs. Certain foreign consolidated subsidiaries have defined benefit and contribution plans. +The following table sets forth the funded and accrued status of the plans, and the amounts recognized in the consolidated balance sheets as of March 31, 2005 and 2004 for the Company's and the consolidated subsidiaries' defined benefit plans: +Millions of yen2004 Mar. 31, 20052003 Mar. 31, 2004Retirement benefit obligation:As of20042003Plan assets at fair value(%,121,260)(%,141,048)(%)Unfunded retirement benefit obligation:(716,445)(664,314)(6,695,748)Unrecognized net retirement benefit obligation at transition120,718131,6661,128,206Unrecognized actuarial gain or loss154,689152,8671,445,691Unrecognized prior service cost:(66,720)(61,833)(62,355)Net retirement benefit obligation(507,758)(441,614)(4,745,402)Prepaid pension cost:4456524,159Accrued retirement benefits:(508,203)(442,266)(4,749,561) +The substitutional portion of the benefits under the WPPF has been included in the amounts shown in the above table. +The Company received the approval from the Minister of Health, Labor and Welfare ('MLHW') in the year ended March 31, 2003 with respect to its application for exemption from the obligation for benefits related to future employee services under the substitutional portion of the WPPF. Certain domestic consolidated subsidiaries received the same approval from MLHW during the year ended March 31, 2004. In accordance with the transitional provision stipulated in "Practical Guidelines for Accounting for Retirement Benefits", the Company and the domestic consolidated subsidiaries accounted for the separation of the substitutional portion of the benefit obligation from the corporate portion of the benefit obligation under their WPPFs as of the dates of approval for their exemption assuming that the transfer to the Japanese government of the substitutional portion of the benefit obligation and related pension plan assets has been completed as of those dates. As a result, the Company recognized a loss of $3,945,945 million for the year ended March 31, 2003 and the domestic consolidated subsidiaries recognized an aggregate gain of $3,669 million and an aggregate loss of $1,587 million for the year ended March 31, 2004. The pension assets to be transferred were calculated at $3,757,770 million for the domestic consolidated subsidiaries at March 31, 2004 and $241,203 million for the Company at March 31, 2003. +The components of retirement benefit expenses for the years ended March 31, 2005, 2004 and 2003 are outlined as follows: +Millions of yen2004 Mar. 31, 20052002 Mar. 31, 2004Thousands of U.S. dollarsService cost:For the years ended200420022004Interest costMar. 31, 200351,54351,54351,543Expected return on plan assets:33,28833,01245,269311,013Amortization of net retirement benefit obligation at transition:(17,999)(15,523)(26,708)(168,215)Amortization of actuarial gain or loss:12,00918,69624,280112,234Amortization of prior service cost:1,29,2891,86,198114,644114,934Other(5,431)(7,049)(7,762)(50,757)Retirement benefit expenses:1795751,673(Gain) loss on return of the substitutional portion of welfare pension fund plans82,14691,77398,091767,720Total:(1,107)(5,594)30,945(10,346)$^{#}$81,039$^{#}$86,179$^{#}$129,036$^{#}$757,374$^{#}$757,374 + \ No newline at end of file diff --git a/tests/data/doclaynet_v1_doctags_sample/61d269cb1f2c976191469b891aa87b7cb40eb65eee4483725c1a85c4dc72d141.txt b/tests/data/doclaynet_v1_doctags_sample/61d269cb1f2c976191469b891aa87b7cb40eb65eee4483725c1a85c4dc72d141.txt new file mode 100644 index 00000000..fe2beae5 --- /dev/null +++ b/tests/data/doclaynet_v1_doctags_sample/61d269cb1f2c976191469b891aa87b7cb40eb65eee4483725c1a85c4dc72d141.txt @@ -0,0 +1,18 @@ + Linking consumers with any time, any place mobile banking +In today's increasingly wireless world, consumers are turning in record numbers to mobile devices for greater convenience and access to banking and information services. + +with the freedom of mobile devices, bank customers can instantly obtain account balances, transfer money and even view a mini-bank statement-or set up instant alerts to monitor their daily account balances, deposit notifications and other personalized information 24 hours a day, 7 days a week. +The exciting potential of wireless is creating unprecedented opportunities for banks to connect with their customers. In Western Europe, the number of mobile banking accounts is expected to reach 31.8 million by 2004. 1 Expanding wireless capabilities are also helping to drive growth in North America, where the number of wireless financial services users is projected to skyrocket to 35 million by 2005. 2 The Asia-Pacific region is forecast at 12 million subscribers of wireless financial services alone in 2003. 3 +The quickly evolving market for mobile banking represents a tremendous opportunity for Euronet Worldwide. Last spring we introduced Euronet ® Mobile Banking as the first financial application that offered both secure account access and a personalized accounting alerting system. Among our new mobile banking clients in 2000 were the Bank of Cyprus, for its branches in London and Greece, and the National Bank of Kuwait, for its Lebanon branch, who were both first to market in their regions. +To further strengthen our capabilities, we announced strategic alliances to market and deliver Euronet's suite of mobile banking solutions with Aether Systems, Inc. for the US market and with Sia Communications for the European, Middle Eastern and Asian markets. In addition, we formed similar regional strategic alliances with companies like Stet Hellas Telecommunications S.A., a Greek mobile operator and subsidiary of Telecom Italia Mobile (TIM). +As next-generation mobile technology brings higher data speeds, personalization and other enhancements, we believe the future of mobile banking presents great opportunities for Euronet. +$^{1}$International Data Corporation (September 2000), $^{2}$TowerGroup (September 2000), $^{3}$Meridien Research (August 1999) + + +National Bank of Kuwait-Lebanon +First-to-Market Mobile Banking +To broaden its customer and account base, the National Bank of Kuwait-Lebanon +(NBK-L) wanted to be first in their market with a mobile banking solution. In a tight race with a competing bank, Euronet's mobile solution was integrated quickly into the NBK-LS IT infrastructure, enabling the bank to be the first to deliver services in its market. +Together with GSM operator Libancell, NBK-LS new mobile banking system offers customers any time, any place access to their account information from their GSM telephones. +11 + \ No newline at end of file diff --git a/tests/data/doclaynet_v1_doctags_sample/a5acc4c1c47a19543362fccf014eec02c7efd931db5e2a34e66444ee8b8e5079.txt b/tests/data/doclaynet_v1_doctags_sample/a5acc4c1c47a19543362fccf014eec02c7efd931db5e2a34e66444ee8b8e5079.txt new file mode 100644 index 00000000..10fda7ef --- /dev/null +++ b/tests/data/doclaynet_v1_doctags_sample/a5acc4c1c47a19543362fccf014eec02c7efd931db5e2a34e66444ee8b8e5079.txt @@ -0,0 +1,14 @@ + HON INDUSTRIES Inc. and SUBSIDIARIES +Income Taxes +Significant components of the provision for income taxes are as follows: +(In thousands)200320022001Current:$ 49,721$ 38,966$ 32,393Federal4,1593,4732,442State53,88042,43934,835Deferred(1,054)6,7557,019$ 52,826$ 49,194$ 41,854-A reconciliation of the statutory federal income tax rate to the Company's effective income tax rate is as follows: +200320022001Federal statutory tax rate35.0%35.0%State taxes, net of federal tax effect1.81.6Credit for increasing research activities(2.0)(1.6)Extraterritorial income exclusion(0.5)(1.0)Other - net0.71.0Effective tax rate35.0%35.0%Shareholders' Equity and Earnings Per Share +Deferred income taxes reflect the net tax effects of temporary differences between the carrying amounts of assets and liabilities for financial reporting purposes and the amounts used for income tax purposes. Significant components of the Company's deferred tax liabilities and assets are as follows: +(In thousands)200320022001Net long-term deferred tax liabilities:$ (28,103)$ (34,398)$ (38,759)Tax over book depreciation1823,5813,197OPEB obligations4,9123,8212,519Compensation Goodwill(18,044)(14,173)(5,550)Other - net3,3204,055(1,039)Total net long-term deferred tax liabilities(37,733)(37,114)(39,632)Net current deferred tax assets:(2981,5171,119liability accruals4,7544,6174,002Vacation accrual-(3,766)(3,766)Integration accruals4,3435,1011,969Inventory differences5288213,302Deferred income Warranty accruals(5,462)(3,820)-Other - net2,8862,3691,606Total net current deferred tax assets6,982(504)6,708Net deferred tax14,32910,10114,940(liabilities) assets$ (23,404)$ (27,013)$ (24,692)The Company purchased 762,300; 614,580; and 1,472,937 shares of its common stock during 2003, 2002, and 2001, respectively. The par value method of accounting is used for common stock repurchases. The excess of the cost of shares acquired over their par value is allocated to Additional Paid-In Capital, with the excess charged to Retained Earnings. +Common Stock, $1 Par Value200320022001Authorized200,000,000200,000,000200,000,000Issued and outstanding58,238,51958,373,60758,672,933Preferred Stock, $1 Par Value2,000,0002,000,0002,000,000Authorized---Issued and outstanding--- +Numerators:20032002Numerators for both basic and diluted EPS net income$ 98,105,000$ 91,360,000Denominators:Denominator for basic EPS weighted-58,178,73958,789,851Potentially dilutive shares from stock option plans366,614231,220231,220Denominator for diluted EPS$ 58,455,35359,021,07159,021,071Earnings per share - basic$ 1.69$ 1.551.55Earnings per share - diluted$ 1.68$ 1.55 +Certain exercisable and nonexercisable stock options were not included in the computation of diluted EPS for fiscal year 2003 and 2002, because the option prices were greater that the average market prices for the applicable periods. The number of stock options outstanding which met this criterion for 2003 was 20,000, with a range of per share exercise prices of $42.49-$42.98; and for 2002 was 30,000, with a range of per share exercise prices of $28.25-$23.22. +Components of other comprehensive income (loss) consist of the following: +(In thousands)200320022001Foreign currency translation adjustments - net of tax$ 45$ -$ 109Change in unrealized gains (losses) on marketable securities - net of tax(690)(322)42Other comprehensive income (loss)$ (645)$ (322)$ 151 +49 + \ No newline at end of file diff --git a/tests/data/doclaynet_v1_doctags_sample/b13563b6fd80bed51928fc4b42497c8236af331fd67bd13b7f7e506d42b4a7e2.txt b/tests/data/doclaynet_v1_doctags_sample/b13563b6fd80bed51928fc4b42497c8236af331fd67bd13b7f7e506d42b4a7e2.txt new file mode 100644 index 00000000..568068aa --- /dev/null +++ b/tests/data/doclaynet_v1_doctags_sample/b13563b6fd80bed51928fc4b42497c8236af331fd67bd13b7f7e506d42b4a7e2.txt @@ -0,0 +1,17 @@ + REPORT OF INDEPENDENT CERTIFIED PUBLIC ACCOUNTANT S +To the Stockholders and the Board of Directors of Atrion Corporation: +We have audited the accompanying consolidated balance sheets of Atrion Corporation (a Delaware corporation) and Subsidiaries as of December 31, 2003 and 2002, and the related consolidated statements of income, changes in stockholders' equity and cash flows for the years then ended. These financial statements are the responsibility of the Company's management. Our responsibility is to express an opinion on these financial statements based on our audit. The financial statements of Atrion Corporation and Subsidiaries as of and for the year in the period ended December 31, 2001, were audited by other auditors who have ceased operations. Those auditors expressed an unqualified opinion on those financial statements in their report dated February 25, 2002. +We conducted our audits in accordance with auditing standards generally accepted in the United States of America. Those standards require that we plan and perform the audit to obtain reasonable assurance about whether the financial statements are free of material misstatement. An audit includes examining, on a test basis, evidence supporting the amounts and disclosures in the financial statements. An audit also includes assessing the accounting principles used and significant estimates made by management as well as evaluating the overall financial statement presentation. We believe that our audits provide a reasonable basis for our opinion. +In our opinion, the financial statements referred to above present fairly, in all material respects, the consolidated financial position of Atrion Corporation and Subsidiaries as of December 31, 2003 and 2002, and the consolidated results of their operations and their consolidated cash flows for the years then ended in conformity with accounting principles generally accepted in the United States of America. +As discussed above, the financial statements of Atrion Corporation and Subsidiaries as of December 31, 2001, and for the year then ended were audited by other auditors who have ceased operations. As described in Note 2, these financial statements have been revised to include the transitional disclosures required by Statement of Financial Accounting Standards No. 142, Goodwill and Other Intangible Assets, which was adopted by the Company as of January 1, 2002. Our audit procedures with respect to the disclosures in Note 2 with respect to 2001 included agreeing the previously reported net income to the previously issued financial statements and the adjustments to reported net income representing amortization expense (including any related tax effects) recognized in those periods related to goodwill to the Company's underlying records obtained from management. We also tested the mathematical accuracy of the reconciliation of adjusted net income to reported net income, and the related income-per-share amounts. In our opinion, the disclosures for 2001 in Note 2 are appropriate. However, we were not engaged to audit, review, or apply any procedures to the 2001 financial statements of the Company other than with respect to such disclosures and, accordingly, we do not express an opinion or any other form of assurance on the 2001 financial statements taken as a whole. + +Grant Thornton LLP Dallas, Texas February 13, 2004 +This is a copy of the audit report previously issued by Arthur Andersen LLP in connection with Atrion Corporation and Subsidiaries Annual Report for the year ended December 31, 2001. This audit report has not been issued by Arthur Andersen LLP in connection with this Annual Report. The consolidated balance sheets as of December 31, 2001 and 2000 and the consolidated statements of income and cash flows for the years ended December 31, 2000 and 1999 referred to in this report have not been included in the accompanying financial statements. +To the Stockholders and the Board of Directors of Atrion Corporation: +We have audited the accompanying consolidated balance sheets of Atrion Corporation (a Delaware corporation) and subsidiaries as of December 31, 2001 and 2000 and the related consolidated statements of income and cash flows for each of the three years in the period ended December 31, 2001. These financial statements are the responsibility of the Company's management. Our responsibility is to express an opinion on these financial statements based on our audits. +We conducted our audits in accordance with auditing standards generally accepted in the United States. Those standards require that we plan and perform the audit to obtain reasonable assurance about whether the financial statements are free of material misstatement. An audit includes examining, on a test basis, evidence supporting the amounts and disclosures in the financial statements. An audit also includes assessing the accounting principles used and significant estimates made by management as well as evaluating the overall financial statement presentation. We believe that our audits provide a reasonable basis for our opinion. +In our opinion, the financial statements referred to above present fairly, in all material respects, the financial position of Atrion Corporation and subsidiaries as of December 31, 2001 and 2000 and the results of their operations and their cash flows for each of the three years in the period ended December 31, 2001 in conformity with accounting principles generally accepted in the United States. + +Arthur Andersen LLP Atlanta, Georgia February 25, 2002 +23 + \ No newline at end of file diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py index be174462..e16fe3e7 100644 --- a/tests/test_dataset_builder.py +++ b/tests/test_dataset_builder.py @@ -1,6 +1,7 @@ from pathlib import Path from typing import List, Optional +import pytest from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( EasyOcrOptions, @@ -69,7 +70,7 @@ def create_docling_prediction_provider( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) }, - do_visualization=False, + do_visualization=True, ) @@ -79,11 +80,12 @@ def test_run_dpbench_e2e(): dataset_layout = DPBenchDatasetBuilder( target=target_path / "gt_dataset", + end_index=5, ) dataset_layout.retrieve_input_dataset() # fetches the source dataset from HF dataset_layout.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=5 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. docling_provider.create_prediction_dataset( @@ -105,12 +107,11 @@ def test_run_doclaynet_with_doctags_fileprovider(): file_provider = FilePredictionProvider( prediction_format=PredictionFormats.DOCTAGS, source_path=Path("./tests/data/doclaynet_v1_doctags_sample"), - do_visualization=False, + do_visualization=True, ignore_missing_files=True, ) dataset_layout = DocLayNetV1DatasetBuilder( - # prediction_provider=docling_provider, target=target_path / "gt_dataset", end_index=5, ) @@ -127,10 +128,10 @@ def test_run_doclaynet_with_doctags_fileprovider(): ) evaluate( - modality=EvaluationModality.MARKDOWN_TEXT, + modality=EvaluationModality.LAYOUT, benchmark=BenchMarkNames.DOCLAYNETV1, idir=target_path / "eval_dataset", - odir=target_path / "evaluations" / EvaluationModality.MARKDOWN_TEXT.value, + odir=target_path / "evaluations" / EvaluationModality.LAYOUT.value, ) @@ -140,11 +141,12 @@ def test_run_omnidocbench_e2e(): dataset_layout = OmniDocBenchDatasetBuilder( target=target_path / "gt_dataset", + end_index=5, ) dataset_layout.retrieve_input_dataset() # fetches the source dataset from HF dataset_layout.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=5 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. docling_provider.create_prediction_dataset( @@ -167,11 +169,12 @@ def test_run_dpbench_tables(): dataset_tables = DPBenchDatasetBuilder( target=target_path / "gt_dataset", + end_index=5, ) dataset_tables.retrieve_input_dataset() # fetches the source dataset from HF dataset_tables.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=5 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. tableformer_provider.create_prediction_dataset( @@ -194,6 +197,7 @@ def test_run_omnidocbench_tables(): dataset_tables = OmniDocBenchDatasetBuilder( target=target_path / "gt_dataset", + end_index=5, ) dataset_tables.retrieve_input_dataset() # fetches the source dataset from HF @@ -223,12 +227,12 @@ def test_run_doclaynet_v1_e2e(): dataset_layout = DocLayNetV1DatasetBuilder( # prediction_provider=docling_provider, target=target_path / "gt_dataset", - end_index=80, + end_index=5, ) dataset_layout.retrieve_input_dataset() # fetches the source dataset from HF dataset_layout.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=5 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. docling_provider.create_prediction_dataset( @@ -245,18 +249,20 @@ def test_run_doclaynet_v1_e2e(): ) +@pytest.mark.skip("Test needs local data which is unavailable.") def test_run_doclaynet_v2_e2e(): target_path = Path(f"./scratch/{BenchMarkNames.DOCLAYNETV2.value}/") docling_provider = create_docling_prediction_provider(page_image_scale=2.0) dataset_layout = DocLayNetV2DatasetBuilder( - dataset_path=Path("/Users/cau/Documents/Data/doclaynet_v2_benchmark"), + dataset_path=Path("/path/to/doclaynet_v2_benchmark"), target=target_path / "gt_dataset", + end_index=5, ) dataset_layout.retrieve_input_dataset() # fetches the source dataset from HF dataset_layout.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=5 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. docling_provider.create_prediction_dataset( @@ -279,11 +285,12 @@ def test_run_funsd(): dataset_layout = FUNSDDatasetBuilder( dataset_source=target_path / "input_dataset", target=target_path / "gt_dataset", + end_index=5, ) dataset_layout.retrieve_input_dataset() # fetches the source dataset from HF dataset_layout.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=5 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. @@ -293,11 +300,12 @@ def test_run_xfund(): dataset_layout = XFUNDDatasetBuilder( dataset_source=target_path / "input_dataset", target=target_path / "gt_dataset", + end_index=5, ) dataset_layout.retrieve_input_dataset() # fetches the source dataset from HF dataset_layout.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=5 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. @@ -307,11 +315,12 @@ def test_run_fintabnet_builder(): dataset = FintabNetDatasetBuilder( target=target_path / "gt_dataset", + end_index=5, ) dataset.retrieve_input_dataset() # fetches the source dataset from HF dataset.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=5 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. tableformer_provider.create_prediction_dataset( @@ -334,11 +343,12 @@ def test_run_p1m_builder(): dataset = PubTables1MDatasetBuilder( target=target_path / "gt_dataset", + end_index=5, ) dataset.retrieve_input_dataset() # fetches the source dataset from HF dataset.save_to_disk( - chunk_size=5, max_num_chunks=1 + chunk_size=5 ) # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet. tableformer_provider.create_prediction_dataset( @@ -361,6 +371,7 @@ def test_run_pubtabnet_builder(): dataset = PubTabNetDatasetBuilder( target=target_path / "gt_dataset", + end_index=25, ) dataset.retrieve_input_dataset() # fetches the source dataset from HF