docling-project · PeterStaar-IBM · Apr 29, 2025 · Apr 26, 2025 · Apr 28, 2025 · Apr 28, 2025
diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -31,6 +31,9 @@
 )
 from docling_eval.dataset_builders.doclaynet_v1_builder import DocLayNetV1DatasetBuilder
 from docling_eval.dataset_builders.doclaynet_v2_builder import DocLayNetV2DatasetBuilder
+from docling_eval.dataset_builders.doclingdpbench_builder import (
+    DoclingDPBenchDatasetBuilder,
+)
 from docling_eval.dataset_builders.docvqa_builder import DocVQADatasetBuilder
 from docling_eval.dataset_builders.dpbench_builder import DPBenchDatasetBuilder
 from docling_eval.dataset_builders.file_dataset_builder import FileDatasetBuilder
@@ -65,20 +68,27 @@
     DatasetTableEvaluation,
     TableEvaluator,
 )
+from docling_eval.evaluators.timings_evaluator import (
+    DatasetTimingsEvaluation,
+    TimingsEvaluator,
+)
 from docling_eval.prediction_providers.docling_provider import DoclingPredictionProvider
 from docling_eval.prediction_providers.file_provider import FilePredictionProvider
 from docling_eval.prediction_providers.tableformer_provider import (
     TableFormerPredictionProvider,
 )
 
 # Configure logging
-logging.getLogger("docling").setLevel(logging.WARNING)
-logging.getLogger("PIL").setLevel(logging.WARNING)
-logging.getLogger("transformers").setLevel(logging.WARNING)
-logging.getLogger("datasets").setLevel(logging.WARNING)
-logging.getLogger("filelock").setLevel(logging.WARNING)
-logging.getLogger("urllib3").setLevel(logging.WARNING)
-logging.getLogger("docling_ibm_models").setLevel(logging.WARNING)
+logging_level = logging.WARNING
+# logging_level = logging.DEBUG
+logging.getLogger("docling").setLevel(logging_level)
+logging.getLogger("PIL").setLevel(logging_level)
+logging.getLogger("transformers").setLevel(logging_level)
+logging.getLogger("datasets").setLevel(logging_level)
+logging.getLogger("filelock").setLevel(logging_level)
+logging.getLogger("urllib3").setLevel(logging_level)
+logging.getLogger("docling_ibm_models").setLevel(logging_level)
+logging.getLogger("matplotlib").setLevel(logging_level)
 
 _log = logging.getLogger(__name__)
 
@@ -156,6 +166,9 @@ def get_dataset_builder(
     if benchmark == BenchMarkNames.DPBENCH:
         return DPBenchDatasetBuilder(**common_params)  # type: ignore
 
+    elif benchmark == BenchMarkNames.DOCLING_DPBENCH:
+        return DoclingDPBenchDatasetBuilder(**common_params)  # type: ignore
+
     elif benchmark == BenchMarkNames.DOCLAYNETV1:
         return DocLayNetV1DatasetBuilder(**common_params)  # type: ignore
 
@@ -418,6 +431,16 @@ def evaluate(
     if modality == EvaluationModality.END2END:
         _log.error("END2END evaluation not supported. ")
 
+    elif modality == EvaluationModality.TIMINGS:
+        timings_evaluator = TimingsEvaluator()
+        evaluation = timings_evaluator(  # type: ignore
+            idir,
+            split=split,
+        )
+
+        with open(save_fn, "w") as fd:
+            json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
+
     elif modality == EvaluationModality.LAYOUT:
         layout_evaluator = LayoutEvaluator()
         evaluation = layout_evaluator(  # type: ignore
@@ -538,6 +561,31 @@ def visualize(
     if modality == EvaluationModality.END2END:
         _log.error("END2END visualization not supported")
 
+    elif modality == EvaluationModality.TIMINGS:
+        try:
+            with open(metrics_filename, "r") as fd:
+                timings_evaluation = DatasetTimingsEvaluation.model_validate_json(
+                    fd.read()
+                )
+
+            log_and_save_stats(
+                odir,
+                benchmark,
+                modality,
+                "time_to_solution_per_doc",
+                timings_evaluation.timing_per_document_stats,
+            )
+
+            log_and_save_stats(
+                odir,
+                benchmark,
+                modality,
+                "time_to_solution_per_page",
+                timings_evaluation.timing_per_page_stats,
+            )
+        except Exception as e:
+            _log.error(f"Error processing timings evaluation: {str(e)}")
+
     elif modality == EvaluationModality.LAYOUT:
         try:
             with open(metrics_filename, "r") as fd:
@@ -554,6 +602,30 @@ def visualize(
                 layout_evaluation.map_stats,
             )
 
+            log_and_save_stats(
+                odir,
+                benchmark,
+                modality,
+                "precision",
+                layout_evaluation.segmentation_precision_stats,
+            )
+
+            log_and_save_stats(
+                odir,
+                benchmark,
+                modality,
+                "recall",
+                layout_evaluation.segmentation_recall_stats,
+            )
+
+            log_and_save_stats(
+                odir,
+                benchmark,
+                modality,
+                "f1",
+                layout_evaluation.segmentation_f1_stats,
+            )
+
             # Append to layout statistics, the AP per classes
             data, headers = layout_evaluation.to_table()
             content = "\n\n\nAP[0.5:0.05:0.95] per class (reported as %):\n\n"
@@ -724,6 +796,7 @@ def create_gt(
     end_index: Annotated[
         int, typer.Option(help="End index (exclusive), -1 for all")
     ] = -1,
+    chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80,
 ):
     """Create ground truth dataset only."""
     gt_dir = output_dir / "gt_dataset"
@@ -741,7 +814,7 @@ def create_gt(
         # Retrieve and save the dataset
         if dataset_builder.must_retrieve:
             dataset_builder.retrieve_input_dataset()
-        dataset_builder.save_to_disk(chunk_size=80)
+        dataset_builder.save_to_disk(chunk_size=chunk_size)
 
         _log.info(f"Ground truth dataset created at {gt_dir}")
     except ValueError as e:
@@ -841,6 +914,7 @@ def create(
     end_index: Annotated[
         int, typer.Option(help="End index (exclusive), -1 for all")
     ] = -1,
+    chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80,
     prediction_provider: Annotated[
         Optional[PredictionProviderType],
         typer.Option(help="Type of prediction provider to use"),
@@ -861,6 +935,7 @@ def create(
         split=split,
         begin_index=begin_index,
         end_index=end_index,
+        chunk_size=chunk_size,
     )
 
     # Then create evaluation if provider specified

diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py
@@ -173,6 +173,7 @@ class DatasetRecordWithPrediction(DatasetRecord):
     )
     original_prediction: Optional[str] = None
     prediction_format: PredictionFormats  # some enum type
+    prediction_timings: Optional[Dict] = Field(alias="prediction_timings", default=None)
 
     predicted_page_images: List[PIL.Image.Image] = Field(
         alias="PredictionPageImages", default=[]
@@ -201,13 +202,15 @@ def features(cls):
             cls.get_field_alias("mime_type"): Value("string"),
             cls.get_field_alias("modalities"): Sequence(Value("string")),
             cls.get_field_alias("prediction_format"): Value("string"),
+            cls.get_field_alias("prediction_timings"): Value("string"),
         }
 
     def as_record_dict(self):
         record = super().as_record_dict()
         record.update(
             {
                 self.get_field_alias("prediction_format"): self.prediction_format.value,
+                self.get_field_alias("prediction_timings"): self.prediction_timings,
             }
         )
 

diff --git a/docling_eval/datamodels/types.py b/docling_eval/datamodels/types.py
@@ -47,12 +47,14 @@ class EvaluationModality(str, Enum):
     OCR = "ocr"
     KEY_VALUE = "key_value"
     QUESTION_ANSWERING = "question_answering"
+    TIMINGS = "timings"
 
 
 class BenchMarkNames(str, Enum):
 
     # End-to-End
     DPBENCH = "DPBench"
+    DOCLING_DPBENCH = "DoclingDPBench"
     OMNIDOCBENCH = "OmniDocBench"
     WORDSCAPE = "WordScape"
 

diff --git a/docling_eval/dataset_builders/doclingdpbench_builder.py b/docling_eval/dataset_builders/doclingdpbench_builder.py
@@ -0,0 +1,103 @@
+import json
+import logging
+import os
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, Iterable, Set
+
+from datasets import load_dataset
+from docling_core.types import DoclingDocument
+from docling_core.types.io import DocumentStream
+from PIL import Image as PILImage
+
+from docling_eval.datamodels.dataset_record import DatasetRecord
+from docling_eval.dataset_builders.dataset_builder import (
+    BaseEvaluationDatasetBuilder,
+    HFSource,
+)
+from docling_eval.utils.utils import get_binary, get_binhash
+
+# Get logger
+_log = logging.getLogger(__name__)
+
+
+class DoclingDPBenchDatasetBuilder(BaseEvaluationDatasetBuilder):
+    """
+    DoclingDPBench dataset builder implementing the base dataset builder interface.
+
+    This builder processes the DoclingDPBench dataset, which contains document
+    understanding benchmarks for various document types.
+    """
+
+    def __init__(
+        self,
+        target: Path,
+        split: str = "test",
+        begin_index: int = 0,
+        end_index: int = -1,
+    ):
+        """
+        Initialize the DoclingDPBench dataset builder.
+
+        Args:
+            target: Path where processed dataset will be saved
+            split: Dataset split to use
+            begin_index: Start index for processing (inclusive)
+            end_index: End index for processing (exclusive), -1 means process all
+        """
+        super().__init__(
+            name="DoclingDPBench",
+            dataset_source=HFSource(repo_id="ds4sd/docling-dpbench"),
+            target=target,
+            split=split,
+            begin_index=begin_index,
+            end_index=end_index,
+        )
+
+        self.must_retrieve = True
+
+    def iterate(self) -> Iterable[DatasetRecord]:
+        """
+        Iterate through the dataset and yield DatasetRecord objects.
+
+        Yields:
+            DatasetRecord objects
+        """
+        if not self.retrieved and self.must_retrieve:
+            raise RuntimeError(
+                "You must first retrieve the source dataset. Call retrieve_input_dataset()."
+            )
+
+        assert self.dataset_local_path is not None
+        _log.info(f"dataset_local_path: {self.dataset_local_path}")
+
+        # Login using e.g. `huggingface-cli login` to access this dataset
+        ds = load_dataset("ds4sd/docling-dpbench")
+
+        for idx, _ in enumerate(ds["test"]):
+            doc_hash = str(get_binhash(_["BinaryDocument"]))
+            doc = (DoclingDocument.model_validate_json(_["GroundTruthDocument"]),)
+
+            page_images = [
+                PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPageImages"]
+            ]
+            pictures = [
+                PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPictures"]
+            ]
+
+            pdf_stream = DocumentStream(
+                name=f"ds4sd/docling-dpbench/{idx}", stream=BytesIO(_["BinaryDocument"])
+            )
+
+            # Create dataset record
+            record = DatasetRecord(
+                doc_id=str(_["document_id"]),
+                doc_hash=doc_hash,
+                ground_truth_doc=doc[0],
+                ground_truth_pictures=pictures,
+                ground_truth_page_images=page_images,
+                original=pdf_stream,
+                mime_type=_["mimetype"],
+            )
+
+            yield record
diff --git a/docling_eval/dataset_builders/file_dataset_builder.py b/docling_eval/dataset_builders/file_dataset_builder.py
@@ -108,7 +108,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
             # Create the ground truth Document
             true_doc = DoclingDocument(name=f"{filename}")
             if mime_type == "application/pdf":
-                _log.info(f"add_pages_to_true_doc: {filename}")
+                _log.debug(f"add_pages_to_true_doc: {filename}")
                 true_doc, _ = add_pages_to_true_doc(
                     pdf_path=filename, true_doc=true_doc, image_scale=2.0
                 )
@@ -127,7 +127,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
                     image=image_ref,
                 )
 
-                _log.info(f"add_pages_to_true_doc: {filename}")
+                _log.debug(f"add_pages_to_true_doc: {filename}")
                 true_doc.pages[1] = page_item
             else:
                 raise ValueError(