Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 83 additions & 8 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
)
from docling_eval.dataset_builders.doclaynet_v1_builder import DocLayNetV1DatasetBuilder
from docling_eval.dataset_builders.doclaynet_v2_builder import DocLayNetV2DatasetBuilder
from docling_eval.dataset_builders.doclingdpbench_builder import (
DoclingDPBenchDatasetBuilder,
)
from docling_eval.dataset_builders.docvqa_builder import DocVQADatasetBuilder
from docling_eval.dataset_builders.dpbench_builder import DPBenchDatasetBuilder
from docling_eval.dataset_builders.file_dataset_builder import FileDatasetBuilder
Expand Down Expand Up @@ -65,20 +68,27 @@
DatasetTableEvaluation,
TableEvaluator,
)
from docling_eval.evaluators.timings_evaluator import (
DatasetTimingsEvaluation,
TimingsEvaluator,
)
from docling_eval.prediction_providers.docling_provider import DoclingPredictionProvider
from docling_eval.prediction_providers.file_provider import FilePredictionProvider
from docling_eval.prediction_providers.tableformer_provider import (
TableFormerPredictionProvider,
)

# Configure logging
logging.getLogger("docling").setLevel(logging.WARNING)
logging.getLogger("PIL").setLevel(logging.WARNING)
logging.getLogger("transformers").setLevel(logging.WARNING)
logging.getLogger("datasets").setLevel(logging.WARNING)
logging.getLogger("filelock").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("docling_ibm_models").setLevel(logging.WARNING)
logging_level = logging.WARNING
# logging_level = logging.DEBUG
logging.getLogger("docling").setLevel(logging_level)
logging.getLogger("PIL").setLevel(logging_level)
logging.getLogger("transformers").setLevel(logging_level)
logging.getLogger("datasets").setLevel(logging_level)
logging.getLogger("filelock").setLevel(logging_level)
logging.getLogger("urllib3").setLevel(logging_level)
logging.getLogger("docling_ibm_models").setLevel(logging_level)
logging.getLogger("matplotlib").setLevel(logging_level)

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -156,6 +166,9 @@ def get_dataset_builder(
if benchmark == BenchMarkNames.DPBENCH:
return DPBenchDatasetBuilder(**common_params) # type: ignore

elif benchmark == BenchMarkNames.DOCLING_DPBENCH:
return DoclingDPBenchDatasetBuilder(**common_params) # type: ignore

elif benchmark == BenchMarkNames.DOCLAYNETV1:
return DocLayNetV1DatasetBuilder(**common_params) # type: ignore

Expand Down Expand Up @@ -418,6 +431,16 @@ def evaluate(
if modality == EvaluationModality.END2END:
_log.error("END2END evaluation not supported. ")

elif modality == EvaluationModality.TIMINGS:
timings_evaluator = TimingsEvaluator()
evaluation = timings_evaluator( # type: ignore
idir,
split=split,
)

with open(save_fn, "w") as fd:
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)

elif modality == EvaluationModality.LAYOUT:
layout_evaluator = LayoutEvaluator()
evaluation = layout_evaluator( # type: ignore
Expand Down Expand Up @@ -538,6 +561,31 @@ def visualize(
if modality == EvaluationModality.END2END:
_log.error("END2END visualization not supported")

elif modality == EvaluationModality.TIMINGS:
try:
with open(metrics_filename, "r") as fd:
timings_evaluation = DatasetTimingsEvaluation.model_validate_json(
fd.read()
)

log_and_save_stats(
odir,
benchmark,
modality,
"time_to_solution_per_doc",
timings_evaluation.timing_per_document_stats,
)

log_and_save_stats(
odir,
benchmark,
modality,
"time_to_solution_per_page",
timings_evaluation.timing_per_page_stats,
)
except Exception as e:
_log.error(f"Error processing timings evaluation: {str(e)}")

elif modality == EvaluationModality.LAYOUT:
try:
with open(metrics_filename, "r") as fd:
Expand All @@ -554,6 +602,30 @@ def visualize(
layout_evaluation.map_stats,
)

log_and_save_stats(
odir,
benchmark,
modality,
"precision",
layout_evaluation.segmentation_precision_stats,
)

log_and_save_stats(
odir,
benchmark,
modality,
"recall",
layout_evaluation.segmentation_recall_stats,
)

log_and_save_stats(
odir,
benchmark,
modality,
"f1",
layout_evaluation.segmentation_f1_stats,
)

# Append to layout statistics, the AP per classes
data, headers = layout_evaluation.to_table()
content = "\n\n\nAP[0.5:0.05:0.95] per class (reported as %):\n\n"
Expand Down Expand Up @@ -724,6 +796,7 @@ def create_gt(
end_index: Annotated[
int, typer.Option(help="End index (exclusive), -1 for all")
] = -1,
chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80,
):
"""Create ground truth dataset only."""
gt_dir = output_dir / "gt_dataset"
Expand All @@ -741,7 +814,7 @@ def create_gt(
# Retrieve and save the dataset
if dataset_builder.must_retrieve:
dataset_builder.retrieve_input_dataset()
dataset_builder.save_to_disk(chunk_size=80)
dataset_builder.save_to_disk(chunk_size=chunk_size)

_log.info(f"Ground truth dataset created at {gt_dir}")
except ValueError as e:
Expand Down Expand Up @@ -841,6 +914,7 @@ def create(
end_index: Annotated[
int, typer.Option(help="End index (exclusive), -1 for all")
] = -1,
chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80,
prediction_provider: Annotated[
Optional[PredictionProviderType],
typer.Option(help="Type of prediction provider to use"),
Expand All @@ -861,6 +935,7 @@ def create(
split=split,
begin_index=begin_index,
end_index=end_index,
chunk_size=chunk_size,
)

# Then create evaluation if provider specified
Expand Down
3 changes: 3 additions & 0 deletions docling_eval/datamodels/dataset_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ class DatasetRecordWithPrediction(DatasetRecord):
)
original_prediction: Optional[str] = None
prediction_format: PredictionFormats # some enum type
prediction_timings: Optional[Dict] = Field(alias="prediction_timings", default=None)

predicted_page_images: List[PIL.Image.Image] = Field(
alias="PredictionPageImages", default=[]
Expand Down Expand Up @@ -201,13 +202,15 @@ def features(cls):
cls.get_field_alias("mime_type"): Value("string"),
cls.get_field_alias("modalities"): Sequence(Value("string")),
cls.get_field_alias("prediction_format"): Value("string"),
cls.get_field_alias("prediction_timings"): Value("string"),
}

def as_record_dict(self):
record = super().as_record_dict()
record.update(
{
self.get_field_alias("prediction_format"): self.prediction_format.value,
self.get_field_alias("prediction_timings"): self.prediction_timings,
}
)

Expand Down
2 changes: 2 additions & 0 deletions docling_eval/datamodels/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,14 @@ class EvaluationModality(str, Enum):
OCR = "ocr"
KEY_VALUE = "key_value"
QUESTION_ANSWERING = "question_answering"
TIMINGS = "timings"


class BenchMarkNames(str, Enum):

# End-to-End
DPBENCH = "DPBench"
DOCLING_DPBENCH = "DoclingDPBench"
OMNIDOCBENCH = "OmniDocBench"
WORDSCAPE = "WordScape"

Expand Down
103 changes: 103 additions & 0 deletions docling_eval/dataset_builders/doclingdpbench_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import json
import logging
import os
from io import BytesIO
from pathlib import Path
from typing import Dict, Iterable, Set

from datasets import load_dataset
from docling_core.types import DoclingDocument
from docling_core.types.io import DocumentStream
from PIL import Image as PILImage

from docling_eval.datamodels.dataset_record import DatasetRecord
from docling_eval.dataset_builders.dataset_builder import (
BaseEvaluationDatasetBuilder,
HFSource,
)
from docling_eval.utils.utils import get_binary, get_binhash

# Get logger
_log = logging.getLogger(__name__)


class DoclingDPBenchDatasetBuilder(BaseEvaluationDatasetBuilder):
"""
DoclingDPBench dataset builder implementing the base dataset builder interface.

This builder processes the DoclingDPBench dataset, which contains document
understanding benchmarks for various document types.
"""

def __init__(
self,
target: Path,
split: str = "test",
begin_index: int = 0,
end_index: int = -1,
):
"""
Initialize the DoclingDPBench dataset builder.

Args:
target: Path where processed dataset will be saved
split: Dataset split to use
begin_index: Start index for processing (inclusive)
end_index: End index for processing (exclusive), -1 means process all
"""
super().__init__(
name="DoclingDPBench",
dataset_source=HFSource(repo_id="ds4sd/docling-dpbench"),
target=target,
split=split,
begin_index=begin_index,
end_index=end_index,
)

self.must_retrieve = True

def iterate(self) -> Iterable[DatasetRecord]:
"""
Iterate through the dataset and yield DatasetRecord objects.

Yields:
DatasetRecord objects
"""
if not self.retrieved and self.must_retrieve:
raise RuntimeError(
"You must first retrieve the source dataset. Call retrieve_input_dataset()."
)

assert self.dataset_local_path is not None
_log.info(f"dataset_local_path: {self.dataset_local_path}")

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("ds4sd/docling-dpbench")

for idx, _ in enumerate(ds["test"]):
doc_hash = str(get_binhash(_["BinaryDocument"]))
doc = (DoclingDocument.model_validate_json(_["GroundTruthDocument"]),)

page_images = [
PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPageImages"]
]
pictures = [
PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPictures"]
]

pdf_stream = DocumentStream(
name=f"ds4sd/docling-dpbench/{idx}", stream=BytesIO(_["BinaryDocument"])
)

# Create dataset record
record = DatasetRecord(
doc_id=str(_["document_id"]),
doc_hash=doc_hash,
ground_truth_doc=doc[0],
ground_truth_pictures=pictures,
ground_truth_page_images=page_images,
original=pdf_stream,
mime_type=_["mimetype"],
)

yield record
4 changes: 2 additions & 2 deletions docling_eval/dataset_builders/file_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
# Create the ground truth Document
true_doc = DoclingDocument(name=f"{filename}")
if mime_type == "application/pdf":
_log.info(f"add_pages_to_true_doc: {filename}")
_log.debug(f"add_pages_to_true_doc: {filename}")
true_doc, _ = add_pages_to_true_doc(
pdf_path=filename, true_doc=true_doc, image_scale=2.0
)
Expand All @@ -127,7 +127,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
image=image_ref,
)

_log.info(f"add_pages_to_true_doc: {filename}")
_log.debug(f"add_pages_to_true_doc: {filename}")
true_doc.pages[1] = page_item
else:
raise ValueError(
Expand Down
Loading