From 8e65c337425c3a026966ae323096f120461186c4 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Sat, 26 Apr 2025 12:11:52 +0200 Subject: [PATCH 01/15] added the area-level precision, recall and f1 Signed-off-by: Peter Staar --- docling_eval/cli/main.py | 30 ++++ docling_eval/datamodels/types.py | 1 + .../doclingdpbench_builder.py | 100 ++++++++++++ docling_eval/evaluators/layout_evaluator.py | 143 +++++++++++++++++- 4 files changed, 269 insertions(+), 5 deletions(-) create mode 100644 docling_eval/dataset_builders/doclingdpbench_builder.py diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index e1b566f7..6580e542 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -31,6 +31,9 @@ ) from docling_eval.dataset_builders.doclaynet_v1_builder import DocLayNetV1DatasetBuilder from docling_eval.dataset_builders.doclaynet_v2_builder import DocLayNetV2DatasetBuilder +from docling_eval.dataset_builders.doclingdpbench_builder import ( + DoclingDPBenchDatasetBuilder, +) from docling_eval.dataset_builders.docvqa_builder import DocVQADatasetBuilder from docling_eval.dataset_builders.dpbench_builder import DPBenchDatasetBuilder from docling_eval.dataset_builders.file_dataset_builder import FileDatasetBuilder @@ -156,6 +159,9 @@ def get_dataset_builder( if benchmark == BenchMarkNames.DPBENCH: return DPBenchDatasetBuilder(**common_params) # type: ignore + elif benchmark == BenchMarkNames.DOCLING_DPBENCH: + return DoclingDPBenchDatasetBuilder(**common_params) # type: ignore + elif benchmark == BenchMarkNames.DOCLAYNETV1: return DocLayNetV1DatasetBuilder(**common_params) # type: ignore @@ -554,6 +560,30 @@ def visualize( layout_evaluation.map_stats, ) + log_and_save_stats( + odir, + benchmark, + modality, + "precision", + layout_evaluation.segmentation_precision_stats, + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "recall", + layout_evaluation.segmentation_recall_stats, + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "f1", + layout_evaluation.segmentation_f1_stats, + ) + # Append to layout statistics, the AP per classes data, headers = layout_evaluation.to_table() content = "\n\n\nAP[0.5:0.05:0.95] per class (reported as %):\n\n" diff --git a/docling_eval/datamodels/types.py b/docling_eval/datamodels/types.py index 120ee414..2b1d2966 100644 --- a/docling_eval/datamodels/types.py +++ b/docling_eval/datamodels/types.py @@ -53,6 +53,7 @@ class BenchMarkNames(str, Enum): # End-to-End DPBENCH = "DPBench" + DOCLING_DPBENCH = "DoclingDPBench" OMNIDOCBENCH = "OmniDocBench" WORDSCAPE = "WordScape" diff --git a/docling_eval/dataset_builders/doclingdpbench_builder.py b/docling_eval/dataset_builders/doclingdpbench_builder.py new file mode 100644 index 00000000..63fe49e0 --- /dev/null +++ b/docling_eval/dataset_builders/doclingdpbench_builder.py @@ -0,0 +1,100 @@ +import json +import logging +import os +from io import BytesIO +from pathlib import Path +from typing import Dict, Iterable, Set + +from datasets import load_dataset +from docling_core.types import DoclingDocument +from docling_core.types.io import DocumentStream +from PIL import Image as PILImage + +from docling_eval.datamodels.dataset_record import DatasetRecord +from docling_eval.dataset_builders.dataset_builder import ( + BaseEvaluationDatasetBuilder, + HFSource, +) +from docling_eval.utils.utils import get_binary, get_binhash + + +class DoclingDPBenchDatasetBuilder(BaseEvaluationDatasetBuilder): + """ + DoclingDPBench dataset builder implementing the base dataset builder interface. + + This builder processes the DoclingDPBench dataset, which contains document + understanding benchmarks for various document types. + """ + + def __init__( + self, + target: Path, + split: str = "test", + begin_index: int = 0, + end_index: int = -1, + ): + """ + Initialize the DoclingDPBench dataset builder. + + Args: + target: Path where processed dataset will be saved + split: Dataset split to use + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ + super().__init__( + name="DoclingDPBench", + dataset_source=HFSource(repo_id="ds4sd/docling-dpbench"), + target=target, + split=split, + begin_index=begin_index, + end_index=end_index, + ) + + self.must_retrieve = True + + def iterate(self) -> Iterable[DatasetRecord]: + """ + Iterate through the dataset and yield DatasetRecord objects. + + Yields: + DatasetRecord objects + """ + if not self.retrieved and self.must_retrieve: + raise RuntimeError( + "You must first retrieve the source dataset. Call retrieve_input_dataset()." + ) + + assert self.dataset_local_path is not None + print(f"dataset_local_path: {self.dataset_local_path}") + + # Login using e.g. `huggingface-cli login` to access this dataset + ds = load_dataset("ds4sd/docling-dpbench") + + for idx, _ in enumerate(ds["test"]): + doc_hash = str(get_binhash(_["BinaryDocument"])) + doc = (DoclingDocument.model_validate_json(_["GroundTruthDocument"]),) + + page_images = [ + PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPageImages"] + ] + pictures = [ + PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPictures"] + ] + + pdf_stream = DocumentStream( + name=f"ds4sd/docling-dpbench/{idx}", stream=BytesIO(_["BinaryDocument"]) + ) + + # Create dataset record + record = DatasetRecord( + doc_id=str(_["document_id"]), + doc_hash=doc_hash, + ground_truth_doc=doc[0], + ground_truth_pictures=pictures, + ground_truth_page_images=page_images, + original=pdf_stream, + mime_type=_["mimetype"], + ) + + yield record diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 63646762..465ed81b 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -59,6 +59,10 @@ class ImageLayoutEvaluation(UnitEvaluation): avg_weighted_label_matched_iou_90: float avg_weighted_label_matched_iou_95: float + segmentation_precision: float + segmentation_recall: float + segmentation_f1: float + class DatasetLayoutEvaluation(DatasetEvaluation): true_labels: Dict[str, int] @@ -78,6 +82,10 @@ class DatasetLayoutEvaluation(DatasetEvaluation): weighted_map_90_stats: DatasetStatistics weighted_map_95_stats: DatasetStatistics + segmentation_precision_stats: DatasetStatistics + segmentation_recall_stats: DatasetStatistics + segmentation_f1_stats: DatasetStatistics + def to_table(self) -> Tuple[List[List[str]], List[str]]: headers = ["label", "Class mAP[0.5:0.95]"] @@ -147,8 +155,15 @@ def __call__( true_labels, pred_labels, intersection_labels = self._find_intersecting_labels( ds_selection ) - intersection_labels_str = "\n" + "\n".join(sorted(intersection_labels)) + true_labels_str = ", ".join(sorted(true_labels)) + logging.info(f"True labels: {true_labels_str}") + + pred_labels_str = ", ".join(sorted(pred_labels)) + logging.info(f"Pred labels: {pred_labels_str}") + + intersection_labels_str = ", ".join(sorted(intersection_labels)) logging.info(f"Intersection labels: {intersection_labels_str}") + # intersection_labels_str = "\n" + "\n".join(sorted(intersection_labels)) doc_ids = [] ground_truths = [] @@ -187,6 +202,9 @@ def __call__( filter_labels=intersection_labels, ) + # logging.info(f"gts: {gts}") + # logging.info(f"preds: {preds}") + if len(gts) > 0: for i in range(len(gts)): doc_ids.append(data[BenchMarkColumns.DOC_ID] + f"-page-{i}") @@ -258,8 +276,19 @@ def __call__( for i, (doc_id, pred, gt) in enumerate( zip(doc_ids, predictions, ground_truths) ): + # logging.info(f"gt: {gt}") + # logging.info(f"pred: {pred}") + + precision, recall, f1 = self._compute_area_level_metrics_for_tensors( + gt_boxes=gt["boxes"], + pred_boxes=pred["boxes"], + page_width=100, + page_height=100, + mask_width=512, + mask_height=512, + ) + # Reset the metric for the next image - # metric.reset() metric = MeanAveragePrecision(iou_type="bbox", class_metrics=True) # Update with single image @@ -293,6 +322,10 @@ def __call__( weighted_map_90_values.append(average_iou_90) weighted_map_95_values.append(average_iou_95) + logging.info( + f"doc: {doc_id}\tprecision: {precision:.2f}, recall: {recall:.2f}, f1: {f1:.2f}, map_50: {map_50:.2f}" + ) + image_evaluation = ImageLayoutEvaluation( name=doc_id, value=average_iou_50, @@ -303,6 +336,9 @@ def __call__( avg_weighted_label_matched_iou_75=average_iou_75, avg_weighted_label_matched_iou_90=average_iou_90, avg_weighted_label_matched_iou_95=average_iou_95, + segmentation_precision=precision, + segmentation_recall=recall, + segmentation_f1=f1, ) evaluations_per_image.append(image_evaluation) if self._intermediate_evaluations_path: @@ -326,6 +362,15 @@ def __call__( weighted_map_75_stats=compute_stats(weighted_map_75_values), weighted_map_90_stats=compute_stats(weighted_map_90_values), weighted_map_95_stats=compute_stats(weighted_map_95_values), + segmentation_precision_stats=compute_stats( + [_.segmentation_precision for _ in evaluations_per_image] + ), + segmentation_recall_stats=compute_stats( + [_.segmentation_recall for _ in evaluations_per_image] + ), + segmentation_f1_stats=compute_stats( + [_.segmentation_f1 for _ in evaluations_per_image] + ), true_labels=true_labels, pred_labels=pred_labels, intersecting_labels=[_.value for _ in intersection_labels], @@ -572,13 +617,10 @@ def _extract_layout_data( for item in items: for prov in item.prov: bbox = prov.bbox.to_top_left_origin(page_height=page_height) - # true_tl_bboxes.append(copy.deepcopy(bbox)) bbox = bbox.normalized(page_size) bbox = bbox.scaled(100.0) - # logging.info(f"ground-truth {page_no}: {page_width, page_height} -> {item.label}, {bbox.coord_origin}: [{bbox.l}, {bbox.t}, {bbox.r}, {bbox.b}]") - bboxes.append([bbox.l, bbox.t, bbox.r, bbox.b]) labels.append(filter_labels.index(self.label_mapping[item.label])) # type: ignore @@ -635,3 +677,94 @@ def _extract_layout_data( # print(pred_tl_bboxes_str) return ground_truths, predictions + + def _compute_area_level_metrics_for_tensors( + self, + gt_boxes: torch.Tensor, + pred_boxes: torch.Tensor, + page_width: int, + page_height: int, + mask_width: int = 512, + mask_height: int = 512, + ) -> Tuple[float, float, float]: + """ + Compute area-level precision, recall, and F1 score for tensor format boxes. + Handles overlapping boxes by using binary masks at the specified resolution. + + Args: + gt_boxes: Ground truth boxes as tensor of shape (N, 4) with [x1, y1, x2, y2] format + pred_boxes: Predicted boxes as tensor of shape (M, 4) with [x1, y1, x2, y2] format + page_width: Width of the original page + page_height: Height of the original page + mask_width: Width of the mask to use for computation (default: 512) + mask_height: Height of the mask to use for computation (default: 512) + + Returns: + Dictionary containing precision, recall, and F1 scores + """ + if gt_boxes.shape[0] == 0: + precision = 1.0 if pred_boxes.shape[0] == 0 else 0.0 + recall = 1.0 + f1 = 1.0 if pred_boxes.shape[0] == 0 else 0.0 + return precision, recall, f1 + + if pred_boxes.shape[0] == 0: + precision = 1.0 + recall = 0.0 + f1 = 0.0 + return precision, recall, f1 + + # Calculate scaling factors (ensure float division) + x_scale = float(mask_width) / float(page_width) + y_scale = float(mask_height) / float(page_height) + + # Create empty masks + gt_mask = torch.zeros((mask_height, mask_width), dtype=torch.bool, device="cpu") + pred_mask = torch.zeros( + (mask_height, mask_width), dtype=torch.bool, device="cpu" + ) + + # Fill ground truth mask + for i in range(gt_boxes.shape[0]): + x1, y1, x2, y2 = gt_boxes[i].tolist() + + # Scale coordinates to mask space + x1, y1 = max(0, int(x1 * x_scale)), max(0, int(y1 * y_scale)) + x2, y2 = min(mask_width, int(x2 * x_scale)), min( + mask_height, int(y2 * y_scale) + ) + + if x2 > x1 and y2 > y1: + gt_mask[y1:y2, x1:x2] = True + + # Fill prediction mask + for i in range(pred_boxes.shape[0]): + x1, y1, x2, y2 = pred_boxes[i].tolist() + + # Scale coordinates to mask space + x1, y1 = max(0, int(x1 * x_scale)), max(0, int(y1 * y_scale)) + x2, y2 = min(mask_width, int(x2 * x_scale)), min( + mask_height, int(y2 * y_scale) + ) + + if x2 > x1 and y2 > y1: + pred_mask[y1:y2, x1:x2] = True + + # Calculate areas (accounting for overlaps) + total_gt_area = torch.sum(gt_mask).item() + total_pred_area = torch.sum(pred_mask).item() + + # Calculate intersection (logical AND of masks) + intersection_mask = torch.logical_and(gt_mask, pred_mask) + total_intersection = torch.sum(intersection_mask).item() + + # Calculate metrics + precision = total_intersection / total_pred_area if total_pred_area > 0 else 0.0 + recall = total_intersection / total_gt_area if total_gt_area > 0 else 0.0 + + # Calculate F1 score + f1 = 0.0 + if precision + recall > 0: + f1 = 2 * (precision * recall) / (precision + recall) + + return precision, recall, f1 From 5b3201ccaa98082005cba59b0a32aa5fcb2b39fd Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 28 Apr 2025 08:41:51 +0200 Subject: [PATCH 02/15] WIP: adding timing modality Signed-off-by: Peter Staar --- docling_eval/datamodels/types.py | 1 + docling_eval/evaluators/layout_evaluator.py | 17 +++- docling_eval/evaluators/timings.py | 66 +++++++++++++++ docling_eval/visualisation/constants.py | 88 ++++++++++++++++++++ docling_eval/visualisation/visualisations.py | 3 +- poetry.lock | 49 ++--------- pyproject.toml | 2 +- 7 files changed, 176 insertions(+), 50 deletions(-) create mode 100644 docling_eval/evaluators/timings.py diff --git a/docling_eval/datamodels/types.py b/docling_eval/datamodels/types.py index 2b1d2966..04a0fd88 100644 --- a/docling_eval/datamodels/types.py +++ b/docling_eval/datamodels/types.py @@ -47,6 +47,7 @@ class EvaluationModality(str, Enum): OCR = "ocr" KEY_VALUE = "key_value" QUESTION_ANSWERING = "question_answering" + TIMINGS = "timings" class BenchMarkNames(str, Enum): diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 465ed81b..f6e49c1d 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -145,14 +145,14 @@ def __call__( # Load the dataset split_path = str(ds_path / split / "*.parquet") split_files = glob.glob(split_path) - logging.info("Files: %s", split_files) + logging.info("#-files: %s", len(split_files)) ds = load_dataset("parquet", data_files={split: split_files}) logging.info("Overview of dataset: %s", ds) # Select the split ds_selection: Dataset = ds[split] - true_labels, pred_labels, intersection_labels = self._find_intersecting_labels( + true_labels, pred_labels, intersection_labels, union_labels = self._find_intersecting_labels( ds_selection ) true_labels_str = ", ".join(sorted(true_labels)) @@ -163,7 +163,9 @@ def __call__( intersection_labels_str = ", ".join(sorted(intersection_labels)) logging.info(f"Intersection labels: {intersection_labels_str}") - # intersection_labels_str = "\n" + "\n".join(sorted(intersection_labels)) + + union_labels_str = ", ".join(sorted(union_labels)) + logging.info(f"Union labels: {union_labels_str}") doc_ids = [] ground_truths = [] @@ -547,11 +549,18 @@ def _find_intersecting_labels( """ intersection_labels: List[DocItemLabel] = [] + union_labels: List[DocItemLabel] = [] for label, count in true_labels.items(): + union_labels.append(DocItemLabel(label)) + if label in pred_labels: intersection_labels.append(DocItemLabel(label)) - return true_labels, pred_labels, intersection_labels + for label, count in pred_labels.items(): + if label not in true_labels: + union_labels.append(DocItemLabel(label)) + + return true_labels, pred_labels, intersection_labels, union_labels def _extract_layout_data( self, diff --git a/docling_eval/evaluators/timings.py b/docling_eval/evaluators/timings.py new file mode 100644 index 00000000..73da40b5 --- /dev/null +++ b/docling_eval/evaluators/timings.py @@ -0,0 +1,66 @@ +import glob +import logging +from pathlib import Path + +from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction + +_log = logging.getLogger(__name__) + +class DatasetTimingEvaluation(DatasetEvaluation): + """Dataset timing evaluation.""" + + timing_per_page_stats: DatasetStatistics + +class TimingsEvaluator(BaseEvaluator): + def __init__( + self, + intermediate_evaluations_path: Optional[Path] = None, + prediction_sources: List[PredictionFormats] = [], + ): + if not prediction_sources: + prediction_sources = supported_prediction_formats + super().__init__( + intermediate_evaluations_path=intermediate_evaluations_path, + prediction_sources=prediction_sources, + supported_prediction_formats=supported_prediction_formats, + ) + + def __call__( + self, + ds_path: Path, + split: str = "test", + ) -> DatasetTimingsEvaluation: + logging.info("Loading the split '%s' from: '%s'", split, ds_path) + + # Load the dataset + split_path = str(ds_path / split / "*.parquet") + split_files = glob.glob(split_path) + logging.info("#-files: %s", len(split_files)) + ds = load_dataset("parquet", data_files={split: split_files}) + logging.info("Overview of dataset: %s", ds) + + # Select the split + ds_selection: Dataset = ds[split] + + timings = [] + for i, data in tqdm( + enumerate(ds_selection), + desc="Timings evaluations", + ncols=120, + total=len(ds_selection), + ): + data_record = DatasetRecordWithPrediction.model_validate(data) + + if data_record.status not in self._accepted_status: + _log.error( + "Skipping record without successfull conversion status: %s", doc_id + ) + rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 + continue + + timings.append(data_record.timings) + + dataset_timing_evaluation = DatasetTimingEvaluation( + timing_per_page_stats=compute_stats([_.time for _ in timings]) + ) + return dataset_layout_evaluation diff --git a/docling_eval/visualisation/constants.py b/docling_eval/visualisation/constants.py index 823c258a..730ea9b6 100644 --- a/docling_eval/visualisation/constants.py +++ b/docling_eval/visualisation/constants.py @@ -109,6 +109,94 @@ """ +HTML_DEFAULT_HEAD_FOR_COMP_v2: str = r""" + + + +Powered by Docling + + +""" + + HTML_COMPARISON_PAGE_v1 = """ diff --git a/docling_eval/visualisation/visualisations.py b/docling_eval/visualisation/visualisations.py index f782fca6..f50c9f0d 100644 --- a/docling_eval/visualisation/visualisations.py +++ b/docling_eval/visualisation/visualisations.py @@ -25,6 +25,7 @@ HTML_COMPARISON_PAGE, HTML_COMPARISON_PAGE_WITH_CLUSTERS, HTML_DEFAULT_HEAD_FOR_COMP, + HTML_DEFAULT_HEAD_FOR_COMP_v2, HTML_INSPECTION, ) @@ -136,7 +137,7 @@ def get_missing_pageimg(width=800, height=1100, text="MISSING PAGE"): html_parts = [ "", "", - HTML_DEFAULT_HEAD_FOR_COMP, + HTML_DEFAULT_HEAD_FOR_COMP_v2, "", ] diff --git a/poetry.lock b/poetry.lock index 5be3a7a7..7f803956 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,38 +1,5 @@ # This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. -[[package]] -name = "accelerate" -version = "1.6.0" -description = "Accelerate" -optional = false -python-versions = ">=3.9.0" -groups = ["main"] -markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\"" -files = [ - {file = "accelerate-1.6.0-py3-none-any.whl", hash = "sha256:1aee717d3d3735ad6d09710a7c26990ee4652b79b4e93df46551551b5227c2aa"}, - {file = "accelerate-1.6.0.tar.gz", hash = "sha256:28c1ef1846e690944f98b68dc7b8bb6c51d032d45e85dcbb3adb0c8b99dffb32"}, -] - -[package.dependencies] -huggingface-hub = ">=0.21.0" -numpy = ">=1.17,<3.0.0" -packaging = ">=20.0" -psutil = "*" -pyyaml = "*" -safetensors = ">=0.4.3" -torch = ">=2.0.0" - -[package.extras] -deepspeed = ["deepspeed"] -dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "diffusers", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-order", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.11.2,<0.12.0)", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] -quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.11.2,<0.12.0)"] -rich = ["rich"] -sagemaker = ["sagemaker"] -test-dev = ["bitsandbytes", "datasets", "diffusers", "evaluate", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] -test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-order", "pytest-subtests", "pytest-xdist"] -test-trackers = ["comet-ml", "dvclive", "matplotlib", "mlflow", "tensorboard", "wandb"] -testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-order", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] - [[package]] name = "aiohappyeyeballs" version = "2.6.1" @@ -1482,18 +1449,17 @@ files = [ [[package]] name = "docling" -version = "2.30.0" +version = "2.31.0" description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." optional = false python-versions = "<4.0,>=3.9" groups = ["main"] files = [ - {file = "docling-2.30.0-py3-none-any.whl", hash = "sha256:88bc3f988116ea100ef1a025b623c94ae0010c11bc183f4773fb852a627d1d5d"}, - {file = "docling-2.30.0.tar.gz", hash = "sha256:6d31293d84ac9967101e394b7fa1b75be951775c1cb873d18b505e82c8d23c83"}, + {file = "docling-2.31.0-py3-none-any.whl", hash = "sha256:0a23c709aba5d3aa8f193e2211a7d3084af2b451f1c69deafdf81591179de779"}, + {file = "docling-2.31.0.tar.gz", hash = "sha256:1115f4cda7e67c70a6a61395aed65133f4e85e86914bdae5153c10a5ed329a71"}, ] [package.dependencies] -accelerate = {version = ">=1.2.1,<2.0.0", optional = true, markers = "(sys_platform != \"darwin\" or platform_machine != \"x86_64\") and extra == \"vlm\""} beautifulsoup4 = ">=4.12.3,<5.0.0" certifi = ">=2024.7.4" docling-core = {version = ">=2.26.0,<3.0.0", extras = ["chunking"]} @@ -1518,10 +1484,6 @@ requests = ">=2.32.2,<3.0.0" rtree = ">=1.3.0,<2.0.0" scipy = {version = ">=1.6.0,<2.0.0", markers = "python_version >= \"3.10\""} tqdm = ">=4.65.0,<5.0.0" -transformers = [ - {version = ">=4.46.0,<5.0.0", optional = true, markers = "(sys_platform != \"darwin\" or platform_machine != \"x86_64\") and extra == \"vlm\""}, - {version = ">=4.42.0,<4.43.0", optional = true, markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\" and extra == \"vlm\""}, -] typer = ">=0.12.5,<0.16.0" [package.extras] @@ -4796,7 +4758,7 @@ version = "7.0.0" description = "Cross-platform lib for process and system monitoring in Python. NOTE: the syntax of this script MUST be kept compatible with Python 2.7." optional = false python-versions = ">=3.6" -groups = ["main", "dev"] +groups = ["dev"] files = [ {file = "psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25"}, {file = "psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da"}, @@ -4809,7 +4771,6 @@ files = [ {file = "psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553"}, {file = "psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456"}, ] -markers = {main = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""} [package.extras] dev = ["abi3audit", "black (==24.10.0)", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest", "pytest-cov", "pytest-xdist", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"] @@ -7980,4 +7941,4 @@ hyperscalers = ["azure-ai-formrecognizer", "azure-common", "azure-core", "boto3" [metadata] lock-version = "2.1" python-versions = "^3.10" -content-hash = "ad48608fca439c925fd79021a7323b74448f506595a7c79f652a07e9538dbd13" +content-hash = "e953ef80d90d7f96d3c994c96801a8f442597062c8af2769c9e687c833075f1b" diff --git a/pyproject.toml b/pyproject.toml index fc672ad8..4379c140 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ lxml = "^5.3.0" datasets = "^3.2.0" apted = "^1.0.3" Distance = "^0.1.3" -docling = {extras = ["vlm"], version = "^2.28.0"} +docling = "^2.31.0" matplotlib = "^3.10.0" torch = "^2.5.1" torchmetrics = "^1.6.0" From 1a6c4862ab390d73a92e233fbdbaa7c671ebd44a Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 28 Apr 2025 09:58:54 +0200 Subject: [PATCH 03/15] updated the code with timings Signed-off-by: Peter Staar --- docling_eval/cli/main.py | 16 ++++++++++++ docling_eval/datamodels/dataset_record.py | 2 ++ .../{timings.py => timings_evaluator.py} | 26 +++++++++++++++++-- .../base_prediction_provider.py | 21 +++++++++++++-- .../prediction_providers/docling_provider.py | 14 ++++++++-- docling_eval/visualisation/constants.py | 6 +++++ 6 files changed, 79 insertions(+), 6 deletions(-) rename docling_eval/evaluators/{timings.py => timings_evaluator.py} (73%) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index 6580e542..189f7606 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -49,6 +49,12 @@ from docling_eval.dataset_builders.xfund_builder import XFUNDDatasetBuilder from docling_eval.evaluators.base_evaluator import DatasetEvaluationType from docling_eval.evaluators.bbox_text_evaluator import BboxTextEvaluator + +from docling_eval.evaluators.timings_evaluator import ( + DatasetTimingsEvaluation, + TimingsEvaluator, +) + from docling_eval.evaluators.layout_evaluator import ( DatasetLayoutEvaluation, LayoutEvaluator, @@ -423,7 +429,17 @@ def evaluate( if modality == EvaluationModality.END2END: _log.error("END2END evaluation not supported. ") + + elif modality == EvaluationModality.TIMINGS: + timings_evaluator = TimingsEvaluator() + evaluation = timings_evaluator( # type: ignore + idir, + split=split, + ) + with open(save_fn, "w") as fd: + json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True) + elif modality == EvaluationModality.LAYOUT: layout_evaluator = LayoutEvaluator() evaluation = layout_evaluator( # type: ignore diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py index 94c9d710..ec6d74e1 100644 --- a/docling_eval/datamodels/dataset_record.py +++ b/docling_eval/datamodels/dataset_record.py @@ -173,6 +173,7 @@ class DatasetRecordWithPrediction(DatasetRecord): ) original_prediction: Optional[str] = None prediction_format: PredictionFormats # some enum type + prediction_timings: Dict = Field(alias="prediction_timings", default={}) predicted_page_images: List[PIL.Image.Image] = Field( alias="PredictionPageImages", default=[] @@ -201,6 +202,7 @@ def features(cls): cls.get_field_alias("mime_type"): Value("string"), cls.get_field_alias("modalities"): Sequence(Value("string")), cls.get_field_alias("prediction_format"): Value("string"), + cls.get_field_alias("prediction_timings"): Value("string"), } def as_record_dict(self): diff --git a/docling_eval/evaluators/timings.py b/docling_eval/evaluators/timings_evaluator.py similarity index 73% rename from docling_eval/evaluators/timings.py rename to docling_eval/evaluators/timings_evaluator.py index 73da40b5..ed5f543f 100644 --- a/docling_eval/evaluators/timings.py +++ b/docling_eval/evaluators/timings_evaluator.py @@ -1,22 +1,42 @@ import glob import logging from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from tqdm import tqdm + +from datasets import Dataset, load_dataset + +from docling_eval.evaluators.base_evaluator import ( + BaseEvaluator, + DatasetEvaluation, + EvaluationRejectionType, + UnitEvaluation, +) +from docling_eval.evaluators.stats import DatasetStatistics, compute_stats from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction +from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats _log = logging.getLogger(__name__) -class DatasetTimingEvaluation(DatasetEvaluation): +class DatasetTimingsEvaluation(DatasetEvaluation): """Dataset timing evaluation.""" timing_per_page_stats: DatasetStatistics class TimingsEvaluator(BaseEvaluator): + """Timings evaluator.""" + def __init__( self, intermediate_evaluations_path: Optional[Path] = None, prediction_sources: List[PredictionFormats] = [], ): + supported_prediction_formats: List[PredictionFormats] = [ + PredictionFormats.DOCLING_DOCUMENT, + ] + if not prediction_sources: prediction_sources = supported_prediction_formats super().__init__( @@ -58,7 +78,9 @@ def __call__( rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 continue - timings.append(data_record.timings) + print("data_record.prediction_timings: ", data_record.prediction_timings) + + timings.append(data_record.prediction_timings) dataset_timing_evaluation = DatasetTimingEvaluation( timing_per_page_stats=compute_stats([_.time for _ in timings]) diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index d280ed98..c9fcf55d 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -118,6 +118,8 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: Returns: Dataset record with prediction added """ + print("predict") + pred_record = self.create_dataset_record_with_prediction( record, DoclingDocument(name="dummy"), @@ -186,6 +188,7 @@ def create_dataset_record_with_prediction( record: DatasetRecord, predicted_doc: Optional[DoclingDocument] = None, original_prediction: Optional[str] = None, + timings: Optional[dict] = None, ) -> DatasetRecordWithPrediction: """ Create a dataset record with prediction from an input record. @@ -198,6 +201,8 @@ def create_dataset_record_with_prediction( Returns: Dataset record with prediction """ + print("create_dataset_record_with_prediction") + pred_page_images = [] pred_pictures = [] if predicted_doc is not None: @@ -208,6 +213,8 @@ def create_dataset_record_with_prediction( page_images_column=BenchMarkColumns.PREDICTION_PAGE_IMAGES.value, ) + print("timings: ", timings) + data = { **record.as_record_dict(), "predicted_doc": predicted_doc, @@ -215,6 +222,7 @@ def create_dataset_record_with_prediction( "predicted_pictures": pred_pictures, "original_prediction": original_prediction, "prediction_format": self.prediction_format, + "prediction_timings": self.prediction_timings, "predictor_info": self.info(), } return DatasetRecordWithPrediction.model_validate(data) @@ -229,9 +237,12 @@ def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: Returns: Dataset record with prediction """ + print("add_prediction") + # Copy the original input data to avoid modifying it input_data = copy.deepcopy(record.original) - + print("copy done") + # Convert Path to DocumentStream if needed if not isinstance(input_data, DocumentStream): if isinstance(input_data, Path): @@ -240,6 +251,8 @@ def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: ) record.original = input_data + print("update record") + pred_record = self.predict(record) return pred_record @@ -328,10 +341,14 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]: ncols=120, total=len(ds_selection), ): + print(f"record {i}") + try: record = DatasetRecord.model_validate(data) + print("record validated") pred_record = self.add_prediction(record) - + print("adding prediction") + if ( self.ignore_missing_predictions and pred_record.status == ConversionStatus.FAILURE diff --git a/docling_eval/prediction_providers/docling_provider.py b/docling_eval/prediction_providers/docling_provider.py index b86b619b..dd8c185e 100644 --- a/docling_eval/prediction_providers/docling_provider.py +++ b/docling_eval/prediction_providers/docling_provider.py @@ -7,6 +7,8 @@ from docling_core.types.doc import DocItemLabel from pydantic import TypeAdapter +from docling.datamodel.settings import settings + from docling_eval.datamodels.dataset_record import ( DatasetRecord, DatasetRecordWithPrediction, @@ -47,6 +49,7 @@ def __init__( ignore_missing_predictions: bool = True, true_labels: Optional[Set[DocItemLabel]] = None, pred_labels: Optional[Set[DocItemLabel]] = None, + profile_pipeline_timings: bool = True, ): """ Initialize the Docling prediction provider. @@ -65,6 +68,10 @@ def __init__( true_labels=true_labels, pred_labels=pred_labels, ) + + # Enable the profiling to measure the time spent + settings.debug.profile_pipeline_timings = profile_pipeline_timings + self.doc_converter = DocumentConverter(format_options=format_options) @property @@ -84,20 +91,23 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: Raises: RuntimeError: If original document stream is not available - """ + """ if record.original is None: raise RuntimeError( "Stream must be given for docling prediction provider to work." ) # Convert the document + print("Convert the document: ", record.doc_id) res = self.doc_converter.convert(copy.deepcopy(record.original)) - + print("done converting, timings: ", res.timings) + # Create prediction record pred_record = self.create_dataset_record_with_prediction( record, res.document, None, + res.timings ) pred_record.status = res.status diff --git a/docling_eval/visualisation/constants.py b/docling_eval/visualisation/constants.py index 730ea9b6..3a165a1e 100644 --- a/docling_eval/visualisation/constants.py +++ b/docling_eval/visualisation/constants.py @@ -129,6 +129,12 @@ font-size: 0.9em; /* Smaller text */ max-width: 100%; } +td { + width: 25%; +} +.page td { + width:auto; +} /* Create a flex container for columns */ .container { display: flex; From 71ed2488afcee198e34641708a967fa2b5d738f9 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 28 Apr 2025 13:38:33 +0200 Subject: [PATCH 04/15] added the timings modality Signed-off-by: Peter Staar --- docling_eval/cli/main.py | 33 +++++++++++++---- docling_eval/datamodels/dataset_record.py | 1 + .../doclingdpbench_builder.py | 2 +- docling_eval/evaluators/stats.py | 10 ++++-- docling_eval/evaluators/timings_evaluator.py | 10 +++--- .../base_prediction_provider.py | 35 +++++++++---------- .../prediction_providers/docling_provider.py | 6 ++-- docs/faq.md | 9 +++++ poetry.lock | 6 ++-- 9 files changed, 72 insertions(+), 40 deletions(-) create mode 100644 docs/faq.md diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index 189f7606..83d0946b 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -81,13 +81,15 @@ ) # Configure logging -logging.getLogger("docling").setLevel(logging.WARNING) -logging.getLogger("PIL").setLevel(logging.WARNING) -logging.getLogger("transformers").setLevel(logging.WARNING) -logging.getLogger("datasets").setLevel(logging.WARNING) -logging.getLogger("filelock").setLevel(logging.WARNING) -logging.getLogger("urllib3").setLevel(logging.WARNING) -logging.getLogger("docling_ibm_models").setLevel(logging.WARNING) +logging_level = logging.WARNING +#logging_level = logging.DEBUG +logging.getLogger("docling").setLevel(logging_level) +logging.getLogger("PIL").setLevel(logging_level) +logging.getLogger("transformers").setLevel(logging_level) +logging.getLogger("datasets").setLevel(logging_level) +logging.getLogger("filelock").setLevel(logging_level) +logging.getLogger("urllib3").setLevel(logging_level) +logging.getLogger("docling_ibm_models").setLevel(logging_level) _log = logging.getLogger(__name__) @@ -560,6 +562,23 @@ def visualize( if modality == EvaluationModality.END2END: _log.error("END2END visualization not supported") + elif modality == EvaluationModality.TIMINGS: + try: + with open(metrics_filename, "r") as fd: + timings_evaluation = DatasetTimingsEvaluation.model_validate_json( + fd.read() + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "time_to_solution_", + timings_evaluation.timing_per_page_stats, + ) + except Exception as e: + _log.error(f"Error processing timings evaluation: {str(e)}") + elif modality == EvaluationModality.LAYOUT: try: with open(metrics_filename, "r") as fd: diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py index ec6d74e1..930d64d3 100644 --- a/docling_eval/datamodels/dataset_record.py +++ b/docling_eval/datamodels/dataset_record.py @@ -210,6 +210,7 @@ def as_record_dict(self): record.update( { self.get_field_alias("prediction_format"): self.prediction_format.value, + self.get_field_alias("prediction_timings"): self.prediction_timings, } ) diff --git a/docling_eval/dataset_builders/doclingdpbench_builder.py b/docling_eval/dataset_builders/doclingdpbench_builder.py index 63fe49e0..22b4c4d1 100644 --- a/docling_eval/dataset_builders/doclingdpbench_builder.py +++ b/docling_eval/dataset_builders/doclingdpbench_builder.py @@ -66,7 +66,7 @@ def iterate(self) -> Iterable[DatasetRecord]: ) assert self.dataset_local_path is not None - print(f"dataset_local_path: {self.dataset_local_path}") + _log.info(f"dataset_local_path: {self.dataset_local_path}") # Login using e.g. `huggingface-cli login` to access this dataset ds = load_dataset("ds4sd/docling-dpbench") diff --git a/docling_eval/evaluators/stats.py b/docling_eval/evaluators/stats.py index ecd898a4..651de4f6 100644 --- a/docling_eval/evaluators/stats.py +++ b/docling_eval/evaluators/stats.py @@ -74,7 +74,7 @@ def save_histogram(self, figname: Path, name: str = ""): plt.savefig(figname) -def compute_stats(values: List[float]) -> DatasetStatistics: +def compute_stats(values: List[float], max_value_is_one: bool = True, nr_bins: int = 20) -> DatasetStatistics: total: int = len(values) mean: float = statistics.mean(values) if len(values) > 0 else -1 @@ -82,8 +82,12 @@ def compute_stats(values: List[float]) -> DatasetStatistics: std: float = statistics.stdev(values) if len(values) > 0 else -1 logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}") - # Compute the histogram with 20 bins between 0 and 1 - hist, bins = np.histogram(values, bins=20, range=(0, 1)) + max_value = max(values) + if max_value_is_one: + max_value = 1.0 + + # Compute the histogram + hist, bins = np.histogram(values, bins=nr_bins, range=(0, max_value)) logging.info(f"#-hist: {len(hist)}, #-bins: {len(bins)}") return DatasetStatistics( diff --git a/docling_eval/evaluators/timings_evaluator.py b/docling_eval/evaluators/timings_evaluator.py index ed5f543f..df351a61 100644 --- a/docling_eval/evaluators/timings_evaluator.py +++ b/docling_eval/evaluators/timings_evaluator.py @@ -70,19 +70,17 @@ def __call__( total=len(ds_selection), ): data_record = DatasetRecordWithPrediction.model_validate(data) - + if data_record.status not in self._accepted_status: _log.error( "Skipping record without successfull conversion status: %s", doc_id ) rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 continue - - print("data_record.prediction_timings: ", data_record.prediction_timings) timings.append(data_record.prediction_timings) - dataset_timing_evaluation = DatasetTimingEvaluation( - timing_per_page_stats=compute_stats([_.time for _ in timings]) + dataset_timings_evaluation = DatasetTimingsEvaluation( + timing_per_page_stats=compute_stats([_["pipeline_total"] for _ in timings], max_value_is_one=False, nr_bins=32) ) - return dataset_layout_evaluation + return dataset_timings_evaluation diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index c9fcf55d..5e3f483a 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -14,6 +14,8 @@ from docling_core.types.io import DocumentStream from tqdm import tqdm +from docling.utils.profiling import ProfilingItem + from docling_eval.datamodels.dataset_record import ( DatasetRecord, DatasetRecordWithPrediction, @@ -31,7 +33,7 @@ ) from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters -# Get logger + _log = logging.getLogger(__name__) # Default HTML export labels for visualization @@ -118,8 +120,6 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: Returns: Dataset record with prediction added """ - print("predict") - pred_record = self.create_dataset_record_with_prediction( record, DoclingDocument(name="dummy"), @@ -201,8 +201,6 @@ def create_dataset_record_with_prediction( Returns: Dataset record with prediction """ - print("create_dataset_record_with_prediction") - pred_page_images = [] pred_pictures = [] if predicted_doc is not None: @@ -212,8 +210,6 @@ def create_dataset_record_with_prediction( pictures_column=BenchMarkColumns.PREDICTION_PICTURES.value, page_images_column=BenchMarkColumns.PREDICTION_PAGE_IMAGES.value, ) - - print("timings: ", timings) data = { **record.as_record_dict(), @@ -222,11 +218,23 @@ def create_dataset_record_with_prediction( "predicted_pictures": pred_pictures, "original_prediction": original_prediction, "prediction_format": self.prediction_format, - "prediction_timings": self.prediction_timings, + "prediction_timings": self._prediction_timings(timings), "predictor_info": self.info(), } - return DatasetRecordWithPrediction.model_validate(data) + record = DatasetRecordWithPrediction.model_validate(data) + + return record + + def _prediction_timings(self, timings): + """Get prediction timings.""" + result = {} + for key, val in timings.items(): + if key=="pipeline_total": + result[key] = float(val.avg()) + + return result + def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: """ Add a prediction to a dataset record. @@ -237,11 +245,8 @@ def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: Returns: Dataset record with prediction """ - print("add_prediction") - # Copy the original input data to avoid modifying it input_data = copy.deepcopy(record.original) - print("copy done") # Convert Path to DocumentStream if needed if not isinstance(input_data, DocumentStream): @@ -251,8 +256,6 @@ def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: ) record.original = input_data - print("update record") - pred_record = self.predict(record) return pred_record @@ -341,13 +344,9 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]: ncols=120, total=len(ds_selection), ): - print(f"record {i}") - try: record = DatasetRecord.model_validate(data) - print("record validated") pred_record = self.add_prediction(record) - print("adding prediction") if ( self.ignore_missing_predictions diff --git a/docling_eval/prediction_providers/docling_provider.py b/docling_eval/prediction_providers/docling_provider.py index dd8c185e..fe86b312 100644 --- a/docling_eval/prediction_providers/docling_provider.py +++ b/docling_eval/prediction_providers/docling_provider.py @@ -1,6 +1,7 @@ import copy import platform from typing import Dict, List, Optional, Set +import logging from docling.datamodel.base_models import InputFormat from docling.document_converter import DocumentConverter, FormatOption @@ -24,6 +25,8 @@ from docling_eval.utils.utils import docling_version, get_package_version +_log = logging.getLogger(__name__) + class DoclingPredictionProvider(BasePredictionProvider): """ Prediction provider that uses Docling document converter. @@ -71,6 +74,7 @@ def __init__( # Enable the profiling to measure the time spent settings.debug.profile_pipeline_timings = profile_pipeline_timings + _log.info(f"profile_pipeline_timings: {profile_pipeline_timings}") self.doc_converter = DocumentConverter(format_options=format_options) @@ -98,9 +102,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: ) # Convert the document - print("Convert the document: ", record.doc_id) res = self.doc_converter.convert(copy.deepcopy(record.original)) - print("done converting, timings: ", res.timings) # Create prediction record pred_record = self.create_dataset_record_with_prediction( diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 00000000..9d2f5c67 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,9 @@ +# Frequently Asked Questions + +## docling-eval seem stuck + +Add the environment variable (in case HF is not responding), + +```sh +caffeinate HF_HUB_OFFLINE=1 poetry run docling_eval create-eval --benchmark DocLayNetV1 --gt-dir ./benchmarks/DocLayNetV1/gt_dataset --output-dir ./benchmarks/DocLayNetV1/smoldocling_v4 --prediction-provider SmolDocling --end-index 256 +``` \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 7f803956..141cf097 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7331,14 +7331,14 @@ urllib3 = ">=1.26.0" [[package]] name = "typer" -version = "0.15.2" +version = "0.12.5" description = "Typer, build great CLIs. Easy to code. Based on Python type hints." optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "typer-0.15.2-py3-none-any.whl", hash = "sha256:46a499c6107d645a9c13f7ee46c5d5096cae6f5fc57dd11eccbbb9ae3e44ddfc"}, - {file = "typer-0.15.2.tar.gz", hash = "sha256:ab2fab47533a813c49fe1f16b1a370fd5819099c00b119e0633df65f22144ba5"}, + {file = "typer-0.12.5-py3-none-any.whl", hash = "sha256:62fe4e471711b147e3365034133904df3e235698399bc4de2b36c8579298d52b"}, + {file = "typer-0.12.5.tar.gz", hash = "sha256:f592f089bedcc8ec1b974125d64851029c3b1af145f04aca64d69410f0c9b722"}, ] [package.dependencies] From 7ca9bbb679592c135993036e86760726ff1c8d96 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 28 Apr 2025 13:49:52 +0200 Subject: [PATCH 05/15] reformatted the code Signed-off-by: Peter Staar --- docling_eval/cli/main.py | 20 ++++---- .../doclingdpbench_builder.py | 3 ++ docling_eval/evaluators/layout_evaluator.py | 10 ++-- docling_eval/evaluators/stats.py | 6 ++- docling_eval/evaluators/timings_evaluator.py | 46 +++++++++++++------ .../base_prediction_provider.py | 22 ++++----- .../prediction_providers/docling_provider.py | 18 +++----- docling_eval/visualisation/constants.py | 1 - docling_eval/visualisation/visualisations.py | 2 +- 9 files changed, 71 insertions(+), 57 deletions(-) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index 83d0946b..af07ca02 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -49,12 +49,6 @@ from docling_eval.dataset_builders.xfund_builder import XFUNDDatasetBuilder from docling_eval.evaluators.base_evaluator import DatasetEvaluationType from docling_eval.evaluators.bbox_text_evaluator import BboxTextEvaluator - -from docling_eval.evaluators.timings_evaluator import ( - DatasetTimingsEvaluation, - TimingsEvaluator, -) - from docling_eval.evaluators.layout_evaluator import ( DatasetLayoutEvaluation, LayoutEvaluator, @@ -74,6 +68,10 @@ DatasetTableEvaluation, TableEvaluator, ) +from docling_eval.evaluators.timings_evaluator import ( + DatasetTimingsEvaluation, + TimingsEvaluator, +) from docling_eval.prediction_providers.docling_provider import DoclingPredictionProvider from docling_eval.prediction_providers.file_provider import FilePredictionProvider from docling_eval.prediction_providers.tableformer_provider import ( @@ -82,7 +80,7 @@ # Configure logging logging_level = logging.WARNING -#logging_level = logging.DEBUG +# logging_level = logging.DEBUG logging.getLogger("docling").setLevel(logging_level) logging.getLogger("PIL").setLevel(logging_level) logging.getLogger("transformers").setLevel(logging_level) @@ -431,7 +429,7 @@ def evaluate( if modality == EvaluationModality.END2END: _log.error("END2END evaluation not supported. ") - + elif modality == EvaluationModality.TIMINGS: timings_evaluator = TimingsEvaluator() evaluation = timings_evaluator( # type: ignore @@ -441,7 +439,7 @@ def evaluate( with open(save_fn, "w") as fd: json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True) - + elif modality == EvaluationModality.LAYOUT: layout_evaluator = LayoutEvaluator() evaluation = layout_evaluator( # type: ignore @@ -577,8 +575,8 @@ def visualize( timings_evaluation.timing_per_page_stats, ) except Exception as e: - _log.error(f"Error processing timings evaluation: {str(e)}") - + _log.error(f"Error processing timings evaluation: {str(e)}") + elif modality == EvaluationModality.LAYOUT: try: with open(metrics_filename, "r") as fd: diff --git a/docling_eval/dataset_builders/doclingdpbench_builder.py b/docling_eval/dataset_builders/doclingdpbench_builder.py index 22b4c4d1..6f2178bf 100644 --- a/docling_eval/dataset_builders/doclingdpbench_builder.py +++ b/docling_eval/dataset_builders/doclingdpbench_builder.py @@ -17,6 +17,9 @@ ) from docling_eval.utils.utils import get_binary, get_binhash +# Get logger +_log = logging.getLogger(__name__) + class DoclingDPBenchDatasetBuilder(BaseEvaluationDatasetBuilder): """ diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index f6e49c1d..31ee0a5e 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -152,8 +152,8 @@ def __call__( # Select the split ds_selection: Dataset = ds[split] - true_labels, pred_labels, intersection_labels, union_labels = self._find_intersecting_labels( - ds_selection + true_labels, pred_labels, intersection_labels, union_labels = ( + self._find_intersecting_labels(ds_selection) ) true_labels_str = ", ".join(sorted(true_labels)) logging.info(f"True labels: {true_labels_str}") @@ -165,7 +165,7 @@ def __call__( logging.info(f"Intersection labels: {intersection_labels_str}") union_labels_str = ", ".join(sorted(union_labels)) - logging.info(f"Union labels: {union_labels_str}") + logging.info(f"Union labels: {union_labels_str}") doc_ids = [] ground_truths = [] @@ -496,7 +496,7 @@ def _compute_average_iou_with_labels_across_iou( def _find_intersecting_labels( self, ds: Dataset, - ) -> tuple[dict[str, int], dict[str, int], list[DocItemLabel]]: + ) -> tuple[dict[str, int], dict[str, int], list[DocItemLabel], list[DocItemLabel]]: r""" Compute counters per labels for the groundtruth, prediciton and their intersections @@ -552,7 +552,7 @@ def _find_intersecting_labels( union_labels: List[DocItemLabel] = [] for label, count in true_labels.items(): union_labels.append(DocItemLabel(label)) - + if label in pred_labels: intersection_labels.append(DocItemLabel(label)) diff --git a/docling_eval/evaluators/stats.py b/docling_eval/evaluators/stats.py index 651de4f6..1f379791 100644 --- a/docling_eval/evaluators/stats.py +++ b/docling_eval/evaluators/stats.py @@ -74,7 +74,9 @@ def save_histogram(self, figname: Path, name: str = ""): plt.savefig(figname) -def compute_stats(values: List[float], max_value_is_one: bool = True, nr_bins: int = 20) -> DatasetStatistics: +def compute_stats( + values: List[float], max_value_is_one: bool = True, nr_bins: int = 20 +) -> DatasetStatistics: total: int = len(values) mean: float = statistics.mean(values) if len(values) > 0 else -1 @@ -85,7 +87,7 @@ def compute_stats(values: List[float], max_value_is_one: bool = True, nr_bins: i max_value = max(values) if max_value_is_one: max_value = 1.0 - + # Compute the histogram hist, bins = np.histogram(values, bins=nr_bins, range=(0, max_value)) logging.info(f"#-hist: {len(hist)}, #-bins: {len(bins)}") diff --git a/docling_eval/evaluators/timings_evaluator.py b/docling_eval/evaluators/timings_evaluator.py index df351a61..e0b74a8e 100644 --- a/docling_eval/evaluators/timings_evaluator.py +++ b/docling_eval/evaluators/timings_evaluator.py @@ -3,10 +3,11 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple -from tqdm import tqdm - from datasets import Dataset, load_dataset +from tqdm import tqdm +from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction +from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats from docling_eval.evaluators.base_evaluator import ( BaseEvaluator, DatasetEvaluation, @@ -15,28 +16,27 @@ ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction -from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats - _log = logging.getLogger(__name__) + class DatasetTimingsEvaluation(DatasetEvaluation): """Dataset timing evaluation.""" timing_per_page_stats: DatasetStatistics - + + class TimingsEvaluator(BaseEvaluator): """Timings evaluator.""" - + def __init__( self, intermediate_evaluations_path: Optional[Path] = None, - prediction_sources: List[PredictionFormats] = [], + prediction_sources: List[PredictionFormats] = [], ): supported_prediction_formats: List[PredictionFormats] = [ PredictionFormats.DOCLING_DOCUMENT, ] - + if not prediction_sources: prediction_sources = supported_prediction_formats super().__init__( @@ -52,12 +52,18 @@ def __call__( ) -> DatasetTimingsEvaluation: logging.info("Loading the split '%s' from: '%s'", split, ds_path) + rejected_samples: Dict[EvaluationRejectionType, int] = { + EvaluationRejectionType.INVALID_CONVERSION_STATUS: 0, + EvaluationRejectionType.MISSING_PREDICTION: 0, + EvaluationRejectionType.MISMATHCED_DOCUMENT: 0, + } + # Load the dataset split_path = str(ds_path / split / "*.parquet") split_files = glob.glob(split_path) logging.info("#-files: %s", len(split_files)) ds = load_dataset("parquet", data_files={split: split_files}) - logging.info("Overview of dataset: %s", ds) + logging.info("Overview of dataset: %s", ds) # Select the split ds_selection: Dataset = ds[split] @@ -70,17 +76,29 @@ def __call__( total=len(ds_selection), ): data_record = DatasetRecordWithPrediction.model_validate(data) - + + doc_id = data_record.doc_id if data_record.status not in self._accepted_status: _log.error( "Skipping record without successfull conversion status: %s", doc_id ) rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 continue - + timings.append(data_record.prediction_timings) - + + if rejected_samples[EvaluationRejectionType.MISMATHCED_DOCUMENT] > 0: + logging.error( + "Total mismatched/skipped documents: %s over %s", + rejected_samples[EvaluationRejectionType.MISMATHCED_DOCUMENT], + len(ds_selection), + ) + dataset_timings_evaluation = DatasetTimingsEvaluation( - timing_per_page_stats=compute_stats([_["pipeline_total"] for _ in timings], max_value_is_one=False, nr_bins=32) + timing_per_page_stats=compute_stats( + [_["pipeline_total"] for _ in timings], + max_value_is_one=False, + nr_bins=32, + ) ) return dataset_timings_evaluation diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index 5e3f483a..01e3c6c2 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -8,14 +8,13 @@ from datasets import load_dataset from docling.datamodel.base_models import ConversionStatus +from docling.utils.profiling import ProfilingItem from docling.utils.utils import chunkify from docling_core.types.doc import DocItemLabel from docling_core.types.doc.document import DoclingDocument from docling_core.types.io import DocumentStream from tqdm import tqdm -from docling.utils.profiling import ProfilingItem - from docling_eval.datamodels.dataset_record import ( DatasetRecord, DatasetRecordWithPrediction, @@ -33,7 +32,6 @@ ) from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters - _log = logging.getLogger(__name__) # Default HTML export labels for visualization @@ -210,7 +208,7 @@ def create_dataset_record_with_prediction( pictures_column=BenchMarkColumns.PREDICTION_PICTURES.value, page_images_column=BenchMarkColumns.PREDICTION_PAGE_IMAGES.value, ) - + data = { **record.as_record_dict(), "predicted_doc": predicted_doc, @@ -221,20 +219,20 @@ def create_dataset_record_with_prediction( "prediction_timings": self._prediction_timings(timings), "predictor_info": self.info(), } - record = DatasetRecordWithPrediction.model_validate(data) - + record = DatasetRecordWithPrediction.model_validate(data) + return record - + def _prediction_timings(self, timings): """Get prediction timings.""" result = {} for key, val in timings.items(): - if key=="pipeline_total": + if key == "pipeline_total": result[key] = float(val.avg()) - + return result - + def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: """ Add a prediction to a dataset record. @@ -247,7 +245,7 @@ def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: """ # Copy the original input data to avoid modifying it input_data = copy.deepcopy(record.original) - + # Convert Path to DocumentStream if needed if not isinstance(input_data, DocumentStream): if isinstance(input_data, Path): @@ -347,7 +345,7 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]: try: record = DatasetRecord.model_validate(data) pred_record = self.add_prediction(record) - + if ( self.ignore_missing_predictions and pred_record.status == ConversionStatus.FAILURE diff --git a/docling_eval/prediction_providers/docling_provider.py b/docling_eval/prediction_providers/docling_provider.py index fe86b312..2a4a2b8c 100644 --- a/docling_eval/prediction_providers/docling_provider.py +++ b/docling_eval/prediction_providers/docling_provider.py @@ -1,15 +1,14 @@ import copy +import logging import platform from typing import Dict, List, Optional, Set -import logging from docling.datamodel.base_models import InputFormat +from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, FormatOption from docling_core.types.doc import DocItemLabel from pydantic import TypeAdapter -from docling.datamodel.settings import settings - from docling_eval.datamodels.dataset_record import ( DatasetRecord, DatasetRecordWithPrediction, @@ -24,9 +23,9 @@ ) from docling_eval.utils.utils import docling_version, get_package_version - _log = logging.getLogger(__name__) + class DoclingPredictionProvider(BasePredictionProvider): """ Prediction provider that uses Docling document converter. @@ -75,7 +74,7 @@ def __init__( # Enable the profiling to measure the time spent settings.debug.profile_pipeline_timings = profile_pipeline_timings _log.info(f"profile_pipeline_timings: {profile_pipeline_timings}") - + self.doc_converter = DocumentConverter(format_options=format_options) @property @@ -95,7 +94,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: Raises: RuntimeError: If original document stream is not available - """ + """ if record.original is None: raise RuntimeError( "Stream must be given for docling prediction provider to work." @@ -103,13 +102,10 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: # Convert the document res = self.doc_converter.convert(copy.deepcopy(record.original)) - + # Create prediction record pred_record = self.create_dataset_record_with_prediction( - record, - res.document, - None, - res.timings + record, res.document, None, res.timings ) pred_record.status = res.status diff --git a/docling_eval/visualisation/constants.py b/docling_eval/visualisation/constants.py index 3a165a1e..59c12c5a 100644 --- a/docling_eval/visualisation/constants.py +++ b/docling_eval/visualisation/constants.py @@ -203,7 +203,6 @@ """ - HTML_COMPARISON_PAGE_v1 = """ diff --git a/docling_eval/visualisation/visualisations.py b/docling_eval/visualisation/visualisations.py index f50c9f0d..c145922a 100644 --- a/docling_eval/visualisation/visualisations.py +++ b/docling_eval/visualisation/visualisations.py @@ -25,8 +25,8 @@ HTML_COMPARISON_PAGE, HTML_COMPARISON_PAGE_WITH_CLUSTERS, HTML_DEFAULT_HEAD_FOR_COMP, - HTML_DEFAULT_HEAD_FOR_COMP_v2, HTML_INSPECTION, + HTML_DEFAULT_HEAD_FOR_COMP_v2, ) From 6d8f33b1240a7d65952c881f6f3aa167687aa020 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 28 Apr 2025 14:11:48 +0200 Subject: [PATCH 06/15] fixed the test Signed-off-by: Peter Staar --- docling_eval/evaluators/stats.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docling_eval/evaluators/stats.py b/docling_eval/evaluators/stats.py index 1f379791..d1538547 100644 --- a/docling_eval/evaluators/stats.py +++ b/docling_eval/evaluators/stats.py @@ -84,9 +84,9 @@ def compute_stats( std: float = statistics.stdev(values) if len(values) > 0 else -1 logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}") - max_value = max(values) - if max_value_is_one: - max_value = 1.0 + max_value = 1.0 + if not max_value_is_one and len(values) > 0: + max_value = max(values) # Compute the histogram hist, bins = np.histogram(values, bins=nr_bins, range=(0, max_value)) From 29bfe65472098246492af22dc507f68574957650 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 29 Apr 2025 07:15:45 +0200 Subject: [PATCH 07/15] ran the test_run_dpbench_tables with success Signed-off-by: Peter Staar --- poetry.lock | 75 ++++++++++++++++++++++++++++++++++++++------------ pyproject.toml | 2 +- 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/poetry.lock b/poetry.lock index 141cf097..d6f8f62e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,38 @@ # This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +[[package]] +name = "accelerate" +version = "1.6.0" +description = "Accelerate" +optional = false +python-versions = ">=3.9.0" +groups = ["main"] +markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\"" +files = [ + {file = "accelerate-1.6.0-py3-none-any.whl", hash = "sha256:1aee717d3d3735ad6d09710a7c26990ee4652b79b4e93df46551551b5227c2aa"}, + {file = "accelerate-1.6.0.tar.gz", hash = "sha256:28c1ef1846e690944f98b68dc7b8bb6c51d032d45e85dcbb3adb0c8b99dffb32"}, +] + +[package.dependencies] +huggingface-hub = ">=0.21.0" +numpy = ">=1.17,<3.0.0" +packaging = ">=20.0" +psutil = "*" +pyyaml = "*" +safetensors = ">=0.4.3" +torch = ">=2.0.0" + +[package.extras] +deepspeed = ["deepspeed"] +dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "diffusers", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-order", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.11.2,<0.12.0)", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] +quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.11.2,<0.12.0)"] +rich = ["rich"] +sagemaker = ["sagemaker"] +test-dev = ["bitsandbytes", "datasets", "diffusers", "evaluate", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] +test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-order", "pytest-subtests", "pytest-xdist"] +test-trackers = ["comet-ml", "dvclive", "matplotlib", "mlflow", "tensorboard", "wandb"] +testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-order", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"] + [[package]] name = "aiohappyeyeballs" version = "2.6.1" @@ -1460,6 +1493,7 @@ files = [ ] [package.dependencies] +accelerate = {version = ">=1.2.1,<2.0.0", optional = true, markers = "(sys_platform != \"darwin\" or platform_machine != \"x86_64\") and extra == \"vlm\""} beautifulsoup4 = ">=4.12.3,<5.0.0" certifi = ">=2024.7.4" docling-core = {version = ">=2.26.0,<3.0.0", extras = ["chunking"]} @@ -1484,6 +1518,10 @@ requests = ">=2.32.2,<3.0.0" rtree = ">=1.3.0,<2.0.0" scipy = {version = ">=1.6.0,<2.0.0", markers = "python_version >= \"3.10\""} tqdm = ">=4.65.0,<5.0.0" +transformers = [ + {version = ">=4.42.0,<4.43.0", optional = true, markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\" and extra == \"vlm\""}, + {version = ">=4.46.0,<5.0.0", optional = true, markers = "(sys_platform != \"darwin\" or platform_machine != \"x86_64\") and extra == \"vlm\""}, +] typer = ">=0.12.5,<0.16.0" [package.extras] @@ -1539,8 +1577,8 @@ docling-core = ">=2.19.0,<3.0.0" huggingface_hub = ">=0.23,<1" jsonlines = ">=3.1.0,<4.0.0" numpy = [ - {version = ">=1.24.4,<3.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, {version = ">=1.24.4,<2.0.0", markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\""}, + {version = ">=1.24.4,<3.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, ] opencv-python-headless = ">=4.6.0.66,<5.0.0.0" Pillow = ">=10.0.0,<12.0.0" @@ -1550,8 +1588,8 @@ torch = ">=2.2.2,<3.0.0" torchvision = ">=0,<1" tqdm = ">=4.64.0,<5.0.0" transformers = [ - {version = ">=4.42.0,<5.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, {version = ">=4.42.0,<4.43.0", markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\""}, + {version = ">=4.42.0,<5.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, ] [[package]] @@ -4032,7 +4070,7 @@ description = "CUBLAS native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"}, {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"}, @@ -4046,7 +4084,7 @@ description = "CUDA profiling tools runtime libs." optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"}, {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"}, @@ -4060,7 +4098,7 @@ description = "NVRTC native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"}, {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"}, @@ -4074,7 +4112,7 @@ description = "CUDA Runtime native Libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"}, {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"}, @@ -4088,7 +4126,7 @@ description = "cuDNN runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"}, {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"}, @@ -4104,7 +4142,7 @@ description = "CUFFT native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"}, {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"}, @@ -4121,7 +4159,7 @@ description = "CURAND native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"}, {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"}, @@ -4135,7 +4173,7 @@ description = "CUDA solver native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"}, {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"}, @@ -4154,7 +4192,7 @@ description = "CUSPARSE native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"}, {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"}, @@ -4171,7 +4209,7 @@ description = "NVIDIA cuSPARSELt" optional = false python-versions = "*" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8"}, {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9"}, @@ -4185,7 +4223,7 @@ description = "NVIDIA Collective Communication Library (NCCL) Runtime" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"}, ] @@ -4197,7 +4235,7 @@ description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"}, {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"}, @@ -4211,7 +4249,7 @@ description = "NVIDIA Tools Extension" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"}, {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"}, @@ -4758,7 +4796,7 @@ version = "7.0.0" description = "Cross-platform lib for process and system monitoring in Python. NOTE: the syntax of this script MUST be kept compatible with Python 2.7." optional = false python-versions = ">=3.6" -groups = ["dev"] +groups = ["main", "dev"] files = [ {file = "psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25"}, {file = "psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da"}, @@ -4771,6 +4809,7 @@ files = [ {file = "psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553"}, {file = "psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456"}, ] +markers = {main = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""} [package.extras] dev = ["abi3audit", "black (==24.10.0)", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest", "pytest-cov", "pytest-xdist", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"] @@ -7291,7 +7330,7 @@ description = "A language and compiler for custom Deep Learning operations" optional = false python-versions = "*" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62"}, {file = "triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220"}, @@ -7941,4 +7980,4 @@ hyperscalers = ["azure-ai-formrecognizer", "azure-common", "azure-core", "boto3" [metadata] lock-version = "2.1" python-versions = "^3.10" -content-hash = "e953ef80d90d7f96d3c994c96801a8f442597062c8af2769c9e687c833075f1b" +content-hash = "0567ed7bf16453af1997ddb9ba9c63b6edc80c59fd7b752457f1f2b1f5f4cf77" diff --git a/pyproject.toml b/pyproject.toml index 4379c140..2007eeeb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ lxml = "^5.3.0" datasets = "^3.2.0" apted = "^1.0.3" Distance = "^0.1.3" -docling = "^2.31.0" +docling = {extras = ["vlm"], version = "^2.31.0"} matplotlib = "^3.10.0" torch = "^2.5.1" torchmetrics = "^1.6.0" From 4415bab6a538bdc2dae950ae3fcae86b4bf65c9b Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 29 Apr 2025 07:37:49 +0200 Subject: [PATCH 08/15] commented out test_run_dpbench_tables Signed-off-by: Peter Staar --- tests/test_dataset_builder.py | 54 +++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py index 6ba121ec..fcbce2c0 100644 --- a/tests/test_dataset_builder.py +++ b/tests/test_dataset_builder.py @@ -225,33 +225,33 @@ def test_run_omnidocbench_e2e(): ) -@pytest.mark.dependency( - depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], - scope="session", -) -def test_run_dpbench_tables(): - target_path = Path(f"./scratch/{BenchMarkNames.DPBENCH.value}/") - tableformer_provider = TableFormerPredictionProvider(do_visualization=True) - - tableformer_provider.create_prediction_dataset( - name="DPBench tables eval", - gt_dataset_dir=target_path / "gt_dataset", - target_dataset_dir=target_path / "eval_dataset_tables", - ) - - evaluate( - modality=EvaluationModality.TABLE_STRUCTURE, - benchmark=BenchMarkNames.DPBENCH, - idir=target_path / "eval_dataset_tables", - odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, - ) - - visualize( - modality=EvaluationModality.TABLE_STRUCTURE, - benchmark=BenchMarkNames.DPBENCH, - idir=target_path / "eval_dataset_tables", - odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, - ) +#@pytest.mark.dependency( +# depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], +# scope="session", +#) +#def test_run_dpbench_tables(): +# target_path = Path(f"./scratch/{BenchMarkNames.DPBENCH.value}/") +# tableformer_provider = TableFormerPredictionProvider(do_visualization=True) +# +# tableformer_provider.create_prediction_dataset( +# name="DPBench tables eval", +# gt_dataset_dir=target_path / "gt_dataset", +# target_dataset_dir=target_path / "eval_dataset_tables", +# ) +# +# evaluate( +# modality=EvaluationModality.TABLE_STRUCTURE, +# benchmark=BenchMarkNames.DPBENCH, +# idir=target_path / "eval_dataset_tables", +# odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, +# ) +# +# visualize( +# modality=EvaluationModality.TABLE_STRUCTURE, +# benchmark=BenchMarkNames.DPBENCH, +# idir=target_path / "eval_dataset_tables", +# odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, +# ) @pytest.mark.skipif( From f78bd2a91c43f95f003f16e351cfb9ee3d6d06d6 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 29 Apr 2025 07:49:58 +0200 Subject: [PATCH 09/15] reformatted code Signed-off-by: Peter Staar --- tests/test_dataset_builder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py index fcbce2c0..5f105248 100644 --- a/tests/test_dataset_builder.py +++ b/tests/test_dataset_builder.py @@ -225,11 +225,11 @@ def test_run_omnidocbench_e2e(): ) -#@pytest.mark.dependency( +# @pytest.mark.dependency( # depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], # scope="session", -#) -#def test_run_dpbench_tables(): +# ) +# def test_run_dpbench_tables(): # target_path = Path(f"./scratch/{BenchMarkNames.DPBENCH.value}/") # tableformer_provider = TableFormerPredictionProvider(do_visualization=True) # From ea482ad7192cd820353ea1f16d431514e7bd0a6f Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 29 Apr 2025 08:51:00 +0200 Subject: [PATCH 10/15] found potential bug in base_prediction_provider Signed-off-by: Peter Staar --- .../base_prediction_provider.py | 20 ++++--- tests/test_dataset_builder.py | 54 +++++++++---------- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index 01e3c6c2..dbc73209 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -223,15 +223,21 @@ def create_dataset_record_with_prediction( return record - def _prediction_timings(self, timings): + def _prediction_timings(self, timings: Optional[dict]) -> dict: """Get prediction timings.""" - result = {} - for key, val in timings.items(): - if key == "pipeline_total": - result[key] = float(val.avg()) - - return result + if isinstance(timings, dict): + result = {} + for key, val in timings.items(): + if key == "pipeline_total": + result[key] = float(val.avg()) + + return result + elif timings is None: + return {} + else: + _log.warning(f"unknown type of timings: {timings}") + return {} def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: """ diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py index 5f105248..6ba121ec 100644 --- a/tests/test_dataset_builder.py +++ b/tests/test_dataset_builder.py @@ -225,33 +225,33 @@ def test_run_omnidocbench_e2e(): ) -# @pytest.mark.dependency( -# depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], -# scope="session", -# ) -# def test_run_dpbench_tables(): -# target_path = Path(f"./scratch/{BenchMarkNames.DPBENCH.value}/") -# tableformer_provider = TableFormerPredictionProvider(do_visualization=True) -# -# tableformer_provider.create_prediction_dataset( -# name="DPBench tables eval", -# gt_dataset_dir=target_path / "gt_dataset", -# target_dataset_dir=target_path / "eval_dataset_tables", -# ) -# -# evaluate( -# modality=EvaluationModality.TABLE_STRUCTURE, -# benchmark=BenchMarkNames.DPBENCH, -# idir=target_path / "eval_dataset_tables", -# odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, -# ) -# -# visualize( -# modality=EvaluationModality.TABLE_STRUCTURE, -# benchmark=BenchMarkNames.DPBENCH, -# idir=target_path / "eval_dataset_tables", -# odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, -# ) +@pytest.mark.dependency( + depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], + scope="session", +) +def test_run_dpbench_tables(): + target_path = Path(f"./scratch/{BenchMarkNames.DPBENCH.value}/") + tableformer_provider = TableFormerPredictionProvider(do_visualization=True) + + tableformer_provider.create_prediction_dataset( + name="DPBench tables eval", + gt_dataset_dir=target_path / "gt_dataset", + target_dataset_dir=target_path / "eval_dataset_tables", + ) + + evaluate( + modality=EvaluationModality.TABLE_STRUCTURE, + benchmark=BenchMarkNames.DPBENCH, + idir=target_path / "eval_dataset_tables", + odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, + ) + + visualize( + modality=EvaluationModality.TABLE_STRUCTURE, + benchmark=BenchMarkNames.DPBENCH, + idir=target_path / "eval_dataset_tables", + odir=target_path / "evaluations" / EvaluationModality.TABLE_STRUCTURE.value, + ) @pytest.mark.skipif( From 7d614875b5d1211733688b0465e1a9f7c84751df Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 29 Apr 2025 09:23:09 +0200 Subject: [PATCH 11/15] found potential bug in base_prediction_provider (2) Signed-off-by: Peter Staar --- docling_eval/datamodels/dataset_record.py | 2 +- .../prediction_providers/base_prediction_provider.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py index 930d64d3..ed88bf7c 100644 --- a/docling_eval/datamodels/dataset_record.py +++ b/docling_eval/datamodels/dataset_record.py @@ -173,7 +173,7 @@ class DatasetRecordWithPrediction(DatasetRecord): ) original_prediction: Optional[str] = None prediction_format: PredictionFormats # some enum type - prediction_timings: Dict = Field(alias="prediction_timings", default={}) + prediction_timings: Optional[Dict] = Field(alias="prediction_timings", default=None) predicted_page_images: List[PIL.Image.Image] = Field( alias="PredictionPageImages", default=[] diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index dbc73209..c2b64751 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -223,7 +223,7 @@ def create_dataset_record_with_prediction( return record - def _prediction_timings(self, timings: Optional[dict]) -> dict: + def _prediction_timings(self, timings: Optional[dict]) -> Optional[dict]: """Get prediction timings.""" if isinstance(timings, dict): @@ -234,10 +234,10 @@ def _prediction_timings(self, timings: Optional[dict]) -> dict: return result elif timings is None: - return {} + return None else: _log.warning(f"unknown type of timings: {timings}") - return {} + return None def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: """ From 8f1190098269181e3d271eec38a7c624da7104bd Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 29 Apr 2025 10:23:26 +0200 Subject: [PATCH 12/15] fixed the timings in base-predictor Signed-off-by: Peter Staar --- docling_eval/evaluators/timings_evaluator.py | 1 + .../prediction_providers/base_prediction_provider.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docling_eval/evaluators/timings_evaluator.py b/docling_eval/evaluators/timings_evaluator.py index e0b74a8e..d59fb514 100644 --- a/docling_eval/evaluators/timings_evaluator.py +++ b/docling_eval/evaluators/timings_evaluator.py @@ -85,6 +85,7 @@ def __call__( rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 continue + print(data_record.prediction_timings) timings.append(data_record.prediction_timings) if rejected_samples[EvaluationRejectionType.MISMATHCED_DOCUMENT] > 0: diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index c2b64751..a642593d 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -229,10 +229,18 @@ def _prediction_timings(self, timings: Optional[dict]) -> Optional[dict]: if isinstance(timings, dict): result = {} for key, val in timings.items(): - if key == "pipeline_total": - result[key] = float(val.avg()) + if isinstance(val, ProfilingItem): + result[key] = val.times + + if len(result) == 0: # datasets does not like empty dicts + _log.warning(f"empty timings: {timings}") + return None + + # import json + # print(json.dumps(result, indent=2)) return result + elif timings is None: return None else: From 728e8b3eca1122d61a97b9118a1b60c6e7fc8bb1 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 29 Apr 2025 10:43:19 +0200 Subject: [PATCH 13/15] removed prints and added logging-level for matplotlib Signed-off-by: Peter Staar --- docling_eval/cli/main.py | 11 ++++++- docling_eval/evaluators/timings_evaluator.py | 31 ++++++++++++++++++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index af07ca02..c2c4f032 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -88,6 +88,7 @@ logging.getLogger("filelock").setLevel(logging_level) logging.getLogger("urllib3").setLevel(logging_level) logging.getLogger("docling_ibm_models").setLevel(logging_level) +logging.getLogger("matplotlib").setLevel(logging_level) _log = logging.getLogger(__name__) @@ -571,7 +572,15 @@ def visualize( odir, benchmark, modality, - "time_to_solution_", + "time_to_solution_per_doc", + timings_evaluation.timing_per_document_stats, + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "time_to_solution_per_page", timings_evaluation.timing_per_page_stats, ) except Exception as e: diff --git a/docling_eval/evaluators/timings_evaluator.py b/docling_eval/evaluators/timings_evaluator.py index d59fb514..56192564 100644 --- a/docling_eval/evaluators/timings_evaluator.py +++ b/docling_eval/evaluators/timings_evaluator.py @@ -22,6 +22,7 @@ class DatasetTimingsEvaluation(DatasetEvaluation): """Dataset timing evaluation.""" + timing_per_document_stats: DatasetStatistics timing_per_page_stats: DatasetStatistics @@ -85,7 +86,7 @@ def __call__( rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 continue - print(data_record.prediction_timings) + # print(data_record.prediction_timings) timings.append(data_record.prediction_timings) if rejected_samples[EvaluationRejectionType.MISMATHCED_DOCUMENT] > 0: @@ -95,11 +96,35 @@ def __call__( len(ds_selection), ) + time_per_doc = [] + time_per_page = [] + + for timing in timings: + + if timing is not None: + for key, val in timing.items(): + if key == "pipeline_total": + time_per_doc.extend(val) + + if key == "layout": + _time_per_page = [0.0 for v in val] + for k2, v2 in timing.items(): + if len(v2) == len(_time_per_page): + for i, v in enumerate(v2): + _time_per_page[i] += v + + time_per_page.extend(_time_per_page) + dataset_timings_evaluation = DatasetTimingsEvaluation( + timing_per_document_stats=compute_stats( + time_per_doc, + max_value_is_one=False, + nr_bins=32, + ), timing_per_page_stats=compute_stats( - [_["pipeline_total"] for _ in timings], + time_per_page, max_value_is_one=False, nr_bins=32, - ) + ), ) return dataset_timings_evaluation From 354e0b7f721a318fe1f5023686fda791f3f87d33 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 29 Apr 2025 10:55:29 +0200 Subject: [PATCH 14/15] found bug in stats Signed-off-by: Peter Staar --- docling_eval/cli/main.py | 5 ++++- docling_eval/evaluators/stats.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index c2c4f032..6fcc56c1 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -796,6 +796,7 @@ def create_gt( end_index: Annotated[ int, typer.Option(help="End index (exclusive), -1 for all") ] = -1, + chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80, ): """Create ground truth dataset only.""" gt_dir = output_dir / "gt_dataset" @@ -813,7 +814,7 @@ def create_gt( # Retrieve and save the dataset if dataset_builder.must_retrieve: dataset_builder.retrieve_input_dataset() - dataset_builder.save_to_disk(chunk_size=80) + dataset_builder.save_to_disk(chunk_size=chunk_size) _log.info(f"Ground truth dataset created at {gt_dir}") except ValueError as e: @@ -913,6 +914,7 @@ def create( end_index: Annotated[ int, typer.Option(help="End index (exclusive), -1 for all") ] = -1, + chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80, prediction_provider: Annotated[ Optional[PredictionProviderType], typer.Option(help="Type of prediction provider to use"), @@ -933,6 +935,7 @@ def create( split=split, begin_index=begin_index, end_index=end_index, + chunk_size=chunk_size, ) # Then create evaluation if provider specified diff --git a/docling_eval/evaluators/stats.py b/docling_eval/evaluators/stats.py index d1538547..218bbfb6 100644 --- a/docling_eval/evaluators/stats.py +++ b/docling_eval/evaluators/stats.py @@ -81,7 +81,7 @@ def compute_stats( mean: float = statistics.mean(values) if len(values) > 0 else -1 median: float = statistics.median(values) if len(values) > 0 else -1 - std: float = statistics.stdev(values) if len(values) > 0 else -1 + std: float = statistics.stdev(values) if len(values) > 1 else 0.0 logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}") max_value = 1.0 From 80699367e5989c5998875e95c8b0bdba2d002b34 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 29 Apr 2025 10:57:10 +0200 Subject: [PATCH 15/15] updated the logging Signed-off-by: Peter Staar --- docling_eval/dataset_builders/file_dataset_builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docling_eval/dataset_builders/file_dataset_builder.py b/docling_eval/dataset_builders/file_dataset_builder.py index 111bd19f..6b2e9d59 100644 --- a/docling_eval/dataset_builders/file_dataset_builder.py +++ b/docling_eval/dataset_builders/file_dataset_builder.py @@ -108,7 +108,7 @@ def iterate(self) -> Iterable[DatasetRecord]: # Create the ground truth Document true_doc = DoclingDocument(name=f"{filename}") if mime_type == "application/pdf": - _log.info(f"add_pages_to_true_doc: {filename}") + _log.debug(f"add_pages_to_true_doc: {filename}") true_doc, _ = add_pages_to_true_doc( pdf_path=filename, true_doc=true_doc, image_scale=2.0 ) @@ -127,7 +127,7 @@ def iterate(self) -> Iterable[DatasetRecord]: image=image_ref, ) - _log.info(f"add_pages_to_true_doc: {filename}") + _log.debug(f"add_pages_to_true_doc: {filename}") true_doc.pages[1] = page_item else: raise ValueError(