diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index e1b566f7..6fcc56c1 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -31,6 +31,9 @@ ) from docling_eval.dataset_builders.doclaynet_v1_builder import DocLayNetV1DatasetBuilder from docling_eval.dataset_builders.doclaynet_v2_builder import DocLayNetV2DatasetBuilder +from docling_eval.dataset_builders.doclingdpbench_builder import ( + DoclingDPBenchDatasetBuilder, +) from docling_eval.dataset_builders.docvqa_builder import DocVQADatasetBuilder from docling_eval.dataset_builders.dpbench_builder import DPBenchDatasetBuilder from docling_eval.dataset_builders.file_dataset_builder import FileDatasetBuilder @@ -65,6 +68,10 @@ DatasetTableEvaluation, TableEvaluator, ) +from docling_eval.evaluators.timings_evaluator import ( + DatasetTimingsEvaluation, + TimingsEvaluator, +) from docling_eval.prediction_providers.docling_provider import DoclingPredictionProvider from docling_eval.prediction_providers.file_provider import FilePredictionProvider from docling_eval.prediction_providers.tableformer_provider import ( @@ -72,13 +79,16 @@ ) # Configure logging -logging.getLogger("docling").setLevel(logging.WARNING) -logging.getLogger("PIL").setLevel(logging.WARNING) -logging.getLogger("transformers").setLevel(logging.WARNING) -logging.getLogger("datasets").setLevel(logging.WARNING) -logging.getLogger("filelock").setLevel(logging.WARNING) -logging.getLogger("urllib3").setLevel(logging.WARNING) -logging.getLogger("docling_ibm_models").setLevel(logging.WARNING) +logging_level = logging.WARNING +# logging_level = logging.DEBUG +logging.getLogger("docling").setLevel(logging_level) +logging.getLogger("PIL").setLevel(logging_level) +logging.getLogger("transformers").setLevel(logging_level) +logging.getLogger("datasets").setLevel(logging_level) +logging.getLogger("filelock").setLevel(logging_level) +logging.getLogger("urllib3").setLevel(logging_level) +logging.getLogger("docling_ibm_models").setLevel(logging_level) +logging.getLogger("matplotlib").setLevel(logging_level) _log = logging.getLogger(__name__) @@ -156,6 +166,9 @@ def get_dataset_builder( if benchmark == BenchMarkNames.DPBENCH: return DPBenchDatasetBuilder(**common_params) # type: ignore + elif benchmark == BenchMarkNames.DOCLING_DPBENCH: + return DoclingDPBenchDatasetBuilder(**common_params) # type: ignore + elif benchmark == BenchMarkNames.DOCLAYNETV1: return DocLayNetV1DatasetBuilder(**common_params) # type: ignore @@ -418,6 +431,16 @@ def evaluate( if modality == EvaluationModality.END2END: _log.error("END2END evaluation not supported. ") + elif modality == EvaluationModality.TIMINGS: + timings_evaluator = TimingsEvaluator() + evaluation = timings_evaluator( # type: ignore + idir, + split=split, + ) + + with open(save_fn, "w") as fd: + json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True) + elif modality == EvaluationModality.LAYOUT: layout_evaluator = LayoutEvaluator() evaluation = layout_evaluator( # type: ignore @@ -538,6 +561,31 @@ def visualize( if modality == EvaluationModality.END2END: _log.error("END2END visualization not supported") + elif modality == EvaluationModality.TIMINGS: + try: + with open(metrics_filename, "r") as fd: + timings_evaluation = DatasetTimingsEvaluation.model_validate_json( + fd.read() + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "time_to_solution_per_doc", + timings_evaluation.timing_per_document_stats, + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "time_to_solution_per_page", + timings_evaluation.timing_per_page_stats, + ) + except Exception as e: + _log.error(f"Error processing timings evaluation: {str(e)}") + elif modality == EvaluationModality.LAYOUT: try: with open(metrics_filename, "r") as fd: @@ -554,6 +602,30 @@ def visualize( layout_evaluation.map_stats, ) + log_and_save_stats( + odir, + benchmark, + modality, + "precision", + layout_evaluation.segmentation_precision_stats, + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "recall", + layout_evaluation.segmentation_recall_stats, + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "f1", + layout_evaluation.segmentation_f1_stats, + ) + # Append to layout statistics, the AP per classes data, headers = layout_evaluation.to_table() content = "\n\n\nAP[0.5:0.05:0.95] per class (reported as %):\n\n" @@ -724,6 +796,7 @@ def create_gt( end_index: Annotated[ int, typer.Option(help="End index (exclusive), -1 for all") ] = -1, + chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80, ): """Create ground truth dataset only.""" gt_dir = output_dir / "gt_dataset" @@ -741,7 +814,7 @@ def create_gt( # Retrieve and save the dataset if dataset_builder.must_retrieve: dataset_builder.retrieve_input_dataset() - dataset_builder.save_to_disk(chunk_size=80) + dataset_builder.save_to_disk(chunk_size=chunk_size) _log.info(f"Ground truth dataset created at {gt_dir}") except ValueError as e: @@ -841,6 +914,7 @@ def create( end_index: Annotated[ int, typer.Option(help="End index (exclusive), -1 for all") ] = -1, + chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80, prediction_provider: Annotated[ Optional[PredictionProviderType], typer.Option(help="Type of prediction provider to use"), @@ -861,6 +935,7 @@ def create( split=split, begin_index=begin_index, end_index=end_index, + chunk_size=chunk_size, ) # Then create evaluation if provider specified diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py index 94c9d710..ed88bf7c 100644 --- a/docling_eval/datamodels/dataset_record.py +++ b/docling_eval/datamodels/dataset_record.py @@ -173,6 +173,7 @@ class DatasetRecordWithPrediction(DatasetRecord): ) original_prediction: Optional[str] = None prediction_format: PredictionFormats # some enum type + prediction_timings: Optional[Dict] = Field(alias="prediction_timings", default=None) predicted_page_images: List[PIL.Image.Image] = Field( alias="PredictionPageImages", default=[] @@ -201,6 +202,7 @@ def features(cls): cls.get_field_alias("mime_type"): Value("string"), cls.get_field_alias("modalities"): Sequence(Value("string")), cls.get_field_alias("prediction_format"): Value("string"), + cls.get_field_alias("prediction_timings"): Value("string"), } def as_record_dict(self): @@ -208,6 +210,7 @@ def as_record_dict(self): record.update( { self.get_field_alias("prediction_format"): self.prediction_format.value, + self.get_field_alias("prediction_timings"): self.prediction_timings, } ) diff --git a/docling_eval/datamodels/types.py b/docling_eval/datamodels/types.py index 120ee414..04a0fd88 100644 --- a/docling_eval/datamodels/types.py +++ b/docling_eval/datamodels/types.py @@ -47,12 +47,14 @@ class EvaluationModality(str, Enum): OCR = "ocr" KEY_VALUE = "key_value" QUESTION_ANSWERING = "question_answering" + TIMINGS = "timings" class BenchMarkNames(str, Enum): # End-to-End DPBENCH = "DPBench" + DOCLING_DPBENCH = "DoclingDPBench" OMNIDOCBENCH = "OmniDocBench" WORDSCAPE = "WordScape" diff --git a/docling_eval/dataset_builders/doclingdpbench_builder.py b/docling_eval/dataset_builders/doclingdpbench_builder.py new file mode 100644 index 00000000..6f2178bf --- /dev/null +++ b/docling_eval/dataset_builders/doclingdpbench_builder.py @@ -0,0 +1,103 @@ +import json +import logging +import os +from io import BytesIO +from pathlib import Path +from typing import Dict, Iterable, Set + +from datasets import load_dataset +from docling_core.types import DoclingDocument +from docling_core.types.io import DocumentStream +from PIL import Image as PILImage + +from docling_eval.datamodels.dataset_record import DatasetRecord +from docling_eval.dataset_builders.dataset_builder import ( + BaseEvaluationDatasetBuilder, + HFSource, +) +from docling_eval.utils.utils import get_binary, get_binhash + +# Get logger +_log = logging.getLogger(__name__) + + +class DoclingDPBenchDatasetBuilder(BaseEvaluationDatasetBuilder): + """ + DoclingDPBench dataset builder implementing the base dataset builder interface. + + This builder processes the DoclingDPBench dataset, which contains document + understanding benchmarks for various document types. + """ + + def __init__( + self, + target: Path, + split: str = "test", + begin_index: int = 0, + end_index: int = -1, + ): + """ + Initialize the DoclingDPBench dataset builder. + + Args: + target: Path where processed dataset will be saved + split: Dataset split to use + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ + super().__init__( + name="DoclingDPBench", + dataset_source=HFSource(repo_id="ds4sd/docling-dpbench"), + target=target, + split=split, + begin_index=begin_index, + end_index=end_index, + ) + + self.must_retrieve = True + + def iterate(self) -> Iterable[DatasetRecord]: + """ + Iterate through the dataset and yield DatasetRecord objects. + + Yields: + DatasetRecord objects + """ + if not self.retrieved and self.must_retrieve: + raise RuntimeError( + "You must first retrieve the source dataset. Call retrieve_input_dataset()." + ) + + assert self.dataset_local_path is not None + _log.info(f"dataset_local_path: {self.dataset_local_path}") + + # Login using e.g. `huggingface-cli login` to access this dataset + ds = load_dataset("ds4sd/docling-dpbench") + + for idx, _ in enumerate(ds["test"]): + doc_hash = str(get_binhash(_["BinaryDocument"])) + doc = (DoclingDocument.model_validate_json(_["GroundTruthDocument"]),) + + page_images = [ + PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPageImages"] + ] + pictures = [ + PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPictures"] + ] + + pdf_stream = DocumentStream( + name=f"ds4sd/docling-dpbench/{idx}", stream=BytesIO(_["BinaryDocument"]) + ) + + # Create dataset record + record = DatasetRecord( + doc_id=str(_["document_id"]), + doc_hash=doc_hash, + ground_truth_doc=doc[0], + ground_truth_pictures=pictures, + ground_truth_page_images=page_images, + original=pdf_stream, + mime_type=_["mimetype"], + ) + + yield record diff --git a/docling_eval/dataset_builders/file_dataset_builder.py b/docling_eval/dataset_builders/file_dataset_builder.py index 111bd19f..6b2e9d59 100644 --- a/docling_eval/dataset_builders/file_dataset_builder.py +++ b/docling_eval/dataset_builders/file_dataset_builder.py @@ -108,7 +108,7 @@ def iterate(self) -> Iterable[DatasetRecord]: # Create the ground truth Document true_doc = DoclingDocument(name=f"{filename}") if mime_type == "application/pdf": - _log.info(f"add_pages_to_true_doc: {filename}") + _log.debug(f"add_pages_to_true_doc: {filename}") true_doc, _ = add_pages_to_true_doc( pdf_path=filename, true_doc=true_doc, image_scale=2.0 ) @@ -127,7 +127,7 @@ def iterate(self) -> Iterable[DatasetRecord]: image=image_ref, ) - _log.info(f"add_pages_to_true_doc: {filename}") + _log.debug(f"add_pages_to_true_doc: {filename}") true_doc.pages[1] = page_item else: raise ValueError( diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 63646762..31ee0a5e 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -59,6 +59,10 @@ class ImageLayoutEvaluation(UnitEvaluation): avg_weighted_label_matched_iou_90: float avg_weighted_label_matched_iou_95: float + segmentation_precision: float + segmentation_recall: float + segmentation_f1: float + class DatasetLayoutEvaluation(DatasetEvaluation): true_labels: Dict[str, int] @@ -78,6 +82,10 @@ class DatasetLayoutEvaluation(DatasetEvaluation): weighted_map_90_stats: DatasetStatistics weighted_map_95_stats: DatasetStatistics + segmentation_precision_stats: DatasetStatistics + segmentation_recall_stats: DatasetStatistics + segmentation_f1_stats: DatasetStatistics + def to_table(self) -> Tuple[List[List[str]], List[str]]: headers = ["label", "Class mAP[0.5:0.95]"] @@ -137,19 +145,28 @@ def __call__( # Load the dataset split_path = str(ds_path / split / "*.parquet") split_files = glob.glob(split_path) - logging.info("Files: %s", split_files) + logging.info("#-files: %s", len(split_files)) ds = load_dataset("parquet", data_files={split: split_files}) logging.info("Overview of dataset: %s", ds) # Select the split ds_selection: Dataset = ds[split] - true_labels, pred_labels, intersection_labels = self._find_intersecting_labels( - ds_selection + true_labels, pred_labels, intersection_labels, union_labels = ( + self._find_intersecting_labels(ds_selection) ) - intersection_labels_str = "\n" + "\n".join(sorted(intersection_labels)) + true_labels_str = ", ".join(sorted(true_labels)) + logging.info(f"True labels: {true_labels_str}") + + pred_labels_str = ", ".join(sorted(pred_labels)) + logging.info(f"Pred labels: {pred_labels_str}") + + intersection_labels_str = ", ".join(sorted(intersection_labels)) logging.info(f"Intersection labels: {intersection_labels_str}") + union_labels_str = ", ".join(sorted(union_labels)) + logging.info(f"Union labels: {union_labels_str}") + doc_ids = [] ground_truths = [] predictions = [] @@ -187,6 +204,9 @@ def __call__( filter_labels=intersection_labels, ) + # logging.info(f"gts: {gts}") + # logging.info(f"preds: {preds}") + if len(gts) > 0: for i in range(len(gts)): doc_ids.append(data[BenchMarkColumns.DOC_ID] + f"-page-{i}") @@ -258,8 +278,19 @@ def __call__( for i, (doc_id, pred, gt) in enumerate( zip(doc_ids, predictions, ground_truths) ): + # logging.info(f"gt: {gt}") + # logging.info(f"pred: {pred}") + + precision, recall, f1 = self._compute_area_level_metrics_for_tensors( + gt_boxes=gt["boxes"], + pred_boxes=pred["boxes"], + page_width=100, + page_height=100, + mask_width=512, + mask_height=512, + ) + # Reset the metric for the next image - # metric.reset() metric = MeanAveragePrecision(iou_type="bbox", class_metrics=True) # Update with single image @@ -293,6 +324,10 @@ def __call__( weighted_map_90_values.append(average_iou_90) weighted_map_95_values.append(average_iou_95) + logging.info( + f"doc: {doc_id}\tprecision: {precision:.2f}, recall: {recall:.2f}, f1: {f1:.2f}, map_50: {map_50:.2f}" + ) + image_evaluation = ImageLayoutEvaluation( name=doc_id, value=average_iou_50, @@ -303,6 +338,9 @@ def __call__( avg_weighted_label_matched_iou_75=average_iou_75, avg_weighted_label_matched_iou_90=average_iou_90, avg_weighted_label_matched_iou_95=average_iou_95, + segmentation_precision=precision, + segmentation_recall=recall, + segmentation_f1=f1, ) evaluations_per_image.append(image_evaluation) if self._intermediate_evaluations_path: @@ -326,6 +364,15 @@ def __call__( weighted_map_75_stats=compute_stats(weighted_map_75_values), weighted_map_90_stats=compute_stats(weighted_map_90_values), weighted_map_95_stats=compute_stats(weighted_map_95_values), + segmentation_precision_stats=compute_stats( + [_.segmentation_precision for _ in evaluations_per_image] + ), + segmentation_recall_stats=compute_stats( + [_.segmentation_recall for _ in evaluations_per_image] + ), + segmentation_f1_stats=compute_stats( + [_.segmentation_f1 for _ in evaluations_per_image] + ), true_labels=true_labels, pred_labels=pred_labels, intersecting_labels=[_.value for _ in intersection_labels], @@ -449,7 +496,7 @@ def _compute_average_iou_with_labels_across_iou( def _find_intersecting_labels( self, ds: Dataset, - ) -> tuple[dict[str, int], dict[str, int], list[DocItemLabel]]: + ) -> tuple[dict[str, int], dict[str, int], list[DocItemLabel], list[DocItemLabel]]: r""" Compute counters per labels for the groundtruth, prediciton and their intersections @@ -502,11 +549,18 @@ def _find_intersecting_labels( """ intersection_labels: List[DocItemLabel] = [] + union_labels: List[DocItemLabel] = [] for label, count in true_labels.items(): + union_labels.append(DocItemLabel(label)) + if label in pred_labels: intersection_labels.append(DocItemLabel(label)) - return true_labels, pred_labels, intersection_labels + for label, count in pred_labels.items(): + if label not in true_labels: + union_labels.append(DocItemLabel(label)) + + return true_labels, pred_labels, intersection_labels, union_labels def _extract_layout_data( self, @@ -572,13 +626,10 @@ def _extract_layout_data( for item in items: for prov in item.prov: bbox = prov.bbox.to_top_left_origin(page_height=page_height) - # true_tl_bboxes.append(copy.deepcopy(bbox)) bbox = bbox.normalized(page_size) bbox = bbox.scaled(100.0) - # logging.info(f"ground-truth {page_no}: {page_width, page_height} -> {item.label}, {bbox.coord_origin}: [{bbox.l}, {bbox.t}, {bbox.r}, {bbox.b}]") - bboxes.append([bbox.l, bbox.t, bbox.r, bbox.b]) labels.append(filter_labels.index(self.label_mapping[item.label])) # type: ignore @@ -635,3 +686,94 @@ def _extract_layout_data( # print(pred_tl_bboxes_str) return ground_truths, predictions + + def _compute_area_level_metrics_for_tensors( + self, + gt_boxes: torch.Tensor, + pred_boxes: torch.Tensor, + page_width: int, + page_height: int, + mask_width: int = 512, + mask_height: int = 512, + ) -> Tuple[float, float, float]: + """ + Compute area-level precision, recall, and F1 score for tensor format boxes. + Handles overlapping boxes by using binary masks at the specified resolution. + + Args: + gt_boxes: Ground truth boxes as tensor of shape (N, 4) with [x1, y1, x2, y2] format + pred_boxes: Predicted boxes as tensor of shape (M, 4) with [x1, y1, x2, y2] format + page_width: Width of the original page + page_height: Height of the original page + mask_width: Width of the mask to use for computation (default: 512) + mask_height: Height of the mask to use for computation (default: 512) + + Returns: + Dictionary containing precision, recall, and F1 scores + """ + if gt_boxes.shape[0] == 0: + precision = 1.0 if pred_boxes.shape[0] == 0 else 0.0 + recall = 1.0 + f1 = 1.0 if pred_boxes.shape[0] == 0 else 0.0 + return precision, recall, f1 + + if pred_boxes.shape[0] == 0: + precision = 1.0 + recall = 0.0 + f1 = 0.0 + return precision, recall, f1 + + # Calculate scaling factors (ensure float division) + x_scale = float(mask_width) / float(page_width) + y_scale = float(mask_height) / float(page_height) + + # Create empty masks + gt_mask = torch.zeros((mask_height, mask_width), dtype=torch.bool, device="cpu") + pred_mask = torch.zeros( + (mask_height, mask_width), dtype=torch.bool, device="cpu" + ) + + # Fill ground truth mask + for i in range(gt_boxes.shape[0]): + x1, y1, x2, y2 = gt_boxes[i].tolist() + + # Scale coordinates to mask space + x1, y1 = max(0, int(x1 * x_scale)), max(0, int(y1 * y_scale)) + x2, y2 = min(mask_width, int(x2 * x_scale)), min( + mask_height, int(y2 * y_scale) + ) + + if x2 > x1 and y2 > y1: + gt_mask[y1:y2, x1:x2] = True + + # Fill prediction mask + for i in range(pred_boxes.shape[0]): + x1, y1, x2, y2 = pred_boxes[i].tolist() + + # Scale coordinates to mask space + x1, y1 = max(0, int(x1 * x_scale)), max(0, int(y1 * y_scale)) + x2, y2 = min(mask_width, int(x2 * x_scale)), min( + mask_height, int(y2 * y_scale) + ) + + if x2 > x1 and y2 > y1: + pred_mask[y1:y2, x1:x2] = True + + # Calculate areas (accounting for overlaps) + total_gt_area = torch.sum(gt_mask).item() + total_pred_area = torch.sum(pred_mask).item() + + # Calculate intersection (logical AND of masks) + intersection_mask = torch.logical_and(gt_mask, pred_mask) + total_intersection = torch.sum(intersection_mask).item() + + # Calculate metrics + precision = total_intersection / total_pred_area if total_pred_area > 0 else 0.0 + recall = total_intersection / total_gt_area if total_gt_area > 0 else 0.0 + + # Calculate F1 score + f1 = 0.0 + if precision + recall > 0: + f1 = 2 * (precision * recall) / (precision + recall) + + return precision, recall, f1 diff --git a/docling_eval/evaluators/stats.py b/docling_eval/evaluators/stats.py index ecd898a4..218bbfb6 100644 --- a/docling_eval/evaluators/stats.py +++ b/docling_eval/evaluators/stats.py @@ -74,16 +74,22 @@ def save_histogram(self, figname: Path, name: str = ""): plt.savefig(figname) -def compute_stats(values: List[float]) -> DatasetStatistics: +def compute_stats( + values: List[float], max_value_is_one: bool = True, nr_bins: int = 20 +) -> DatasetStatistics: total: int = len(values) mean: float = statistics.mean(values) if len(values) > 0 else -1 median: float = statistics.median(values) if len(values) > 0 else -1 - std: float = statistics.stdev(values) if len(values) > 0 else -1 + std: float = statistics.stdev(values) if len(values) > 1 else 0.0 logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}") - # Compute the histogram with 20 bins between 0 and 1 - hist, bins = np.histogram(values, bins=20, range=(0, 1)) + max_value = 1.0 + if not max_value_is_one and len(values) > 0: + max_value = max(values) + + # Compute the histogram + hist, bins = np.histogram(values, bins=nr_bins, range=(0, max_value)) logging.info(f"#-hist: {len(hist)}, #-bins: {len(bins)}") return DatasetStatistics( diff --git a/docling_eval/evaluators/timings_evaluator.py b/docling_eval/evaluators/timings_evaluator.py new file mode 100644 index 00000000..56192564 --- /dev/null +++ b/docling_eval/evaluators/timings_evaluator.py @@ -0,0 +1,130 @@ +import glob +import logging +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from datasets import Dataset, load_dataset +from tqdm import tqdm + +from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction +from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats +from docling_eval.evaluators.base_evaluator import ( + BaseEvaluator, + DatasetEvaluation, + EvaluationRejectionType, + UnitEvaluation, +) +from docling_eval.evaluators.stats import DatasetStatistics, compute_stats + +_log = logging.getLogger(__name__) + + +class DatasetTimingsEvaluation(DatasetEvaluation): + """Dataset timing evaluation.""" + + timing_per_document_stats: DatasetStatistics + timing_per_page_stats: DatasetStatistics + + +class TimingsEvaluator(BaseEvaluator): + """Timings evaluator.""" + + def __init__( + self, + intermediate_evaluations_path: Optional[Path] = None, + prediction_sources: List[PredictionFormats] = [], + ): + supported_prediction_formats: List[PredictionFormats] = [ + PredictionFormats.DOCLING_DOCUMENT, + ] + + if not prediction_sources: + prediction_sources = supported_prediction_formats + super().__init__( + intermediate_evaluations_path=intermediate_evaluations_path, + prediction_sources=prediction_sources, + supported_prediction_formats=supported_prediction_formats, + ) + + def __call__( + self, + ds_path: Path, + split: str = "test", + ) -> DatasetTimingsEvaluation: + logging.info("Loading the split '%s' from: '%s'", split, ds_path) + + rejected_samples: Dict[EvaluationRejectionType, int] = { + EvaluationRejectionType.INVALID_CONVERSION_STATUS: 0, + EvaluationRejectionType.MISSING_PREDICTION: 0, + EvaluationRejectionType.MISMATHCED_DOCUMENT: 0, + } + + # Load the dataset + split_path = str(ds_path / split / "*.parquet") + split_files = glob.glob(split_path) + logging.info("#-files: %s", len(split_files)) + ds = load_dataset("parquet", data_files={split: split_files}) + logging.info("Overview of dataset: %s", ds) + + # Select the split + ds_selection: Dataset = ds[split] + + timings = [] + for i, data in tqdm( + enumerate(ds_selection), + desc="Timings evaluations", + ncols=120, + total=len(ds_selection), + ): + data_record = DatasetRecordWithPrediction.model_validate(data) + + doc_id = data_record.doc_id + if data_record.status not in self._accepted_status: + _log.error( + "Skipping record without successfull conversion status: %s", doc_id + ) + rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 + continue + + # print(data_record.prediction_timings) + timings.append(data_record.prediction_timings) + + if rejected_samples[EvaluationRejectionType.MISMATHCED_DOCUMENT] > 0: + logging.error( + "Total mismatched/skipped documents: %s over %s", + rejected_samples[EvaluationRejectionType.MISMATHCED_DOCUMENT], + len(ds_selection), + ) + + time_per_doc = [] + time_per_page = [] + + for timing in timings: + + if timing is not None: + for key, val in timing.items(): + if key == "pipeline_total": + time_per_doc.extend(val) + + if key == "layout": + _time_per_page = [0.0 for v in val] + for k2, v2 in timing.items(): + if len(v2) == len(_time_per_page): + for i, v in enumerate(v2): + _time_per_page[i] += v + + time_per_page.extend(_time_per_page) + + dataset_timings_evaluation = DatasetTimingsEvaluation( + timing_per_document_stats=compute_stats( + time_per_doc, + max_value_is_one=False, + nr_bins=32, + ), + timing_per_page_stats=compute_stats( + time_per_page, + max_value_is_one=False, + nr_bins=32, + ), + ) + return dataset_timings_evaluation diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index d280ed98..a642593d 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -8,6 +8,7 @@ from datasets import load_dataset from docling.datamodel.base_models import ConversionStatus +from docling.utils.profiling import ProfilingItem from docling.utils.utils import chunkify from docling_core.types.doc import DocItemLabel from docling_core.types.doc.document import DoclingDocument @@ -31,7 +32,6 @@ ) from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters -# Get logger _log = logging.getLogger(__name__) # Default HTML export labels for visualization @@ -186,6 +186,7 @@ def create_dataset_record_with_prediction( record: DatasetRecord, predicted_doc: Optional[DoclingDocument] = None, original_prediction: Optional[str] = None, + timings: Optional[dict] = None, ) -> DatasetRecordWithPrediction: """ Create a dataset record with prediction from an input record. @@ -215,9 +216,36 @@ def create_dataset_record_with_prediction( "predicted_pictures": pred_pictures, "original_prediction": original_prediction, "prediction_format": self.prediction_format, + "prediction_timings": self._prediction_timings(timings), "predictor_info": self.info(), } - return DatasetRecordWithPrediction.model_validate(data) + record = DatasetRecordWithPrediction.model_validate(data) + + return record + + def _prediction_timings(self, timings: Optional[dict]) -> Optional[dict]: + """Get prediction timings.""" + + if isinstance(timings, dict): + result = {} + for key, val in timings.items(): + if isinstance(val, ProfilingItem): + result[key] = val.times + + if len(result) == 0: # datasets does not like empty dicts + _log.warning(f"empty timings: {timings}") + return None + + # import json + # print(json.dumps(result, indent=2)) + + return result + + elif timings is None: + return None + else: + _log.warning(f"unknown type of timings: {timings}") + return None def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: """ diff --git a/docling_eval/prediction_providers/docling_provider.py b/docling_eval/prediction_providers/docling_provider.py index b86b619b..2a4a2b8c 100644 --- a/docling_eval/prediction_providers/docling_provider.py +++ b/docling_eval/prediction_providers/docling_provider.py @@ -1,8 +1,10 @@ import copy +import logging import platform from typing import Dict, List, Optional, Set from docling.datamodel.base_models import InputFormat +from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, FormatOption from docling_core.types.doc import DocItemLabel from pydantic import TypeAdapter @@ -21,6 +23,8 @@ ) from docling_eval.utils.utils import docling_version, get_package_version +_log = logging.getLogger(__name__) + class DoclingPredictionProvider(BasePredictionProvider): """ @@ -47,6 +51,7 @@ def __init__( ignore_missing_predictions: bool = True, true_labels: Optional[Set[DocItemLabel]] = None, pred_labels: Optional[Set[DocItemLabel]] = None, + profile_pipeline_timings: bool = True, ): """ Initialize the Docling prediction provider. @@ -65,6 +70,11 @@ def __init__( true_labels=true_labels, pred_labels=pred_labels, ) + + # Enable the profiling to measure the time spent + settings.debug.profile_pipeline_timings = profile_pipeline_timings + _log.info(f"profile_pipeline_timings: {profile_pipeline_timings}") + self.doc_converter = DocumentConverter(format_options=format_options) @property @@ -95,9 +105,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: # Create prediction record pred_record = self.create_dataset_record_with_prediction( - record, - res.document, - None, + record, res.document, None, res.timings ) pred_record.status = res.status diff --git a/docling_eval/visualisation/constants.py b/docling_eval/visualisation/constants.py index 823c258a..59c12c5a 100644 --- a/docling_eval/visualisation/constants.py +++ b/docling_eval/visualisation/constants.py @@ -109,6 +109,99 @@ """ +HTML_DEFAULT_HEAD_FOR_COMP_v2: str = r""" + + + +Powered by Docling + + +""" + HTML_COMPARISON_PAGE_v1 = """ diff --git a/docling_eval/visualisation/visualisations.py b/docling_eval/visualisation/visualisations.py index f782fca6..c145922a 100644 --- a/docling_eval/visualisation/visualisations.py +++ b/docling_eval/visualisation/visualisations.py @@ -26,6 +26,7 @@ HTML_COMPARISON_PAGE_WITH_CLUSTERS, HTML_DEFAULT_HEAD_FOR_COMP, HTML_INSPECTION, + HTML_DEFAULT_HEAD_FOR_COMP_v2, ) @@ -136,7 +137,7 @@ def get_missing_pageimg(width=800, height=1100, text="MISSING PAGE"): html_parts = [ "", "", - HTML_DEFAULT_HEAD_FOR_COMP, + HTML_DEFAULT_HEAD_FOR_COMP_v2, "", ] diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 00000000..9d2f5c67 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,9 @@ +# Frequently Asked Questions + +## docling-eval seem stuck + +Add the environment variable (in case HF is not responding), + +```sh +caffeinate HF_HUB_OFFLINE=1 poetry run docling_eval create-eval --benchmark DocLayNetV1 --gt-dir ./benchmarks/DocLayNetV1/gt_dataset --output-dir ./benchmarks/DocLayNetV1/smoldocling_v4 --prediction-provider SmolDocling --end-index 256 +``` \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 5be3a7a7..d6f8f62e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1482,14 +1482,14 @@ files = [ [[package]] name = "docling" -version = "2.30.0" +version = "2.31.0" description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." optional = false python-versions = "<4.0,>=3.9" groups = ["main"] files = [ - {file = "docling-2.30.0-py3-none-any.whl", hash = "sha256:88bc3f988116ea100ef1a025b623c94ae0010c11bc183f4773fb852a627d1d5d"}, - {file = "docling-2.30.0.tar.gz", hash = "sha256:6d31293d84ac9967101e394b7fa1b75be951775c1cb873d18b505e82c8d23c83"}, + {file = "docling-2.31.0-py3-none-any.whl", hash = "sha256:0a23c709aba5d3aa8f193e2211a7d3084af2b451f1c69deafdf81591179de779"}, + {file = "docling-2.31.0.tar.gz", hash = "sha256:1115f4cda7e67c70a6a61395aed65133f4e85e86914bdae5153c10a5ed329a71"}, ] [package.dependencies] @@ -1519,8 +1519,8 @@ rtree = ">=1.3.0,<2.0.0" scipy = {version = ">=1.6.0,<2.0.0", markers = "python_version >= \"3.10\""} tqdm = ">=4.65.0,<5.0.0" transformers = [ - {version = ">=4.46.0,<5.0.0", optional = true, markers = "(sys_platform != \"darwin\" or platform_machine != \"x86_64\") and extra == \"vlm\""}, {version = ">=4.42.0,<4.43.0", optional = true, markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\" and extra == \"vlm\""}, + {version = ">=4.46.0,<5.0.0", optional = true, markers = "(sys_platform != \"darwin\" or platform_machine != \"x86_64\") and extra == \"vlm\""}, ] typer = ">=0.12.5,<0.16.0" @@ -1577,8 +1577,8 @@ docling-core = ">=2.19.0,<3.0.0" huggingface_hub = ">=0.23,<1" jsonlines = ">=3.1.0,<4.0.0" numpy = [ - {version = ">=1.24.4,<3.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, {version = ">=1.24.4,<2.0.0", markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\""}, + {version = ">=1.24.4,<3.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, ] opencv-python-headless = ">=4.6.0.66,<5.0.0.0" Pillow = ">=10.0.0,<12.0.0" @@ -1588,8 +1588,8 @@ torch = ">=2.2.2,<3.0.0" torchvision = ">=0,<1" tqdm = ">=4.64.0,<5.0.0" transformers = [ - {version = ">=4.42.0,<5.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, {version = ">=4.42.0,<4.43.0", markers = "sys_platform == \"darwin\" and platform_machine == \"x86_64\""}, + {version = ">=4.42.0,<5.0.0", markers = "sys_platform != \"darwin\" or platform_machine != \"x86_64\""}, ] [[package]] @@ -4070,7 +4070,7 @@ description = "CUBLAS native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"}, {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"}, @@ -4084,7 +4084,7 @@ description = "CUDA profiling tools runtime libs." optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"}, {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"}, @@ -4098,7 +4098,7 @@ description = "NVRTC native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"}, {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"}, @@ -4112,7 +4112,7 @@ description = "CUDA Runtime native Libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"}, {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"}, @@ -4126,7 +4126,7 @@ description = "cuDNN runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"}, {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"}, @@ -4142,7 +4142,7 @@ description = "CUFFT native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"}, {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"}, @@ -4159,7 +4159,7 @@ description = "CURAND native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"}, {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"}, @@ -4173,7 +4173,7 @@ description = "CUDA solver native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"}, {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"}, @@ -4192,7 +4192,7 @@ description = "CUSPARSE native runtime libraries" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"}, {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"}, @@ -4209,7 +4209,7 @@ description = "NVIDIA cuSPARSELt" optional = false python-versions = "*" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8"}, {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9"}, @@ -4223,7 +4223,7 @@ description = "NVIDIA Collective Communication Library (NCCL) Runtime" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"}, ] @@ -4235,7 +4235,7 @@ description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"}, {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"}, @@ -4249,7 +4249,7 @@ description = "NVIDIA Tools Extension" optional = false python-versions = ">=3" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"}, {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"}, @@ -7330,7 +7330,7 @@ description = "A language and compiler for custom Deep Learning operations" optional = false python-versions = "*" groups = ["main"] -markers = "platform_machine == \"x86_64\" and platform_system == \"Linux\"" +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\"" files = [ {file = "triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62"}, {file = "triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220"}, @@ -7370,14 +7370,14 @@ urllib3 = ">=1.26.0" [[package]] name = "typer" -version = "0.15.2" +version = "0.12.5" description = "Typer, build great CLIs. Easy to code. Based on Python type hints." optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "typer-0.15.2-py3-none-any.whl", hash = "sha256:46a499c6107d645a9c13f7ee46c5d5096cae6f5fc57dd11eccbbb9ae3e44ddfc"}, - {file = "typer-0.15.2.tar.gz", hash = "sha256:ab2fab47533a813c49fe1f16b1a370fd5819099c00b119e0633df65f22144ba5"}, + {file = "typer-0.12.5-py3-none-any.whl", hash = "sha256:62fe4e471711b147e3365034133904df3e235698399bc4de2b36c8579298d52b"}, + {file = "typer-0.12.5.tar.gz", hash = "sha256:f592f089bedcc8ec1b974125d64851029c3b1af145f04aca64d69410f0c9b722"}, ] [package.dependencies] @@ -7980,4 +7980,4 @@ hyperscalers = ["azure-ai-formrecognizer", "azure-common", "azure-core", "boto3" [metadata] lock-version = "2.1" python-versions = "^3.10" -content-hash = "ad48608fca439c925fd79021a7323b74448f506595a7c79f652a07e9538dbd13" +content-hash = "0567ed7bf16453af1997ddb9ba9c63b6edc80c59fd7b752457f1f2b1f5f4cf77" diff --git a/pyproject.toml b/pyproject.toml index fc672ad8..2007eeeb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ lxml = "^5.3.0" datasets = "^3.2.0" apted = "^1.0.3" Distance = "^0.1.3" -docling = {extras = ["vlm"], version = "^2.28.0"} +docling = {extras = ["vlm"], version = "^2.31.0"} matplotlib = "^3.10.0" torch = "^2.5.1" torchmetrics = "^1.6.0"