diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index e1b566f7..6fcc56c1 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -31,6 +31,9 @@ ) from docling_eval.dataset_builders.doclaynet_v1_builder import DocLayNetV1DatasetBuilder from docling_eval.dataset_builders.doclaynet_v2_builder import DocLayNetV2DatasetBuilder +from docling_eval.dataset_builders.doclingdpbench_builder import ( + DoclingDPBenchDatasetBuilder, +) from docling_eval.dataset_builders.docvqa_builder import DocVQADatasetBuilder from docling_eval.dataset_builders.dpbench_builder import DPBenchDatasetBuilder from docling_eval.dataset_builders.file_dataset_builder import FileDatasetBuilder @@ -65,6 +68,10 @@ DatasetTableEvaluation, TableEvaluator, ) +from docling_eval.evaluators.timings_evaluator import ( + DatasetTimingsEvaluation, + TimingsEvaluator, +) from docling_eval.prediction_providers.docling_provider import DoclingPredictionProvider from docling_eval.prediction_providers.file_provider import FilePredictionProvider from docling_eval.prediction_providers.tableformer_provider import ( @@ -72,13 +79,16 @@ ) # Configure logging -logging.getLogger("docling").setLevel(logging.WARNING) -logging.getLogger("PIL").setLevel(logging.WARNING) -logging.getLogger("transformers").setLevel(logging.WARNING) -logging.getLogger("datasets").setLevel(logging.WARNING) -logging.getLogger("filelock").setLevel(logging.WARNING) -logging.getLogger("urllib3").setLevel(logging.WARNING) -logging.getLogger("docling_ibm_models").setLevel(logging.WARNING) +logging_level = logging.WARNING +# logging_level = logging.DEBUG +logging.getLogger("docling").setLevel(logging_level) +logging.getLogger("PIL").setLevel(logging_level) +logging.getLogger("transformers").setLevel(logging_level) +logging.getLogger("datasets").setLevel(logging_level) +logging.getLogger("filelock").setLevel(logging_level) +logging.getLogger("urllib3").setLevel(logging_level) +logging.getLogger("docling_ibm_models").setLevel(logging_level) +logging.getLogger("matplotlib").setLevel(logging_level) _log = logging.getLogger(__name__) @@ -156,6 +166,9 @@ def get_dataset_builder( if benchmark == BenchMarkNames.DPBENCH: return DPBenchDatasetBuilder(**common_params) # type: ignore + elif benchmark == BenchMarkNames.DOCLING_DPBENCH: + return DoclingDPBenchDatasetBuilder(**common_params) # type: ignore + elif benchmark == BenchMarkNames.DOCLAYNETV1: return DocLayNetV1DatasetBuilder(**common_params) # type: ignore @@ -418,6 +431,16 @@ def evaluate( if modality == EvaluationModality.END2END: _log.error("END2END evaluation not supported. ") + elif modality == EvaluationModality.TIMINGS: + timings_evaluator = TimingsEvaluator() + evaluation = timings_evaluator( # type: ignore + idir, + split=split, + ) + + with open(save_fn, "w") as fd: + json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True) + elif modality == EvaluationModality.LAYOUT: layout_evaluator = LayoutEvaluator() evaluation = layout_evaluator( # type: ignore @@ -538,6 +561,31 @@ def visualize( if modality == EvaluationModality.END2END: _log.error("END2END visualization not supported") + elif modality == EvaluationModality.TIMINGS: + try: + with open(metrics_filename, "r") as fd: + timings_evaluation = DatasetTimingsEvaluation.model_validate_json( + fd.read() + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "time_to_solution_per_doc", + timings_evaluation.timing_per_document_stats, + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "time_to_solution_per_page", + timings_evaluation.timing_per_page_stats, + ) + except Exception as e: + _log.error(f"Error processing timings evaluation: {str(e)}") + elif modality == EvaluationModality.LAYOUT: try: with open(metrics_filename, "r") as fd: @@ -554,6 +602,30 @@ def visualize( layout_evaluation.map_stats, ) + log_and_save_stats( + odir, + benchmark, + modality, + "precision", + layout_evaluation.segmentation_precision_stats, + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "recall", + layout_evaluation.segmentation_recall_stats, + ) + + log_and_save_stats( + odir, + benchmark, + modality, + "f1", + layout_evaluation.segmentation_f1_stats, + ) + # Append to layout statistics, the AP per classes data, headers = layout_evaluation.to_table() content = "\n\n\nAP[0.5:0.05:0.95] per class (reported as %):\n\n" @@ -724,6 +796,7 @@ def create_gt( end_index: Annotated[ int, typer.Option(help="End index (exclusive), -1 for all") ] = -1, + chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80, ): """Create ground truth dataset only.""" gt_dir = output_dir / "gt_dataset" @@ -741,7 +814,7 @@ def create_gt( # Retrieve and save the dataset if dataset_builder.must_retrieve: dataset_builder.retrieve_input_dataset() - dataset_builder.save_to_disk(chunk_size=80) + dataset_builder.save_to_disk(chunk_size=chunk_size) _log.info(f"Ground truth dataset created at {gt_dir}") except ValueError as e: @@ -841,6 +914,7 @@ def create( end_index: Annotated[ int, typer.Option(help="End index (exclusive), -1 for all") ] = -1, + chunk_size: Annotated[int, typer.Option(help="chunk size")] = 80, prediction_provider: Annotated[ Optional[PredictionProviderType], typer.Option(help="Type of prediction provider to use"), @@ -861,6 +935,7 @@ def create( split=split, begin_index=begin_index, end_index=end_index, + chunk_size=chunk_size, ) # Then create evaluation if provider specified diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py index 94c9d710..ed88bf7c 100644 --- a/docling_eval/datamodels/dataset_record.py +++ b/docling_eval/datamodels/dataset_record.py @@ -173,6 +173,7 @@ class DatasetRecordWithPrediction(DatasetRecord): ) original_prediction: Optional[str] = None prediction_format: PredictionFormats # some enum type + prediction_timings: Optional[Dict] = Field(alias="prediction_timings", default=None) predicted_page_images: List[PIL.Image.Image] = Field( alias="PredictionPageImages", default=[] @@ -201,6 +202,7 @@ def features(cls): cls.get_field_alias("mime_type"): Value("string"), cls.get_field_alias("modalities"): Sequence(Value("string")), cls.get_field_alias("prediction_format"): Value("string"), + cls.get_field_alias("prediction_timings"): Value("string"), } def as_record_dict(self): @@ -208,6 +210,7 @@ def as_record_dict(self): record.update( { self.get_field_alias("prediction_format"): self.prediction_format.value, + self.get_field_alias("prediction_timings"): self.prediction_timings, } ) diff --git a/docling_eval/datamodels/types.py b/docling_eval/datamodels/types.py index 120ee414..04a0fd88 100644 --- a/docling_eval/datamodels/types.py +++ b/docling_eval/datamodels/types.py @@ -47,12 +47,14 @@ class EvaluationModality(str, Enum): OCR = "ocr" KEY_VALUE = "key_value" QUESTION_ANSWERING = "question_answering" + TIMINGS = "timings" class BenchMarkNames(str, Enum): # End-to-End DPBENCH = "DPBench" + DOCLING_DPBENCH = "DoclingDPBench" OMNIDOCBENCH = "OmniDocBench" WORDSCAPE = "WordScape" diff --git a/docling_eval/dataset_builders/doclingdpbench_builder.py b/docling_eval/dataset_builders/doclingdpbench_builder.py new file mode 100644 index 00000000..6f2178bf --- /dev/null +++ b/docling_eval/dataset_builders/doclingdpbench_builder.py @@ -0,0 +1,103 @@ +import json +import logging +import os +from io import BytesIO +from pathlib import Path +from typing import Dict, Iterable, Set + +from datasets import load_dataset +from docling_core.types import DoclingDocument +from docling_core.types.io import DocumentStream +from PIL import Image as PILImage + +from docling_eval.datamodels.dataset_record import DatasetRecord +from docling_eval.dataset_builders.dataset_builder import ( + BaseEvaluationDatasetBuilder, + HFSource, +) +from docling_eval.utils.utils import get_binary, get_binhash + +# Get logger +_log = logging.getLogger(__name__) + + +class DoclingDPBenchDatasetBuilder(BaseEvaluationDatasetBuilder): + """ + DoclingDPBench dataset builder implementing the base dataset builder interface. + + This builder processes the DoclingDPBench dataset, which contains document + understanding benchmarks for various document types. + """ + + def __init__( + self, + target: Path, + split: str = "test", + begin_index: int = 0, + end_index: int = -1, + ): + """ + Initialize the DoclingDPBench dataset builder. + + Args: + target: Path where processed dataset will be saved + split: Dataset split to use + begin_index: Start index for processing (inclusive) + end_index: End index for processing (exclusive), -1 means process all + """ + super().__init__( + name="DoclingDPBench", + dataset_source=HFSource(repo_id="ds4sd/docling-dpbench"), + target=target, + split=split, + begin_index=begin_index, + end_index=end_index, + ) + + self.must_retrieve = True + + def iterate(self) -> Iterable[DatasetRecord]: + """ + Iterate through the dataset and yield DatasetRecord objects. + + Yields: + DatasetRecord objects + """ + if not self.retrieved and self.must_retrieve: + raise RuntimeError( + "You must first retrieve the source dataset. Call retrieve_input_dataset()." + ) + + assert self.dataset_local_path is not None + _log.info(f"dataset_local_path: {self.dataset_local_path}") + + # Login using e.g. `huggingface-cli login` to access this dataset + ds = load_dataset("ds4sd/docling-dpbench") + + for idx, _ in enumerate(ds["test"]): + doc_hash = str(get_binhash(_["BinaryDocument"])) + doc = (DoclingDocument.model_validate_json(_["GroundTruthDocument"]),) + + page_images = [ + PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPageImages"] + ] + pictures = [ + PILImage.open(BytesIO(__["bytes"])) for __ in _["GroundTruthPictures"] + ] + + pdf_stream = DocumentStream( + name=f"ds4sd/docling-dpbench/{idx}", stream=BytesIO(_["BinaryDocument"]) + ) + + # Create dataset record + record = DatasetRecord( + doc_id=str(_["document_id"]), + doc_hash=doc_hash, + ground_truth_doc=doc[0], + ground_truth_pictures=pictures, + ground_truth_page_images=page_images, + original=pdf_stream, + mime_type=_["mimetype"], + ) + + yield record diff --git a/docling_eval/dataset_builders/file_dataset_builder.py b/docling_eval/dataset_builders/file_dataset_builder.py index 111bd19f..6b2e9d59 100644 --- a/docling_eval/dataset_builders/file_dataset_builder.py +++ b/docling_eval/dataset_builders/file_dataset_builder.py @@ -108,7 +108,7 @@ def iterate(self) -> Iterable[DatasetRecord]: # Create the ground truth Document true_doc = DoclingDocument(name=f"{filename}") if mime_type == "application/pdf": - _log.info(f"add_pages_to_true_doc: {filename}") + _log.debug(f"add_pages_to_true_doc: {filename}") true_doc, _ = add_pages_to_true_doc( pdf_path=filename, true_doc=true_doc, image_scale=2.0 ) @@ -127,7 +127,7 @@ def iterate(self) -> Iterable[DatasetRecord]: image=image_ref, ) - _log.info(f"add_pages_to_true_doc: {filename}") + _log.debug(f"add_pages_to_true_doc: {filename}") true_doc.pages[1] = page_item else: raise ValueError( diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 63646762..31ee0a5e 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -59,6 +59,10 @@ class ImageLayoutEvaluation(UnitEvaluation): avg_weighted_label_matched_iou_90: float avg_weighted_label_matched_iou_95: float + segmentation_precision: float + segmentation_recall: float + segmentation_f1: float + class DatasetLayoutEvaluation(DatasetEvaluation): true_labels: Dict[str, int] @@ -78,6 +82,10 @@ class DatasetLayoutEvaluation(DatasetEvaluation): weighted_map_90_stats: DatasetStatistics weighted_map_95_stats: DatasetStatistics + segmentation_precision_stats: DatasetStatistics + segmentation_recall_stats: DatasetStatistics + segmentation_f1_stats: DatasetStatistics + def to_table(self) -> Tuple[List[List[str]], List[str]]: headers = ["label", "Class mAP[0.5:0.95]"] @@ -137,19 +145,28 @@ def __call__( # Load the dataset split_path = str(ds_path / split / "*.parquet") split_files = glob.glob(split_path) - logging.info("Files: %s", split_files) + logging.info("#-files: %s", len(split_files)) ds = load_dataset("parquet", data_files={split: split_files}) logging.info("Overview of dataset: %s", ds) # Select the split ds_selection: Dataset = ds[split] - true_labels, pred_labels, intersection_labels = self._find_intersecting_labels( - ds_selection + true_labels, pred_labels, intersection_labels, union_labels = ( + self._find_intersecting_labels(ds_selection) ) - intersection_labels_str = "\n" + "\n".join(sorted(intersection_labels)) + true_labels_str = ", ".join(sorted(true_labels)) + logging.info(f"True labels: {true_labels_str}") + + pred_labels_str = ", ".join(sorted(pred_labels)) + logging.info(f"Pred labels: {pred_labels_str}") + + intersection_labels_str = ", ".join(sorted(intersection_labels)) logging.info(f"Intersection labels: {intersection_labels_str}") + union_labels_str = ", ".join(sorted(union_labels)) + logging.info(f"Union labels: {union_labels_str}") + doc_ids = [] ground_truths = [] predictions = [] @@ -187,6 +204,9 @@ def __call__( filter_labels=intersection_labels, ) + # logging.info(f"gts: {gts}") + # logging.info(f"preds: {preds}") + if len(gts) > 0: for i in range(len(gts)): doc_ids.append(data[BenchMarkColumns.DOC_ID] + f"-page-{i}") @@ -258,8 +278,19 @@ def __call__( for i, (doc_id, pred, gt) in enumerate( zip(doc_ids, predictions, ground_truths) ): + # logging.info(f"gt: {gt}") + # logging.info(f"pred: {pred}") + + precision, recall, f1 = self._compute_area_level_metrics_for_tensors( + gt_boxes=gt["boxes"], + pred_boxes=pred["boxes"], + page_width=100, + page_height=100, + mask_width=512, + mask_height=512, + ) + # Reset the metric for the next image - # metric.reset() metric = MeanAveragePrecision(iou_type="bbox", class_metrics=True) # Update with single image @@ -293,6 +324,10 @@ def __call__( weighted_map_90_values.append(average_iou_90) weighted_map_95_values.append(average_iou_95) + logging.info( + f"doc: {doc_id}\tprecision: {precision:.2f}, recall: {recall:.2f}, f1: {f1:.2f}, map_50: {map_50:.2f}" + ) + image_evaluation = ImageLayoutEvaluation( name=doc_id, value=average_iou_50, @@ -303,6 +338,9 @@ def __call__( avg_weighted_label_matched_iou_75=average_iou_75, avg_weighted_label_matched_iou_90=average_iou_90, avg_weighted_label_matched_iou_95=average_iou_95, + segmentation_precision=precision, + segmentation_recall=recall, + segmentation_f1=f1, ) evaluations_per_image.append(image_evaluation) if self._intermediate_evaluations_path: @@ -326,6 +364,15 @@ def __call__( weighted_map_75_stats=compute_stats(weighted_map_75_values), weighted_map_90_stats=compute_stats(weighted_map_90_values), weighted_map_95_stats=compute_stats(weighted_map_95_values), + segmentation_precision_stats=compute_stats( + [_.segmentation_precision for _ in evaluations_per_image] + ), + segmentation_recall_stats=compute_stats( + [_.segmentation_recall for _ in evaluations_per_image] + ), + segmentation_f1_stats=compute_stats( + [_.segmentation_f1 for _ in evaluations_per_image] + ), true_labels=true_labels, pred_labels=pred_labels, intersecting_labels=[_.value for _ in intersection_labels], @@ -449,7 +496,7 @@ def _compute_average_iou_with_labels_across_iou( def _find_intersecting_labels( self, ds: Dataset, - ) -> tuple[dict[str, int], dict[str, int], list[DocItemLabel]]: + ) -> tuple[dict[str, int], dict[str, int], list[DocItemLabel], list[DocItemLabel]]: r""" Compute counters per labels for the groundtruth, prediciton and their intersections @@ -502,11 +549,18 @@ def _find_intersecting_labels( """ intersection_labels: List[DocItemLabel] = [] + union_labels: List[DocItemLabel] = [] for label, count in true_labels.items(): + union_labels.append(DocItemLabel(label)) + if label in pred_labels: intersection_labels.append(DocItemLabel(label)) - return true_labels, pred_labels, intersection_labels + for label, count in pred_labels.items(): + if label not in true_labels: + union_labels.append(DocItemLabel(label)) + + return true_labels, pred_labels, intersection_labels, union_labels def _extract_layout_data( self, @@ -572,13 +626,10 @@ def _extract_layout_data( for item in items: for prov in item.prov: bbox = prov.bbox.to_top_left_origin(page_height=page_height) - # true_tl_bboxes.append(copy.deepcopy(bbox)) bbox = bbox.normalized(page_size) bbox = bbox.scaled(100.0) - # logging.info(f"ground-truth {page_no}: {page_width, page_height} -> {item.label}, {bbox.coord_origin}: [{bbox.l}, {bbox.t}, {bbox.r}, {bbox.b}]") - bboxes.append([bbox.l, bbox.t, bbox.r, bbox.b]) labels.append(filter_labels.index(self.label_mapping[item.label])) # type: ignore @@ -635,3 +686,94 @@ def _extract_layout_data( # print(pred_tl_bboxes_str) return ground_truths, predictions + + def _compute_area_level_metrics_for_tensors( + self, + gt_boxes: torch.Tensor, + pred_boxes: torch.Tensor, + page_width: int, + page_height: int, + mask_width: int = 512, + mask_height: int = 512, + ) -> Tuple[float, float, float]: + """ + Compute area-level precision, recall, and F1 score for tensor format boxes. + Handles overlapping boxes by using binary masks at the specified resolution. + + Args: + gt_boxes: Ground truth boxes as tensor of shape (N, 4) with [x1, y1, x2, y2] format + pred_boxes: Predicted boxes as tensor of shape (M, 4) with [x1, y1, x2, y2] format + page_width: Width of the original page + page_height: Height of the original page + mask_width: Width of the mask to use for computation (default: 512) + mask_height: Height of the mask to use for computation (default: 512) + + Returns: + Dictionary containing precision, recall, and F1 scores + """ + if gt_boxes.shape[0] == 0: + precision = 1.0 if pred_boxes.shape[0] == 0 else 0.0 + recall = 1.0 + f1 = 1.0 if pred_boxes.shape[0] == 0 else 0.0 + return precision, recall, f1 + + if pred_boxes.shape[0] == 0: + precision = 1.0 + recall = 0.0 + f1 = 0.0 + return precision, recall, f1 + + # Calculate scaling factors (ensure float division) + x_scale = float(mask_width) / float(page_width) + y_scale = float(mask_height) / float(page_height) + + # Create empty masks + gt_mask = torch.zeros((mask_height, mask_width), dtype=torch.bool, device="cpu") + pred_mask = torch.zeros( + (mask_height, mask_width), dtype=torch.bool, device="cpu" + ) + + # Fill ground truth mask + for i in range(gt_boxes.shape[0]): + x1, y1, x2, y2 = gt_boxes[i].tolist() + + # Scale coordinates to mask space + x1, y1 = max(0, int(x1 * x_scale)), max(0, int(y1 * y_scale)) + x2, y2 = min(mask_width, int(x2 * x_scale)), min( + mask_height, int(y2 * y_scale) + ) + + if x2 > x1 and y2 > y1: + gt_mask[y1:y2, x1:x2] = True + + # Fill prediction mask + for i in range(pred_boxes.shape[0]): + x1, y1, x2, y2 = pred_boxes[i].tolist() + + # Scale coordinates to mask space + x1, y1 = max(0, int(x1 * x_scale)), max(0, int(y1 * y_scale)) + x2, y2 = min(mask_width, int(x2 * x_scale)), min( + mask_height, int(y2 * y_scale) + ) + + if x2 > x1 and y2 > y1: + pred_mask[y1:y2, x1:x2] = True + + # Calculate areas (accounting for overlaps) + total_gt_area = torch.sum(gt_mask).item() + total_pred_area = torch.sum(pred_mask).item() + + # Calculate intersection (logical AND of masks) + intersection_mask = torch.logical_and(gt_mask, pred_mask) + total_intersection = torch.sum(intersection_mask).item() + + # Calculate metrics + precision = total_intersection / total_pred_area if total_pred_area > 0 else 0.0 + recall = total_intersection / total_gt_area if total_gt_area > 0 else 0.0 + + # Calculate F1 score + f1 = 0.0 + if precision + recall > 0: + f1 = 2 * (precision * recall) / (precision + recall) + + return precision, recall, f1 diff --git a/docling_eval/evaluators/stats.py b/docling_eval/evaluators/stats.py index ecd898a4..218bbfb6 100644 --- a/docling_eval/evaluators/stats.py +++ b/docling_eval/evaluators/stats.py @@ -74,16 +74,22 @@ def save_histogram(self, figname: Path, name: str = ""): plt.savefig(figname) -def compute_stats(values: List[float]) -> DatasetStatistics: +def compute_stats( + values: List[float], max_value_is_one: bool = True, nr_bins: int = 20 +) -> DatasetStatistics: total: int = len(values) mean: float = statistics.mean(values) if len(values) > 0 else -1 median: float = statistics.median(values) if len(values) > 0 else -1 - std: float = statistics.stdev(values) if len(values) > 0 else -1 + std: float = statistics.stdev(values) if len(values) > 1 else 0.0 logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}") - # Compute the histogram with 20 bins between 0 and 1 - hist, bins = np.histogram(values, bins=20, range=(0, 1)) + max_value = 1.0 + if not max_value_is_one and len(values) > 0: + max_value = max(values) + + # Compute the histogram + hist, bins = np.histogram(values, bins=nr_bins, range=(0, max_value)) logging.info(f"#-hist: {len(hist)}, #-bins: {len(bins)}") return DatasetStatistics( diff --git a/docling_eval/evaluators/timings_evaluator.py b/docling_eval/evaluators/timings_evaluator.py new file mode 100644 index 00000000..56192564 --- /dev/null +++ b/docling_eval/evaluators/timings_evaluator.py @@ -0,0 +1,130 @@ +import glob +import logging +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from datasets import Dataset, load_dataset +from tqdm import tqdm + +from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction +from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats +from docling_eval.evaluators.base_evaluator import ( + BaseEvaluator, + DatasetEvaluation, + EvaluationRejectionType, + UnitEvaluation, +) +from docling_eval.evaluators.stats import DatasetStatistics, compute_stats + +_log = logging.getLogger(__name__) + + +class DatasetTimingsEvaluation(DatasetEvaluation): + """Dataset timing evaluation.""" + + timing_per_document_stats: DatasetStatistics + timing_per_page_stats: DatasetStatistics + + +class TimingsEvaluator(BaseEvaluator): + """Timings evaluator.""" + + def __init__( + self, + intermediate_evaluations_path: Optional[Path] = None, + prediction_sources: List[PredictionFormats] = [], + ): + supported_prediction_formats: List[PredictionFormats] = [ + PredictionFormats.DOCLING_DOCUMENT, + ] + + if not prediction_sources: + prediction_sources = supported_prediction_formats + super().__init__( + intermediate_evaluations_path=intermediate_evaluations_path, + prediction_sources=prediction_sources, + supported_prediction_formats=supported_prediction_formats, + ) + + def __call__( + self, + ds_path: Path, + split: str = "test", + ) -> DatasetTimingsEvaluation: + logging.info("Loading the split '%s' from: '%s'", split, ds_path) + + rejected_samples: Dict[EvaluationRejectionType, int] = { + EvaluationRejectionType.INVALID_CONVERSION_STATUS: 0, + EvaluationRejectionType.MISSING_PREDICTION: 0, + EvaluationRejectionType.MISMATHCED_DOCUMENT: 0, + } + + # Load the dataset + split_path = str(ds_path / split / "*.parquet") + split_files = glob.glob(split_path) + logging.info("#-files: %s", len(split_files)) + ds = load_dataset("parquet", data_files={split: split_files}) + logging.info("Overview of dataset: %s", ds) + + # Select the split + ds_selection: Dataset = ds[split] + + timings = [] + for i, data in tqdm( + enumerate(ds_selection), + desc="Timings evaluations", + ncols=120, + total=len(ds_selection), + ): + data_record = DatasetRecordWithPrediction.model_validate(data) + + doc_id = data_record.doc_id + if data_record.status not in self._accepted_status: + _log.error( + "Skipping record without successfull conversion status: %s", doc_id + ) + rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 + continue + + # print(data_record.prediction_timings) + timings.append(data_record.prediction_timings) + + if rejected_samples[EvaluationRejectionType.MISMATHCED_DOCUMENT] > 0: + logging.error( + "Total mismatched/skipped documents: %s over %s", + rejected_samples[EvaluationRejectionType.MISMATHCED_DOCUMENT], + len(ds_selection), + ) + + time_per_doc = [] + time_per_page = [] + + for timing in timings: + + if timing is not None: + for key, val in timing.items(): + if key == "pipeline_total": + time_per_doc.extend(val) + + if key == "layout": + _time_per_page = [0.0 for v in val] + for k2, v2 in timing.items(): + if len(v2) == len(_time_per_page): + for i, v in enumerate(v2): + _time_per_page[i] += v + + time_per_page.extend(_time_per_page) + + dataset_timings_evaluation = DatasetTimingsEvaluation( + timing_per_document_stats=compute_stats( + time_per_doc, + max_value_is_one=False, + nr_bins=32, + ), + timing_per_page_stats=compute_stats( + time_per_page, + max_value_is_one=False, + nr_bins=32, + ), + ) + return dataset_timings_evaluation diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index d280ed98..a642593d 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -8,6 +8,7 @@ from datasets import load_dataset from docling.datamodel.base_models import ConversionStatus +from docling.utils.profiling import ProfilingItem from docling.utils.utils import chunkify from docling_core.types.doc import DocItemLabel from docling_core.types.doc.document import DoclingDocument @@ -31,7 +32,6 @@ ) from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters -# Get logger _log = logging.getLogger(__name__) # Default HTML export labels for visualization @@ -186,6 +186,7 @@ def create_dataset_record_with_prediction( record: DatasetRecord, predicted_doc: Optional[DoclingDocument] = None, original_prediction: Optional[str] = None, + timings: Optional[dict] = None, ) -> DatasetRecordWithPrediction: """ Create a dataset record with prediction from an input record. @@ -215,9 +216,36 @@ def create_dataset_record_with_prediction( "predicted_pictures": pred_pictures, "original_prediction": original_prediction, "prediction_format": self.prediction_format, + "prediction_timings": self._prediction_timings(timings), "predictor_info": self.info(), } - return DatasetRecordWithPrediction.model_validate(data) + record = DatasetRecordWithPrediction.model_validate(data) + + return record + + def _prediction_timings(self, timings: Optional[dict]) -> Optional[dict]: + """Get prediction timings.""" + + if isinstance(timings, dict): + result = {} + for key, val in timings.items(): + if isinstance(val, ProfilingItem): + result[key] = val.times + + if len(result) == 0: # datasets does not like empty dicts + _log.warning(f"empty timings: {timings}") + return None + + # import json + # print(json.dumps(result, indent=2)) + + return result + + elif timings is None: + return None + else: + _log.warning(f"unknown type of timings: {timings}") + return None def add_prediction(self, record: DatasetRecord) -> DatasetRecordWithPrediction: """ diff --git a/docling_eval/prediction_providers/docling_provider.py b/docling_eval/prediction_providers/docling_provider.py index b86b619b..2a4a2b8c 100644 --- a/docling_eval/prediction_providers/docling_provider.py +++ b/docling_eval/prediction_providers/docling_provider.py @@ -1,8 +1,10 @@ import copy +import logging import platform from typing import Dict, List, Optional, Set from docling.datamodel.base_models import InputFormat +from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, FormatOption from docling_core.types.doc import DocItemLabel from pydantic import TypeAdapter @@ -21,6 +23,8 @@ ) from docling_eval.utils.utils import docling_version, get_package_version +_log = logging.getLogger(__name__) + class DoclingPredictionProvider(BasePredictionProvider): """ @@ -47,6 +51,7 @@ def __init__( ignore_missing_predictions: bool = True, true_labels: Optional[Set[DocItemLabel]] = None, pred_labels: Optional[Set[DocItemLabel]] = None, + profile_pipeline_timings: bool = True, ): """ Initialize the Docling prediction provider. @@ -65,6 +70,11 @@ def __init__( true_labels=true_labels, pred_labels=pred_labels, ) + + # Enable the profiling to measure the time spent + settings.debug.profile_pipeline_timings = profile_pipeline_timings + _log.info(f"profile_pipeline_timings: {profile_pipeline_timings}") + self.doc_converter = DocumentConverter(format_options=format_options) @property @@ -95,9 +105,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: # Create prediction record pred_record = self.create_dataset_record_with_prediction( - record, - res.document, - None, + record, res.document, None, res.timings ) pred_record.status = res.status diff --git a/docling_eval/visualisation/constants.py b/docling_eval/visualisation/constants.py index 823c258a..59c12c5a 100644 --- a/docling_eval/visualisation/constants.py +++ b/docling_eval/visualisation/constants.py @@ -109,6 +109,99 @@ """ +HTML_DEFAULT_HEAD_FOR_COMP_v2: str = r"""
+ + +