diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index d9a4ffac..e1b566f7 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -73,6 +73,13 @@ # Configure logging logging.getLogger("docling").setLevel(logging.WARNING) +logging.getLogger("PIL").setLevel(logging.WARNING) +logging.getLogger("transformers").setLevel(logging.WARNING) +logging.getLogger("datasets").setLevel(logging.WARNING) +logging.getLogger("filelock").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) +logging.getLogger("docling_ibm_models").setLevel(logging.WARNING) + _log = logging.getLogger(__name__) app = typer.Typer( @@ -188,14 +195,17 @@ def get_dataset_builder( name="CVAT", dataset_source=dataset_source, target=target, split=split ) elif benchmark == BenchMarkNames.PLAIN_FILES: - assert dataset_source is not None + if dataset_source is None: + raise ValueError("dataset_source is required for PLAIN_FILES") + return FileDatasetBuilder( name=dataset_source.name, dataset_source=dataset_source, target=target, split=split, + begin_index=begin_index, + end_index=end_index, ) - else: raise ValueError(f"Unsupported benchmark: {benchmark}") @@ -209,7 +219,11 @@ def get_prediction_provider( ): pipeline_options: PaginatedPipelineOptions """Get the appropriate prediction provider with default settings.""" - if provider_type == PredictionProviderType.DOCLING: + if ( + provider_type == PredictionProviderType.DOCLING + or provider_type == PredictionProviderType.OCR_DOCLING + or provider_type == PredictionProviderType.EasyOCR_DOCLING + ): ocr_factory = get_ocr_factory() ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore @@ -238,6 +252,78 @@ def get_prediction_provider( ignore_missing_predictions=True, ) + elif provider_type == PredictionProviderType.MacOCR_DOCLING: + ocr_factory = get_ocr_factory() + + ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore + kind="ocrmac", + ) + + pipeline_options = PdfPipelineOptions( + do_ocr=True, + ocr_options=ocr_options, + do_table_structure=True, + ) + + pipeline_options.images_scale = 2.0 + pipeline_options.generate_page_images = True + pipeline_options.generate_picture_images = True + + if artifacts_path is not None: + pipeline_options.artifacts_path = artifacts_path + + return DoclingPredictionProvider( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), + InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options), + }, + do_visualization=do_visualization, + ignore_missing_predictions=True, + ) + + elif provider_type == PredictionProviderType.PDF_DOCLING: + + ocr_factory = get_ocr_factory() + + ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore + kind="easyocr", + ) + + pdf_pipeline_options = PdfPipelineOptions( + do_ocr=False, + ocr_options=ocr_options, # we need to provide OCR options in order to not break the parquet serialization + do_table_structure=True, + ) + + pdf_pipeline_options.images_scale = 2.0 + pdf_pipeline_options.generate_page_images = True + pdf_pipeline_options.generate_picture_images = True + + ocr_pipeline_options = PdfPipelineOptions( + do_ocr=True, + ocr_options=ocr_options, # we need to provide OCR options in order to not break the parquet serialization + do_table_structure=True, + ) + + ocr_pipeline_options.images_scale = 2.0 + ocr_pipeline_options.generate_page_images = True + ocr_pipeline_options.generate_picture_images = True + + if artifacts_path is not None: + pdf_pipeline_options.artifacts_path = artifacts_path + ocr_pipeline_options.artifacts_path = artifacts_path + + return DoclingPredictionProvider( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options), + InputFormat.IMAGE: PdfFormatOption( + pipeline_options=ocr_pipeline_options + ), + }, + do_visualization=do_visualization, + ignore_missing_predictions=True, + ) + elif provider_type == PredictionProviderType.SMOLDOCLING: pipeline_options = VlmPipelineOptions() @@ -614,9 +700,14 @@ def create_cvat( output_dir: Annotated[Path, typer.Option(help="Output directory")], gt_dir: Annotated[Path, typer.Option(help="Dataset source path")], bucket_size: Annotated[int, typer.Option(help="Size of CVAT tasks")] = 20, + use_predictions: Annotated[bool, typer.Option(help="use predictions")] = False, ): + """Create dataset ready to upload to CVAT starting from (ground-truth) dataset.""" builder = CvatPreannotationBuilder( - dataset_source=gt_dir, target=output_dir, bucket_size=bucket_size + dataset_source=gt_dir, + target=output_dir, + bucket_size=bucket_size, + use_predictions=use_predictions, ) builder.prepare_for_annotation() diff --git a/docling_eval/datamodels/types.py b/docling_eval/datamodels/types.py index 93a5011f..120ee414 100644 --- a/docling_eval/datamodels/types.py +++ b/docling_eval/datamodels/types.py @@ -118,6 +118,11 @@ class PredictionProviderType(str, Enum): """Types of prediction providers available.""" DOCLING = "Docling" + PDF_DOCLING = "PDF_Docling" + OCR_DOCLING = "OCR_Docling" + MacOCR_DOCLING = "MacOCR_Docling" + EasyOCR_DOCLING = "EasyOCR_Docling" + TABLEFORMER = "TableFormer" FILE = "File" SMOLDOCLING = "SmolDocling" diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py index a3b96a5e..012c95fd 100644 --- a/docling_eval/dataset_builders/dataset_builder.py +++ b/docling_eval/dataset_builders/dataset_builder.py @@ -7,6 +7,7 @@ import ibm_boto3 # type: ignore from docling.utils.utils import chunkify +from docling_core.types.doc.document import ImageRefMode from huggingface_hub import snapshot_download from pydantic import BaseModel @@ -15,7 +16,6 @@ TRUE_HTML_EXPORT_LABELS, ) from docling_eval.utils.utils import save_shard_to_disk, write_datasets_info -from docling_eval.visualisation.visualisations import save_inspection_html # Get logger _log = logging.getLogger(__name__) @@ -276,10 +276,11 @@ def save_to_disk( record_list.append(r.as_record_dict()) if do_visualization: viz_path = self.target / "visualizations" / f"{r.doc_id}.html" - save_inspection_html( + r.ground_truth_doc.save_as_html( filename=viz_path, - doc=r.ground_truth_doc, labels=TRUE_HTML_EXPORT_LABELS, + image_mode=ImageRefMode.EMBEDDED, + split_page_view=True, ) save_shard_to_disk( diff --git a/docling_eval/dataset_builders/file_dataset_builder.py b/docling_eval/dataset_builders/file_dataset_builder.py index d32a9b4e..111bd19f 100644 --- a/docling_eval/dataset_builders/file_dataset_builder.py +++ b/docling_eval/dataset_builders/file_dataset_builder.py @@ -100,7 +100,7 @@ def iterate(self) -> Iterable[DatasetRecord]: for filename in tqdm( selected_filenames, - desc="Processing files for DP-Bench", + desc=f"Processing files for {self.name}", ncols=128, ): mime_type, _ = mimetypes.guess_type(filename) @@ -108,6 +108,7 @@ def iterate(self) -> Iterable[DatasetRecord]: # Create the ground truth Document true_doc = DoclingDocument(name=f"{filename}") if mime_type == "application/pdf": + _log.info(f"add_pages_to_true_doc: {filename}") true_doc, _ = add_pages_to_true_doc( pdf_path=filename, true_doc=true_doc, image_scale=2.0 ) @@ -126,6 +127,7 @@ def iterate(self) -> Iterable[DatasetRecord]: image=image_ref, ) + _log.info(f"add_pages_to_true_doc: {filename}") true_doc.pages[1] = page_item else: raise ValueError( @@ -139,18 +141,20 @@ def iterate(self) -> Iterable[DatasetRecord]: page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, ) - # Get PDF as binary data - pdf_bytes = get_binary(filename) - pdf_stream = DocumentStream(name=filename.name, stream=BytesIO(pdf_bytes)) + # Get source as binary data + source_bytes = get_binary(filename) + source_stream = DocumentStream( + name=filename.name, stream=BytesIO(source_bytes) + ) # Create dataset record record = DatasetRecord( doc_id=str(filename.name), - doc_hash=get_binhash(pdf_bytes), + doc_hash=get_binhash(source_bytes), ground_truth_doc=true_doc, ground_truth_pictures=true_pictures, ground_truth_page_images=true_page_images, - original=pdf_stream, + original=source_stream, mime_type=mime_type, ) diff --git a/docling_eval/evaluators/readingorder_evaluator.py b/docling_eval/evaluators/readingorder_evaluator.py index 3c3fa364..01c64fb5 100644 --- a/docling_eval/evaluators/readingorder_evaluator.py +++ b/docling_eval/evaluators/readingorder_evaluator.py @@ -292,7 +292,7 @@ def _show_items(self, true_doc: DoclingDocument): ) text = item.text if isinstance(item, TextItem) else None label = item.label # type: ignore - print(f"True {i}: {level} - {label}: {bbox} - {text}") + # print(f"True {i}: {level} - {label}: {bbox} - {text}") class ReadingOrderVisualizer: diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index 996cafdd..d280ed98 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -165,7 +165,6 @@ def visualize_results( / f"{prediction_record.doc_id}.html", true_doc=gt_doc, pred_doc=pred_doc, - page_image=prediction_record.ground_truth_page_images[0], true_labels=self.true_labels, pred_labels=self.pred_labels, draw_reading_order=True, diff --git a/docling_eval/prediction_providers/docling_provider.py b/docling_eval/prediction_providers/docling_provider.py index 880e58b4..b86b619b 100644 --- a/docling_eval/prediction_providers/docling_provider.py +++ b/docling_eval/prediction_providers/docling_provider.py @@ -106,7 +106,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: def info(self) -> Dict: """Get information about the prediction provider.""" - return { + result = { "asset": PredictionProviderType.DOCLING, "version": docling_version(), "package_versions": { @@ -128,10 +128,11 @@ def info(self) -> Dict: mode="json", exclude_defaults=True ) if v.pipeline_options is not None - else {} + else None # Parquet might not like empty dicts! ), } for k, v in self.doc_converter.format_to_options.items() if k in [InputFormat.PDF, InputFormat.IMAGE] }, } + return result diff --git a/docling_eval/visualisation/constants.py b/docling_eval/visualisation/constants.py index a0a04e3c..823c258a 100644 --- a/docling_eval/visualisation/constants.py +++ b/docling_eval/visualisation/constants.py @@ -309,7 +309,7 @@ display: flex; flex-direction: column; width: 25%; /* Adjust the width of each item */ - height: 100%; /* Adjust height to fill parent container */ + height: 50%; /* Adjust height to fill parent container */ border: 1px solid #ccc; /* Optional: Add borders */ box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1); /* Optional: Add shadow */ background-color: #fff; /* Optional: Add background */ diff --git a/docling_eval/visualisation/visualisations.py b/docling_eval/visualisation/visualisations.py index 330c634a..f782fca6 100644 --- a/docling_eval/visualisation/visualisations.py +++ b/docling_eval/visualisation/visualisations.py @@ -1,10 +1,16 @@ import copy import logging +import re from pathlib import Path from typing import Set from docling.datamodel.base_models import BoundingBox, Cluster from docling.utils.visualization import draw_clusters +from docling_core.experimental.serializer.html import ( + HTMLDocSerializer, + HTMLOutputStyle, + HTMLParams, +) from docling_core.types.doc.document import ( ContentLayer, DocItem, @@ -23,54 +29,6 @@ ) -def save_comparison_html( - filename: Path, - true_doc: DoclingDocument, - pred_doc: DoclingDocument, - page_image: Image.Image, - true_labels: Set[DocItemLabel], - pred_labels: Set[DocItemLabel], -): - - true_doc_html = true_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=true_labels, - ) - - pred_doc_html = pred_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=pred_labels, - ) - - # since the string in srcdoc are wrapped by ', we need to replace all ' by it HTML convention - true_doc_html = true_doc_html.replace("'", "'") - pred_doc_html = pred_doc_html.replace("'", "'") - - image_base64 = from_pil_to_base64(page_image) - - """ - # Convert the image to a bytes object - buffered = io.BytesIO() - page_image.save( - buffered, format="PNG" - ) # Specify the format (e.g., JPEG, PNG, etc.) - image_bytes = buffered.getvalue() - - # Encode the bytes to a Base64 string - image_base64 = base64.b64encode(image_bytes).decode("utf-8") - """ - - comparison_page = copy.deepcopy(HTML_COMPARISON_PAGE) - comparison_page = comparison_page.replace("BASE64PAGE", image_base64) - comparison_page = comparison_page.replace("TRUEDOC", true_doc_html) - comparison_page = comparison_page.replace("PREDDOC", pred_doc_html) - - with open(str(filename), "w") as fw: - fw.write(comparison_page) - - def draw_arrow( draw: ImageDraw.ImageDraw, arrow_coords: tuple[float, float, float, float], @@ -117,296 +75,151 @@ def draw_arrow( return draw -def draw_clusters_with_reading_order( - doc: DoclingDocument, - page_image: Image.Image, - labels: Set[DocItemLabel], - page_no: int = 1, - reading_order: bool = True, -): - - # img = copy.deepcopy(page_image) - img = page_image.copy() - draw = ImageDraw.Draw(img) - - # Load a font (adjust the font size and path as needed) - font = ImageFont.load_default() - try: - font = ImageFont.truetype("arial.ttf", size=15) - except IOError: - font = ImageFont.load_default() - - x0, y0 = None, None - - for item, level in doc.iterate_items( - included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE} - ): - if isinstance(item, DocItem): # and item.label in labels: - for prov in item.prov: - - if page_no != prov.page_no: - continue - - bbox = prov.bbox.to_top_left_origin( - page_height=doc.pages[prov.page_no].size.height - ) - bbox = bbox.normalized(doc.pages[prov.page_no].size) - - bbox.l = round(bbox.l * img.width) - bbox.r = round(bbox.r * img.width) - bbox.t = round(bbox.t * img.height) - bbox.b = round(bbox.b * img.height) - - if bbox.b > bbox.t: - bbox.b, bbox.t = bbox.t, bbox.b - - if not reading_order: - x0, y0 = None, None - elif x0 is None and y0 is None: - x0 = (bbox.l + bbox.r) / 2.0 - y0 = (bbox.b + bbox.t) / 2.0 - else: - assert x0 is not None - assert y0 is not None - - x1 = (bbox.l + bbox.r) / 2.0 - y1 = (bbox.b + bbox.t) / 2.0 - - # Arrow parameters - start_point = (x0, y0) # Starting point of the arrow - end_point = (x1, y1) # Ending point of the arrow - arrowhead_length = 20 # Length of the arrowhead - arrowhead_width = 10 # Width of the arrowhead - - arrow_color = "red" - line_width = 2 - - # Draw the arrow shaft (line) - draw.line( - [start_point, end_point], fill=arrow_color, width=line_width - ) - - # Calculate the arrowhead points - dx = end_point[0] - start_point[0] - dy = end_point[1] - start_point[1] - angle = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft - - # Normalized direction vector for the arrow shaft - ux, uy = dx / angle, dy / angle - - # Base of the arrowhead - base_x = end_point[0] - ux * arrowhead_length - base_y = end_point[1] - uy * arrowhead_length - - # Left and right points of the arrowhead - left_x = base_x - uy * arrowhead_width - left_y = base_y + ux * arrowhead_width - right_x = base_x + uy * arrowhead_width - right_y = base_y - ux * arrowhead_width - - # Draw the arrowhead (triangle) - draw.polygon( - [end_point, (left_x, left_y), (right_x, right_y)], - fill=arrow_color, - ) - - x0, y0 = x1, y1 - - # Draw rectangle with only a border - rectangle_color = "blue" - border_width = 1 - draw.rectangle( - [bbox.l, bbox.b, bbox.r, bbox.t], - outline=rectangle_color, - width=border_width, - ) - - # Calculate label size using getbbox - text_bbox = font.getbbox(str(item.label)) - label_width = text_bbox[2] - text_bbox[0] - label_height = text_bbox[3] - text_bbox[1] - label_x = bbox.l - label_y = ( - bbox.b - label_height - ) # - 5 # Place the label above the rectangle - - # Draw label text - draw.text( - (label_x, label_y), - str(item.label), - fill=rectangle_color, - font=font, - ) - - return img - - def save_comparison_html_with_clusters( filename: Path, true_doc: DoclingDocument, pred_doc: DoclingDocument, - page_image: Image.Image, true_labels: Set[DocItemLabel], pred_labels: Set[DocItemLabel], draw_reading_order: bool = True, ): - if (1 not in true_doc.pages) or (1 not in pred_doc.pages): - logging.error(f"1 not in true_doc.pages -> skipping {filename} ") - return - - def draw_doc_layout(doc: DoclingDocument, image: Image.Image): - r""" - Draw the document clusters and optionaly the reading order - """ - clusters = [] - for idx, (elem, _) in enumerate( - doc.iterate_items( - included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE} - ) - ): - if not isinstance(elem, DocItem): - continue - if len(elem.prov) == 0: - continue # Skip elements without provenances - prov = elem.prov[0] - - if prov.page_no not in true_doc.pages or prov.page_no != 1: - logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ") - continue - - tlo_bbox = prov.bbox.to_top_left_origin( - page_height=doc.pages[prov.page_no].size.height - ) - cluster = Cluster( - id=idx, - label=elem.label, - bbox=BoundingBox.model_validate(tlo_bbox), - cells=[], - ) - clusters.append(cluster) - - scale_x = image.width / doc.pages[1].size.width - scale_y = image.height / doc.pages[1].size.height - draw_clusters(image, clusters, scale_x, scale_y) + """Save comparison html with clusters.""" - return image + def get_missing_pageimg(width=800, height=1100, text="MISSING PAGE"): + """Get missing page imgage.""" + import numpy as np + from PIL import Image, ImageDraw, ImageFont - def draw_doc_reading_order(doc: DoclingDocument, image: Image.Image): - r""" - Draw the reading order - """ + # Create a white background image + image = Image.new("RGB", (width, height), color="white") draw = ImageDraw.Draw(image) - x0, y0 = None, None - - for elem, _ in doc.iterate_items( - included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE} - ): - if not isinstance(elem, DocItem): - continue - if len(elem.prov) == 0: - continue # Skip elements without provenances - prov = elem.prov[0] - - if prov.page_no not in true_doc.pages or prov.page_no != 1: - logging.error(f"{prov.page_no} not in true_doc.pages -> skipping! ") - continue - - tlo_bbox = prov.bbox.to_top_left_origin( - page_height=doc.pages[prov.page_no].size.height - ) - ro_bbox = tlo_bbox.normalized(doc.pages[prov.page_no].size) - ro_bbox.l = round(ro_bbox.l * image.width) - ro_bbox.r = round(ro_bbox.r * image.width) - ro_bbox.t = round(ro_bbox.t * image.height) - ro_bbox.b = round(ro_bbox.b * image.height) - - if ro_bbox.b > ro_bbox.t: - ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b - - if x0 is None and y0 is None: - x0 = (ro_bbox.l + ro_bbox.r) / 2.0 - y0 = (ro_bbox.b + ro_bbox.t) / 2.0 - else: - assert x0 is not None - assert y0 is not None - - x1 = (ro_bbox.l + ro_bbox.r) / 2.0 - y1 = (ro_bbox.b + ro_bbox.t) / 2.0 - - draw = draw_arrow( - draw, - (x0, y0, x1, y1), - line_width=2, - color="red", - ) - x0, y0 = x1, y1 + + # Try to use a standard font or fall back to default + try: + # For larger installations, you might have Arial or other fonts + font = ImageFont.truetype("arial.ttf", size=60) + except IOError: + # Fall back to default font + font = ImageFont.load_default().font_variant(size=60) + + # Get text size to center it + text_width, text_height = ( + draw.textsize(text, font=font) + if hasattr(draw, "textsize") + else (draw.textlength(text, font=font), font.size) + ) + + # Position for the text (centered and angled) + position = ((width - text_width) // 2, (height - text_height) // 2) + + # Draw the watermark text (light gray and rotated) + draw.text(position, text, fill=(200, 200, 200), font=font) + + # Rotate the image 45 degrees to create diagonal watermark effect + image = image.rotate(45, expand=False, fillcolor="white") + return image - # HTML rendering - true_doc_html = true_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=true_labels, - ) + true_page_imgs = true_doc.get_visualization(show_label=False) + pred_page_imgs = pred_doc.get_visualization(show_label=False) - pred_doc_html = pred_doc.export_to_html( - image_mode=ImageRefMode.EMBEDDED, - html_head=HTML_DEFAULT_HEAD_FOR_COMP, - labels=pred_labels, + true_page_nos = true_page_imgs.keys() + pred_page_nos = pred_page_imgs.keys() + + if true_page_nos != pred_page_nos: + logging.error( + f"incompatible true_page_nos versus pred_page_nos: \ntrue_page_nos: {true_page_nos}\npred_page_nos: {pred_page_nos}" + ) + + page_nos = true_page_nos | pred_page_nos + + html_parts = [ + "", + "", + HTML_DEFAULT_HEAD_FOR_COMP, + "
", + ] + + html_parts.append("| ")
+ html_parts.append(f' | ")
+
+ html_parts.append("")
+ html_parts.append(f" \n{true_doc_page_body}\n ")
+ html_parts.append(" | ")
- html_doc = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED, labels=labels)
- html_doc = html_doc.replace("'", "'")
+ html_parts.append("")
+ html_parts.append(f' | ")
- page_images = []
- page_template = '")
+ html_parts.append(f" \n{pred_doc_page_body}\n ")
+ html_parts.append(" | ")
- if page.image is not None and page.image.pil_image is not None:
+ html_parts.append("