docling-project · PeterStaar-IBM · Apr 25, 2025 · Apr 24, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -73,6 +73,13 @@
 
 # Configure logging
 logging.getLogger("docling").setLevel(logging.WARNING)
+logging.getLogger("PIL").setLevel(logging.WARNING)
+logging.getLogger("transformers").setLevel(logging.WARNING)
+logging.getLogger("datasets").setLevel(logging.WARNING)
+logging.getLogger("filelock").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+logging.getLogger("docling_ibm_models").setLevel(logging.WARNING)
+
 _log = logging.getLogger(__name__)
 
 app = typer.Typer(
@@ -188,14 +195,17 @@ def get_dataset_builder(
             name="CVAT", dataset_source=dataset_source, target=target, split=split
         )
     elif benchmark == BenchMarkNames.PLAIN_FILES:
-        assert dataset_source is not None
+        if dataset_source is None:
+            raise ValueError("dataset_source is required for PLAIN_FILES")
+
         return FileDatasetBuilder(
             name=dataset_source.name,
             dataset_source=dataset_source,
             target=target,
             split=split,
+            begin_index=begin_index,
+            end_index=end_index,
         )
-
     else:
         raise ValueError(f"Unsupported benchmark: {benchmark}")
 
@@ -209,7 +219,11 @@ def get_prediction_provider(
 ):
     pipeline_options: PaginatedPipelineOptions
     """Get the appropriate prediction provider with default settings."""
-    if provider_type == PredictionProviderType.DOCLING:
+    if (
+        provider_type == PredictionProviderType.DOCLING
+        or provider_type == PredictionProviderType.OCR_DOCLING
+        or provider_type == PredictionProviderType.EasyOCR_DOCLING
+    ):
         ocr_factory = get_ocr_factory()
 
         ocr_options: OcrOptions = ocr_factory.create_options(  # type: ignore
@@ -238,6 +252,78 @@ def get_prediction_provider(
             ignore_missing_predictions=True,
         )
 
+    elif provider_type == PredictionProviderType.MacOCR_DOCLING:
+        ocr_factory = get_ocr_factory()
+
+        ocr_options: OcrOptions = ocr_factory.create_options(  # type: ignore
+            kind="ocrmac",
+        )
+
+        pipeline_options = PdfPipelineOptions(
+            do_ocr=True,
+            ocr_options=ocr_options,
+            do_table_structure=True,
+        )
+
+        pipeline_options.images_scale = 2.0
+        pipeline_options.generate_page_images = True
+        pipeline_options.generate_picture_images = True
+
+        if artifacts_path is not None:
+            pipeline_options.artifacts_path = artifacts_path
+
+        return DoclingPredictionProvider(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+                InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
+            },
+            do_visualization=do_visualization,
+            ignore_missing_predictions=True,
+        )
+
+    elif provider_type == PredictionProviderType.PDF_DOCLING:
+
+        ocr_factory = get_ocr_factory()
+
+        ocr_options: OcrOptions = ocr_factory.create_options(  # type: ignore
+            kind="easyocr",
+        )
+
+        pdf_pipeline_options = PdfPipelineOptions(
+            do_ocr=False,
+            ocr_options=ocr_options,  # we need to provide OCR options in order to not break the parquet serialization
+            do_table_structure=True,
+        )
+
+        pdf_pipeline_options.images_scale = 2.0
+        pdf_pipeline_options.generate_page_images = True
+        pdf_pipeline_options.generate_picture_images = True
+
+        ocr_pipeline_options = PdfPipelineOptions(
+            do_ocr=True,
+            ocr_options=ocr_options,  # we need to provide OCR options in order to not break the parquet serialization
+            do_table_structure=True,
+        )
+
+        ocr_pipeline_options.images_scale = 2.0
+        ocr_pipeline_options.generate_page_images = True
+        ocr_pipeline_options.generate_picture_images = True
+
+        if artifacts_path is not None:
+            pdf_pipeline_options.artifacts_path = artifacts_path
+            ocr_pipeline_options.artifacts_path = artifacts_path
+
+        return DoclingPredictionProvider(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
+                InputFormat.IMAGE: PdfFormatOption(
+                    pipeline_options=ocr_pipeline_options
+                ),
+            },
+            do_visualization=do_visualization,
+            ignore_missing_predictions=True,
+        )
+
     elif provider_type == PredictionProviderType.SMOLDOCLING:
         pipeline_options = VlmPipelineOptions()
 
@@ -614,9 +700,14 @@ def create_cvat(
     output_dir: Annotated[Path, typer.Option(help="Output directory")],
     gt_dir: Annotated[Path, typer.Option(help="Dataset source path")],
     bucket_size: Annotated[int, typer.Option(help="Size of CVAT tasks")] = 20,
+    use_predictions: Annotated[bool, typer.Option(help="use predictions")] = False,
 ):
+    """Create dataset ready to upload to CVAT starting from (ground-truth) dataset."""
     builder = CvatPreannotationBuilder(
-        dataset_source=gt_dir, target=output_dir, bucket_size=bucket_size
+        dataset_source=gt_dir,
+        target=output_dir,
+        bucket_size=bucket_size,
+        use_predictions=use_predictions,
     )
     builder.prepare_for_annotation()
 

diff --git a/docling_eval/datamodels/types.py b/docling_eval/datamodels/types.py
@@ -118,6 +118,11 @@ class PredictionProviderType(str, Enum):
     """Types of prediction providers available."""
 
     DOCLING = "Docling"
+    PDF_DOCLING = "PDF_Docling"
+    OCR_DOCLING = "OCR_Docling"
+    MacOCR_DOCLING = "MacOCR_Docling"
+    EasyOCR_DOCLING = "EasyOCR_Docling"
+
     TABLEFORMER = "TableFormer"
     FILE = "File"
     SMOLDOCLING = "SmolDocling"

diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py
@@ -7,6 +7,7 @@
 
 import ibm_boto3  # type: ignore
 from docling.utils.utils import chunkify
+from docling_core.types.doc.document import ImageRefMode
 from huggingface_hub import snapshot_download
 from pydantic import BaseModel
 
@@ -15,7 +16,6 @@
     TRUE_HTML_EXPORT_LABELS,
 )
 from docling_eval.utils.utils import save_shard_to_disk, write_datasets_info
-from docling_eval.visualisation.visualisations import save_inspection_html
 
 # Get logger
 _log = logging.getLogger(__name__)
@@ -276,10 +276,11 @@ def save_to_disk(
                 record_list.append(r.as_record_dict())
                 if do_visualization:
                     viz_path = self.target / "visualizations" / f"{r.doc_id}.html"
-                    save_inspection_html(
+                    r.ground_truth_doc.save_as_html(
                         filename=viz_path,
-                        doc=r.ground_truth_doc,
                         labels=TRUE_HTML_EXPORT_LABELS,
+                        image_mode=ImageRefMode.EMBEDDED,
+                        split_page_view=True,
                     )
 
             save_shard_to_disk(

diff --git a/docling_eval/dataset_builders/file_dataset_builder.py b/docling_eval/dataset_builders/file_dataset_builder.py
@@ -100,14 +100,15 @@ def iterate(self) -> Iterable[DatasetRecord]:
 
         for filename in tqdm(
             selected_filenames,
-            desc="Processing files for DP-Bench",
+            desc=f"Processing files for {self.name}",
             ncols=128,
         ):
             mime_type, _ = mimetypes.guess_type(filename)
 
             # Create the ground truth Document
             true_doc = DoclingDocument(name=f"{filename}")
             if mime_type == "application/pdf":
+                _log.info(f"add_pages_to_true_doc: {filename}")
                 true_doc, _ = add_pages_to_true_doc(
                     pdf_path=filename, true_doc=true_doc, image_scale=2.0
                 )
@@ -126,6 +127,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
                     image=image_ref,
                 )
 
+                _log.info(f"add_pages_to_true_doc: {filename}")
                 true_doc.pages[1] = page_item
             else:
                 raise ValueError(
@@ -139,18 +141,20 @@ def iterate(self) -> Iterable[DatasetRecord]:
                 page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
             )
 
-            # Get PDF as binary data
-            pdf_bytes = get_binary(filename)
-            pdf_stream = DocumentStream(name=filename.name, stream=BytesIO(pdf_bytes))
+            # Get source as binary data
+            source_bytes = get_binary(filename)
+            source_stream = DocumentStream(
+                name=filename.name, stream=BytesIO(source_bytes)
+            )
 
             # Create dataset record
             record = DatasetRecord(
                 doc_id=str(filename.name),
-                doc_hash=get_binhash(pdf_bytes),
+                doc_hash=get_binhash(source_bytes),
                 ground_truth_doc=true_doc,
                 ground_truth_pictures=true_pictures,
                 ground_truth_page_images=true_page_images,
-                original=pdf_stream,
+                original=source_stream,
                 mime_type=mime_type,
             )
 

diff --git a/docling_eval/evaluators/readingorder_evaluator.py b/docling_eval/evaluators/readingorder_evaluator.py
@@ -292,7 +292,7 @@ def _show_items(self, true_doc: DoclingDocument):
             )
             text = item.text if isinstance(item, TextItem) else None
             label = item.label  # type: ignore
-            print(f"True {i}: {level} - {label}: {bbox} - {text}")
+            # print(f"True {i}: {level} - {label}: {bbox} - {text}")
 
 
 class ReadingOrderVisualizer:

diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py
@@ -165,7 +165,6 @@ def visualize_results(
                 / f"{prediction_record.doc_id}.html",
                 true_doc=gt_doc,
                 pred_doc=pred_doc,
-                page_image=prediction_record.ground_truth_page_images[0],
                 true_labels=self.true_labels,
                 pred_labels=self.pred_labels,
                 draw_reading_order=True,

diff --git a/docling_eval/prediction_providers/docling_provider.py b/docling_eval/prediction_providers/docling_provider.py
@@ -106,7 +106,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
     def info(self) -> Dict:
         """Get information about the prediction provider."""
 
-        return {
+        result = {
             "asset": PredictionProviderType.DOCLING,
             "version": docling_version(),
             "package_versions": {
@@ -128,10 +128,11 @@ def info(self) -> Dict:
                             mode="json", exclude_defaults=True
                         )
                         if v.pipeline_options is not None
-                        else {}
+                        else None  # Parquet might not like empty dicts!
                     ),
                 }
                 for k, v in self.doc_converter.format_to_options.items()
                 if k in [InputFormat.PDF, InputFormat.IMAGE]
             },
         }
+        return result
diff --git a/docling_eval/visualisation/constants.py b/docling_eval/visualisation/constants.py
@@ -309,7 +309,7 @@
             display: flex;
             flex-direction: column;
             width: 25%; /* Adjust the width of each item */
-            height: 100%; /* Adjust height to fill parent container */
+            height: 50%; /* Adjust height to fill parent container */
             border: 1px solid #ccc; /* Optional: Add borders */
             box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1); /* Optional: Add shadow */
             background-color: #fff; /* Optional: Add background */