Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 95 additions & 4 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@

# Configure logging
logging.getLogger("docling").setLevel(logging.WARNING)
logging.getLogger("PIL").setLevel(logging.WARNING)
logging.getLogger("transformers").setLevel(logging.WARNING)
logging.getLogger("datasets").setLevel(logging.WARNING)
logging.getLogger("filelock").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("docling_ibm_models").setLevel(logging.WARNING)

_log = logging.getLogger(__name__)

app = typer.Typer(
Expand Down Expand Up @@ -188,14 +195,17 @@ def get_dataset_builder(
name="CVAT", dataset_source=dataset_source, target=target, split=split
)
elif benchmark == BenchMarkNames.PLAIN_FILES:
assert dataset_source is not None
if dataset_source is None:
raise ValueError("dataset_source is required for PLAIN_FILES")

return FileDatasetBuilder(
name=dataset_source.name,
dataset_source=dataset_source,
target=target,
split=split,
begin_index=begin_index,
end_index=end_index,
)

else:
raise ValueError(f"Unsupported benchmark: {benchmark}")

Expand All @@ -209,7 +219,11 @@ def get_prediction_provider(
):
pipeline_options: PaginatedPipelineOptions
"""Get the appropriate prediction provider with default settings."""
if provider_type == PredictionProviderType.DOCLING:
if (
provider_type == PredictionProviderType.DOCLING
or provider_type == PredictionProviderType.OCR_DOCLING
or provider_type == PredictionProviderType.EasyOCR_DOCLING
):
ocr_factory = get_ocr_factory()

ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
Expand Down Expand Up @@ -238,6 +252,78 @@ def get_prediction_provider(
ignore_missing_predictions=True,
)

elif provider_type == PredictionProviderType.MacOCR_DOCLING:
ocr_factory = get_ocr_factory()

ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
kind="ocrmac",
)

pipeline_options = PdfPipelineOptions(
do_ocr=True,
ocr_options=ocr_options,
do_table_structure=True,
)

pipeline_options.images_scale = 2.0
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path

return DoclingPredictionProvider(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
},
do_visualization=do_visualization,
ignore_missing_predictions=True,
)

elif provider_type == PredictionProviderType.PDF_DOCLING:

ocr_factory = get_ocr_factory()

ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
kind="easyocr",
)

pdf_pipeline_options = PdfPipelineOptions(
do_ocr=False,
ocr_options=ocr_options, # we need to provide OCR options in order to not break the parquet serialization
do_table_structure=True,
)

pdf_pipeline_options.images_scale = 2.0
pdf_pipeline_options.generate_page_images = True
pdf_pipeline_options.generate_picture_images = True

ocr_pipeline_options = PdfPipelineOptions(
do_ocr=True,
ocr_options=ocr_options, # we need to provide OCR options in order to not break the parquet serialization
do_table_structure=True,
)

ocr_pipeline_options.images_scale = 2.0
ocr_pipeline_options.generate_page_images = True
ocr_pipeline_options.generate_picture_images = True

if artifacts_path is not None:
pdf_pipeline_options.artifacts_path = artifacts_path
ocr_pipeline_options.artifacts_path = artifacts_path

return DoclingPredictionProvider(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.IMAGE: PdfFormatOption(
pipeline_options=ocr_pipeline_options
),
},
do_visualization=do_visualization,
ignore_missing_predictions=True,
)

elif provider_type == PredictionProviderType.SMOLDOCLING:
pipeline_options = VlmPipelineOptions()

Expand Down Expand Up @@ -614,9 +700,14 @@ def create_cvat(
output_dir: Annotated[Path, typer.Option(help="Output directory")],
gt_dir: Annotated[Path, typer.Option(help="Dataset source path")],
bucket_size: Annotated[int, typer.Option(help="Size of CVAT tasks")] = 20,
use_predictions: Annotated[bool, typer.Option(help="use predictions")] = False,
):
"""Create dataset ready to upload to CVAT starting from (ground-truth) dataset."""
builder = CvatPreannotationBuilder(
dataset_source=gt_dir, target=output_dir, bucket_size=bucket_size
dataset_source=gt_dir,
target=output_dir,
bucket_size=bucket_size,
use_predictions=use_predictions,
)
builder.prepare_for_annotation()

Expand Down
5 changes: 5 additions & 0 deletions docling_eval/datamodels/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ class PredictionProviderType(str, Enum):
"""Types of prediction providers available."""

DOCLING = "Docling"
PDF_DOCLING = "PDF_Docling"
OCR_DOCLING = "OCR_Docling"
MacOCR_DOCLING = "MacOCR_Docling"
EasyOCR_DOCLING = "EasyOCR_Docling"

TABLEFORMER = "TableFormer"
FILE = "File"
SMOLDOCLING = "SmolDocling"
Expand Down
7 changes: 4 additions & 3 deletions docling_eval/dataset_builders/dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import ibm_boto3 # type: ignore
from docling.utils.utils import chunkify
from docling_core.types.doc.document import ImageRefMode
from huggingface_hub import snapshot_download
from pydantic import BaseModel

Expand All @@ -15,7 +16,6 @@
TRUE_HTML_EXPORT_LABELS,
)
from docling_eval.utils.utils import save_shard_to_disk, write_datasets_info
from docling_eval.visualisation.visualisations import save_inspection_html

# Get logger
_log = logging.getLogger(__name__)
Expand Down Expand Up @@ -276,10 +276,11 @@ def save_to_disk(
record_list.append(r.as_record_dict())
if do_visualization:
viz_path = self.target / "visualizations" / f"{r.doc_id}.html"
save_inspection_html(
r.ground_truth_doc.save_as_html(
filename=viz_path,
doc=r.ground_truth_doc,
labels=TRUE_HTML_EXPORT_LABELS,
image_mode=ImageRefMode.EMBEDDED,
split_page_view=True,
)

save_shard_to_disk(
Expand Down
16 changes: 10 additions & 6 deletions docling_eval/dataset_builders/file_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,15 @@ def iterate(self) -> Iterable[DatasetRecord]:

for filename in tqdm(
selected_filenames,
desc="Processing files for DP-Bench",
desc=f"Processing files for {self.name}",
ncols=128,
):
mime_type, _ = mimetypes.guess_type(filename)

# Create the ground truth Document
true_doc = DoclingDocument(name=f"{filename}")
if mime_type == "application/pdf":
_log.info(f"add_pages_to_true_doc: {filename}")
true_doc, _ = add_pages_to_true_doc(
pdf_path=filename, true_doc=true_doc, image_scale=2.0
)
Expand All @@ -126,6 +127,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
image=image_ref,
)

_log.info(f"add_pages_to_true_doc: {filename}")
true_doc.pages[1] = page_item
else:
raise ValueError(
Expand All @@ -139,18 +141,20 @@ def iterate(self) -> Iterable[DatasetRecord]:
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
)

# Get PDF as binary data
pdf_bytes = get_binary(filename)
pdf_stream = DocumentStream(name=filename.name, stream=BytesIO(pdf_bytes))
# Get source as binary data
source_bytes = get_binary(filename)
source_stream = DocumentStream(
name=filename.name, stream=BytesIO(source_bytes)
)

# Create dataset record
record = DatasetRecord(
doc_id=str(filename.name),
doc_hash=get_binhash(pdf_bytes),
doc_hash=get_binhash(source_bytes),
ground_truth_doc=true_doc,
ground_truth_pictures=true_pictures,
ground_truth_page_images=true_page_images,
original=pdf_stream,
original=source_stream,
mime_type=mime_type,
)

Expand Down
2 changes: 1 addition & 1 deletion docling_eval/evaluators/readingorder_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def _show_items(self, true_doc: DoclingDocument):
)
text = item.text if isinstance(item, TextItem) else None
label = item.label # type: ignore
print(f"True {i}: {level} - {label}: {bbox} - {text}")
# print(f"True {i}: {level} - {label}: {bbox} - {text}")


class ReadingOrderVisualizer:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ def visualize_results(
/ f"{prediction_record.doc_id}.html",
true_doc=gt_doc,
pred_doc=pred_doc,
page_image=prediction_record.ground_truth_page_images[0],
true_labels=self.true_labels,
pred_labels=self.pred_labels,
draw_reading_order=True,
Expand Down
5 changes: 3 additions & 2 deletions docling_eval/prediction_providers/docling_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
def info(self) -> Dict:
"""Get information about the prediction provider."""

return {
result = {
"asset": PredictionProviderType.DOCLING,
"version": docling_version(),
"package_versions": {
Expand All @@ -128,10 +128,11 @@ def info(self) -> Dict:
mode="json", exclude_defaults=True
)
if v.pipeline_options is not None
else {}
else None # Parquet might not like empty dicts!
),
}
for k, v in self.doc_converter.format_to_options.items()
if k in [InputFormat.PDF, InputFormat.IMAGE]
},
}
return result
2 changes: 1 addition & 1 deletion docling_eval/visualisation/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@
display: flex;
flex-direction: column;
width: 25%; /* Adjust the width of each item */
height: 100%; /* Adjust height to fill parent container */
height: 50%; /* Adjust height to fill parent container */
border: 1px solid #ccc; /* Optional: Add borders */
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1); /* Optional: Add shadow */
background-color: #fff; /* Optional: Add background */
Expand Down
Loading