Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docling_eval/benchmarks/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

class BenchMarkColumns(str, Enum):
CONVERTER_TYPE = "converter_type"
DOCLING_VERSION = "docling_version"
CONVERTER_VERSION = "converter_version"
DOCLING_PIPELINE = "docling_pipeline"

STATUS = "status"
Expand Down
51 changes: 18 additions & 33 deletions docling_eval/benchmarks/cvat_annotation/create.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,50 @@
import argparse
import copy
import glob
import json
import logging
import os
from pathlib import Path
from typing import Dict, Generator, Iterator, List, Optional, Tuple, cast
from typing import Dict, Iterator, List, Optional, Tuple, cast

import xmltodict # type: ignore[import]
from datasets import Dataset, load_dataset
from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.document import (
DocItem,
DoclingDocument,
FloatingItem,
GraphData,
ImageRef,
PageItem,
PictureItem,
ProvenanceItem,
TableData,
TableItem,
)
from docling_core.types.doc.labels import (
DocItemLabel,
GroupLabel,
PictureClassificationLabel,
TableCellLabel,
)
from docling_core.types.doc.labels import DocItemLabel
from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import]
from PIL import Image # as PILImage
from tqdm import tqdm # type: ignore

from docling_eval.benchmarks.constants import BenchMarkColumns, EvaluationModality
from docling_eval.benchmarks.constants import (
BenchMarkColumns,
ConverterTypes,
EvaluationModality,
)
from docling_eval.benchmarks.cvat_annotation.utils import (
AnnotatedDoc,
AnnotatedImage,
AnnotationBBox,
AnnotationLine,
AnnotationOverview,
BenchMarkDirs,
DocLinkLabel,
TableComponentLabel,
rgb_to_hex,
)
from docling_eval.benchmarks.utils import (
draw_clusters_with_reading_order,
get_binhash,
save_comparison_html_with_clusters,
save_inspection_html,
write_datasets_info,
)
from docling_eval.docling.conversion import create_docling_converter
from docling_eval.docling.utils import (
crop_bounding_box,
docling_version,
extract_images,
from_pil_to_base64uri,
get_binary,
insert_images,
get_binhash,
save_shard_to_disk,
write_datasets_info,
)
from docling_eval.converters.conversion import create_pdf_docling_converter
from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters

# from pydantic import

Expand Down Expand Up @@ -602,14 +586,14 @@ def create_true_document(basename: str, annot: dict, desc: AnnotatedImage):
img_height = page_image.height

if pred_doc.pages[page_no] is None:
logging.error(f"Page item is None, skipping ...")
logging.error("Page item is None, skipping ...")
continue

pred_page_item = pred_doc.pages[page_no]

pred_page_imageref = pred_page_item.image
if pred_page_imageref is None:
logging.error(f"Page ImageRef is None, skipping ...")
logging.error("Page ImageRef is None, skipping ...")
continue

assert pred_page_imageref.size.width == img_width
Expand Down Expand Up @@ -644,14 +628,14 @@ def create_true_document(basename: str, annot: dict, desc: AnnotatedImage):
page_no = 1

if (page_no not in true_doc.pages) or (true_doc.pages[page_no] is None):
logging.error(f"Page item is None, skipping ...")
logging.error("Page item is None, skipping ...")
continue

true_page_item = true_doc.pages[page_no]

true_page_imageref = true_page_item.image
if true_page_imageref is None:
logging.error(f"Page ImageRef is None, skipping ...")
logging.error("Page ImageRef is None, skipping ...")
continue

true_page_pilimage = true_page_imageref.pil_image
Expand Down Expand Up @@ -943,7 +927,7 @@ def create_layout_dataset_from_annotations(

# Create Converter
image_scale = 2.0
doc_converter = create_docling_converter(page_image_scale=image_scale)
doc_converter = create_pdf_docling_converter(page_image_scale=image_scale)

records = []
for basename, desc, true_doc in tqdm(
Expand Down Expand Up @@ -1000,7 +984,8 @@ def create_layout_dataset_from_annotations(
)

record = {
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.CONVERTER_TYPE: ConverterTypes.DOCLING,
BenchMarkColumns.CONVERTER_VERSION: docling_version(),
BenchMarkColumns.STATUS: str(conv_results.status),
BenchMarkColumns.DOC_ID: str(basename),
BenchMarkColumns.DOC_PATH: str(basename),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,33 +1,21 @@
import argparse
import copy
import glob
import json
import logging
import os
from pathlib import Path
from typing import Dict, List, Tuple

from datasets import Features
from datasets import Image as Features_Image
from datasets import Sequence, Value

from docling_eval.benchmarks.constants import BenchMarkColumns
from docling_eval.benchmarks.utils import (
add_pages_to_true_doc,
convert_html_table_into_docling_tabledata,
save_comparison_html,
save_comparison_html_with_clusters,
write_datasets_info,
)
from docling_eval.docling.conversion import create_docling_converter
from docling_eval.docling.utils import (
crop_bounding_box,
docling_version,
extract_images,
from_pil_to_base64uri,
get_binary,
save_shard_to_disk,
)
from docling_eval.converters.conversion import create_pdf_docling_converter


def parse_args():
Expand Down Expand Up @@ -61,7 +49,7 @@ def _write_datasets_info(
):
features = Features(
{
BenchMarkColumns.DOCLING_VERSION: Value("string"),
BenchMarkColumns.CONVERTER_VERSION: Value("string"),
BenchMarkColumns.STATUS: Value("string"),
BenchMarkColumns.DOC_ID: Value("string"),
# BenchMarkColumns.DOC_PATH: Value("string"),
Expand Down Expand Up @@ -113,7 +101,7 @@ def main():
os.makedirs(_)

# Create Converter
doc_converter = create_docling_converter(
doc_converter = create_pdf_docling_converter(
page_image_scale=image_scale, artifacts_path=artifacts_path
)

Expand All @@ -134,7 +122,7 @@ def main():
)

record = {
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.CONVERTER_VERSION: docling_version(),
BenchMarkColumns.STATUS: str(conv_results.status),
BenchMarkColumns.DOC_ID: str(os.path.basename(pdf_path)),
BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()),
Expand Down
9 changes: 0 additions & 9 deletions docling_eval/benchmarks/cvat_annotation/eval.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,4 @@
import argparse
import logging
import os
from pathlib import Path

from huggingface_hub import snapshot_download
from tabulate import tabulate # type: ignore

from docling_eval.benchmarks.constants import BenchMarkNames, EvaluationModality
from docling_eval.cli.main import evaluate, visualise

# Configure logging
logging.basicConfig(
Expand Down
32 changes: 9 additions & 23 deletions docling_eval/benchmarks/cvat_annotation/preannotate.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,29 @@
import argparse
import copy
import glob
import json
import logging
import os
from pathlib import Path
from typing import Dict, List, Tuple

from datasets import Dataset, load_dataset
from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size
from docling_core.types.doc.document import (
DocItem,
DoclingDocument,
PictureItem,
TableItem,
)
from docling_core.types.doc.labels import (
DocItemLabel,
GroupLabel,
PictureClassificationLabel,
TableCellLabel,
)
from pydantic import BaseModel
from typing import List

from datasets import load_dataset
from docling_core.types.doc.base import BoundingBox, ImageRefMode
from docling_core.types.doc.document import DocItem, DoclingDocument
from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
from tqdm import tqdm # type: ignore

from docling_eval.benchmarks.constants import BenchMarkColumns
from docling_eval.benchmarks.cvat_annotation.utils import (
AnnotatedDoc,
AnnotatedImage,
AnnotationBBox,
AnnotationLine,
AnnotationOverview,
BenchMarkDirs,
DocLinkLabel,
TableComponentLabel,
rgb_to_hex,
)
from docling_eval.benchmarks.utils import get_binhash
from docling_eval.docling.utils import insert_images
from docling_eval.benchmarks.utils import get_binhash, insert_images

# Configure logging
logging.basicConfig(
Expand Down Expand Up @@ -252,11 +238,11 @@ def create_cvat_preannotation_file_for_single_page(
annotated_image.page_nos = [page_no]
overview.img_annotations[filename] = annotated_image
else:
logging.warning(f"missing pillow image of the page, skipping ...")
logging.warning("missing pillow image of the page, skipping ...")
continue

else:
logging.warning(f"missing image-ref of the page, skipping ...")
logging.warning("missing image-ref of the page, skipping ...")
continue

page_bboxes: List[AnnotationBBox] = []
Expand Down
9 changes: 2 additions & 7 deletions docling_eval/benchmarks/cvat_annotation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,8 @@
from pathlib import Path
from typing import Dict, List, Tuple

from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size
from docling_core.types.doc.labels import (
DocItemLabel,
GroupLabel,
PictureClassificationLabel,
TableCellLabel,
)
from docling_core.types.doc.base import BoundingBox
from docling_core.types.doc.labels import DocItemLabel
from pydantic import BaseModel


Expand Down
Loading