docling-project · nikos-livathinos · Feb 26, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025
diff --git a/README.md b/README.md
@@ -21,199 +21,35 @@ Evaluate docling on various datasets. You can use the cli
 
 ```sh
 docling-eval % poetry run evaluate --help
-2024-12-20 10:51:57,593 - INFO - PyTorch version 2.5.1 available.
 
  Usage: evaluate [OPTIONS]
 
-╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ *  --task        -t      [create|evaluate|visualize]                                                                Evaluation task [default: None] [required]                                                                              │
-│ *  --modality    -m      [end-to-end|layout|tableformer|codeformer]                                                 Evaluation modality [default: None] [required]                                                                          │
-│ *  --benchmark   -b      [DPBench|OmniDcoBench|WordScape|PubLayNet|DocLayNetV1|Pub1M|PubTabNet|FinTabNet|WikiTabNet]  Benchmark name [default: None] [required]                                                                               │
-│ *  --input-dir   -i      PATH                                                                                       Input directory [default: None] [required]                                                                              │
-│ *  --output-dir  -o      PATH                                                                                       Output directory [default: None] [required]                                                                             │
-│    --help                                                                                                           Show this message and exit.                                                                                             │
-╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ *  --task            -t      [create|evaluate|visualize]                                 Evaluation task [default: None] [required]                 │
+│ *  --modality        -m      [end-to-end|layout|table_structure|code_transcription|math  Evaluation modality [default: None] [required]             │
+│                              _transcription|reading_order|markdown_text|captioning|bbox                                                             │
+│                              es_text]                                                                                                               │
+│ *  --benchmark       -b      [DPBench|OmniDocBench|WordScape|PubLayNet|DocLayNetV1|DocL  Benchmark name [default: None] [required]                  │
+│                              ayNetV2|FUNSD|Pub1M|PubTabNet|FinTabNet|WikiTabNet]                                                                    │
+│ *  --output-dir      -o      PATH                                                        Output directory [default: None] [required]                │
+│    --input-dir       -i      PATH                                                        Input directory [default: None]                            │
+│    --converter_type  -c      [Docling|SmolDocling]                                       Type of document converter [default: Docling]              │
+│    --split           -s      TEXT                                                        Dataset split [default: test]                              │
+│    --artifacts-path  -a      PATH                                                        Load artifacts from local path [default: None]             │
+│    --max-items       -n      INTEGER                                                     How many items to load from the original dataset           │
+│                                                                                          [default: 1000]                                            │
+│    --help                                                                                Show this message and exit.                                │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 ```
 
-## End to End examples
+## Benchmarks
 
-### FinTabNet
-
-Using a single command (loading the dataset from Huggingface: [FinTabNet_OTSL](https://huggingface.co/datasets/ds4sd/FinTabNet_OTSL)),
-
-```sh
-poetry run python docs/examples/benchmark_fintabnet.py
-```
-
-<details>
-<summary><b>Table evaluations for FinTabNet</b></summary>
-<br>
-
-👉 Evaluate the dataset:
-
-```sh
-poetry run evaluate \
-    -t evaluate \
-    -m tableformer \
-    -b FinTabNet \
-    -i benchmarks/FinTabNet-dataset/tableformer \
-    -o benchmarks/FinTabNet-dataset/tableformer
-```
-
-[Tableformer evaluation json](docs/evaluations/FinTabNet/evaluation_FinTabNet_tableformer.json)
-
-👉 Visualize the dataset:
-
-```sh
-poetry run evaluate \
-    -t visualize \
-    -m tableformer \
-    -b FinTabNet \
-    -i benchmarks/FinTabNet-dataset/tableformer \
-    -o benchmarks/FinTabNet-dataset/tableformer
-```
-
-![TEDS plot](docs/evaluations/FinTabNet/evaluation_FinTabNet_tableformer-delta_row_col.png)
-
-![TEDS struct only plot](docs/evaluations/FinTabNet/evaluation_FinTabNet_tableformer_TEDS_struct-only.png)
-
-[TEDS struct only report](docs/evaluations/FinTabNet/evaluation_FinTabNet_tableformer_TEDS_struct-only.txt)
-
-![TEDS struct with text plot](docs/evaluations/FinTabNet/evaluation_FinTabNet_tableformer_TEDS_struct-with-text.png)
-
-[TEDS struct with text report](docs/evaluations/FinTabNet/evaluation_FinTabNet_tableformer_TEDS_struct-with-text.txt)
-
-</details>
-
-### DocLayNet v1
-
-Using a single command,
-
-```sh
-poetry run python ./docs/examples/benchmark_doclaynet_v1.py
-```
-
-This command downloads the DocLayNet v1.1 dataset, runs the evaluations and produces the following files:
-
-<details>
-<summary><b>Layout evaluation</b></summary>
-<br>
-
-- [Layout evaluation json](docs/evaluations/DocLayNetV1/evaluation_DocLayNetV1_layout.json)
-- [mAP[0.5:0.95] report](docs/evaluations/DocLayNetV1/evaluation_DocLayNetV1_layout_mAP_0.5_0.95.txt)
-- [mAP[0.5:0.95] plot](docs/evaluations/DocLayNetV1/evaluation_DocLayNetV1_layout_mAP_0.5_0.95.png)
-
-</details>
-=======
-
-
-### Pub1M
-
-Using a single command (loading the dataset from Huggingface: [Pub1M_OTSL](https://huggingface.co/datasets/ds4sd/Pub1M_OTSL)),
-
-```sh
-poetry run python docs/examples/benchmark_p1m.py
-```
-
-<details>
-<summary><b>Table evaluations for Pub1M</b></summary>
-<br>
-
-👉 Evaluate the dataset:
-
-```sh
-poetry run evaluate \
-    -t evaluate \
-    -m tableformer \
-    -b Pub1M \
-    -i benchmarks/Pub1M-dataset/tableformer \
-    -o benchmarks/Pub1M-dataset/tableformer
-```
-
-[Tableformer evaluation json](docs/evaluations/Pub1M/evaluation_Pub1M_tableformer.json)
-
-👉 Visualize the dataset:
-
-```sh
-poetry run evaluate \
-    -t visualize \
-    -m tableformer \
-    -b Pub1M \
-    -i benchmarks/Pub1M-dataset/tableformer \
-    -o benchmarks/Pub1M-dataset/tableformer
-```
-
-![TEDS plot](docs/evaluations/Pub1M/evaluation_Pub1M_tableformer-delta_row_col.png)
-
-![TEDS struct only plot](docs/evaluations/Pub1M/evaluation_Pub1M_tableformer_TEDS_struct-only.png)
-
-[TEDS struct only report](docs/evaluations/Pub1M/evaluation_Pub1M_tableformer_TEDS_struct-only.txt)
-
-![TEDS struct with text plot](docs/evaluations/Pub1M/evaluation_Pub1M_tableformer_TEDS_struct-with-text.png)
-
-[TEDS struct with text report](docs/evaluations/Pub1M/evaluation_Pub1M_tableformer_TEDS_struct-with-text.txt)
-
-</details>
-
-
-### PubTabNet
-
-Using a single command (loading the dataset from Huggingface: [Pubtabnet_OTSL](https://huggingface.co/datasets/ds4sd/Pubtabnet_OTSL)),
-
-```sh
-poetry run python ./docs/examples/benchmark_pubtabnet.py
-```
-
-<details>
-<summary><b>Table evaluations for PubTabNet</b></summary>
-<br>
-
-👉 Evaluate the dataset:
-
-```sh
-poetry run evaluate \
-    -t evaluate \
-    -m tableformer \
-    -b PubTabNet \
-    -i benchmarks/PubTabNet-dataset/tableformer \
-    -o benchmarks/PubTabNet-dataset/tableformer
-```
-
-[Tableformer evaluation json](docs/evaluations/PubTabNet/evaluation_PubTabNet_tableformer.json)
-
-👉 Visualize the dataset:
-
-```sh
-poetry run evaluate \
-    -t visualize \
-    -m tableformer \
-    -b PubTabNet \
-    -i benchmarks/PubTabNet-dataset/tableformer \
-    -o benchmarks/PubTabNet-dataset/tableformer
-```
-
-![TEDS plot](docs/evaluations/PubTabNet/evaluation_PubTabNet_tableformer-delta_row_col.png)
-
-![TEDS struct only plot](docs/evaluations/PubTabNet/evaluation_PubTabNet_tableformer_TEDS_struct-only.png)
-
-[TEDS struct only report](docs/evaluations/PubTabNet/evaluation_PubTabNet_tableformer_TEDS_struct-only.txt)
-
-![TEDS struct with text plot](docs/evaluations/PubTabNet/evaluation_PubTabNet_tableformer_TEDS_struct-with-text.png)
-
-[TEDS struct with text report](docs/evaluations/PubTabNet/evaluation_PubTabNet_tableformer_TEDS_struct-with-text.txt)
-
-
-</details>
-
-
-## DP-Bench
-
-[See DP-Bench benchmarks](docs/DP-Bench_benchmarks.md)
-
-
-## OmniDocBench
-
-[See OmniDocBench benchmarks](docs/OmniDocBench_benchmarks.md)
+- [DP-Bench benchmarks](docs/DP-Bench_benchmarks.md): Text, layout, reading order and table structure evaluation on the DP-Bench dataset.
+- [OmniDocBench benchmarks](docs/OmniDocBench_benchmarks.md): Text, layout, reading order and table structure evaluation on the OmniDocBench dataset.
+- [DocLayNetV1 Benchmarks](docs/DocLayNetv1_benchmarks.md): Text and layout evaluation on the DocLayNet v1.2 dataset.
+- [FinTabnet Benchmarks](docs/FinTabNet_benchmarks.md): Table structure evaluation on the FinTabNet dataset.
+- [PubTabNet benchmarks](docs/PubTabNet_benchmarks.md): Table structure evaluation on the PubTabNet dataset.
+- [Pub1M benchmarks](docs/P1M_benchmarks.md): Table structure evaluation on the Pub1M dataset.
 
 
 ## Contributing

diff --git a/docling_eval/benchmarks/doclaynet_v1/create.py b/docling_eval/benchmarks/doclaynet_v1/create.py
@@ -180,6 +180,7 @@ def create_dlnv1_e2e_dataset(
     converter_type: ConverterTypes = ConverterTypes.DOCLING,
     do_viz: bool = False,
     max_items: int = -1,  # If -1 take the whole split
+    do_save_page_text: bool = False,
 ):
     ds = load_dataset(name, split=split)
 
@@ -217,6 +218,16 @@ def create_dlnv1_e2e_dataset(
 
         pred_doc = conv_results.document
 
+        # Debugging code that dumps the VLM predicted text in files
+        if do_save_page_text:
+            debug_dir = output_dir / "debug"
+            os.makedirs(debug_dir, exist_ok=True)
+            if len(conv_results.pages):
+                for page_id, page in enumerate(conv_results.pages):
+                    page_text_fn = debug_dir / f"{page_hash}_{page_id}.txt"
+                    with open(page_text_fn, "w") as fd:
+                        fd.write(page.predictions.vlm_response.text)
+
         true_doc = DoclingDocument(name=page_hash)
         true_doc, true_page_images = add_pages_to_true_doc(
             pdf_path=pdf_stream, true_doc=true_doc, image_scale=1.0
@@ -272,7 +283,7 @@ def create_dlnv1_e2e_dataset(
             BenchMarkColumns.MIMETYPE: "image/png",
             BenchMarkColumns.MODALITIES: [
                 EvaluationModality.LAYOUT,
-                EvaluationModality.READING_ORDER,
+                EvaluationModality.MARKDOWN_TEXT,
             ],
         }
         pdf_stream.close()

diff --git a/docling_eval/benchmarks/dpbench/create.py b/docling_eval/benchmarks/dpbench/create.py
@@ -298,17 +298,6 @@ def create_dpbench_e2e_dataset(
             )
 
         if do_viz:
-            """
-            save_comparison_html(
-                filename=viz_dir / f"{os.path.basename(pdf_path)}-comp.html",
-                true_doc=true_doc,
-                pred_doc=pred_doc,
-                page_image=true_page_images[0],
-                true_labels=TRUE_HTML_EXPORT_LABELS,
-                pred_labels=PRED_HTML_EXPORT_LABELS,
-            )
-            """
-
             save_comparison_html_with_clusters(
                 filename=viz_dir / f"{os.path.basename(pdf_path)}-clusters.html",
                 true_doc=true_doc,
@@ -345,7 +334,9 @@ def create_dpbench_e2e_dataset(
             BenchMarkColumns.MIMETYPE: "application/pdf",
             BenchMarkColumns.MODALITIES: [
                 EvaluationModality.LAYOUT,
+                EvaluationModality.MARKDOWN_TEXT,
                 EvaluationModality.READING_ORDER,
+                EvaluationModality.TABLE_STRUCTURE,
             ],
         }
         records.append(record)

diff --git a/docling_eval/benchmarks/funsd/create.py b/docling_eval/benchmarks/funsd/create.py
@@ -523,10 +523,7 @@ def create_funsd_dataset(
                 BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,
                 BenchMarkColumns.ORIGINAL: img_bytes,
                 BenchMarkColumns.MIMETYPE: "image/png",
-                BenchMarkColumns.MODALITIES: [
-                    EvaluationModality.LAYOUT,
-                    EvaluationModality.READING_ORDER,
-                ],
+                BenchMarkColumns.MODALITIES: [],
             }
             records.append(record)
             count += 1

diff --git a/docling_eval/benchmarks/omnidocbench/create.py b/docling_eval/benchmarks/omnidocbench/create.py
@@ -324,17 +324,6 @@ def create_omnidocbench_e2e_dataset(
         )
 
         if do_viz:
-            """
-            save_comparison_html(
-                filename=viz_dir / f"{os.path.basename(pdf_path)}-comp.html",
-                true_doc=true_doc,
-                pred_doc=pred_doc,
-                page_image=true_page_images[0],
-                true_labels=TRUE_HTML_EXPORT_LABELS,
-                pred_labels=PRED_HTML_EXPORT_LABELS,
-            )
-            """
-
             save_comparison_html_with_clusters(
                 filename=viz_dir / f"{os.path.basename(pdf_path)}-clusters.html",
                 true_doc=true_doc,
@@ -372,7 +361,9 @@ def create_omnidocbench_e2e_dataset(
             BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,
             BenchMarkColumns.MODALITIES: [
                 EvaluationModality.LAYOUT,
+                EvaluationModality.MARKDOWN_TEXT,
                 EvaluationModality.READING_ORDER,
+                EvaluationModality.TABLE_STRUCTURE,
             ],
         }
         records.append(record)

diff --git a/docling_eval/benchmarks/utils.py b/docling_eval/benchmarks/utils.py
@@ -154,7 +154,7 @@ def yield_cells_from_html_table(
 ):
     soup = BeautifulSoup(table_html, "html.parser")
     table = soup.find("table") or soup  # Ensure table context
-    rows = table.find_all("tr")
+    rows = table.find_all("tr")  # type: ignore
 
     max_cols = 0
     for row in rows:

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -40,11 +40,6 @@
     DatasetMarkdownEvaluation,
     MarkdownTextEvaluator,
 )
-
-# from docling_eval.evaluators.readingorder_evaluator_glm import (
-#    DatasetReadingOrderEvaluation,
-#    ReadingOrderEvaluatorGlm,
-# )
 from docling_eval.evaluators.readingorder_evaluator import ReadingOrderEvaluator
 from docling_eval.evaluators.stats import DatasetStatistics
 from docling_eval.evaluators.table_evaluator import (

diff --git a/docling_eval/converters/conversion.py b/docling_eval/converters/conversion.py
@@ -11,11 +11,11 @@
     OcrOptions,
     PdfPipelineOptions,
     RapidOcrOptions,
-    SmolDoclingOptions,
     TableFormerMode,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
     VlmPipelineOptions,
+    smoldocling_vlm_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
@@ -134,12 +134,10 @@ def create_image_docling_converter(
 def create_smol_docling_converter(
     timings: bool = True,
 ):
-    vlm_options = SmolDoclingOptions()
-    pipeline_options = VlmPipelineOptions(
-        generate_page_images=True,
-        force_backend_text=False,
-        vlm_options=vlm_options,
-    )
+    pipeline_options = VlmPipelineOptions()
+    pipeline_options.generate_page_images = True
+    pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
+    pipeline_options.vlm_options = smoldocling_vlm_conversion_options
 
     converter = DocumentConverter(
         format_options={

diff --git a/docs/CVAT_create_groundtruth.md b/docs/CVAT_create_groundtruth.md
@@ -7,7 +7,7 @@ To start creating ground-truth, you first need to have a dataset in parquet form
 The first way is to use an existing dataset (eg dpbench), which has potentially pre-annotated tables, formulas, etc. Hence you can run the create script of that particular benchmark.
 
 ```sh
-poetry run python ./docs/examples/benchmark_dpbench.py
+poetry run python docs/examples/benchmark_dpbench.py
 ```
 
 Another way is to create it from PDF or PNG files. In this case, simply run the following scripts,