# Docling

In [1]:
import datetime
import logging
import time
from pathlib import Path
import pandas as pd

In [2]:
input_doc_paths = [
        Path("../data/coca-cola-business-and-sustainability-report-2018.pdf"),
    ]
output_dir = Path("./scratch")

# Export multimodal Docling Example

In [1]:
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.utils.export import generate_multimodal_pages

_log = logging.getLogger(__name__)

IMAGE_RESOLUTION_SCALE = 2.0


def main():
    logging.basicConfig(level=logging.INFO)
    input_files = DocumentConversionInput.from_paths(input_doc_paths)

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    assemble_options = AssembleOptions()
    assemble_options.images_scale = IMAGE_RESOLUTION_SCALE

    doc_converter = DocumentConverter(assemble_options=assemble_options)

    start_time = time.time()

    converted_docs = doc_converter.convert(input_files)
    success_count = 0
    failure_count = 0
    output_dir.mkdir(parents=True, exist_ok=True)
    rows = []
    for doc in converted_docs:
        if doc.status != ConversionStatus.SUCCESS:
            _log.info(f"Document {doc.input.file} failed to convert.")
            failure_count += 1
            continue
        for (
            content_text,
            content_md,
            content_dt,
            page_cells,
            page_segments,
            page,
        ) in generate_multimodal_pages(doc):

            dpi = page._default_image_scale * 72

            rows.append(
                {
                    "document": doc.input.file.name,
                    "hash": doc.input.document_hash,
                    "page_hash": page.page_hash,
                    "image": {
                        "width": page.image.width,
                        "height": page.image.height,
                        "bytes": page.image.tobytes(),
                    },
                    "cells": page_cells,
                    "contents": content_text,
                    "contents_md": content_md,
                    "contents_dt": content_dt,
                    "segments": page_segments,
                    "extra": {
                        "page_num": page.page_no + 1,
                        "width_in_points": page.size.width,
                        "height_in_points": page.size.height,
                        "dpi": dpi,
                    },
                }
            )
        success_count += 1

    print('Success count:', success_count)

    # Generate one parquet from all documents
    df = pd.json_normalize(rows)
    now = datetime.datetime.now()
    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
    df.to_parquet(output_filename)

    end_time = time.time() - start_time

    _log.info(f"All documents were converted in {end_time:.2f} seconds.")

    if failure_count > 0:
        raise RuntimeError(
            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
        )

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 57344.00it/s]
  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Processing document coca-cola-business-and-sustainability-report-2018.pdf
INFO:docling.document_converter:Finished converting page batch time=8.561
INFO:docling.document_converter:Finished converting page batch time=9.153
INFO:docling.document_converter:Finished converting page batch time=13.371
INFO:docling.document_converter:Finished converting page batch time=5.881
INFO:docling.document_converter:Finished converting page batch time=13.749
INFO:docling.document_converter:Finished converting page batch time=8.666
INFO:docling.document_converter:Finished converting page batch time=15.077
INFO:docling.document_converter:Fin

Success count: 1


INFO:__main__:All documents were converted in 186.33 seconds.


### Load

In [None]:
# This block demonstrates how the file can be opened with the HF datasets library
from datasets import Dataset
from PIL import Image
multimodal_df = pd.read_parquet(output_filename)

# Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
dataset = Dataset.from_pandas(multimodal_df)
def transforms(examples):
    examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
    return examples
dataset = dataset.map(transforms)

# Convert a single Document

In [3]:
from docling.document_converter import DocumentConverter

source = input_doc_paths[0]
converter = DocumentConverter()
result = converter.convert_single(source)
md = result.render_as_markdown()

with open(f"{output_dir}/markdown.md", "w") as file:
    file.write(md)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 34460.24it/s]
  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


# Custom Conversion

In [None]:
import json
import logging
import time
from pathlib import Path
from typing import Iterable

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)


def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
):
    output_dir.mkdir(parents=True, exist_ok=True)

    success_count = 0
    failure_count = 0

    for conv_res in conv_results:
        if conv_res.status == ConversionStatus.SUCCESS:
            success_count += 1
            doc_filename = conv_res.input.file.stem

            # Export Deep Search document JSON format:
            with (output_dir / f"{doc_filename}.json").open("w") as fp:
                fp.write(json.dumps(conv_res.render_as_dict()))

            # Export Text format:
            with (output_dir / f"{doc_filename}.txt").open("w") as fp:
                fp.write(conv_res.render_as_text())

            # Export Markdown format:
            with (output_dir / f"{doc_filename}.md").open("w") as fp:
                fp.write(conv_res.render_as_markdown())

            # Export Document Tags format:
            with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
                fp.write(conv_res.render_as_doctags())

        else:
            _log.info(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1

    _log.info(
        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
    )

    return success_count, failure_count

### PyPdfium without OCR

In [None]:
def main():
    logging.basicConfig(level=logging.INFO)
    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr=False
    pipeline_options.do_table_structure=True
    pipeline_options.table_structure_options.do_cell_matching = False

    doc_converter = DocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=PyPdfiumDocumentBackend,
    )
    
    # Define input files
    input = DocumentConversionInput.from_paths(input_doc_paths)

    start_time = time.time()

    conv_results = doc_converter.convert(input)
    success_count, failure_count = export_documents(
        conv_results, output_dir=Path("./scratch")
    )

    end_time = time.time() - start_time

    _log.info(f"All documents were converted in {end_time:.2f} seconds.")

    if failure_count > 0:
        raise RuntimeError(
            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
        )

if __name__ == "__main__":
    main()

### PyPdfium with OCR

In [None]:
def main():
    logging.basicConfig(level=logging.INFO)
    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr=False
    pipeline_options.do_table_structure=True
    pipeline_options.table_structure_options.do_cell_matching = True

    doc_converter = DocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=PyPdfiumDocumentBackend,
    )

    # Define input files
    input = DocumentConversionInput.from_paths(input_doc_paths)

    start_time = time.time()

    conv_results = doc_converter.convert(input)
    success_count, failure_count = export_documents(
        conv_results, output_dir=Path("./scratch")
    )

    end_time = time.time() - start_time

    _log.info(f"All documents were converted in {end_time:.2f} seconds.")

    if failure_count > 0:
        raise RuntimeError(
            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
        )

if __name__ == "__main__":
    main()


### Docling Parse without OCR

In [None]:
def main():
    logging.basicConfig(level=logging.INFO)
    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

    doc_converter = DocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )

    # Define input files
    input = DocumentConversionInput.from_paths(input_doc_paths)

    start_time = time.time()

    conv_results = doc_converter.convert(input)
    success_count, failure_count = export_documents(
        conv_results, output_dir=Path("./scratch")
    )

    end_time = time.time() - start_time

    _log.info(f"All documents were converted in {end_time:.2f} seconds.")

    if failure_count > 0:
        raise RuntimeError(
            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
        )


if __name__ == "__main__":
    main()

### Docling Parse with OCR

In [None]:
def main():
    logging.basicConfig(level=logging.INFO)

    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr=True
    pipeline_options.do_table_structure=True
    pipeline_options.table_structure_options.do_cell_matching = True

    doc_converter = DocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )


    # Define input files
    input = DocumentConversionInput.from_paths(input_doc_paths)

    start_time = time.time()

    conv_results = doc_converter.convert(input)
    success_count, failure_count = export_documents(
        conv_results, output_dir=Path("./scratch")
    )

    end_time = time.time() - start_time

    _log.info(f"All documents were converted in {end_time:.2f} seconds.")

    if failure_count > 0:
        raise RuntimeError(
            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
        )


if __name__ == "__main__":
    main()