# Docling

In [6]:
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker

In [7]:
source = "../docs/AR for improved learnability.pdf"
converter = DocumentConverter()
result = converter.convert(source)

In [8]:
doc = result.document

In [23]:
chunker = HybridChunker(tokenizer="BAAI/bge-small-en-v1.5")
chunk_iter = chunker.chunk(doc)

Token indices sequence length is longer than the specified maximum sequence length for this model (856 > 512). Running this sequence through the model will result in indexing errors


In [24]:
result.document.pages

{1: PageItem(size=Size(width=595.2760009765625, height=793.7009887695312), image=None, page_no=1),
 2: PageItem(size=Size(width=595.2760009765625, height=793.7009887695312), image=None, page_no=2),
 3: PageItem(size=Size(width=595.2760009765625, height=793.7009887695312), image=None, page_no=3),
 4: PageItem(size=Size(width=595.2760009765625, height=793.7009887695312), image=None, page_no=4),
 5: PageItem(size=Size(width=595.2760009765625, height=793.7009887695312), image=None, page_no=5),
 6: PageItem(size=Size(width=595.2760009765625, height=793.7009887695312), image=None, page_no=6),
 7: PageItem(size=Size(width=595.2760009765625, height=793.7009887695312), image=None, page_no=7),
 8: PageItem(size=Size(width=595.2760009765625, height=793.7009887695312), image=None, page_no=8),
 9: PageItem(size=Size(width=595.2760009765625, height=793.7009887695312), image=None, page_no=9)}

In [27]:
chunk_0 = list(chunk_iter)[-1]

IndexError: list index out of range

In [29]:
result.document.tables

[TableItem(self_ref='#/tables/0', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.TABLE: 'table'>, prov=[ProvenanceItem(page_no=3, bbox=BoundingBox(l=37.23048400878906, t=163.27099609375, r=288.4778747558594, b=53.53057861328125, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))], captions=[RefItem(cref='#/texts/47')], references=[], footnotes=[], image=None, data=TableData(table_cells=[TableCell(bbox=BoundingBox(l=43.59700012207031, t=160.01300048828125, r=72.4219970703125, b=145.60400390625, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), row_span=1, col_span=1, start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=0, end_col_offset_idx=1, text='Cognitive  process', column_header=True, row_header=False, row_section=False), TableCell(bbox=BoundingBox(l=98.36100006103516, t=160.01300048828125, r=162.177001953125, b=154.1649932861328, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), r

In [30]:
result.document.pictures

[PictureItem(self_ref='#/pictures/0', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.PICTURE: 'picture'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=36.224613189697266, t=732.672119140625, r=96.2625732421875, b=665.4491577148438, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))], captions=[], references=[], footnotes=[], image=None, annotations=[]),
 PictureItem(self_ref='#/pictures/1', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.PICTURE: 'picture'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=499.68072509765625, t=739.1707763671875, r=557.9953002929688, b=666.0467529296875, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 0))], captions=[], references=[], footnotes=[], image=None, annotations=[]),
 PictureItem(self_ref='#/pictures/2', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY

In [31]:
import logging
import time
from pathlib import Path

In [32]:
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

In [33]:
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0

In [34]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("../docs/AR for improved learnability.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Save markdown with embedded pictures
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

    # Save markdown with externally referenced pictures
    md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

    # Save HTML with externally referenced pictures
    html_filename = output_dir / f"{doc_filename}-with-image-refs.html"
    conv_res.document.save_as_html(html_filename, image_mode=ImageRefMode.REFERENCED)

    end_time = time.time() - start_time

    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")

In [36]:
main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'


INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document AR for improved learnability.pdf
INFO:docling.document_converter:Finished converting document AR for improved learnability.pdf in 50.48 sec.
INFO:__main__:Document converted and figures exported in 52.70 seconds.


In [53]:
chunk_iter = chunker.chunk(result.document)

In [50]:
import spacy
import pandas as pd
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [51]:
structured_data = {
    "Name": None,
    "Job Title": None,
    "Organization": None,
    "Education": []
}

In [None]:
for i, chunk in enumerate(chunk_iter):
    print(f"=== {i} ===")
    print(f"chunk.text:\n{repr(f'{chunk.text[:300]}…')}")
    
    enriched_text = chunker.serialize(chunk=chunk)
    print(f"chunker.serialize(chunk):\n{repr(f'{enriched_text[:300]}…')}")
    temp = nlp(enriched_text)
    structured_data = {
        "Name": [],
        "Job Title": [],
        "Organization": [],
        "Education": [],
        "Date": [],

    }
    for ent in temp.ents:
        if ent.label_ == "PERSON":
            structured_data["Name"].append(ent.text)
        elif ent.label_ == "ORG":
            structured_data["Organization"].append(ent.text)
        elif ent.label_ == "EDUCATION":
            structured_data["Education"].append(ent.text)
        elif ent.label_ == "DATE":
            structured_data["Date"].append(ent.text)

    print(structured_data)
    print()

=== 0 ===
chunk.text:
'glyph<c=11,font=/EHOCRU+TimesNewRomanPS-ItalicMT>glyph<c=21,font=/EHOCRU+TimesNewRomanPS-ItalicMT>glyph<c=19,font=/EHOCRU+TimesNewRomanPS-ItalicMT>glyph<c=21,font=/EHOCRU+TimesNewRomanPS-ItalicMT>glyph<c=23,font=/EHOCRU+TimesNewRomanPS-ItalicMT>glyph<c=12,font=/EHOCRU+TimesNewRomanPS-ItalicMT>\nglyph…'
chunker.serialize(chunk):
'glyph<c=11,font=/EHOCRU+TimesNewRomanPS-ItalicMT>glyph<c=21,font=/EHOCRU+TimesNewRomanPS-ItalicMT>glyph<c=19,font=/EHOCRU+TimesNewRomanPS-ItalicMT>glyph<c=21,font=/EHOCRU+TimesNewRomanPS-ItalicMT>glyph<c=23,font=/EHOCRU+TimesNewRomanPS-ItalicMT>glyph<c=12,font=/EHOCRU+TimesNewRomanPS-ItalicMT>\nglyph…'
{'Name': None, 'Job Title': None, 'Organization': 'ScienceDirect', 'Education': []}

=== 1 ===
chunk.text:
'journal homepage: www.elsevier.com/locate/cirpj…'
chunker.serialize(chunk):
'CIRP Journal of Manufacturing Science and Technology\njournal homepage: www.elsevier.com/locate/cirpj…'
{'Name': None, 'Job Title': None, 'Organization': 'CI

In [56]:
len(result.document.texts)

180