In [10]:
import os
from dotenv import load_dotenv
from langchain_docling.loader import ExportType

load_dotenv()
EMBED_MODEL_ID = os.getenv("MODEL_ID")
EXPORT_TYPE = ExportType.DOC_CHUNKS

In [11]:
from langchain_docling import DoclingLoader

FILE_PATH = "https://arxiv.org/pdf/2408.09869"

loader = DoclingLoader(file_path=FILE_PATH)

docs = loader.load()

for d in docs[:3]:
    print(f"- {d.page_content=}")

Token indices sequence length is longer than the specified maximum sequence length for this model (648 > 512). Running this sequence through the model will result in indexing errors


- d.page_content='Docling Technical Report\nVersion 1.0\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar\nAI4K Group, IBM Research R¨ uschlikon, Switzerland'
- d.page_content='Abstract\nThis technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.'
- d.page_content='1 Introduction\nConverting PDF documents back into a machine-processable format has been a major challenge for deca

In [12]:
d.metadata

{'source': 'https://arxiv.org/pdf/2408.09869',
 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta',
  'version': '1.0.0',
  'doc_items': [{'self_ref': '#/texts/8',
    'parent': {'$ref': '#/body'},
    'children': [],
    'content_layer': 'body',
    'label': 'text',
    'prov': [{'page_no': 1,
      'bbox': {'l': 108.0,
       't': 239.37,
       'r': 504.003,
       'b': 143.54600000000005,
       'coord_origin': 'BOTTOMLEFT'},
      'charspan': [0, 792]}]},
   {'self_ref': '#/texts/9',
    'parent': {'$ref': '#/body'},
    'children': [],
    'content_layer': 'body',
    'label': 'text',
    'prov': [{'page_no': 1,
      'bbox': {'l': 108.0,
       't': 135.88800000000003,
       'r': 504.003,
       'b': 83.52099999999996,
       'coord_origin': 'BOTTOMLEFT'},
      'charspan': [0, 488]}]},
   {'self_ref': '#/texts/12',
    'parent': {'$ref': '#/body'},
    'children': [],
    'content_layer': 'body',
    'label': 'text',
    'prov': [{'page_no': 2,
      'bbox': 

In [13]:
d.metadata['dl_meta'].keys()

dict_keys(['schema_name', 'version', 'doc_items', 'headings', 'origin'])

In [14]:
from docling.chunking import HybridChunker
from langchain_docling import DoclingLoader

loader = DoclingLoader(
    file_path=FILE_PATH,
    export_type=EXPORT_TYPE,
    chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),
)

docs_hybrid = loader.load()
for dh in docs_hybrid[:3]:
    print(f"- {d.page_content=}")
dh.metadata

Token indices sequence length is longer than the specified maximum sequence length for this model (612 > 512). Running this sequence through the model will result in indexing errors


- d.page_content='1 Introduction\nConverting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.\nWith Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and

{'source': 'https://arxiv.org/pdf/2408.09869',
 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta',
  'version': '1.0.0',
  'doc_items': [{'self_ref': '#/texts/8',
    'parent': {'$ref': '#/body'},
    'children': [],
    'content_layer': 'body',
    'label': 'text',
    'prov': [{'page_no': 1,
      'bbox': {'l': 108.0,
       't': 239.37,
       'r': 504.003,
       'b': 143.54600000000005,
       'coord_origin': 'BOTTOMLEFT'},
      'charspan': [0, 792]}]},
   {'self_ref': '#/texts/9',
    'parent': {'$ref': '#/body'},
    'children': [],
    'content_layer': 'body',
    'label': 'text',
    'prov': [{'page_no': 1,
      'bbox': {'l': 108.0,
       't': 135.88800000000003,
       'r': 504.003,
       'b': 83.52099999999996,
       'coord_origin': 'BOTTOMLEFT'},
      'charspan': [0, 488]}]},
   {'self_ref': '#/texts/12',
    'parent': {'$ref': '#/body'},
    'children': [],
    'content_layer': 'body',
    'label': 'text',
    'prov': [{'page_no': 2,
      'bbox': 

In [16]:
dh == d

True

In [None]:
from langchain_postgres import PGVector

vector_store = PGVector(
    embeddings=embeddings,
    collection_name="my_docs",
    connection="postgresql+psycopg://...",
)

In [4]:
import os
import json
import fitz  # PyMuPDF
import psycopg
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

def annotate_pdf(psql_conn, source_filename, output_dir = None, output_filename=None):
    """
    Given a PostgreSQL connection and a source filename (as stored in the "source" key of the cmetadata column),
    this function queries the database for all rows associated with that document and then adds rectangle annotations
    onto the PDF file for each chunk's bounding boxes. Each chunk (as identified by its chunk_index) is assigned a distinct color.
    
    The function converts bounding box coordinates from a bottom-left origin system (as stored in your JSON) to the
    top-left origin system used by PyMuPDF.
    
    Parameters:
      psql_conn: an open psycopg2 connection to your PostgreSQL database.
      source_filename: the file path (or source value) identifying your document.
      output_filename: optional; if provided, the annotated PDF will be saved using this filename. 
                       Otherwise, "_annotated" is appended to the original filename.
    """
    # Query the database for rows where the JSONB column "cmetadata" has the given source.
    query = """
        SELECT cmetadata
        FROM public.langchain_pg_embedding
        WHERE cmetadata->>'source' = %s
        ORDER BY (cmetadata->>'chunk_index')::int;
    """
    with psql_conn.cursor() as cur:
        cur.execute(query, (source_filename,))
        rows = cur.fetchall()

    if not rows:
        print("No entries found for source:", source_filename)
        return

    # Open the PDF document using PyMuPDF.
    doc = fitz.open(source_filename)

    # Define a palette of colors as RGB tuples with each component in [0, 1].
    colors = [
        (1, 0, 0),   # red
        (0, 1, 0),   # green
        (0, 0, 1),   # blue
        (1, 1, 0),   # yellow
        (1, 0, 1),   # magenta
        (0, 1, 1),   # cyan
        # You can add additional colors if needed.
    ]

    # Loop over each row in the query result.
    for row in rows:
        # Each row contains the cmetadata JSON. It might already be a dictionary; if not, parse it.
        cmetadata = row[0]
        if isinstance(cmetadata, str):
            cmetadata = json.loads(cmetadata)
        
        # Retrieve the chunk index to select the annotation color.
        chunk_index = cmetadata.get("chunk_index")
        if chunk_index is None:
            continue

        # Select a color by cycling through the palette.
        color = colors[chunk_index % len(colors)]

        # Retrieve the document-level metadata and its associated items.
        dl_meta = cmetadata.get("dl_meta", {})
        doc_items = dl_meta.get("doc_items", [])
        for item in doc_items:
            # Each document item should contain a list under the key "prov".
            prov_list = item.get("prov", [])
            for prov in prov_list:
                # Get the bounding box dictionary.
                bbox = prov.get("bbox")
                if not bbox:
                    continue
                
                # Retrieve the page number (assumed 1-indexed in your metadata).
                page_no = prov.get("page_no")
                if not page_no or page_no < 1 or page_no > doc.page_count:
                    continue
                page = doc.load_page(page_no - 1)  # PyMuPDF uses 0-indexed pages.
                
                # Extract horizontal coordinates.
                left = bbox.get("l")
                right = bbox.get("r")
                # Extract vertical coordinates from the JSON (using bottom-left origin).
                orig_bottom = bbox.get("b")
                orig_top = bbox.get("t")
                if None in (left, right, orig_bottom, orig_top):
                    continue

                # Determine if the bbox uses bottom-left coordinates.
                coord_origin = bbox.get("coord_origin", "TOPLEFT")  # Default assume top-left if not provided.
                if coord_origin.upper() == "BOTTOMLEFT":
                    # For PyMuPDF (top-left system), transform the y-coordinates.
                    page_height = page.rect.height
                    # Invert the y-coordinates.
                    new_top = page_height - orig_top
                    new_bottom = page_height - orig_bottom
                else:
                    # If already in top-left coordinate system, use as-is.
                    new_top = orig_top
                    new_bottom = orig_bottom

                # Create a rectangle. In PyMuPDF, fitz.Rect(left, top, right, bottom).
                rect = fitz.Rect(left, new_top, right, new_bottom)

                page.draw_rect(
                    rect,
                    color=color,      # Border color (can be None if no border)
                    fill=color,       # Fill color
                    fill_opacity=0.2, # Transparency
                    width=1.0,              # Border width (optional)
                    overlay=True
                )
                # # Add a rectangle annotation to the page.
                # annot = page.addRectAnnot(rect)
                # annot.setColors(stroke=color)   # Set the annotation's border (stroke) color.
                # annot.setBorder(width=1)          # Set the border width if desired.
                # annot.update()

    # Define an output filename if not provided.
    if not output_filename:
        output_filename = 'annotated_' + os.path.basename(source_filename)
        
    output_filepath = os.path.join(output_dir, output_filename) if output_dir else output_filename
    # Save the annotated PDF.
    doc.save(output_filepath, garbage=4, deflate=True)
    doc.close()
    print("Annotated PDF saved as:", output_filename)


# Example usage:
if __name__ == '__main__':
    # Connect to your PostgreSQL database. Adjust connection parameters as needed.
    conn = psycopg.connect(
        dbname=os.getenv('DB_NAME', 'rag_lyme_docs'),
        user=os.getenv('DB_USER', 'postgres'),
        password=os.getenv('DB_PASSWORD', 'your_password'),
        host=os.getenv('DB_HOST', 'localhost'),
    )
    
    # Specify the PDF file (as stored in the JSON metadata "source").
    source_pdf = "test_rag_documents/2012-lyme-legislature.pdf"

    output_dir = os.getenv('ANNOTATED_DOCUMENT_DIRECTORY', None)
    
    # Call the function to create an annotated duplicate of the PDF.
    annotate_pdf(conn, source_pdf, output_dir = output_dir)
    
    conn.close()


Annotated PDF saved as: annotated_2012-lyme-legislature.pdf
