### Convert PDF to Markdown with Docling
Use Docling open-source OCR library to convert PDF --> Markdown. Chunk each page of Markdown into sections using Langchain Markdown Header splitter. Lastly, save results to a Delta Table. 

In [0]:
%pip install docling
%pip install -qU langchain-text-splitters
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from docling.document_converter import DocumentConverter
from langchain_text_splitters import MarkdownHeaderTextSplitter
from pyspark.sql.functions import monotonically_increasing_id

In [0]:
# Change the Volume path to directory your PDFs are stored
bronze_volume = "/Volumes/users/david_hurley/vehicle_warranty/"

# Get all PDF file paths and remove dbfs prefix
file_paths = [file[0].replace("dbfs:", "") for file in dbutils.fs.ls(bronze_volume)]
print(file_paths)

['/Volumes/users/david_hurley/vehicle_warranty/Ford.pdf', '/Volumes/users/david_hurley/vehicle_warranty/Mercedes.pdf', '/Volumes/users/david_hurley/vehicle_warranty/Toyota.pdf']


In [0]:
# Instantiate Docling
converter = DocumentConverter()

# Create a list to store Markdown dataframes
markdown_data_for_all_docs = []

for file in file_paths:

  # Convert PDF to Markdown
  result = converter.convert(file)
  markdown_document = result.document.export_to_markdown()

  # Define Markdown headers to chunk on
  
  headers_to_split_on = [
      ("#", "Header 1"),
      ("##", "Header 2"),
      ("###", "Header 3"),
  ]

  markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
  md_header_splits = markdown_splitter.split_text(markdown_document)

  # Add more columns like Page Number or URL as metadata for AI to filter on
  markdown_data_for_all_docs.extend([{"chunked_markdown": x.page_content} for x in md_header_splits])

df = spark.createDataFrame(markdown_data_for_all_docs)

Downloading detection model, please wait. This may take several minutes depending upon your network connection.
Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


In [0]:
# Create primary key, needed for Vector Search
df = df.withColumn("id", monotonically_increasing_id())

# Save results to Silver Delta Table
df.write.option("mergeSchema", "true").mode("overwrite").saveAsTable("users.david_hurley.vehicle_warranty_markdown")