In [0]:
%pip install docling
%pip install -qU langchain-text-splitters
dbutils.library.restartPython()

Collecting docling
  Downloading docling-2.36.1-py3-none-any.whl.metadata (10 kB)
Collecting pydantic<3.0.0,>=2.0.0 (from docling)
  Using cached pydantic-2.11.6-py3-none-any.whl.metadata (67 kB)
Collecting docling-core<3.0.0,>=2.29.0 (from docling-core[chunking]<3.0.0,>=2.29.0->docling)
  Downloading docling_core-2.37.0-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-ibm-models<4.0.0,>=3.4.4 (from docling)
  Downloading docling_ibm_models-3.4.4-py3-none-any.whl.metadata (6.4 kB)
Collecting docling-parse<5.0.0,>=4.0.0 (from docling)
  Downloading docling_parse-4.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
Collecting pydantic-settings<3.0.0,>=2.3.0 (from docling)
  Downloading 

In [0]:
from docling.document_converter import DocumentConverter
from langchain_text_splitters import MarkdownHeaderTextSplitter
from pyspark.sql.functions import monotonically_increasing_id

In [0]:
# List of PDFs to convert to Markdown
bronze_volume = "/Volumes/users/david_hurley/vehicle_warranty/" # WHERE PDFS LIVE
file_paths = [file[0].replace("dbfs:", "") for file in dbutils.fs.ls(bronze_volume)]

In [0]:
# For each PDF use Docling to convert to Markdown for each page and use Langchain to split on headers to smaller chunks
converter = DocumentConverter()
markdown_data_for_all_docs = []

for file in file_paths:
  result = converter.convert(file)

  markdown_document = result.document.export_to_markdown()

  headers_to_split_on = [
      ("#", "Header 1"),
      ("##", "Header 2"),
      ("###", "Header 3"),
  ]

  markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
  md_header_splits = markdown_splitter.split_text(markdown_document)

  # Add more columns like Page Number or URL as metadata for AI to filter on
  markdown_data_for_all_docs.extend([{"chunked_markdown": x.page_content} for x in md_header_splits])

df = spark.createDataFrame(markdown_data_for_all_docs)

In [0]:
# Add unique ID and save as table
df = df.withColumn("id", monotonically_increasing_id())
df.write.option("mergeSchema", "true").mode("overwrite").saveAsTable("users.david_hurley.markdown")