In [0]:
%pip install mistralai==1.8.1
%pip install -qU langchain-text-splitters
dbutils.library.restartPython()

In [0]:
import json
from pathlib import Path
from mistralai import Mistral
from IPython.display import Markdown, display
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
from langchain_text_splitters import MarkdownHeaderTextSplitter
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, monotonically_increasing_id

#Overview

#### Steps
1. Convert PDF to Markdown (OCR)
2. Chunk Markdown by section headers and add metadata
3. Save Markdown file and upsert chunked text and metadata to table
4. Create Vector Search Endpoint and Index (via UI but can use code)
![image](/Workspace/Users/david.hurley@databricks.com/vector-search/artifacts/vector-search-arch.png)

___
___
# 1. PDF --> Markdown
#### Potential OCR choices (many more beyond these):

**Open-Source**
- Docling
- Marker
- PyMuPDF4LLM

**Paid**
- Azure Document Intelligence
- Mistral OCR
- Gemini 2.0 Flash
- Databricks `ai_parse()`

---
___

### Define Data

In [0]:
# included in artifacts
bronze_volume = "/Volumes/users/david_hurley/vehicle_warranty/"
markdown_volume = "/Volumes/users/david_hurley/vehicle_warranty_markdown/"
silver_table = "users.david_hurley.vehicle_warranty_silver"

file_info = dbutils.fs.ls(bronze_volume)

display(file_info)

### Mistral OCR

In [0]:
# create secret in scope
mistral_api_key = dbutils.secrets.get(scope = "david-hurley-fe", key = "mistral-api-key")
client = Mistral(api_key=mistral_api_key)

In [0]:
def ocr_pdf_mistral(file_path: list, file_name: str, client) -> dict:
    """ Upload PDF to Mistral API and return json payload with Markdown """
    uploaded_file = client.files.upload(
        file={
            "file_name": file_name,
            "content": Path(file_path.replace("dbfs:", "")).read_bytes(),
        },
        purpose="ocr",
    )

    signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

    pdf_response = client.ocr.process(
        document=DocumentURLChunk(document_url=signed_url.url),
        model="mistral-ocr-latest",
        include_image_base64=False
    )

    json_response = json.loads(pdf_response.model_dump_json())

    # add vehicle model name
    json_response["model"] = file_name.replace(".pdf", "")

    return json_response

def combine_and_save_markdown(response: dict, file_name: str, volume: str):
    """ Save Markdown to file to avoid rerun of Mistral API """
    markdowns = []
    for page in response['pages']:
        markdowns.append(page['markdown'])

    output = "\n\n".join(markdowns)
    with open(f"{volume}/{file_name.replace('.pdf', '.md')}", "w") as f:
        f.write(output)

def chunk_markdown(page, response):
    """ Chunk each Markdown page by headers """
    markdown_document = page['markdown']

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    # Langchain utility
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(markdown_document)

    return [{"page_number": page['index'], "model": response['model'], "chunked_markdown": x.page_content} for x in md_header_splits]

In [0]:
chunked_markdown_df = None

# for each file convert to Markdown and for each page of Markdown chunk
for file in file_info:

  file_path = file[0]
  file_name = file[1]

  response = ocr_pdf_mistral(file_path=file_path, file_name=file_name, client=client)

  combine_and_save_markdown(response=response, file_name=file_name, volume=markdown_volume)

  chunked_markdown = []
  for page in response['pages']:
    chunked_markdown_flat = chunk_markdown(page, response)

    temp_df = spark.createDataFrame(chunked_markdown_flat)

    if chunked_markdown_df is None:
      chunked_markdown_df = temp_df
    else:
      chunked_markdown_df = chunked_markdown_df.unionByName(temp_df)
  
# vector index needs a primary key so create one
chunked_markdown_df = chunked_markdown_df.withColumn("id", row_number().over(Window.orderBy(monotonically_increasing_id())))

# save silver table to create vector search index
chunked_markdown_df.write.mode("overwrite").saveAsTable(silver_table)

### AI SQL Document Parsing OCR (Private Preview)

In [0]:
%sql
SELECT
    path,
    ai_parse(content) AS parsed
FROM
    READ_FILES('/Volumes/users/david_hurley/vehicle_warranty/*.pdf', format => 'binaryFile')

In [0]:
from pyspark.sql.functions import col, expr

df = spark.read.format("binaryFile") \
  .load('/Volumes/users/david_hurley/vehicle_warranty/*.pdf') \
  .select(
    col("path"),
    expr("ai_parse(content)").alias("parsed")) \
  .collect()


In [0]:
# parse columns out more than above
%sql
WITH corpus AS (
  SELECT
    path,
    ai_parse(content) AS parsed
  FROM
    READ_FILES('/Volumes/users/david_hurley/vehicle_warranty/*.pdf', format => 'binaryFile')
)
SELECT
  path,
  parsed:document AS document,
  parsed:pages AS pages,
  parsed:elements AS elements,
  parsed:_corrupted_data AS _corrupted_data
FROM corpus;
