In [0]:
%pip install openai httpx fitz PyMuPDF
%restart_python

In [0]:
%run ./utils_prompts

In [0]:
import base64
import fitz 
import pandas as pd
import httpx
from IPython.display import display, Markdown

In [0]:
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
DATABRICKS_BASE_URL = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get()
DATABRICKS_URL = f"{DATABRICKS_BASE_URL}/serving-endpoints/"

In [0]:


def get_image_from_url(image_url):
    return httpx.get(image_url).content

def base64_encode_image(image):
    return base64.b64encode(image).decode('utf-8')

def draw_image(image):
    img = Image.open(BytesIO(image_bytes))
    display(img)

def display_markdown(text):
    display(Markdown(text))

In [0]:
from openai import OpenAI

client = OpenAI(api_key=DATABRICKS_TOKEN, base_url=DATABRICKS_URL)

def parse_image(image_data, prompt, model):
    response = client.chat.completions.create(
                    model=model,
                    messages=[{
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {
                                "type": "image_url",
                                "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}
                            }
                        ]
                    }],
                    max_tokens=10000
                )

    result = response.choices[0].message.content.strip()
    return result

In [0]:
from PIL import Image
from io import BytesIO

image_url = ("https://images.prismic.io/formsort/Zn6Ffx5LeNNTwnrT_restrict.png?auto=format,compress")  # replace with your image URL
image_bytes = get_image_from_url(image_url)
draw_image(image_bytes)

In [0]:

md_text = parse_image(base64_encode_image(get_image_from_url("https://images.prismic.io/formsort/Zn6Ffx5LeNNTwnrT_restrict.png?auto=format,compress")), GENERIC_PROMPT, "databricks-llama-4-maverick") 
display_markdown(md_text)

In [0]:
def convert_pdf_to_base64(pdf_path, dpi=300, jpg_quality=80):
    """
    Convert PDF to base64 images optimized for RAG applications.
    Simple single-threaded approach for reliable performance.
    
    Args:
        pdf_path: Path to PDF file
        dpi: Resolution
    
    Returns:
        pandas DataFrame with columns: page_num, base64_img, doc_id
    """
    
    zoom = dpi / 72
    zoom_matrix = fitz.Matrix(zoom, zoom)
    
    doc = fitz.open(pdf_path)
    num_pages = len(doc)
    
    print(f"Processing {num_pages} pages at {dpi} DPI...")
    
    df_data = []
    
    for page_num in range(num_pages):
        if page_num % 25 == 0:  # Progress update every 25 pages
            print(f"Processing page {page_num + 1}/{num_pages}")
        
        page = doc.load_page(page_num)
        
        pix = page.get_pixmap(matrix=zoom_matrix, alpha=False)
        img_bytes = pix.tobytes("jpeg", jpg_quality=80)  
        img_base64 = base64.b64encode(img_bytes).decode('utf-8')
        
        df_data.append({
            'page_num': page_num + 1,
            'base64_img': img_base64,
            'doc_id': pdf_path
        })
    
    doc.close()
    print(f"Conversion complete. Generated {len(df_data)} base64 images.")
    
    return pd.DataFrame(df_data)

In [0]:
df = convert_pdf_to_base64("/Volumes/pdf_parsing/test_parsing/raw_files/aph_example_1.pdf")

In [0]:
text_processed = parse_image(df.iloc[0]["base64_img"], GENERIC_PROMPT, "databricks-claude-opus-4-1")

In [0]:
text_processed

In [0]:
display_markdown(text_processed)