In [10]:
import dotenv

dotenv.load_dotenv()

True

In [19]:
# Convert PDFs to images
import fitz  # PyMuPDF
import os

def convert_pdf_to_images(input_path: str, output_path: str):
    # Open the PDF file
    pdf_path = input_path
    pdf_document = fitz.open(pdf_path)
    
    # Create output directory if it doesn't exist
    output_dir = output_path
    os.makedirs(output_dir, exist_ok=True)
    
    # Iterate through each page
    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document[page_num]
        
        # Convert page to image
        pix = page.get_pixmap()
        
        # Save image
        image_path = f"{output_dir}/page_{page_num + 1}.png"
        pix.save(image_path)
    
    pdf_document.close()

for file_id in ["M000093215.PDF", "M00093381B.PDF", "M000093520.PDF", "M00103924B.PDF"]:
    convert_pdf_to_images(f"data/unstructured/{file_id}", f"data/images/{file_id}")

In [12]:
# Extract key value pairs from images
import base64
import glob
import json

from litellm import supports_vision, completion

model = "openai/gpt-4o"
assert supports_vision(model=model)

def image_to_base64(input_path: str):
    with open(input_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def extract_kv_pairs(input_path: str):
    base64_image = image_to_base64(input_path)

    image_query_response = completion(
        model=model,
        # response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": "Extract a flat, 1-dimensional list of key value pairs from the following image"},
            {"role": "system", "content": "Return the a markdown bulleted * key: value only"},
            # {"role": "system", "content": "Example: {'pairs': [{'key': key, 'value': value}, {'key': key, 'value': value}]}"},
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
    )

    # return json.loads(image_query_response.choices[0].message.content)
    return image_query_response.choices[0].message.content
print(extract_kv_pairs("data/images/M000093215.PDF/page_1.png"))


- Borough: Manhattan
- Date: Nov 29 1986
- Zoning District: C6-6.5
- 57th Street Wing Cellar: 0.G. 260
- Offices: 4
- Elevator Machine Room: 20
- Gallery: Mechanical Room
- Storage: Cellar Mezzanine
- Mechanical Room: Office
- Locker Room: Elevator
- Machine Room: Storage
- 1st Floor Live Load: 85
- 1st Floor Permitted Use Units: 1,021
- 1st Floor Description of Use: Carnegie Concert Hall
- Trap Room: Lobbies
- Carnegie Hall Lounge: And Bar
- 2nd Floor Live Load: 85
- 2nd Floor Permitted Use Units: 350
- 2nd Floor Description of Use: 1st Tier Boxes of Carnegie Concert Hall
- Recital Hall: Lounge And Dress Rooms
- 3rd Floor Live Load: 85
- 3rd Floor Permitted Use Units: 248
- 3rd Floor Permitted Use Units: 299
- 3rd Floor Description of Use: 2nd Tier Boxes of Carnegie Concert Hall
- Recital Hall: Balcony
- Balcony Live Load: 85
- Recital Hall Balcony: 4th Floor
- 4th Floor Live Load: 85
- 4th Floor Permitted Use Units: 430
- 4th Floor Permitted Use Units: 16
- 4th Floor Description of U

In [13]:
from litellm import embedding

def get_image_embedding(input_path: str):
    with open(input_path, "rb") as image_file:
        response = embedding(model="cohere/embed-english-v3.0", input=[image_to_base64(input_path)])

    return response.data[0]["embedding"]

print(get_image_embedding("data/images/M000093215.PDF/page_1.png"))


[-0.0059661865, -0.024291992, -0.025741577, -0.06628418, 0.0031814575, -0.013214111, -0.03704834, -0.01184082, 0.018310547, 0.010154724, 0.020645142, -0.011627197, -0.03515625, -0.049041748, -0.016174316, -0.026412964, 0.0107040405, -0.0014448166, 0.0061187744, 0.01108551, -0.03289795, -0.0070533752, 0.023712158, 0.024856567, 0.031951904, -0.01876831, -0.015731812, 0.0020427704, -0.01134491, -0.016799927, -0.001745224, 0.07141113, 0.039001465, 0.003698349, -0.0034503937, -0.023147583, -0.014266968, 0.017684937, 0.011039734, 0.012588501, -0.012382507, 0.013626099, -0.04058838, -0.008331299, -0.095214844, -0.023284912, -0.058746338, 0.0068130493, 0.026031494, 0.06286621, 0.027496338, 0.022232056, -0.014823914, 0.13769531, -0.034057617, -0.0059394836, -0.042999268, -0.0069885254, -0.022766113, 0.075927734, 0.047424316, 0.010971069, -0.004421234, 0.00077438354, -0.03366089, 0.033447266, -0.009300232, 0.022567749, 0.05935669, -0.027664185, -0.046081543, 0.02458191, -0.0010118484, 0.00482559

In [14]:
# Create Typesense schema
import typesense

typesense_client = typesense.Client(
    {
        "nodes": [
            {
                "host": "localhost",
                "port": "8108",
                "protocol": "http",
            }
        ],
        "api_key": "admin",
        "connection_timeout_seconds": 2,
    }
)

schema = {
    "name": "pdfs",
    "fields": [
        {"name": "source_filename", "type": "string"},
        {"name": "page_number", "type": "int32"},
        {"name": "content", "type": "string"},
        {"name": "embedding", "type": "float[]", "num_dim": 1024},
    ],
    "enable_nested_fields": True,
}

try:
    typesense_client.collections["pdfs"].delete()
except Exception as e:
    print(e)

typesense_client.collections.create(schema)


[Errno 404] No collection with name `pdfs` found.


{'created_at': 1737841884,
 'default_sorting_field': '',
 'enable_nested_fields': True,
 'fields': [{'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'source_filename',
   'optional': False,
   'sort': False,
   'type': 'string'},
  {'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'page_number',
   'optional': False,
   'sort': True,
   'type': 'int32'},
  {'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'content',
   'optional': False,
   'sort': False,
   'type': 'string'},
  {'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'embedding',
   'num_dim': 1024,
   'optional': False,
   'sort': False,
   'type': 'float[]'}],
 'name': 'pdfs',
 'num_documents': 0,
 'symbols_to_index': [],
 'token_separators': []}

In [15]:
# Insert key value pairs into Typesense

for file_id in ["M000093215.PDF", "M00093381B.PDF", "M000093520.PDF", "M00103924B.PDF"]:
# for file_id in ["M000093215.PDF"]:
    for file_name in glob.glob(f"data/images/{file_id}/*.png"):
        kv_pairs = extract_kv_pairs(file_name)
        page_number = file_name.split("/")[-1].split(".")[0].split("_")[1]

        response = typesense_client.collections["pdfs"].documents.import_(
            documents=[
                {
                    "source_filename": file_id,
                    "page_number": int(page_number),
                    "content": kv_pairs,
                    "embedding": get_image_embedding(file_name),
                }
            ]
        )

        for doc in response:
            print("success:", doc["success"])
            print("error:", doc.get("error", ""))

success: True
error: 
success: True
error: 
success: True
error: 
success: True
error: 
success: True
error: 
success: True
error: 
success: True
error: 
success: True
error: 


In [20]:
typesense_client.collections["pdfs"].retrieve()

{'created_at': 1737841884,
 'default_sorting_field': '',
 'enable_nested_fields': True,
 'fields': [{'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'source_filename',
   'optional': False,
   'sort': False,
   'type': 'string'},
  {'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'page_number',
   'optional': False,
   'sort': True,
   'type': 'int32'},
  {'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'content',
   'optional': False,
   'sort': False,
   'type': 'string'},
  {'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'embedding',
   'num_dim': 1024,
   'optional': False,
   'sort': False,
   'type': 'float[]'}],
 'name': 'pdfs',
 'num_documents': 8,
 'symbols_to_index': [],
 'token_separators': []}

In [17]:
query = "What borough is the property in 1986?"

def gather_query_context(query: str):
  embedding_str = ','.join(str(v) for v in embedding(
    model="cohere/embed-english-v3.0", 
    input=[query]
  ).data[0]["embedding"])

  searches = {
    "searches": [
      {
        "query_by": "content",
        "q": query,
        "exclude_fields": "embedding"
      }
    ]
  }

  search_parameters = {
    "collection": "pdfs",
    "vector_query": f"embedding:([{embedding_str}], alpha: 0.4, k: 4)",
    "per_page": 25,
  }

  results = typesense_client.multi_search.perform(searches, search_parameters)

  return results

gather_query_context(query)


{'results': [{'facet_counts': [],
   'found': 6,
   'hits': [{'document': {'content': '* Borough: MANHATTAN\n* Date: JAN 23 1989\n* No.: 93520\n* Zoning District: C6-6.5\n* 57th Street Wing Cellar O.G.: 260\n* Cellar: 4\n* Cellar: 20\n* Cellar Mezzanine: 85\n* Cellar Mezzanine: 10\n* 1st Floor: 25\n* 1st Floor: 1,021\n* 2nd Floor: 25\n* 2nd Floor: 350\n* 3rd Floor: 25\n* 3rd Floor: 248\n* 3rd Floor: 299\n* Balcony: 25\n* 4th Floor: 25\n* 4th Floor: 430\n* 4th Floor: 10\n* 5th Floor: 25\n* 5th Floor: 563',
      'id': '5',
      'page_number': 1,
      'source_filename': 'M000093520.PDF'},
     'highlight': {'content': {'matched_tokens': ['Borough', '1989'],
       'snippet': '<mark>Borough</mark>: MANHATTAN\n* Date: JAN 23 <mark>1989</mark>\n* No.: 93520\n* Zoning District:'}},
     'highlights': [{'field': 'content',
       'matched_tokens': ['Borough', '1989'],
       'snippet': '<mark>Borough</mark>: MANHATTAN\n* Date: JAN 23 <mark>1989</mark>\n* No.: 93520\n* Zoning District:'}],
 

In [18]:
# Synthesize query using retrieved results
def synthesize_query(query: str):
    # Get context
    context = gather_query_context(query)
    base64_images = []
    
    # Get images from context
    for result in context["results"][0]["hits"]:
        image_base64 = image_to_base64(
            f"data/images/{result['document']['source_filename']}/page_{result['document']['page_number']}.png"
        )

        base64_images.append(image_base64)

    # Respond to query using images
    image_query_response = completion(
        model=model,
        # response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": "Attempt to answer the following query using the following images"},
            {"role": "user", "content": f"Query: {query}"},
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    } for base64_image in base64_images
                ]
            }
        ],
    )

    # return json.loads(image_query_response.choices[0].message.content)
    return image_query_response.choices[0].message.content

print(synthesize_query("What is the first floor used for?"))


The first floor is used for the Carnegie concert hall, a trap room, lobbies, and the Carnegie Hall lounge and bar.
