In [36]:
import dotenv

dotenv.load_dotenv()

True

In [19]:
# Convert PDFs to images
import fitz  # PyMuPDF
import os

def convert_pdf_to_images(input_path: str, output_path: str):
    # Open the PDF file
    pdf_path = input_path
    pdf_document = fitz.open(pdf_path)
    
    # Create output directory if it doesn't exist
    output_dir = output_path
    os.makedirs(output_dir, exist_ok=True)
    
    # Iterate through each page
    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document[page_num]
        
        # Convert page to image
        pix = page.get_pixmap()
        
        # Save image
        image_path = f"{output_dir}/page_{page_num + 1}.png"
        pix.save(image_path)
    
    pdf_document.close()

for file_id in ["M000093215.PDF", "M00093381B.PDF", "M000093520.PDF", "M00103924B.PDF"]:
    convert_pdf_to_images(f"data/unstructured/{file_id}", f"data/images/{file_id}")

In [53]:
# Extract key value pairs from images
import base64
import glob
import json

from litellm import supports_vision, completion

model = "openai/gpt-4o"
assert supports_vision(model=model)

def image_to_base64(input_path: str):
    with open(input_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def extract_kv_pairs(input_path: str):
    base64_image = image_to_base64(input_path)

    image_query_response = completion(
        model=model,
        # response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": "Extract a flat, 1-dimensional list of key value pairs from the following image"},
            {"role": "system", "content": "Return the a markdown bulleted * key: value only"},
            # {"role": "system", "content": "Example: {'pairs': [{'key': key, 'value': value}, {'key': key, 'value': value}]}"},
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
    )

    # return json.loads(image_query_response.choices[0].message.content)
    return image_query_response.choices[0].message.content
print(extract_kv_pairs("data/images/M000093215.PDF/page_1.png"))


[92m15:01:36 - LiteLLM:INFO[0m: utils.py:2850 - 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m15:01:41 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler


- Borough: Manhattan
- Date: Nov 29 1986
- Zoning District: C6-6, 5
- 57th Street Wing Cellar Live Load: O.G. 
- 57th Street Wing Cellar Persons: 260 
- 57th Street Wing Cellar Cinema: 4 
- 57th Street Wing Cellar Offices: 20 
- 57th Street Wing Cellar: Elevator Machine Room, Gallery, Mechanical Room, Storage
- Cellar Mezzanine: Mechanical Room, Office, Locker Room, Elevator Machine Room, Storage
- 1st Floor Persons: 85 
- 1st Floor Units: 1,021 
- 1st Floor: Carnegie Concert Hall, Trap Room, Lobbies, Carnegie Hall Lounge and Bar
- 2nd Floor Persons: 85 
- 2nd Floor Units: 350 
- 2nd Floor: 1st Tier Boxes of Carnegie Concert Hall, Recital Hall, Lounge and Dress Rooms
- 3rd Floor Persons: 85 
- 3rd Floor Units 1: 248 
- 3rd Floor Units 2: 299 
- 3rd Floor: 2nd Tier Boxes of Carnegie Concert Hall, Recital Hall
- Balcony Persons: 85 
- Balcony: Recital Hall Balcony
- 4th Floor Persons: 85 
- 4th Floor Units 1: 430 
- 4th Floor Units 2: 16 
- 4th Floor: Dress Circle of Carnegie Hall, Loung

In [55]:
from litellm import embedding

def get_image_embedding(input_path: str):
    with open(input_path, "rb") as image_file:
        response = embedding(model="cohere/embed-english-v3.0", input=[image_to_base64(input_path)])

    return response.data[0]["embedding"]

print(get_image_embedding("data/images/M000093215.PDF/page_1.png"))


INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:07:18 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler


[-0.0059547424, -0.024276733, -0.025756836, -0.066223145, 0.0031604767, -0.013214111, -0.03704834, -0.01184082, 0.018325806, 0.010139465, 0.020645142, -0.011619568, -0.03515625, -0.04901123, -0.016159058, -0.026428223, 0.010688782, -0.0014753342, 0.006134033, 0.011054993, -0.032928467, -0.0070343018, 0.023727417, 0.024871826, 0.031951904, -0.018753052, -0.015716553, 0.0020503998, -0.01133728, -0.016815186, -0.0017433167, 0.07147217, 0.038970947, 0.003698349, -0.0034313202, -0.023132324, -0.014259338, 0.017700195, 0.011024475, 0.01260376, -0.012382507, 0.013633728, -0.040618896, -0.00831604, -0.09515381, -0.023269653, -0.058746338, 0.006828308, 0.026031494, 0.062927246, 0.02746582, 0.022216797, -0.014801025, 0.13769531, -0.0340271, -0.0059661865, -0.043029785, -0.0069885254, -0.022750854, 0.075927734, 0.047424316, 0.011009216, -0.00440979, 0.0007534027, -0.03363037, 0.033477783, -0.0093307495, 0.022583008, 0.059326172, -0.027664185, -0.046081543, 0.02456665, -0.0009889603, 0.004802704, 

In [59]:
# Create Typesense schema
import typesense

typesense_client = typesense.Client(
    {
        "nodes": [
            {
                "host": "localhost",
                "port": "8108",
                "protocol": "http",
            }
        ],
        "api_key": "admin",
        "connection_timeout_seconds": 2,
    }
)

schema = {
    "name": "pdfs",
    "fields": [
        {"name": "source_filename", "type": "string"},
        {"name": "page_number", "type": "int32"},
        {"name": "content", "type": "string"},
        {"name": "embedding", "type": "float[]", "num_dim": 1024},
    ],
    "enable_nested_fields": True,
}

try:
    typesense_client.collections["pdfs"].delete()
except Exception as e:
    print(e)

typesense_client.collections.create(schema)


{'created_at': 1737835828,
 'default_sorting_field': '',
 'enable_nested_fields': True,
 'fields': [{'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'source_filename',
   'optional': False,
   'sort': False,
   'stem': False,
   'store': True,
   'type': 'string'},
  {'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'page_number',
   'optional': False,
   'sort': True,
   'stem': False,
   'store': True,
   'type': 'int32'},
  {'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'content',
   'optional': False,
   'sort': False,
   'stem': False,
   'store': True,
   'type': 'string'},
  {'facet': False,
   'hnsw_params': {'M': 16, 'ef_construction': 200},
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'embedding',
   'num_dim': 1024,
   'optional': False,
   'sort': False,
   'stem': False,
   'store': True,
   'type': 'float[]',
   'vec_dist': 'cosine'}],
 'name': 'pdfs',
 

In [69]:
# Insert key value pairs into Typesense

for file_id in ["M000093215.PDF", "M00093381B.PDF", "M000093520.PDF", "M00103924B.PDF"]:
# for file_id in ["M000093215.PDF"]:
    for file_name in glob.glob(f"data/images/{file_id}/*.png"):
        kv_pairs = extract_kv_pairs(file_name)
        page_number = file_name.split("/")[-1].split(".")[0].split("_")[1]

        response = typesense_client.collections["pdfs"].documents.import_(
            documents=[
                {
                    "source_filename": file_id,
                    "page_number": int(page_number),
                    "content": kv_pairs,
                    "embedding": get_image_embedding(file_name),
                }
            ]
        )

        for doc in response:
            print("success:", doc["success"])
            print("error:", doc.get("error", ""))

[92m15:18:07 - LiteLLM:INFO[0m: utils.py:2850 - 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m15:18:15 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:18:15 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
[92m15:18:15 - LiteLLM:INFO[0m: utils.py:2850 - 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: 
LiteLLM completion() model= gpt-4o; provider = openai


success: True
error: 


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m15:18:26 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:18:27 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
[92m15:18:27 - LiteLLM:INFO[0m: utils.py:2850 - 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: 
LiteLLM completion() model= gpt-4o; provider = openai


success: True
error: 


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m15:18:32 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:18:33 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
[92m15:18:33 - LiteLLM:INFO[0m: utils.py:2850 - 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: 
LiteLLM completion() model= gpt-4o; provider = openai


success: True
error: 


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m15:18:38 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:18:39 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
[92m15:18:39 - LiteLLM:INFO[0m: utils.py:2850 - 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: 
LiteLLM completion() model= gpt-4o; provider = openai


success: True
error: 


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m15:18:43 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:18:44 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
[92m15:18:44 - LiteLLM:INFO[0m: utils.py:2850 - 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: 
LiteLLM completion() model= gpt-4o; provider = openai


success: True
error: 


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m15:18:49 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:18:50 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
[92m15:18:50 - LiteLLM:INFO[0m: utils.py:2850 - 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: 
LiteLLM completion() model= gpt-4o; provider = openai


success: True
error: 


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m15:18:55 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:18:55 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
[92m15:18:55 - LiteLLM:INFO[0m: utils.py:2850 - 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: 
LiteLLM completion() model= gpt-4o; provider = openai


success: True
error: 


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m15:19:00 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:19:01 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler


success: True
error: 


In [71]:
typesense_client.collections["pdfs"].retrieve()

{'created_at': 1737835828,
 'default_sorting_field': '',
 'enable_nested_fields': True,
 'fields': [{'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'source_filename',
   'optional': False,
   'sort': False,
   'stem': False,
   'store': True,
   'type': 'string'},
  {'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'page_number',
   'optional': False,
   'sort': True,
   'stem': False,
   'store': True,
   'type': 'int32'},
  {'facet': False,
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'content',
   'optional': False,
   'sort': False,
   'stem': False,
   'store': True,
   'type': 'string'},
  {'facet': False,
   'hnsw_params': {'M': 16, 'ef_construction': 200},
   'index': True,
   'infix': False,
   'locale': '',
   'name': 'embedding',
   'num_dim': 1024,
   'optional': False,
   'sort': False,
   'stem': False,
   'store': True,
   'type': 'float[]',
   'vec_dist': 'cosine'}],
 'name': 'pdfs',
 

In [76]:
query = "What borough is the property in 1986?"

def gather_query_context(query: str):
  embedding_str = ','.join(str(v) for v in embedding(
    model="cohere/embed-english-v3.0", 
    input=[query]
  ).data[0]["embedding"])

  searches = {
    "searches": [
      {
        "query_by": "content",
        "q": query,
        "exclude_fields": "embedding"
      }
    ]
  }

  search_parameters = {
    "collection": "pdfs",
    "vector_query": f"embedding:([{embedding_str}], alpha: 0.4, k: 4)",
    "per_page": 25,
  }

  results = typesense_client.multi_search.perform(searches, search_parameters)

  return results

gather_query_context(query)


INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:32:25 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler


{'results': [{'facet_counts': [],
   'found': 4,
   'hits': [{'document': {'content': '- Borough: Manhattan\n- Date: Nov 29 1986\n- Zoning District: C6-6,5\n- 57th Street Wing Cellar, Live Load: O.G.\n- 57th Street Wing Cellar, Building Units: 260\n- 57th Street Wing Cellar, Description of Use: Cinema\n- 57th Street Wing Cellar, Description of Use: Offices\n- 57th Street Wing Cellar, Description of Use: Elevator Machine Room\n- 57th Street Wing Cellar, Description of Use: Gallery\n- 57th Street Wing Cellar, Description of Use: Mechanical Room\n- 57th Street Wing Cellar, Description of Use: Storage\n- 57th Street Wing Cellar Mezzanine, Building Units: 4\n- 57th Street Wing Cellar Mezzanine, Description of Use: Mechanical Room\n- 57th Street Wing Cellar Mezzanine, Description of Use: Office\n- 57th Street Wing Cellar Mezzanine, Description of Use: Locker Room, Elevator\n- 57th Street Wing Cellar Mezzanine, Description of Use: Machine Room, Storage\n- 57th Street Wing 1st Floor, Live Load

In [78]:
# Synthesize query using retrieved results
def synthesize_query(query: str):
    # Get context
    context = gather_query_context(query)
    base64_images = []
    
    # Get images from context
    for result in context["results"][0]["hits"]:
        image_base64 = image_to_base64(
            f"data/images/{result['document']['source_filename']}/page_{result['document']['page_number']}.png"
        )

        base64_images.append(image_base64)

    # Respond to query using images
    image_query_response = completion(
        model=model,
        # response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": "Attempt to answer the following query using the following images"},
            {"role": "user", "content": f"Query: {query}"},
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    } for base64_image in base64_images
                ]
            }
        ],
    )

    # return json.loads(image_query_response.choices[0].message.content)
    return image_query_response.choices[0].message.content

print(synthesize_query("What is the first floor used for?"))


INFO: HTTP Request: POST https://api.cohere.ai/v1/embed "HTTP/1.1 200 OK"
[92m15:39:58 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler
[92m15:39:58 - LiteLLM:INFO[0m: utils.py:2850 - 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: 
LiteLLM completion() model= gpt-4o; provider = openai
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m15:40:02 - LiteLLM:INFO[0m: utils.py:1030 - Wrapper: Completed Call, calling success_handler
INFO: Wrapper: Completed Call, calling success_handler


The first floor is used for:

- Carnegie Concert Hall
- Trap Room
- Lobbies
- Carnegie Hall Lounge and Bar
