### Import libraries

In [None]:
import os
from dotenv import load_dotenv
from util.load_config import load_config
from vector_store.weaviate import WeaviateCollectionManager
from parser.colqwen import Colqwen
from weaviate.classes.query import MetadataQuery
import base64
from io import BytesIO
from PIL import Image
from openai import OpenAI
from typing import List, Dict, Union

---

### Auxiliary Functions

In [None]:
def scale_image(image: Image.Image, new_height: int = 1024) -> Image.Image:
    """
    Scale an image to a new height while maintaining the aspect ratio.
    """
    width, height = image.size
    aspect_ratio = width / height
    new_width = int(new_height * aspect_ratio)

    scaled_image = image.resize((new_width, new_height))

    return scaled_image


def load_and_scale_image(image_base64: str, new_height: int = 1024) -> Image.Image:
    """
    Load an image from a base64 string and scale it to the specified height.
    """
    # Decode base64 to bytes and open with PIL
    image = decode_base64_to_image(image_base64)
    
    # Scale the image
    scaled_image = scale_image(image, new_height)

    return scaled_image

def decode_base64_to_image(base64_str: str) -> Image.Image:
    image_data = base64.b64decode(base64_str)
    return Image.open(BytesIO(image_data)).convert("RGB")


def build_flexible_message_payload(
    base64_images: List[str],
    user_text: str,
    system_prompt: str = "You are an intelligent assistant that summarizes the visual content of images.",
    additional_user_content: List[Dict[str, Union[str, Dict]]] = None
) -> List[Dict[str, Union[str, List[Dict[str, Union[str, Dict]]]]]]:
    """
    Build a flexible multimodal message payload for GPT-4o.
    """
    image_payloads = [
        {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/png;base64,{img}"
            }
        }
        for img in base64_images
    ]

    text_payload = {"type": "text", "text": user_text}

    user_content = image_payloads + [text_payload]

    if additional_user_content:
        user_content += additional_user_content

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_content}
    ]
    return messages


def ask_gpt4o(open_ai_client, messages: List[Dict]) -> str:
    response = open_ai_client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        temperature=0.3
    )
    return response.choices[0].message.content

---

### Setup

In [None]:
load_dotenv()
open_ai_key = os.getenv("OPENAI_APIKEY")

# Set up OpenAI model
open_ai_client = OpenAI(api_key=open_ai_key)

# Load config
config = load_config('config.yaml')

# Setup Weaviate connection
manager = WeaviateCollectionManager(config=config)
manager.connect(connection_type="ec2", host="your-ec2-public-ip")
collection = manager.get_collection("colqwen")

# Initialize model
model = Colqwen(model_name="vidore/colqwen2-v1.0", device_map="gpu", attn_implementation="eager")

---

### Retrieval

In [None]:
query_text = "What is the model architecture?"
query_embedding = model.multi_vectorize_text(query_text)

image_response = collection.query.near_vector(
    near_vector=query_embedding.cpu().float().numpy().tolist(),
    target_vector="colqwen_vector",
    limit=3,
    return_metadata=MetadataQuery(distance=True)
)

In [None]:
print(f'For query: {query_text}')

returned_images = []
for img_obj in image_response.objects:
    print('Distance:', img_obj.metadata.distance)

    display(load_and_scale_image(img_obj.properties.get('base64_image'), new_height=1024))

    returned_images.append(img_obj.properties.get('base64_image'))

print("##"*30)

---

### Summarizer with OpenAI

In [None]:
system_prompt = """
You are a highly capable multimodal assistant designed to interpret and summarize the content of images with accuracy, clarity, and context-awareness. 

When given images and a related text query from the user, your goal is to:

- Carefully analyze the visual content in each image
- Extract key details such as objects, people, text, actions, scenes, or relationships
- Connect the visual content with the user's question
- Summarize the most important, relevant information without speculation or hallucination
- Be clear, concise, and informative — using bullet points or paragraphs depending on the context

If the user query is vague or open-ended, provide a general but insightful summary of the visual information. If the user query is specific, tailor your response directly to answering their question.

Always prioritize factual accuracy. If you are unsure about something in an image, state it with appropriate caution (e.g., "this appears to be...").

You may be shown multiple images at once. If so, compare or summarize them together if relevant.

Respond professionally and helpfully.
"""

In [None]:
messages = build_flexible_message_payload(
    base64_images=returned_images,
    user_text=query_text,
    system_prompt=system_prompt,
)

In [None]:
# Ask GPT-4o
result = ask_gpt4o(open_ai_client, messages)
print(result)

---

### Non related question

In [None]:
query_text = "How do attention-based strategies improve the outcome of negotiations in business models?"

query_embedding = model.multi_vectorize_text(query_text)

image_response = collection.query.near_vector(
    near_vector=query_embedding.cpu().float().numpy().tolist(),
    target_vector="colqwen_vector",
    limit=3,
    return_metadata=MetadataQuery(distance=True)
)

In [None]:
print(f'For query: {query_text}')

returned_images = []
for img_obj in image_response.objects:
    print('Distance:', img_obj.metadata.distance)

    display(load_and_scale_image(img_obj.properties.get('base64_image'), new_height=1024))

    returned_images.append(img_obj.properties.get('base64_image'))

print("##"*30)

In [None]:
system_prompt = """
You are a highly capable multimodal assistant designed to interpret and summarize the content of images with accuracy, clarity, and context-awareness. 

When given images and a related text query from the user, your goal is to:

- Carefully analyze the visual content in each image
- Extract key details such as objects, people, text, actions, scenes, or relationships
- Connect the visual content with the user's question
- Summarize the most important, relevant information without speculation or hallucination
- Be clear, concise, and informative — using bullet points or paragraphs depending on the context

If the user query is vague or open-ended, provide a general but insightful summary of the visual information. If the user query is specific, tailor your response directly to answering their question.

Always prioritize factual accuracy. If you are unsure about something in an image, state it with appropriate caution (e.g., "this appears to be...").

You may be shown multiple images at once. If so, compare or summarize them together if relevant.

If the images provided to you are not related to text query from the user, your goal is to:
- Say that the provided images are unrelated to the user query before answering
- Say that you will provide the best answer based on your general knowledge and not what was given to you

Respond professionally and helpfully.
"""

In [None]:
messages = build_flexible_message_payload(
    base64_images=returned_images,
    user_text=query_text,
    system_prompt=system_prompt,
)

In [None]:
# Ask GPT-4o
result = ask_gpt4o(open_ai_client, messages)
print(result)

---