In [135]:
import base64
import requests
from io import BytesIO
from PIL import Image
import os
import torch
from visual_bge.modeling import Visualized_BGE
from pymilvus import (
    utility,
    CollectionSchema, DataType, FieldSchema, model,
    connections, Collection, AnnSearchRequest, RRFRanker,
)
from tqdm import tqdm

In [136]:
ENDPOINT = os.getenv('ZILLIS_ENDPOINT')
TOKEN = os.getenv('ZILLIS_TOKEN')
connections.connect(uri=ENDPOINT, token=TOKEN)

COLLECTION_NAME = "odprt_index"

In [137]:
AUTO_ID = FieldSchema(
    name="auto_id",
    dtype=DataType.INT64,
    is_primary=True,
    auto_id=True
)

DOC_ID = FieldSchema(
    name="doc_id",
    dtype=DataType.VARCHAR,
    max_length=500
)

DOC_SOURCE = FieldSchema(
    name="doc_source",
    dtype=DataType.VARCHAR,
    max_length=1000,
    default_value="NA"
)

### TEXT FEATURES

TEXT = FieldSchema(
    name="text",
    dtype=DataType.VARCHAR,
    max_length=50000,
    default_value=""
)

TEXT_DENSE_EMBEDDING = FieldSchema(
    name="text_dense_embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=1024
)

TEXT_SPARSE_EMBEDDING = FieldSchema(
    name="text_sparse_embedding",
    dtype=DataType.SPARSE_FLOAT_VECTOR
)

### IMAGE FEATURES

DESCRIPTION = FieldSchema(
    name="description",
    dtype=DataType.VARCHAR,
    max_length=5000,
    default_value=""
)

DESCRIPTION_EMBEDDING = FieldSchema(
    name="description_embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=768
)

IMAGE_EMBEDDING = FieldSchema(
    name="image_embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=768 # Image embedding dim
)

### DEFINING THE SCHEMA

SCHEMA = CollectionSchema(
    fields=[AUTO_ID, DOC_ID, DOC_SOURCE, TEXT, TEXT_DENSE_EMBEDDING, TEXT_SPARSE_EMBEDDING, DESCRIPTION, DESCRIPTION_EMBEDDING, IMAGE_EMBEDDING],
    description="Schema for indexing documents and images",
    enable_dynamic_field=True
)

In [138]:
def create_collection(collection_name, schema):
    # Check if the collection exists
    if utility.has_collection(collection_name):
        print(f"Collection '{collection_name}' already exists")
        return Collection(name=collection_name) 
    else:
        # Create the collection
        return Collection(name=collection_name, schema=schema, using='default', shards_num=2)

def drop_collection(collection_name):
    # Check if the collection exists
    if utility.has_collection(collection_name):
        collection = Collection(name=collection_name)
        # Release the collection
        collection.release()
        # Drop the collection if it exists
        utility.drop_collection(collection_name)
        print(f"Collection '{collection_name}' has been dropped")
    else:
        print(f"Collection '{collection_name}' does not exist")

drop_collection(COLLECTION_NAME)
collection = create_collection(collection_name=COLLECTION_NAME, schema = SCHEMA)

Collection 'odprt_index' has been dropped


In [140]:
# Embedding Model
embedding_model = Visualized_BGE(model_name_bge="BAAI/bge-base-en-v1.5", model_weight="Visualized_base_en_v1.5.pth")
embedding_model.eval()

Visualized_BGE(
  (bge_encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=T

In [148]:
image_paths = ["34.png", "43.png", "46.png", "160.png", "archi.png"]
image_paths = [f"sample_images/{path}" for path in image_paths]

# Function to encode an image
def encode_image(img):
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    encoded_string = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return encoded_string

# Initialise Hyperbolic API Details
api_key = os.getenv("HYPERBOLIC_API_KEY")
api = "https://api.hyperbolic.xyz/v1/chat/completions"
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}",
}

# List to store summaries
summaries = []

# Loop through each image path, load the image, encode it, and get the summary
for image_path in image_paths:
    # Load and encode the image
    img = Image.open(image_path)
    base64_img = encode_image(img)

    # Create the API payload
    payload = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Generate me a summary of this image."},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"},
                    },
                ],
            }
        ],
        "model": "Qwen/Qwen2-VL-7B-Instruct",
        "max_tokens": 300,  # Max length for BGE-Embeddings
        "temperature": 0.7,
        "top_p": 0.9,
    }

    # Send the request to the API
    response = requests.post(api, headers=headers, json=payload)
    print(response.json())
    # Extract the summary from the response
    summary = response.json().get("choices", [{}])[0].get("message", {}).get("content", "No summary available")
    summaries.append(summary)

    # Print the summary for the current image
    print(f"Summary for {image_path}: {summary}")

{'id': 'chat-58c3912dbeb2452ab369575fe311f4ed', 'object': 'chat.completion', 'created': 1739176612, 'model': 'Qwen/Qwen2-VL-7B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The image is a line graph titled "More voters say it \'really matters\' who wins the presidency than at any point in the last 20 years." The graph shows the percentage of registered voters who believe it really matters who wins the presidential election and those who believe things will be pretty much the same regardless of who is elected, from 2000 to 2020.\n\nKey points from the graph:\n- In 2020, 83% of registered voters believe it really matters who wins the presidential election.\n- In 2020, 16% of registered voters believe things will be pretty much the same regardless of who is elected.\n- There is a noticeable increase in the percentage of voters who believe it really matters who wins the presidential election from 2016 to 2020.\n- The percentage of voters who believe thing

In [149]:
# Helper function to generate embeddings
def generate_embeddings(image_path, text_summary):
    """Generates embeddings for both image and optional text summary."""
    description_embedding = []
    image_embedding = []

    with torch.no_grad():
        if text_summary and text_summary != "No summary available":
            description_embedding = embedding_model.encode(text=text_summary).tolist()[0]
        
        # Generate embedding for image
        image_embedding = embedding_model.encode(image=image_path).tolist()[0]

    return description_embedding, image_embedding

description_embeddings = []
image_embeddings = []

# Generate embeddings
for i, path in enumerate(image_paths):
    description_embedding, image_embedding = generate_embeddings(path, summaries[i])
    description_embeddings.append(description_embedding)
    image_embeddings.append(image_embedding)

In [152]:
data = [
    {
        "doc_id": image_paths[i],
        "doc_source": "sample_images",
        "text": "",
        "text_dense_embedding": [0.0] * 1024,  # Placeholder for text dense embedding
        "text_sparse_embedding": [],  # Empty sparse vector
        "description": summaries[i],
        "description_embedding": description_embeddings[i],  # Placeholder for description embedding
        "image_embedding": image_embeddings[i],  # Placeholder for image embedding
    }
    for i in range(len(image_paths))
]


In [153]:
from tqdm import tqdm

def batch_ingestion(collection, data):
    batch_size = 100
    total_elements = len(data)  # Ensure batching considers the number of records
    total_batches = (total_elements + batch_size - 1) // batch_size

    # Using tqdm to create a progress bar
    for start in tqdm(range(0, total_elements, batch_size), 
                      total=total_batches,
                      desc="Ingesting batches"):
        end = min(start + batch_size, total_elements)
        batch = data[start:end]  # Slice batch correctly
        collection.insert(batch)  # Insert batch into collection


In [154]:
batch_ingestion(collection=collection, data = data)

Ingesting batches: 100%|██████████| 1/1 [00:00<00:00,  1.71it/s]


In [155]:
def create_all_indexes(collection: Collection) -> None:
    # dense embeddings index
    collection.create_index(
        field_name="text_dense_embedding",
        index_params={
            "metric_type": "COSINE",
            "index_type": "HNSW",
            "params": {
                "M": 5,
                "efConstruction": 512
            }
        },
        index_name="dense_embeddings_index"
    )
    
    print("Dense embeddings index created")

    # sparse embeddings index
    collection.create_index(
        field_name="text_sparse_embedding",
        index_params={
            "metric_type": "IP",
            "index_type": "SPARSE_INVERTED_INDEX",
            "params": {
                "drop_ratio_build": 0.2
            }
        },
        index_name="sparse_embeddings_index"
    )

    print("Sparse embeddings index created")

    # description embeddings index
    collection.create_index(
        field_name="description_embedding",
        index_params={
            "metric_type": "COSINE",
            "index_type": "HNSW"
        },
        index_name="description_embedding_index"
    )
    
    print("description_embedding index created")

    # sparse embeddings index
    collection.create_index(
        field_name="image_embedding",
        index_params={
            "metric_type": "COSINE",
            "index_type": "HNSW",
        },
        index_name="image_embedding_index"
    )
    
    print("image_embedding index created")
    # load
    collection.load()
    print("Collection loaded")

In [156]:
create_all_indexes(collection)

Dense embeddings index created
Sparse embeddings index created
description_embedding index created
image_embedding index created
Collection loaded


In [159]:
def hybrid_search(query: str) -> str:
    query_embedding = embedding_model.encode(text=query).tolist()[0]
    
    search_results = collection.hybrid_search(
            reqs=[
                AnnSearchRequest(
                    data=[query_embedding],  # content vector embedding
                    anns_field='description_embedding',  # content vector field
                    param={"metric_type": "COSINE", "params": {"M": 64, "efConstruction": 512}}, 
                    limit=3
                ),
                AnnSearchRequest(
                    data=[query_embedding],  # keyword vector embedding
                    anns_field='image_embedding',  # keyword vector field
                    param={"metric_type": "COSINE", "params": {"M": 64, "efConstruction": 512}}, 
                    limit=3
                )
            ],
            output_fields=['doc_id', 'description'],
            # using RRFRanker here for reranking
            rerank=RRFRanker(),
            limit=3
            )
    
    hits = search_results[0]
    
    context = []
    for res in hits:
        doc_id = res.doc_id
        description = res.description
        context.append(f"Doc_id: {doc_id} \n Description: {description}")
    
    return "\n\n".join(context)

In [161]:
hybrid_search("Covid-19")

'Doc_id: sample_images/160.png \n Description: The image presents a pie chart from the Pew Research Center that summarizes public opinion on the need for new COVID-19 aid and the urgency of Congress acting on this issue. The chart is based on a survey of the general public conducted from November 12 to 17, 2020.\n\nThe pie chart is divided into three main sections:\n\n1. **80% Necessary**: This section is the largest, indicating that 80% of the respondents believe another economic assistance package is necessary.\n2. **As soon as possible, by the current Congress and President Trump**: This section represents 68% of the respondents, suggesting that they want Congress to act on this issue as soon as possible, specifically under the current leadership.\n3. **After the presidential inauguration in January**: This section is the smallest, with 11% of the respondents suggesting that Congress should wait until after the presidential inauguration in January to consider another aid package.\n4