In [3]:
from opensearchpy import OpenSearch
import pandas as pd
import re
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import random
from PIL import Image
from scipy.sparse import bsr_matrix
from scipy import sparse
import json
from dotenv import load_dotenv
from tqdm import tqdm


# Set up connection

In [4]:
load_dotenv(dotenv_path="../backend/.env")

OPENSEARCH_URL = os.getenv("OPENSEARCH_URL", "http://localhost:9200")
USERNAME = os.getenv("OPENSEARCH_USER", "admin")
PASSWORD = os.getenv("OPENSEARCH_PASS", "admin")

host = OPENSEARCH_URL.replace("https://", "").replace("http://", "").split(":")[0]
port = 443 if OPENSEARCH_URL.startswith("https") else 9200

client = OpenSearch(
    hosts=[{"host": host, "port": port}],
    http_auth=(USERNAME, PASSWORD),
    use_ssl=OPENSEARCH_URL.startswith("https"),
    verify_certs=False
)

print(f'running on {OPENSEARCH_URL}')

running on https://search-opensearch-cluster-wu5ju7kko4epwsbhzljedkegwm.aos.eu-north-1.on.aws


# Clean data

In [5]:
masks_folder = "data/npz/"
clip_folder = "data/npz_clip/"

df = pd.read_csv("data/omniart_CLIP_embeddings.csv")

### Extract masks and corresponding embeddings

In [None]:
def find_bounding_boxes(image, padding=0):
    boxes = []
    unique_labels = np.unique(image)

    for label in unique_labels:
        # Find the positions of all pixels with this label
        box = {}
        positions = np.where(image == label)
        y_coords, x_coords = positions

        # Determine the minimum and maximum coordinates
        min_y, min_x = y_coords.min(), x_coords.min()
        max_y, max_x = y_coords.max(), x_coords.max()

        # Calculate height and width and add padding
        width = int(max_x - min_x + 1 + 2 * padding)
        height = int(max_y - min_y + 1 + 2 * padding)
        min_x = int(max(min_x - padding, 0))
        min_y = int(max(min_y - padding, 0))

        centroid = (min_x + width // 2, min_y + height // 2)

        centroid_percentage = (centroid[0] / image.shape[1], centroid[1] / image.shape[0])

        bbox = (min_x,  min_x + width, min_y, min_y + height)

        box["mask_id"] = label
        box["region"] = bbox
        box["width"] = width
        box["height"] = height
        box["centroid"] = centroid_percentage
        binary = np.copy(image)
        binary[binary != label] = 0
        binary[binary == label] = 1
        box["data"] =  binary[bbox[0]: bbox[1], bbox[2]: bbox[3]]
        # box["data"] = binary[bbox[2]: bbox[3], bbox[0]: bbox[1]]
        boxes.append(box)

        # boxes[label] = (min_x, min_y, width, height)

    return boxes

In [7]:
# iteratev over df, print every row
masks = []
empty = []

for index, row in tqdm(df.iterrows(), desc="Processing painting", unit="painting", total=df.shape[0]):  
    try:
        omni_id = row['omni_id']

        # Reconstruct the 3D array of masks
        loaded_data = bsr_matrix(sparse.load_npz(os.path.join(masks_folder,f'{omni_id}_sparse_masks.npz')))

        segmented_img = loaded_data.toarray()
        
        # Calculate bbbox and centroid
        bounding_boxes = find_bounding_boxes(segmented_img)
        bounding_boxes = list(map(lambda x: {**x, "omni_id": omni_id}, bounding_boxes))


        # Get CLIP data
        clip_data = np.load(os.path.join(clip_folder,f'{omni_id}_embeddings.npz'))

        for mask in bounding_boxes:
            try:
                mask_id = mask["mask_id"]
                mask["CLIP_embeddings"] = clip_data[f'segment_{mask["mask_id"]}']
                masks.append(mask)
            except:
                empty.append(f'{omni_id}_{mask_id}')
    
    except :
        empty.append(omni_id)
        continue

mask_df = pd.DataFrame(masks)

Processing painting:   0%|          | 0/3000 [00:00<?, ?painting/s]

Processing painting: 100%|██████████| 3000/3000 [02:02<00:00, 24.54painting/s]


### Add Id column

In [8]:
mask_df["id"] = mask_df.index

# Upload data

In [10]:
INDEX_NAME = "masks"  

# Convert DataFrame to a list of dictionaries
df_dict = mask_df.to_dict(orient='records')

### Test the first entry

In [11]:
# # Upload test
# response = client.index(index=INDEX_NAME, body=df_dict[0])
# doc_id = response['_id']
# print(f"✅ Indexed mask: {doc_id}")

In [12]:
# # Delete test
# delete_response = client.delete(index=INDEX_NAME, id=doc_id)
# print(f"🗑️ Deleted mask: {delete_response['result']}")

### Iterate on all entries

In [None]:
# Index each painting
failed = []

for mask in tqdm(df_dict, desc="Indexing masks", unit="mask"):  
    try:
        response = client.index(index=INDEX_NAME, body=mask)
    except Exception as e:
        failed.append(mask)
        print(f"❌ Error indexing painting: {e}")

In [14]:
len(failed)

1

# Check Upload

In [15]:
response = client.search(
    index=INDEX_NAME,
    body={
        "query": { "match_all": {} },
        "size": 10  # limit number of docs returned
    }
)

for hit in response["hits"]["hits"]:
    print(json.dumps(hit["_source"], indent=2))



{
  "mask_id": 1,
  "region": [
    0,
    360,
    0,
    489
  ],
  "width": 360,
  "height": 489,
  "centroid": [
    0.5,
    0.49897750511247446
  ],
  "data": [
    [
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1,