In [6]:

import pymongo
from pymongo.database import Database
from pymongo.operations import SearchIndexModel


MONGODB_CONNECTION_STRING = "mongodb://localhost:27017/?directConnection=true"
MONGODB_DB_NAME = "imagetalk"

# Connect to CosmosDB (MongoDB API)
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
# Replace with your database name
db: Database = client[MONGODB_DB_NAME]

image_collection = db["images"]
processed_collection = db["processed"]

In [7]:
# initialize collections
image_collection.create_index("image_id", unique=True)
processed_collection.create_index("file", unique=True)

DIMENSIONS = 1536  # <-- set this to match your embedding size
vector_index = SearchIndexModel(
    name="embedding_vector_index",
    type="vectorSearch",
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "embedding",         # field that stores your embedding array
                "numDimensions": DIMENSIONS,  # must match len(embedding)
                "similarity": "cosine"       # or "euclidean" or "dotProduct"
            }
        ]
    },
)

# 3) Create the index
image_collection.create_search_index(model=vector_index)

ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 68d595868cd0b8685d6084f7, topology_type: Single, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [None]:
import glob
import pandas as pd

embedded_files = glob.glob(
    "../../../data/works_with_images_no_text_partitioned_embedded.parquet/*.parquet")

for file in embedded_files:
    # check if file has been processed
    if processed_collection.find_one({"file": file}):
        print(f"Skipping {file}")
        continue
    print(f"Indexing {file}")
    df = pd.read_parquet(file)[["image_id", "embedding"]]
    records = df.to_dict(orient="records")    
    image_collection.insert_many(records)
    # add file to processed collection
    processed_collection.insert_one({"file": file})
    break

Indexing ../../../data/works_with_images_no_text_partitioned_embedded.parquet/part-03817-44ca329b-08d0-4e7d-85ca-f4c7b3d59f87-c000.snappy.parquet
{'image_id': 'https://iiif.wellcomecollection.org/image/b3009446x_0790.jp2/full/604,1024/0/default.jpg', 'embedding': array([-0.05557 , -0.01134 ,  0.012115, ..., -0.02177 ,  0.02693 ,
        0.01979 ], shape=(1536,), dtype=float16)}
