In [1]:

import pymongo
from pymongo.database import Database
from pymongo.operations import SearchIndexModel


MONGODB_CONNECTION_STRING = "mongodb://user:pass@localhost:27017/?directConnection=true"
MONGODB_DB_NAME = "imagetalk"
DIMENSIONS = 1536  # <-- set this to match your embedding size


# Connect to CosmosDB (MongoDB API)
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
# Replace with your database name
db: Database = client[MONGODB_DB_NAME]

image_collection = db["images"]
processed_collection = db["processed"]

In [None]:
# initialize collections

# UNCOMMENT the following lines if you want to start fresh


# image_collection.create_index("image_id", unique=True)
# processed_collection.create_index("file", unique=True)

# vector_index = SearchIndexModel(
#     name="embedding_vector_index",
#     type="vectorSearch",
#     definition={
#         "fields": [
#             {
#                 "type": "vector",
#                 "path": "embedding",         # field that stores your embedding array
#                 "numDimensions": DIMENSIONS,  # must match len(embedding)
#                 "similarity": "cosine"       # or "euclidean" or "dotProduct"
#             }
#         ]
#     },
# )

# # 3) Create the index
# image_collection.create_search_index(model=vector_index)

'embedding_vector_index'

In [3]:
import glob
import pandas as pd
import numpy as np
import tqdm
from pymongo import UpdateOne
from pymongo.errors import BulkWriteError

embedded_files = glob.glob(
    "../../../data/works_with_images_no_text_partitioned_embedded.parquet/*.parquet")

BATCH_SIZE = 1000          # tune for memory / speed
PROGRESS_EVERY = 5000       # log every N attempted upserts


def normalise_embedding(e):
    """Return list[float] (float32) or None.
    Ensures the vector is JSON-serialisable and correct length.
    """
    if e is None:
        return None
    if isinstance(e, (list, tuple)):
        if len(e) != DIMENSIONS:
            return None
        return [float(x) for x in e]
    try:
        arr = np.asarray(e, dtype=np.float32)
    except Exception:
        return None
    if arr.shape != (DIMENSIONS,):
        return None
    return arr.tolist()

for file in tqdm.tqdm(embedded_files):
    if processed_collection.find_one({"file": file}):
        # Already successfully processed earlier
        continue

    # print(f"Indexing {file}")
    df = pd.read_parquet(file)[["image_id", "embedding"]]

    # Drop null embeddings & duplicates within the file to reduce work
    before = len(df)
    df = df[df["embedding"].notnull()].drop_duplicates(subset=["image_id"]).copy()

    # Normalise embeddings
    df["embedding"] = df["embedding"].apply(normalise_embedding)
    df = df[df["embedding"].notnull()]
    after = len(df)
    # print(f"Rows before: {before}, after filtering/normalising: {after}")
    if after == 0:
        processed_collection.insert_one({"file": file})
        continue

    ops = []
    attempted = 0
    upserted_estimate = 0
    duplicate_conflicts = 0

    for rec in df.to_dict(orient="records"):
        ops.append(
            UpdateOne(
                {"image_id": rec["image_id"]},
                {"$setOnInsert": rec},
                upsert=True,
            )
        )
        if len(ops) >= BATCH_SIZE:
            try:
                result = image_collection.bulk_write(ops, ordered=False)
                upserted_estimate += result.upserted_count
            except BulkWriteError as bwe:
                # Count duplicate key errors (code 11000); ignore them
                write_errors = bwe.details.get("writeErrors", [])
                duplicate_conflicts += sum(1 for we in write_errors if we.get("code") == 11000)
                # Upserts that succeeded still count
                upserted_estimate += bwe.details.get("nUpserted", 0)
            attempted += len(ops)
            # if attempted % PROGRESS_EVERY == 0:
                # print(
                #     f"Attempted {attempted} | inserted (new) ~{upserted_estimate} | duplicates ignored {duplicate_conflicts}"
                # )
            ops = []

    # Flush remaining
    if ops:
        try:
            result = image_collection.bulk_write(ops, ordered=False)
            upserted_estimate += result.upserted_count
        except BulkWriteError as bwe:
            write_errors = bwe.details.get("writeErrors", [])
            duplicate_conflicts += sum(1 for we in write_errors if we.get("code") == 11000)
            upserted_estimate += bwe.details.get("nUpserted", 0)
        attempted += len(ops)

    # print(
    #     f"Finished {file}: attempted {attempted}, new inserted ~{upserted_estimate}, duplicates ignored {duplicate_conflicts}."
    # )

    processed_collection.insert_one({"file": file})

  0%|          | 0/17707 [00:00<?, ?it/s]

100%|██████████| 17707/17707 [55:56<00:00,  5.27it/s]  


In [5]:

# 2. Your query embedding (length must match your index dimensions, e.g. 1536)
# <-- replace with a real embedding from your model
query_vector = [0.5] * 1536

# 3. Run vector search aggregation
pipeline = [
    {
        "$vectorSearch": {
            "index": "embedding_vector_index",           # name of your search index
            "path": "embedding",          # field where embeddings are stored
            "queryVector": query_vector,  # your query vector
            "numCandidates": 200,         # how many ANN candidates to consider
            "limit": 5                    # top k results
        }
    },
    {
        "$project": {
            "image_id": 1,
            "score": {"$meta": "vectorSearchScore"}
        }
    }
]

results = list(image_collection.aggregate(pipeline))

for r in results:
    print(r)

{'_id': ObjectId('68da64cb9b5d3a5ad80a92c5'), 'image_id': 'https://iiif.wellcomecollection.org/image/b31650594_0160.jp2/full/718,1024/0/default.jpg', 'score': 0.5447544455528259}
{'_id': ObjectId('68da63499b5d3a5ad8025e0d'), 'image_id': 'https://iiif.wellcomecollection.org/image/b2811680x_0369.jp2/full/587,1024/0/default.jpg', 'score': 0.5430546998977661}
{'_id': ObjectId('68d5d2e54e032aa3a270d853'), 'image_id': 'https://iiif.wellcomecollection.org/image/b18024439_0877.JP2/full/718,1024/0/default.jpg', 'score': 0.542113721370697}
{'_id': ObjectId('68da66e79b5d3a5ad814e0e7'), 'image_id': 'https://iiif.wellcomecollection.org/image/b31660575_0097.jp2/full/614,1024/0/default.jpg', 'score': 0.5397946834564209}
{'_id': ObjectId('68da64679b5d3a5ad8088ed3'), 'image_id': 'https://iiif.wellcomecollection.org/image/b31809972_0008.jp2/full/660,1024/0/default.jpg', 'score': 0.538773775100708}
