## Testing weaviate_manager.py Module

Handling connexion to the catalogues, for updates or search! For full update examples (see the update_catalogue_HM.py example)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from my_mirror_on_cloud import weaviate_manager as wm

In [None]:
import pandas as pd

In [None]:
import weaviate.classes.config as wc

### Simple connexion


In [None]:
with wm.WeaviateManager() as weaviate:
    client = weaviate.get_client()
    print(weaviate.list_collections())

## Testing Search with small Farfetch catalogue (description)

In [None]:
df = pd.read_json("../results/farfetch.jsonl", lines=True)
df.head()

In [None]:
df_selected = df[df["model"]=="qwen2.5vl:7b"][["image_name", "description"]].reset_index(drop=True)
df_selected.head()

In [None]:
### Creating test collection ! (with snowflake vectorizer for description text)

collection_name = "test_collection_farfetch"

with wm.WeaviateManager() as weaviate:
    client = weaviate.create_collection(
        collection_name=collection_name,
        properties=[
            wc.Property(name="image_name", data_type=wc.DataType.TEXT),
            wc.Property(name="description", data_type=wc.DataType.TEXT),
        ],
        vector_config=[
            wc.Configure.Vectors.text2vec_weaviate(
                name="main_vector",
                model="Snowflake/snowflake-arctic-embed-l-v2.0",
                source_properties=["description"],
            )
        ],
        force_creation=True,
    )
    
    print(weaviate.list_collections())
    print(weaviate.get_properties_of_collection(collection_name))

In [None]:
### format the data for batch inserting
 
formated_data = df_selected.to_dict(orient="records")
formated_data = [{"properties": record} for record in formated_data]
formated_data[:3]

In [None]:
### fill the collection

with wm.WeaviateManager() as weaviate:
    weaviate.batch_insert_objects_to_collection(
        collection_name=collection_name,
        objects_data=formated_data,
        batch_size=200,
        show_progress=True
    )

In [None]:
### Searching in the collection

collection_name = "test_collection_farfetch"
query_text = "red dress with flowers"
limit = 5

with wm.WeaviateManager() as weaviate:
    results = weaviate.search_by_text(
        collection_name=collection_name,
        query=query_text,
        limit=limit
    )
    

In [None]:
from pprint import pprint
pprint(results)

In [None]:
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from pathlib import Path
cols=5
rows=cols//limit
fig, axes = plt.subplots(rows, cols, figsize=(2*cols, 2*rows))
axes = axes.flatten()  

relative_path = "../data/farfetch/images/"

images = [relative_path + item.properties["image_name"] for item in results.objects]

for i, ax in enumerate(axes):
    img = mpimg.imread(str(images[i]))
    ax.imshow(img)
    ax.set_title(Path(f"{images[i]}").name, fontsize=8)  
    ax.axis('off')  
else:
    ax.axis('off')  

plt.tight_layout()
plt.show()


## Testing Search with small Farfetch catalogue (fashionclip)

In [None]:
collection_name = "test_collection_farfetch2"

In [None]:
with wm.WeaviateManager() as weaviate:
    client = weaviate.create_collection(
        collection_name=collection_name,
        properties=[
            wc.Property(name="image_name", data_type=wc.DataType.TEXT),
            ],
        vector_config=[
            wc.Configure.Vectors.self_provided(
                name="vector_fashionclip",
                vector_index_config=wc.Configure.VectorIndex.hnsw(
                    distance_metric=wc.VectorDistances.COSINE
                )
            )
        ],
        force_creation=True,
    )
    
    print(weaviate.list_collections())
    print(weaviate.get_properties_of_collection(collection_name))

In [None]:
# Get image vectors
from pathlib import Path
from my_mirror_on_cloud import embedding_manager as em

image_paths = Path('../data/farfetch/images').glob('*.jpg')
image_list = [str(p) for p in image_paths]
print(f"Found {len(image_list)} images, ", image_list[0])

vectors = em.vectorize_images(
    image_list, model_name="fashion-clip", batch_size=100, use_float16=True,
)
formated_data = [
    {
        "properties": {
            "image_name": Path(img).name,
        },
        "vectors": {
            "vector_fashionclip": vector["embedding"]
        }
    }
    for img, vector in zip(image_list, vectors)
]

In [None]:
### fill the collection

with wm.WeaviateManager() as weaviate:
    weaviate.batch_insert_objects_to_collection(
        collection_name=collection_name,
        objects_data=formated_data,
        batch_size=200,
        show_progress=True
    )

In [None]:
## Load one image from H&M for search

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

image1 =  "../data/h-and-m-personalized-fashion-recommendations/images/093/0930409001.jpg"

fig, axes = plt.subplots(1, 1, figsize=(3, 3))

img = mpimg.imread(image1)
axes.imshow(img)
axes.axis('off')  

plt.tight_layout()
plt.show()

In [None]:
query_vector = em.vectorize_images([image1], model_name="fashion-clip")[0]["embedding"]

with wm.WeaviateManager() as weaviate:
    results = weaviate.search_by_vector(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=5,
        target_vector="vector_fashionclip"
    )

In [None]:
## try the same with a fashion-clip vector generated from text

query_text_vector = em.vectorize_texts(
    ["red dress with flowers"], model_name="fashion-clip"
)[0]["embedding"]

with wm.WeaviateManager() as weaviate:
    results = weaviate.search_by_vector(
        collection_name=collection_name,
        query_vector=query_text_vector,
        limit=5,
        target_vector="vector_fashionclip",
        certainty=0.1 ### had to lower certainty to get results (default is 0.7)
    )

In [None]:
results.objects

In [None]:
cols=5
rows=cols//limit
fig, axes = plt.subplots(rows, cols, figsize=(2*cols, 2*rows))
axes = axes.flatten()  

relative_path = "../data/farfetch/images/"

images = [relative_path + item.properties["image_name"] for item in results.objects]

for i, ax in enumerate(axes):
    img = mpimg.imread(str(images[i]))
    ax.imshow(img)
    ax.set_title(Path(f"{images[i]}").name, fontsize=8)  
    ax.axis('off')  
else:
    ax.axis('off')  

plt.tight_layout()
plt.show()

## Full Detailed Example with H&M catalogue (but the script is better)

In [None]:
import my_mirror_on_cloud.vector_store as vs

In [None]:
import pandas as pd

In [None]:
CatalogStore = vs.LocalCatalogStore(db_path="../data/catalogue_v1.db")


In [None]:
CatalogStore.get_all_columns()

In [None]:
Catalog = CatalogStore.get_all_images()
Catalog[0:2]

In [None]:
df = pd.DataFrame(Catalog)
df.head()

In [None]:
from weaviate.util import generate_uuid5

df["uuid"] = df["image_path"].apply(lambda x: generate_uuid5(x))
df.head()

In [None]:
from tqdm.notebook import tqdm

In [None]:
all_data = []

for i, row in tqdm(enumerate(df.itertuples())):

    item_data = {
        "uuid": row.uuid,
        "image_path": row.image_path,
    }
    for embedding in row.embeddings:
        item_data.update(
            {
                f"vector_{embedding['model_name']}".replace("-", ""): embedding['embedding'],
                f"timestamp_{embedding['model_name']}".replace("-", ""): embedding['timestamp'],
                f"confidence_{embedding['model_name']}".replace("-", ""): embedding['confidence'],
            }
        )
    # for tag in row.tags:
    #     item_data.update({
    #         f"description_{tag['model_name']}": tag['embedding'],
    #         f"timestamp_{tag['model_name']}": tag['timestamp'],
    #         f"confidence_{tag['model_name']}": tag['confidence'],
    #     })
        
        
    all_data.append(item_data)


In [None]:
from itertools import chain

def get_unique_keys(list_of_dicts):
    """Get all unique keys from a list of dictionaries"""
    unique_keys = set(chain.from_iterable(d.keys() for d in list_of_dicts))
    return list(unique_keys)

unique_keys = get_unique_keys(all_data)
vector_keys = [key.replace('-', '') for key in unique_keys if key.startswith("vector_")]
nonvector_keys = [key.replace('-', '') for key in unique_keys if not key.startswith("vector_") and key != "uuid"]
uuid_keys = [key for key in unique_keys if key == "uuid"]
print("Vector keys:", vector_keys)
print("Non-vector keys:", nonvector_keys)
print("UUID keys:", uuid_keys)

In [None]:
formated_data = []
for item in all_data:
    item_data = {"vector":{}, "properties":{}, "uuid":None}
    for key in vector_keys:
        if key in item and item[key] is not None:
            item_data["vector"][key] = item[key]
    for key in nonvector_keys:
        if key in item and item[key] is not None:
            item_data["properties"][key] = item[key]
    for key in uuid_keys:
        if key in item and item[key] is not None:
            item_data["uuid"] = item[key]
    formated_data.append(item_data)

In [None]:
formated_data[0]

In [None]:
with wm.WeaviateManager() as weaviate:
    client = weaviate.create_collection(
        collection_name="Catalogue_HM",
        force_creation=True,
        properties=[
            wc.Property(name="image_path", data_type=wc.DataType.TEXT),
            wc.Property(name="timestamp_fashionclip", data_type=wc.DataType.TEXT),
            wc.Property(name="confidence_fashionclip", data_type=wc.DataType.NUMBER),
        ],
        vector_config=[
            wc.Configure.Vectors.self_provided(
                name=key,
                vector_index_config=wc.Configure.VectorIndex.hnsw(
                    distance_metric=wc.VectorDistances.COSINE
                ),
            )
            for key in vector_keys
        ]    )
    weaviate.batch_insert_objects_to_collection(
        collection_name="Catalogue_HM",
        objects_data=formated_data,
        batch_size=50,
        show_progress=True
    )