## Testing vector_store.py module

for handling local sqlite db and weaviate collections

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import my_mirror_on_cloud.vector_store as vs
import my_mirror_on_cloud.text_generation as tg

In [None]:
# Initialize the local catalog store
store = vs.LocalCatalogStore(db_path="../data/catalogue_v1.db")
store

In [None]:
image1 =  "../data/h-and-m-personalized-fashion-recommendations/images/093/0930409001.jpg"

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

fig, axes = plt.subplots(1, 1, figsize=(3, 3))

img = mpimg.imread(image1)
axes.imshow(img)
axes.axis('off')  

plt.tight_layout()
plt.show()

In [None]:
from pprint import pprint

description = tg.analyze_clothing_image(
        image1,
        model_name="qwen2.5vl:7b",
        prompt_type="description_only",
        max_width=256,
    )
pprint(description)

In [None]:
tags=tg.get_tags_from_analysis(analysis=description)

processing_status = tg.get_processing_status(analysis=description)

processing_status, tags

In [None]:
image_id = store.insert_image(image1, tags=tags, processing=processing_status, embeddings=None)

In [None]:
store.close()

## Analyzing the full catalogue!



In [None]:
# Get images
from pathlib import Path
image_paths = Path('../data/h-and-m-personalized-fashion-recommendations/images').rglob('*.jpg')
image_list = list(image_paths)
print(f"Found {len(image_list)} images, {image_list[0]}")

In [None]:
store = vs.LocalCatalogStore(db_path="../data/catalogue_v1.db")

In [None]:
from tqdm.notebook import tqdm

In [None]:
model_name = "qwen2.5vl:7b"
prompt_type = "description_only"

for img_path in tqdm(image_list):
    img_path_str = str(img_path)
    model_name = "qwen2.5vl:7b"
    
    if store.is_model_processed(img_path_str, model_name):
        continue

    description = tg.analyze_clothing_image(
        img_path_str,
        model_name=model_name,
        max_width=256,
        prompt_type=prompt_type,
    )
    tags = tg.get_tags_from_analysis(analysis=description)
    processing_status = tg.get_processing_status(analysis=description)
    store.insert_image(img_path_str, tags=tags, processing=processing_status, embeddings=None)

    if len(store.get_all_images()) % 100 == 0:
        print(f"Processed {len(store.get_all_images())} images so far.")



In [None]:
print(processing_status)

## Tags from main catalogue

In [None]:
import pandas as pd

In [None]:
# Get images
from pathlib import Path
image_paths = Path('../data/h-and-m-personalized-fashion-recommendations/images').rglob('*.jpg')
image_list = list(image_paths)
print(f"Found {len(image_list)} images, {image_list[0]}")

In [None]:
df = pd.read_csv("../data/h-and-m-personalized-fashion-recommendations/articles.csv")
df.head()

In [None]:
df.shape, df.columns

In [None]:
df['index_name']

In [None]:
df["product_type_name"].unique()

In [None]:
df["colour_group_name"].unique()

In [None]:
df["index_group_name"].unique()

In [None]:
df["genre"] = df["index_group_name"].map({
    "Menswear": "Male",
    "Ladieswear": "Female",
    "Baby/Children": "Child",
    "Divided": "All",
    "Sport": "All"
})

In [None]:
from datetime import datetime, timezone
from tqdm.notebook import tqdm

In [None]:
df["id"] = df["article_id"].apply(lambda x: "0" + str(x))
df["id"]

In [None]:
store = vs.LocalCatalogStore(db_path="../data/catalogue_v1.db")
model_name = "original"
force_update = False

for i, img_path in tqdm(enumerate(image_list[:])):
    img_path_str = str(img_path)
    # if store.is_model_processed(img_path_str, model_name) and not force_update:
    #     continue
    model_name = "original"
    if not df[df["id"] == img_path.stem].empty:    
        tags = {
            "model_name": model_name,
            "confidence": 1.0,
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "product_type": df[df["id"] == img_path.stem]["product_type_name"].values[0],
            "colour": df[df["id"] == img_path.stem]["colour_group_name"].values[0],
            "gender": df[df["id"] == img_path.stem]["genre"].values[0]
        }
    processing_status = {model_name: True}
    store.insert_image(img_path_str, tags=tags, processing=processing_status, embeddings=None, force_update=True)

# Checking the catalogue

In [None]:
store = vs.LocalCatalogStore(db_path="../data/catalogue_v1.db")

In [None]:
Catalog = store.get_all_images()
df_cat = pd.DataFrame(Catalog)


In [None]:
from collections import Counter
from itertools import chain

def count_key_occurrences(list_of_dicts):
    """Count how many times each key appears across all dictionaries."""
    all_keys = chain.from_iterable(d.keys() for d in list_of_dicts)
    return Counter(all_keys)

def count_key_occurrences_nested(list_of_list_of_dicts):
    """Count keys in a list of list of dictionaries."""
    all_keys = chain.from_iterable(
        chain.from_iterable(d.keys() for d in sublist) 
        for sublist in list_of_list_of_dicts
    )
    return Counter(all_keys)

key_counts = count_key_occurrences(df_cat["processing_status"].to_list())
print(key_counts)
key_counts = count_key_occurrences_nested(df_cat["tags"].to_list())
print(key_counts)
# Output: Counter({'model1': 5, 'model2': 3, 'model3': 2, ...})