## Testing the embedding_manager.py module

For clothing image vectorization

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import my_mirror_on_cloud.embedding_manager as em

In [None]:
from tqdm.notebook import tqdm

## Get H&M DATA

In [None]:
# Get images
from pathlib import Path
image_paths = Path('../data/h-and-m-personalized-fashion-recommendations/images').rglob('*.jpg')
image_list = [str(p) for p in image_paths]
print(f"Found {len(image_list)} images, {image_list[0]}")

## Simple vectorization

In [None]:
## Quick test on 10 images

test_vectors = em.vectorize_images(
    image_list[:1], model_name="fashion-clip", batch_size=32, max_width=256, use_float16=True
)

In [None]:
test_vectors = em.vectorize_images(
    image_list[:1], model_name="fashion-clip", batch_size=32, max_width=2056, use_float16=True
)

In [None]:
test_vectors[0]["embedding"]

In [None]:
# Without preprocessing

import time
# With preprocessing  
start = time.time()
embeddings2 = em.vectorize_images(
    image_list[:100], model_name="fashion-clip", batch_size=32, use_float16=True
)
time_without = time.time() - start

In [None]:
# With preprocessing  
start = time.time()
embeddings2 = em.vectorize_images(
    image_list[:100], model_name="fashion-clip", batch_size=32, max_width=256, use_float16=True
)
time_with = time.time() - start

print(f"Without preprocessing: {time_without:.2f}s")
print(f"With preprocessing: {time_with:.2f}s")

In [None]:
embeddings2[0]

## More vectors

In [None]:
import my_mirror_on_cloud.vector_store as vs

In [None]:
store = vs.LocalCatalogStore(db_path="../data/catalogue_v1.db")

In [None]:
batch_size = 32
max_files = 1000
model_name = "fashion-clip"
force_update = False

batch_path = []
for img_path in tqdm(image_list[:]):
    if store.is_model_processed(img_path, model_name) and not force_update:
        continue
    batch_path.append(img_path)
    if len(batch_path) == max_files or img_path == image_list[-1]:
        embeddings = em.vectorize_images(
            batch_path, model_name=model_name, batch_size=batch_size, use_float16=True,
        )
        for item in embeddings:
            processing_status = em.get_processing_status(item)
            embedding = em.get_embeddings_from_analysis(item)
            store.insert_image(
                item["image_path"],
                processing=processing_status,
                embeddings=embedding,
                force_update=True
            )

        batch_path = []

In [None]:
# import sqlite3
# import shutil

# def migrate_embeddings_empty_column(db_path: str):
#     """
#     Migrate table structure when embedding column is empty.
#     Keeps all data except the old embedding column.
#     """
    
#     # Step 1: Create backup
#     backup_path = db_path + ".backup"
#     shutil.copy2(db_path, backup_path)
#     print(f"✅ Backup created at: {backup_path}")

#     conn = sqlite3.connect(db_path)
#     cur = conn.cursor()

#     try:
#         # Step 2: Begin migration
#         cur.execute("PRAGMA foreign_keys=off;")
#         cur.execute("BEGIN TRANSACTION;")
        
#         # Step 3: Rename old table
#         cur.execute("ALTER TABLE images RENAME TO images_old;")
        
#         # Step 4: Create new table with updated schema
#         cur.execute("""
#         CREATE TABLE images (
#             id INTEGER PRIMARY KEY AUTOINCREMENT,
#             image_path TEXT NOT NULL,
#             image_hash TEXT UNIQUE NOT NULL,
#             embeddings BLOB NULL,                    -- New BLOB column for multiple embeddings
#             tags TEXT DEFAULT '[]',                  
#             processing_status TEXT DEFAULT '{}',     
#             date_added TIMESTAMP DEFAULT CURRENT_TIMESTAMP
#         );
#         """)
        
#         # Step 5: Copy data (excluding old empty embedding column)
#         cur.execute("""
#         INSERT INTO images (id, image_path, image_hash, tags, processing_status, date_added)
#         SELECT id, image_path, image_hash, tags, processing_status, date_added
#         FROM images_old;
#         """)
        
#         # Step 6: Clean up
#         cur.execute("DROP TABLE images_old;")
#         cur.execute("COMMIT;")
#         cur.execute("PRAGMA foreign_keys=on;")
        
#         print("✅ Migration completed successfully!")
#         print("📊 All data preserved, new embeddings column ready for use")
        
#     except Exception as e:
#         cur.execute("ROLLBACK;")
#         print(f"❌ Migration failed: {e}")
#         # Restore from backup
#         shutil.copy2(backup_path, db_path)
#         raise
        
#     finally:
#         conn.close()

# # Usage
# migrate_embeddings_empty_column("../data/catalogue_v1.db")

In [None]:
store.get_image_by_path(image_list[0])["processing_status"]

In [None]:
processing_status.keys()