In [1]:
# Code Cell 1: Evaluate Indexing Strategies Metrics
# This cell loads (or simulates) a probe image, extracts its embedding, and then adds a random gallery of embeddings to each index.
# It times the search query and prints out the retrieved metadata for each indexing strategy (BruteForce, LSH, HNSW).
import os
# OMP workaround
os.environ["KMP_DUPLICATE_LIB_OK"]="True" 
import time
import numpy as np
from PIL import Image
from modules.extraction.preprocessing import Preprocessing
from modules.extraction.embedding import Embedding
from modules.retrieval.index.bruteforce import FaissBruteForce
from modules.retrieval.index.lsh import FaissLSH
from modules.retrieval.index.hnsw import FaissHNSW

# Placeholder for probe image path – replace with an actual image file.
probe_image_path = "C:\\Users\\Putna\\OneDrive - Johns Hopkins\\Documents\\Johns Hopkins\\Creating AI Enabled Systems\\SP25\\ironclad\\storage\\probe\\Aaron_Sorkin\\Aaron_Sorkin_0002.jpg"  # Replace with your actual image path.
embedding_dim = 512  # Adjust this if your embedding model outputs a different dimension.

# Initialize preprocessing and embedding extraction modules.
preprocessing = Preprocessing(image_size=160)
embedding_extractor = Embedding(pretrained="casia-webface", device="cpu")

# Load and process the probe image; if fails, use a random embedding.
try:
    probe_image = Image.open(probe_image_path)
    processed_probe = preprocessing.process(probe_image)
    query_embedding = embedding_extractor.encode(processed_probe)
except Exception as e:
    print("Error loading probe image, using a random query embedding. Details:", e)
    query_embedding = np.random.rand(embedding_dim).astype('float32')

# Simulate a gallery of embeddings and associated metadata.
num_gallery = 1000
gallery_embeddings = np.random.rand(num_gallery, embedding_dim).astype('float32')
gallery_metadata = [f"Person_{i}" for i in range(num_gallery)]
k = 5  # Number of nearest neighbors to retrieve.

# Initialize different indexing strategies.
strategies = {
    "BruteForce": FaissBruteForce(dim=embedding_dim, metric="euclidean"),
    "LSH": FaissLSH(dim=embedding_dim, nbits=128),
    "HNSW": FaissHNSW(dim=embedding_dim, M=32, efConstruction=40)
}

# Dictionary to store query results.
query_results = {}

for name, index_obj in strategies.items():
    # Add gallery embeddings to the index.
    index_obj.add_embeddings(gallery_embeddings, gallery_metadata)
    
    # Time the search query.
    start_time = time.time()
    query_vector = query_embedding.reshape(1, -1)
    distances, indices = index_obj.index.search(query_vector, k)
    elapsed_time = time.time() - start_time
    
    # Retrieve metadata for the found indices.
    meta_results = [index_obj.get_metadata(int(idx)) for idx in indices[0]]
    
    query_results[name] = {
        "query_time_sec": elapsed_time,
        "distances": distances,
        "indices": indices,
        "metadata": meta_results
    }

# Print the results for each indexing strategy.
for strategy, results in query_results.items():
    print(f"Strategy: {strategy}")
    print(f"  Query Time: {results['query_time_sec']:.4f} sec")
    print(f"  Retrieved Metadata: {results['metadata']}")
    print("-------------------------------------------------")


Strategy: BruteForce
  Query Time: 0.0000 sec
  Retrieved Metadata: ['Person_367', 'Person_617', 'Person_846', 'Person_974', 'Person_681']
-------------------------------------------------
Strategy: LSH
  Query Time: 0.0000 sec
  Retrieved Metadata: ['Person_157', 'Person_50', 'Person_743', 'Person_264', 'Person_126']
-------------------------------------------------
Strategy: HNSW
  Query Time: 0.0010 sec
  Retrieved Metadata: ['Person_617', 'Person_846', 'Person_974', 'Person_797', 'Person_282']
-------------------------------------------------


In [2]:
# Code Cell 2: Evaluate Embedding Extraction Performance
# This cell measures how long it takes to extract an embedding from a probe image.
# Replace the placeholder image path with an actual image file to generate real metrics.

import time
from PIL import Image
from modules.extraction.preprocessing import Preprocessing
from modules.extraction.embedding import Embedding

# Placeholder for probe image path – replace with an actual image file.
probe_image_path = "C:\\Users\\Putna\\OneDrive - Johns Hopkins\\Documents\\Johns Hopkins\\Creating AI Enabled Systems\\SP25\\ironclad\\storage\\probe\\Aaron_Sorkin\\Aaron_Sorkin_0002.jpg"
# Initialize preprocessing and embedding modules.
preprocessing = Preprocessing(image_size=160)
embedding_extractor = Embedding(pretrained="casia-webface", device="cpu")

# Load and process the probe image.
try:
    probe_image = Image.open(probe_image_path)
    processed_probe = preprocessing.process(probe_image)
except Exception as e:
    print("Error loading probe image. Using a random tensor as input. Details:", e)
    import torch
    processed_probe = torch.rand(1, 3, 160, 160)

# Measure the time for embedding extraction.
start_time = time.time()
embedding_vector = embedding_extractor.encode(processed_probe)
elapsed_time = time.time() - start_time

print(f"Embedding shape: {embedding_vector.shape}")
print(f"Time taken for embedding extraction: {elapsed_time:.4f} seconds")


Embedding shape: (512,)
Time taken for embedding extraction: 0.0205 seconds


In [3]:
# Code Cell 3: Evaluate Impact of Similarity Metrics on Search Results
# This cell compares search performance using different similarity metrics (Euclidean and Cosine)
# on a simulated gallery using the FaissBruteForce index. It prints out query times and retrieved metadata.

import time
import numpy as np
from modules.retrieval.index.bruteforce import FaissBruteForce

# Define embedding dimension and simulate a gallery.
embedding_dim = 512
num_gallery = 1000
gallery_embeddings = np.random.rand(num_gallery, embedding_dim).astype('float32')
gallery_metadata = [f"Person_{i}" for i in range(num_gallery)]
k = 5  # Number of nearest neighbors to retrieve.

# Generate a random query embedding (or use a real one if available).
query_embedding = np.random.rand(embedding_dim).astype('float32')
query_vector = query_embedding.reshape(1, -1)

# Evaluate for Euclidean and Cosine similarity metrics.
metrics = ["euclidean", "cosine"]
metric_results = {}

for metric in metrics:
    # Initialize the brute-force index with the selected metric.
    index_obj = FaissBruteForce(dim=embedding_dim, metric=metric)
    index_obj.add_embeddings(gallery_embeddings, gallery_metadata)
    
    # If using cosine, normalize the query vector so that inner product corresponds to cosine similarity.
    if metric == "cosine":
        import faiss
        faiss.normalize_L2(query_vector)
    
    start_time = time.time()
    distances, indices = index_obj.index.search(query_vector, k)
    elapsed_time = time.time() - start_time
    
    meta_results = [index_obj.get_metadata(int(idx)) for idx in indices[0]]
    metric_results[metric] = {
        "query_time_sec": elapsed_time,
        "distances": distances,
        "indices": indices,
        "metadata": meta_results
    }

# Print the search results for each similarity metric.
for metric, results in metric_results.items():
    print(f"Similarity Metric: {metric}")
    print(f"  Query Time: {results['query_time_sec']:.4f} sec")
    print(f"  Retrieved Metadata: {results['metadata']}")
    print("-------------------------------------------------")


Similarity Metric: euclidean
  Query Time: 0.0005 sec
  Retrieved Metadata: ['Person_151', 'Person_838', 'Person_368', 'Person_275', 'Person_569']
-------------------------------------------------
Similarity Metric: cosine
  Query Time: 0.0000 sec
  Retrieved Metadata: ['Person_838', 'Person_368', 'Person_151', 'Person_275', 'Person_743']
-------------------------------------------------
