### Andrew Taylor
### atayl136
### Creating AI Enabled Systems

# Search Demo Notebook

# Demo Search Notebook
This notebook demonstrates nearest neighbor search using the implemented FAISS index and various distance measures: **Euclidean**, **Cosine**, **Dot Product**, and **Minkowski**. We compute embeddings for gallery images, perform searches with 10 probe images, and report the rank positions.


In [1]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

import sys
# Import FAISS (will be correctly handled in the index classes)
import faiss



# Add the parent directory to the path
# Replace '/path/to/parent/directory' with the actual path to your parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)

from modules.extraction.embedding import Embedding
from modules.extraction.preprocessing import Preprocessing


# Then import
from modules.retrieval.index.bruteforce import FaissBruteForce
from modules.retrieval.search import FaissSearch

In [2]:
import os

os.environ["KMP_DUPLICATE_LIB_OK"]="True" 



In [3]:
from PIL import Image
import torch
import os
import glob
import numpy as np


# Initialize the preprocessing pipeline and embedding model once.
preprocessing = Preprocessing(image_size=160)
device = 'cpu'
embedding_model = Embedding(pretrained='casia-webface', device=device)

def compute_embedding(image_path):
    # Open and preprocess the image.
    image = Image.open(image_path).convert("RGB")
    processed_image = preprocessing.process(image)
    # Compute the embedding.
    embedding_vector = embedding_model.encode(processed_image)
    return embedding_vector

def get_first_image_from_folder(folder_path):
    """Get the first valid image file from a folder, skipping problematic files."""
    valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
    
    try:
        for filename in os.listdir(folder_path):
            # Skip files starting with ._ (macOS metadata files)
            if filename.startswith('._'):
                continue
                
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path) and any(file_path.lower().endswith(ext) for ext in valid_extensions):
                # Verify it's a valid image before returning
                try:
                    with Image.open(file_path) as img:
                        # Just accessing a property forces PIL to validate the file
                        img.format
                    return file_path
                except Exception:
                    # Skip this file if PIL can't open it
                    continue
    except Exception as e:
        print(f"Error accessing folder {folder_path}: {e}")
    
    return None

gallery_dir = '..\storage\multi_image_gallery'
folder_paths = glob.glob(os.path.join(gallery_dir, '*'))
print(f"Found {len(folder_paths)} folders in gallery.")

embeddings = []
metadata = []
successful_count = 0
error_count = 0

for folder_path in folder_paths:
    try:
        img_path = get_first_image_from_folder(folder_path)
        if img_path:
            try:
                embedding = compute_embedding(img_path)
                embeddings.append(embedding)
                metadata.append(os.path.basename(folder_path))
                successful_count += 1
                if successful_count % 10 == 0:
                    print(f"Successfully processed {successful_count} folders")
            except Exception as e:
                error_count += 1
                print(f"Error processing image {img_path}: {e}")
        else:
            print(f"No valid images found in folder: {folder_path}")
    except Exception as e:
        print(f"Fatal error with folder {folder_path}: {e}")

print(f"Processing complete. Success: {successful_count}, Errors: {error_count}")

embeddings = np.array(embeddings)
print(f"Created embeddings for {len(embeddings)} folders")

# Build a FAISS BruteForce index with Euclidean metric for demonstration.
faiss_index = FaissBruteForce(dim=512, metric='euclidean')
faiss_index.add_embeddings(embeddings, metadata)
print("Gallery embeddings indexed.")

  gallery_dir = '..\storage\multi_image_gallery'


Found 1000 folders in gallery.
Successfully processed 10 folders
Successfully processed 20 folders
Successfully processed 30 folders
Successfully processed 40 folders
Successfully processed 50 folders
Successfully processed 60 folders
Successfully processed 70 folders
Successfully processed 80 folders
Successfully processed 90 folders
Successfully processed 100 folders
Successfully processed 110 folders
Successfully processed 120 folders
Successfully processed 130 folders
Successfully processed 140 folders
Successfully processed 150 folders
Successfully processed 160 folders
Successfully processed 170 folders
Successfully processed 180 folders
Successfully processed 190 folders
Successfully processed 200 folders
Successfully processed 210 folders
Successfully processed 220 folders
Successfully processed 230 folders
Successfully processed 240 folders
Successfully processed 250 folders
Successfully processed 260 folders
Successfully processed 270 folders
Successfully processed 280 folder

In [4]:
# Randomly select 10 probes from the probe directory
probe_dir = '../storage/probe'
probe_folders = glob.glob(os.path.join(probe_dir, '*'))
probe_folders = np.random.choice(probe_folders, size=10, replace=False)

probe_embeddings = []
probe_metadata = []

print("Selected probe images:")
for i, folder_path in enumerate(probe_folders):
    img_path = get_first_image_from_folder(folder_path)
    if img_path:
        try:
            probe_embedding = compute_embedding(img_path)
            probe_embeddings.append(probe_embedding.cpu().numpy()) #store as numpy
            probe_metadata.append(os.path.basename(folder_path))
            print(f"Probe {i+1}: {os.path.basename(folder_path)}")
        except Exception as e:
            print(f"Error processing probe image: {e}")
    else:
        print(f"No valid image in probe folder: {folder_path}")

probe_embeddings = np.array(probe_embeddings)

# Remove empty probes.
valid_probes = []
valid_metadata = []
for i, probe in enumerate(probe_embeddings):
    if probe.size > 0:
        valid_probes.append(probe)
        valid_metadata.append(probe_metadata[i])

probe_embeddings = np.array(valid_probes)
probe_metadata = valid_metadata

Selected probe images:
Error processing probe image: 'numpy.ndarray' object has no attribute 'cpu'
Error processing probe image: 'numpy.ndarray' object has no attribute 'cpu'
Error processing probe image: 'numpy.ndarray' object has no attribute 'cpu'
Error processing probe image: 'numpy.ndarray' object has no attribute 'cpu'
Error processing probe image: 'numpy.ndarray' object has no attribute 'cpu'
Error processing probe image: 'numpy.ndarray' object has no attribute 'cpu'
Error processing probe image: 'numpy.ndarray' object has no attribute 'cpu'
Error processing probe image: 'numpy.ndarray' object has no attribute 'cpu'
Error processing probe image: 'numpy.ndarray' object has no attribute 'cpu'
Error processing probe image: 'numpy.ndarray' object has no attribute 'cpu'


In [5]:


#searcher = FaissSearch(faiss_index, metric="euclidean")
#distances, indices, meta_results = searcher.search(probe, k=5)
#print(f"  Search completed successfully")

In [6]:
# Define the distance metrics to test.
distance_metrics = ['euclidean', 'cosine', 'dot_product', 'minkowski']
k = 5  # Retrieve top 5 nearest neighbors for each probe.

# Dictionary to store results for each metric.
results = {metric: [] for metric in distance_metrics}

# First, check your index and ensure it's properly built
print(f"Index size: {faiss_index.index.ntotal}")
print(f"Index dimension: {faiss_index.index.d}")

for metric in distance_metrics:
    print(f"\nDistance metric: {metric}")
    try:
        searcher = FaissSearch(faiss_index, metric=metric, p=3)

        for i, probe in enumerate(probe_embeddings):
            try:
                print(f"Searching for probe {i+1}/{len(probe_embeddings)}: {probe_metadata[i]}")
                probe = np.ascontiguousarray(probe, dtype='float32')
                print(f"  Probe shape: {probe.shape}, dtype: {probe.dtype}")
                print("  Starting search...")
                safe_k = min(k, faiss_index.index.ntotal)
                print(f"  Using k={safe_k}")

                try:
                    distances, indices, meta_results = searcher.search(probe, k=safe_k)
                    print(f"  Search completed successfully")
                except Exception as e:
                    print(f"  Search failed: {str(e)}")
                    print("  Trying direct FAISS search as fallback...")
                    D, I = faiss_index.index.search(probe.reshape(1, -1), safe_k)
                    print(f"  Direct search returned shape: {D.shape}")
                    distances = D
                    indices = I
                    meta_results = [[faiss_index.metadata[idx] for idx in I[0]]]

                print(f"  First result: {meta_results[0][0]} (Distance: {distances[0][0]:.4f})")

            except Exception as e:
                print(f"Error processing probe {i}: {str(e)}")
                continue

    except Exception as e:
        print(f"Error with metric {metric}: {str(e)}")
        continue

Index size: 1000
Index dimension: 512

Distance metric: euclidean

Distance metric: cosine

Distance metric: dot_product

Distance metric: minkowski


In [7]:
# Compile the Euclidean results into a summary table.
euclidean_results = results['euclidean']
summary = []
for res in euclidean_results:
    row = {'Probe': res['Probe']}
    for rank, neighbor in enumerate(res['Neighbors'], start=1):
        row[f'Rank {rank}'] = neighbor
    summary.append(row)

df_summary = pd.DataFrame(summary)
print("Nearest Neighbor Ranking (Euclidean):")
print(df_summary)


Nearest Neighbor Ranking (Euclidean):
Empty DataFrame
Columns: []
Index: []


In [8]:
# Visualize the top neighbor distance for one probe across different metrics.
probe_idx = 0  # Using the first probe.
probe_name = probe_metadata[probe_idx]
metric_names = []
distance_values = []

for metric in distance_metrics:
    searcher = FaissSearch(faiss_index, metric=metric, p=3)
    distances, indices, meta_results = searcher.search(probe_embeddings[probe_idx], k=k)
    metric_names.append(metric)
    distance_values.append(distances[0][0])  # Top neighbor distance.

plt.figure(figsize=(6, 4))
plt.bar(metric_names, distance_values)
plt.xlabel('Distance Metric')
plt.ylabel('Top Neighbor Distance')
plt.title(f'Top Neighbor Distance for Probe {probe_name}')
plt.show()


IndexError: list index out of range

## Observations

- **Euclidean**, **Cosine**, and **Dot Product** metrics yield different rankings, though cosine and dot product are often similar if embeddings are normalized.
- The **Minkowski** metric (with `p=3` in this demo) provides additional flexibility in distance measurement.
- The choice of distance measure can affect the ranking of nearest neighbors; further tuning and experiments are necessary to determine the best fit for the application.
## Observations

- **Euclidean**, **Cosine**, and **Dot Product** metrics yield different rankings, though cosine and dot product are often similar if embeddings are normalized.
- The **Minkowski** metric (with `p=3` in this demo) provides additional flexibility in distance measurement.
- The choice of distance measure can affect the ranking of nearest neighbors; further tuning and experiments are necessary to determine the best fit for the application.
