# Forensic Similarity Search Notebook
This notebook demonstrates a similarity search using CLIP and FAISS in a modular manner.

## Usage Instructions
Below is the original usage and description extracted from the script:
  
#!/usr/bin/env python3  
"""
forensic_similarity_search.py

Usage:
  python forensic_similarity_search.py \
      --ref_dir  /evidence/suspects/john_doe_faces \
      --gallery  /evidence/phone_dump/DCIM \
      --out_dir  ./matches \
      --threshold 0.25 \
      --top_k 10
"""

In [None]:
# !pip install torch open_clip_torch faiss-cpu pillow tqdm numpy

Collecting torch
  Downloading torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting open_clip_torch
  Downloading open_clip_torch-2.32.0-py3-none-any.whl.metadata (31 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting pillow
  Using cached pillow-11.2.1-cp312-cp312-win_amd64.whl.metadata (9.1 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting numpy
  Downloading numpy-2.3.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting filelock (from torch)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached j

In [1]:
# ---------- 1. Imports ----------
import argparse, os, shutil, glob
import numpy as np
from tqdm import tqdm
from PIL import Image
import torch, open_clip
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ---------- 2. Initialize CLIP and Define embed() ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-L-14",  pretrained="laion2b_s32b_b82k", device=device
)
model.eval()

def embed(img_path: str) -> np.ndarray:
    """Return a unit-length 768-dim vector for one image file."""
    img = preprocess(Image.open(img_path).convert("RGB")).unsqueeze(0).to(device)
    with torch.no_grad():
        v = model.encode_image(img).cpu().numpy()
    v /= np.linalg.norm(v, ord=2) + 1e-10
    return v.squeeze()  # shape (768,)

In [3]:
# ---------- 3. Reference and Gallery Collection Functions ----------
def collect_refs(ref_dir: str) -> np.ndarray:
    ref_embeds = [embed(p) for p in glob.glob(os.path.join(ref_dir, "*"))]
    return np.stack(ref_embeds)  # shape (n_ref, 768)

def build_index(vectors: np.ndarray) -> faiss.Index:
    d = vectors.shape[1]
    index = faiss.IndexFlatIP(d)  # exact cosine similarity since vectors are unit-length
    index.add(vectors.astype("float32"))
    return index

def collect_gallery(gallery_dir: str):
    paths = sorted(glob.glob(os.path.join(gallery_dir, "**", "*.*"), recursive=True))
    embeds = []
    for p in tqdm(paths, desc="Embedding gallery"):
        try:
            embeds.append(embed(p))
        except Exception as e:
            print(f"[warn] {p}: {e}")
            embeds.append(np.zeros(768))  # keep index alignment
    return paths, np.stack(embeds)

In [4]:
# ---------- 4. Search and Copy Function ----------
def search_and_copy(ref_vecs, gallery_vecs, gallery_paths, out_dir, threshold=0.25, top_k=10):
    os.makedirs(out_dir, exist_ok=True)
    index = build_index(gallery_vecs)
    hits = set()
    for ref_v in ref_vecs:
        # For unit-length vectors, dot-product equals cosine similarity.
        sims, idxs = index.search(ref_v.astype("float32")[None, :], top_k)
        for s, i in zip(sims[0], idxs[0]):
            dist = 1 - s  # cosine-distance
            if dist <= threshold:
                hits.add(i)
    for i in hits:
        src = gallery_paths[i]
        dst = os.path.join(out_dir, os.path.basename(src))
        shutil.copy2(src, dst)
    print(f"Copied {len(hits)} matching images to {out_dir}")

In [None]:
# # ---------- 5. Main CLI Wiring ----------
# def main():
#     ap = argparse.ArgumentParser()
#     ap.add_argument("--ref_dir",  required=True, help="Dir with 3–4 suspect images")
#     ap.add_argument("--gallery",  required=True, help="Dir tree of images to scan")
#     ap.add_argument("--out_dir",  default="./matches")
#     ap.add_argument("--threshold", type=float, default=0.25,
#                     help="max cosine distance (lower=tighter)")
#     ap.add_argument("--top_k",    type=int,   default=10,
#                     help="how many neighbours per ref to examine")
#     args = ap.parse_args()

#     ref_vecs         = collect_refs(args.ref_dir)
#     g_paths, g_vecs  = collect_gallery(args.gallery)
#     search_and_copy(ref_vecs, g_vecs, g_paths,
#                     args.out_dir, args.threshold, args.top_k)

# if __name__ == "__main__":
#     main()

: 

In [None]:
# ---------- 5. Run without CLI ----------
# Inline parameters - modify these paths as needed
ref_dir = r"datasets\images\face\reference_images"      # Directory containing 3–4 suspect images
gallery = r"datasets\images\face\gallery"          # Directory tree of images to scan
out_dir = r"datasets\images\face\matched_images"          # Output directory for matched images
threshold = 0.25                                # Maximum cosine distance (lower=tighter)
top_k = 10                                      # Number of neighbors per reference to examine

print("Collecting reference embeddings...")
ref_files = glob.glob(os.path.join(ref_dir, "*"))
if not ref_files:
	print(f"[ERROR] No images found in reference directory: {ref_dir}")
else:
	ref_vecs = collect_refs(ref_dir)
	print("Collecting gallery embeddings...")
	g_paths, g_vecs = collect_gallery(gallery)
	if len(g_paths) == 0:
		print(f"[ERROR] No images found in gallery directory: {gallery}")
	else:
		print("Performing search and copying matches...")
		search_and_copy(ref_vecs, g_vecs, g_paths, out_dir, threshold, top_k)
		print("Done!")

Collecting reference embeddings...
Collecting gallery embeddings...


Embedding gallery: 100%|██████████| 130/130 [06:53<00:00,  3.18s/it]


Performing search and copying matches...
