In [2]:
import os
import shutil
import numpy as np
import faiss
from deepface import DeepFace
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from PIL import Image, UnidentifiedImageError
import tensorflow as tf

In [3]:
# Force TensorFlow to use GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("✅ GPU is enabled for DeepFace!")

✅ GPU is enabled for DeepFace!


In [4]:
# Define input and output directories
INPUT_FOLDER = "./temp/input"
OUTPUT_FOLDER = "./temp/output"

# Ensure output folder exists
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

# DeepFace model for feature extraction
DEEPFACE_MODEL = "VGG-Face"  # Try "VGG-Face" or "ArcFace" if needed

In [5]:
# Function to extract deep learning-based feature embeddings
def extract_embedding(image_path):
    try:
        embedding = DeepFace.represent(img_path=image_path, model_name=DEEPFACE_MODEL, enforce_detection=False)[0]["embedding"]
        return np.array(embedding, dtype=np.float32)
    except Exception as e:
        print(f"Skipping {image_path}: DeepFace error -> {e}")
        return None

In [11]:
# Step 1: Extract feature embeddings for all images
image_paths = [
    os.path.join(INPUT_FOLDER, file) for file in os.listdir(INPUT_FOLDER)
    if file.lower().endswith((".jpg", ".jpeg", ".png"))
]

image_embeddings = []
valid_image_paths = []

print(f"Extracting feature embeddings from {len(image_paths)} images...")

for img_path in tqdm(image_paths, desc="Extracting embeddings"):
    embedding = extract_embedding(img_path)
    if embedding is not None:
        image_embeddings.append(embedding)
        valid_image_paths.append(img_path)

image_embeddings = np.array(image_embeddings)

Extracting feature embeddings from 50 images...


Extracting embeddings: 100%|██████████| 50/50 [00:03<00:00, 15.71it/s]


In [12]:
# Step 2: Build FAISS index for fast nearest neighbor search
dimension = image_embeddings.shape[1]  # Get embedding size
index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search
index.add(image_embeddings)  # Add all image embeddings

In [13]:
# Step 3: Identify distinct images
unique_images = []
checked_images = set()

print("Identifying distinct images using FAISS nearest neighbor search...")

for i, img_path in tqdm(enumerate(valid_image_paths), total=len(valid_image_paths), desc="Verifying images"):
    if img_path in checked_images:
        continue

    # Find similar images
    distances, indices = index.search(np.array([image_embeddings[i]]), k=10)  # Get top-10 nearest images

    is_duplicate = False
    for j in indices[0][1:]:  # Skip self-match
        if valid_image_paths[j] in checked_images:
            continue

        try:
            # Use DeepFace for final verification
            result = DeepFace.verify(img1_path=img_path, img2_path=valid_image_paths[j], enforce_detection=False)
            if result.get("verified", False):
                is_duplicate = True
                break  # Stop if duplicate found
        except Exception as e:
            print(f"Skipping {img_path}: DeepFace verification error -> {e}")
            continue

    # If not duplicate, save it
    if not is_duplicate:
        unique_images.append(img_path)
        shutil.copy(img_path, os.path.join(OUTPUT_FOLDER, os.path.basename(img_path)))

    checked_images.add(img_path)

print(f"Processing complete. Found {len(unique_images)} distinct images.")
print(f"Distinct images saved in: {OUTPUT_FOLDER}")

Identifying distinct images using FAISS nearest neighbor search...


Verifying images: 100%|██████████| 50/50 [00:07<00:00,  6.40it/s]

Processing complete. Found 9 distinct images.
Distinct images saved in: ./temp/output



