In [1]:
import os
import shutil
import numpy as np
import faiss
from deepface import DeepFace
from tqdm import tqdm
from PIL import Image, UnidentifiedImageError
import tensorflow as tf

In [2]:
# Force TensorFlow to use GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("✅ GPU is enabled for DeepFace!")

✅ GPU is enabled for DeepFace!


In [3]:
# Define input and output directories
INPUT_FOLDER = "./unfiltered2"
OUTPUT_FOLDER = "./unfiltered3"

# Emotion categories (Subfolder names)
EMOTION_CLASSES = ["anger", "contempt", "disgust", "fear", "happy", "neutral", "sad", "surprise"]

# DeepFace model for feature extraction
DEEPFACE_MODEL = "VGG-Face"  # Alternative: "ArcFace", "Facenet"

In [5]:
# Ensure output structure exists
def initialize_output_folders():
    if not os.path.exists(OUTPUT_FOLDER):
        os.makedirs(OUTPUT_FOLDER)
    
    for emotion in EMOTION_CLASSES:
        emotion_folder = os.path.join(OUTPUT_FOLDER, emotion)
        if not os.path.exists(emotion_folder):
            os.makedirs(emotion_folder)

In [6]:
# Function to extract deep learning-based feature embeddings
def extract_embedding(image_path):
    try:
        embedding = DeepFace.represent(img_path=image_path, model_name=DEEPFACE_MODEL, enforce_detection=False)[0]["embedding"]
        return np.array(embedding, dtype=np.float32)
    except Exception as e:
        print(f"Skipping {image_path}: DeepFace error -> {e}")
        return None

In [7]:
# Process images for a specific emotion category
def process_emotion_class(emotion):
    input_path = os.path.join(INPUT_FOLDER, emotion)
    output_path = os.path.join(OUTPUT_FOLDER, emotion)

    # Ensure the input category folder exists
    if not os.path.exists(input_path):
        print(f"Skipping {emotion}: No such folder in input dataset.")
        return

    # Get all image paths in the category
    image_paths = [
        os.path.join(input_path, file) for file in os.listdir(input_path)
        if file.lower().endswith((".jpg", ".jpeg", ".png"))
    ]

    if not image_paths:
        print(f"Skipping {emotion}: No images found.")
        return

    print(f"\n🔹 Processing '{emotion}' folder ({len(image_paths)} images)...")

    # Step 1: Extract feature embeddings
    image_embeddings = []
    valid_image_paths = []

    for img_path in tqdm(image_paths, desc=f"Extracting embeddings ({emotion})"):
        embedding = extract_embedding(img_path)
        if embedding is not None:
            image_embeddings.append(embedding)
            valid_image_paths.append(img_path)

    if not valid_image_paths:
        print(f"Skipping '{emotion}': No valid images after feature extraction.")
        return

    image_embeddings = np.array(image_embeddings)

    # Step 2: Build FAISS index for fast nearest neighbor search
    dimension = image_embeddings.shape[1]  # Get embedding size
    index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search
    index.add(image_embeddings)  # Add all image embeddings

    # Step 3: Identify distinct images
    unique_images = []
    checked_images = set()

    print(f"Identifying distinct images for '{emotion}'...")

    for i, img_path in tqdm(enumerate(valid_image_paths), total=len(valid_image_paths), desc=f"Verifying ({emotion})"):
        if img_path in checked_images:
            continue

        # Find similar images
        distances, indices = index.search(np.array([image_embeddings[i]]), k=10)  # Get top-10 nearest images

        is_duplicate = False
        for j in indices[0][1:]:  # Skip self-match
            if valid_image_paths[j] in checked_images:
                continue

            try:
                # Use DeepFace for final verification
                result = DeepFace.verify(img1_path=img_path, img2_path=valid_image_paths[j], enforce_detection=False)
                if result.get("verified", False):
                    is_duplicate = True
                    break  # Stop if duplicate found
            except Exception as e:
                print(f"Skipping {img_path}: DeepFace verification error -> {e}")
                continue

        # If not duplicate, save it
        if not is_duplicate:
            unique_images.append(img_path)
            shutil.copy(img_path, os.path.join(output_path, os.path.basename(img_path)))

        checked_images.add(img_path)

    print(f"✅ Processing completed for '{emotion}'. Found {len(unique_images)} distinct images.")
    print(f"📂 Distinct images saved in: {output_path}\n")

In [8]:
# Run the script
if __name__ == "__main__":
    initialize_output_folders()

    # Process each emotion category separately
    for emotion in EMOTION_CLASSES:
        process_emotion_class(emotion)

    print("🎉 All categories processed successfully!")


🔹 Processing 'anger' folder (15729 images)...


Extracting embeddings (anger): 100%|██████████| 15729/15729 [15:15<00:00, 17.17it/s]


Identifying distinct images for 'anger'...


Verifying (anger): 100%|██████████| 15729/15729 [34:01<00:00,  7.70it/s] 


✅ Processing completed for 'anger'. Found 1568 distinct images.
📂 Distinct images saved in: ./unfiltered3\anger


🔹 Processing 'contempt' folder (17006 images)...


Extracting embeddings (contempt): 100%|██████████| 17006/17006 [21:00<00:00, 13.49it/s]


Identifying distinct images for 'contempt'...


Verifying (contempt): 100%|██████████| 17006/17006 [39:01<00:00,  7.26it/s] 


✅ Processing completed for 'contempt'. Found 1826 distinct images.
📂 Distinct images saved in: ./unfiltered3\contempt


🔹 Processing 'disgust' folder (16149 images)...


Extracting embeddings (disgust): 100%|██████████| 16149/16149 [15:41<00:00, 17.16it/s]


Identifying distinct images for 'disgust'...


Verifying (disgust): 100%|██████████| 16149/16149 [33:17<00:00,  8.09it/s] 


✅ Processing completed for 'disgust'. Found 1714 distinct images.
📂 Distinct images saved in: ./unfiltered3\disgust


🔹 Processing 'fear' folder (15203 images)...


Extracting embeddings (fear): 100%|██████████| 15203/15203 [15:52<00:00, 15.96it/s]


Identifying distinct images for 'fear'...


Verifying (fear): 100%|██████████| 15203/15203 [32:17<00:00,  7.84it/s]


✅ Processing completed for 'fear'. Found 1359 distinct images.
📂 Distinct images saved in: ./unfiltered3\fear


🔹 Processing 'happy' folder (16715 images)...


Extracting embeddings (happy): 100%|██████████| 16715/16715 [17:33<00:00, 15.86it/s]


Identifying distinct images for 'happy'...


Verifying (happy): 100%|██████████| 16715/16715 [35:20<00:00,  7.88it/s] 


✅ Processing completed for 'happy'. Found 1454 distinct images.
📂 Distinct images saved in: ./unfiltered3\happy


🔹 Processing 'neutral' folder (16710 images)...


Extracting embeddings (neutral): 100%|██████████| 16710/16710 [17:27<00:00, 15.95it/s]


Identifying distinct images for 'neutral'...


Verifying (neutral): 100%|██████████| 16710/16710 [36:59<00:00,  7.53it/s]


✅ Processing completed for 'neutral'. Found 1547 distinct images.
📂 Distinct images saved in: ./unfiltered3\neutral


🔹 Processing 'sad' folder (15014 images)...


Extracting embeddings (sad): 100%|██████████| 15014/15014 [14:29<00:00, 17.27it/s]


Identifying distinct images for 'sad'...


Verifying (sad): 100%|██████████| 15014/15014 [29:52<00:00,  8.38it/s] 


✅ Processing completed for 'sad'. Found 1419 distinct images.
📂 Distinct images saved in: ./unfiltered3\sad


🔹 Processing 'surprise' folder (16109 images)...


Extracting embeddings (surprise): 100%|██████████| 16109/16109 [17:13<00:00, 15.59it/s]


Identifying distinct images for 'surprise'...


Verifying (surprise): 100%|██████████| 16109/16109 [35:10<00:00,  7.63it/s] 

✅ Processing completed for 'surprise'. Found 1132 distinct images.
📂 Distinct images saved in: ./unfiltered3\surprise

🎉 All categories processed successfully!



