In [1]:
import os
import shutil
import numpy as np
import faiss
from deepface import DeepFace
from tqdm import tqdm
from PIL import Image, UnidentifiedImageError
import tensorflow as tf

In [2]:
# Force TensorFlow to use GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("âœ… GPU is enabled for DeepFace!")

âœ… GPU is enabled for DeepFace!


In [4]:
# Define input and output directories
INPUT_FOLDER = "./unfiltered2"
OUTPUT_FOLDER = "./unfiltered4"

# Emotion categories
EMOTION_CLASSES = ["anger", "contempt", "disgust", "fear", "happy", "neutral", "sad", "surprise"]

# DeepFace model for feature extraction
DEEPFACE_MODEL = "ArcFace"  # Alternatives: "Facenet", "ArcFace", "DeepID"

In [5]:
# FAISS Parameters
NUM_NEIGHBORS = 5  # Reduce from 10 to 5 (less aggressive similarity search)

# DeepFace Verification Parameters
SIMILARITY_THRESHOLD = 0.4  # Increase threshold (0.4-0.5 is good for more distinct images)

In [6]:
# Ensure output structure exists
def initialize_output_folders():
    if not os.path.exists(OUTPUT_FOLDER):
        os.makedirs(OUTPUT_FOLDER)

    for emotion in EMOTION_CLASSES:
        emotion_folder = os.path.join(OUTPUT_FOLDER, emotion)
        if not os.path.exists(emotion_folder):
            os.makedirs(emotion_folder)

In [7]:
# Function to extract deep learning-based feature embeddings
def extract_embedding(image_path):
    try:
        embedding = DeepFace.represent(img_path=image_path, model_name=DEEPFACE_MODEL, enforce_detection=False)[0]["embedding"]
        return np.array(embedding, dtype=np.float32)
    except Exception as e:
        print(f"Skipping {image_path}: DeepFace error -> {e}")
        return None

In [8]:
# Process images for a specific emotion category
def process_emotion_class(emotion):
    input_path = os.path.join(INPUT_FOLDER, emotion)
    output_path = os.path.join(OUTPUT_FOLDER, emotion)

    if not os.path.exists(input_path):
        print(f"Skipping {emotion}: No such folder in input dataset.")
        return

    # Get all image paths in the category
    image_paths = [
        os.path.join(input_path, file) for file in os.listdir(input_path)
        if file.lower().endswith((".jpg", ".jpeg", ".png"))
    ]

    if not image_paths:
        print(f"Skipping {emotion}: No images found.")
        return

    print(f"\nðŸ”¹ Processing '{emotion}' folder ({len(image_paths)} images)...")

    # Step 1: Extract feature embeddings
    image_embeddings = []
    valid_image_paths = []

    for img_path in tqdm(image_paths, desc=f"Extracting embeddings ({emotion})"):
        embedding = extract_embedding(img_path)
        if embedding is not None:
            image_embeddings.append(embedding)
            valid_image_paths.append(img_path)

    if not valid_image_paths:
        print(f"Skipping '{emotion}': No valid images after feature extraction.")
        return

    image_embeddings = np.array(image_embeddings)

    # Step 2: Build FAISS index for fast nearest neighbor search
    dimension = image_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(image_embeddings)

    # Step 3: Identify distinct images
    unique_images = []
    checked_images = set()

    print(f"Identifying distinct images for '{emotion}'...")

    for i, img_path in tqdm(enumerate(valid_image_paths), total=len(valid_image_paths), desc=f"Verifying ({emotion})"):
        if img_path in checked_images:
            continue

        # Find similar images (reduced from 10 to NUM_NEIGHBORS to improve distinct detection)
        distances, indices = index.search(np.array([image_embeddings[i]]), k=NUM_NEIGHBORS)

        is_duplicate = False
        for j in indices[0][1:]:  # Skip self-match
            if valid_image_paths[j] in checked_images:
                continue

            try:
                # Use DeepFace for final verification
                result = DeepFace.verify(img1_path=img_path, img2_path=valid_image_paths[j], enforce_detection=False)
                similarity_score = result.get("distance", 1.0)  # Lower is more similar

                if similarity_score < SIMILARITY_THRESHOLD:
                    is_duplicate = True
                    break  # Stop if duplicate found

            except Exception as e:
                print(f"Skipping {img_path}: DeepFace verification error -> {e}")
                continue

        # If not duplicate, save it
        if not is_duplicate:
            unique_images.append(img_path)
            shutil.copy(img_path, os.path.join(output_path, os.path.basename(img_path)))

        checked_images.add(img_path)

    print(f"âœ… Processing completed for '{emotion}'. Found {len(unique_images)} distinct images.")
    print(f"ðŸ“‚ Distinct images saved in: {output_path}\n")

In [9]:
# Run the script
if __name__ == "__main__":
    initialize_output_folders()

    # Process each emotion category separately
    for emotion in EMOTION_CLASSES:
        process_emotion_class(emotion)

    print("ðŸŽ‰ All categories processed successfully!")


ðŸ”¹ Processing 'anger' folder (15729 images)...


Extracting embeddings (anger):   0%|          | 0/15729 [00:00<?, ?it/s]

25-01-18 12:45:47 - arcface_weights.h5 will be downloaded to C:\Users\Tuf\.deepface/weights\arcface_weights.h5


Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/arcface_weights.h5
To: C:\Users\Tuf\.deepface\weights\arcface_weights.h5
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 137M/137M [00:29<00:00, 4.58MB/s]
Extracting embeddings (anger): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 15729/15729 [34:02<00:00,  7.70it/s] 


Identifying distinct images for 'anger'...


Verifying (anger): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 15729/15729 [40:08<00:00,  6.53it/s] 


âœ… Processing completed for 'anger'. Found 10634 distinct images.
ðŸ“‚ Distinct images saved in: ./unfiltered4\anger


ðŸ”¹ Processing 'contempt' folder (17006 images)...


Extracting embeddings (contempt): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 17006/17006 [34:53<00:00,  8.12it/s]


Identifying distinct images for 'contempt'...


Verifying (contempt): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 17006/17006 [38:16<00:00,  7.41it/s] 


âœ… Processing completed for 'contempt'. Found 9235 distinct images.
ðŸ“‚ Distinct images saved in: ./unfiltered4\contempt


ðŸ”¹ Processing 'disgust' folder (16149 images)...


Extracting embeddings (disgust): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16149/16149 [30:34<00:00,  8.80it/s]


Identifying distinct images for 'disgust'...


Verifying (disgust): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16149/16149 [35:01<00:00,  7.68it/s] 


âœ… Processing completed for 'disgust'. Found 8651 distinct images.
ðŸ“‚ Distinct images saved in: ./unfiltered4\disgust


ðŸ”¹ Processing 'fear' folder (15203 images)...


Extracting embeddings (fear): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 15203/15203 [34:34<00:00,  7.33it/s]


Identifying distinct images for 'fear'...


Verifying (fear): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 15203/15203 [46:56<00:00,  5.40it/s] 


âœ… Processing completed for 'fear'. Found 9445 distinct images.
ðŸ“‚ Distinct images saved in: ./unfiltered4\fear


ðŸ”¹ Processing 'happy' folder (16715 images)...


Extracting embeddings (happy): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16715/16715 [36:51<00:00,  7.56it/s]


Identifying distinct images for 'happy'...


Verifying (happy): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16715/16715 [58:39<00:00,  4.75it/s] 


âœ… Processing completed for 'happy'. Found 13190 distinct images.
ðŸ“‚ Distinct images saved in: ./unfiltered4\happy


ðŸ”¹ Processing 'neutral' folder (16710 images)...


Extracting embeddings (neutral): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16710/16710 [35:23<00:00,  7.87it/s]


Identifying distinct images for 'neutral'...


Verifying (neutral): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16710/16710 [51:06<00:00,  5.45it/s] 


âœ… Processing completed for 'neutral'. Found 12380 distinct images.
ðŸ“‚ Distinct images saved in: ./unfiltered4\neutral


ðŸ”¹ Processing 'sad' folder (15014 images)...


Extracting embeddings (sad): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 15014/15014 [30:51<00:00,  8.11it/s]


Identifying distinct images for 'sad'...


Verifying (sad): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 15014/15014 [39:01<00:00,  6.41it/s] 


âœ… Processing completed for 'sad'. Found 10638 distinct images.
ðŸ“‚ Distinct images saved in: ./unfiltered4\sad


ðŸ”¹ Processing 'surprise' folder (16109 images)...


Extracting embeddings (surprise): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16109/16109 [34:36<00:00,  7.76it/s]


Identifying distinct images for 'surprise'...


Verifying (surprise): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16109/16109 [47:27<00:00,  5.66it/s] 

âœ… Processing completed for 'surprise'. Found 9479 distinct images.
ðŸ“‚ Distinct images saved in: ./unfiltered4\surprise

ðŸŽ‰ All categories processed successfully!



