In [None]:
import os
import imagehash
from PIL import Image
from tqdm import tqdm

def remove_duplicates(root_dirs):
    seen_hashes = {}
    removed = 0

    for root_dir in root_dirs:
        print(f"Scanning: {root_dir}")
        for emotion in os.listdir(root_dir):
            emotion_dir = os.path.join(root_dir, emotion)
            if not os.path.isdir(emotion_dir):
                continue

            for filename in tqdm(os.listdir(emotion_dir), desc=f"Checking {emotion}"):
                file_path = os.path.join(emotion_dir, filename)

                if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
                    continue

                try:
                    img = Image.open(file_path).convert("L")  # grayscale
                    hash = imagehash.average_hash(img)

                    if hash in seen_hashes:
                        os.remove(file_path)
                        removed += 1
                    else:
                        seen_hashes[hash] = file_path

                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    print(f"\n✅ Finished. Removed {removed} duplicate images.")

remove_duplicates(["train", "test"])


Scanning: train


Checking angry: 100%|██████████| 4801/4801 [00:01<00:00, 3019.53it/s]
Checking disgust: 100%|██████████| 465/465 [00:00<00:00, 3253.06it/s]
Checking fear: 100%|██████████| 4837/4837 [00:01<00:00, 3181.76it/s]
Checking happy: 100%|██████████| 8636/8636 [00:03<00:00, 2807.39it/s]
Checking neutral: 100%|██████████| 6132/6132 [00:02<00:00, 2495.03it/s]
Checking sad: 100%|██████████| 5960/5960 [00:02<00:00, 2318.83it/s]
Checking surprise: 100%|██████████| 3237/3237 [00:01<00:00, 2110.12it/s]


Scanning: test


Checking angry: 100%|██████████| 301/301 [00:00<00:00, 2064.94it/s]
Checking disgust: 100%|██████████| 31/31 [00:00<00:00, 2032.16it/s]
Checking fear: 100%|██████████| 322/322 [00:00<00:00, 2094.16it/s]
Checking happy: 100%|██████████| 778/778 [00:00<00:00, 2080.13it/s]
Checking neutral: 100%|██████████| 376/376 [00:00<00:00, 2059.10it/s]
Checking sad: 100%|██████████| 372/372 [00:00<00:00, 2097.88it/s]
Checking surprise: 100%|██████████| 208/208 [00:00<00:00, 2114.43it/s]


✅ Finished. Removed 0 duplicate images.



