In [1]:
import os
import hashlib
from collections import defaultdict, Counter

def get_file_hash(filepath):

    hash_md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def check_duplicates(dataset_dir):
    splits = ['train', 'val', 'test']
    hash_dict = defaultdict(list)

    for split in splits:
        split_dir = os.path.join(dataset_dir, split)
        for root, _, files in os.walk(split_dir):
            for fname in files:
                fpath = os.path.join(root, fname)
                file_hash = get_file_hash(fpath)

                cls_name = os.path.basename(os.path.dirname(fpath))
                hash_dict[file_hash].append((split, cls_name, fpath))

    duplicates = {h: paths for h, paths in hash_dict.items() if len(paths) > 1}

    if duplicates:
        print("Duplicate images detected across splits")
        class_counter = Counter()

        for h, paths in duplicates.items():
            for _, cls, _ in paths:
                class_counter[cls] += 1

            print(f"\nHash: {h}")
            for split, cls, fpath in paths:
                print(f"  [{split}] ({cls}) {fpath}")

        print("\nDuplicate image count by class:")
        for cls, count in class_counter.most_common():
            print(f"  {cls}: {count} duplicates")
    else:
        print("No duplicates found between train/val/test.")

if __name__ == "__main__":
    DATASET_DIR = r"D:\dataset_split"  
    check_duplicates(DATASET_DIR)


Duplicate images detected across splits

Hash: 32cb22521edc69a5503a6749446dd522
  [train] (Tomato_healthy) D:\dataset_split\train\Tomato_healthy\068e324c-faf6-40d6-8f83-578907f1cac5___GH_HL Leaf 466.1.JPG
  [train] (Tomato_healthy) D:\dataset_split\train\Tomato_healthy\34c81c57-e1fa-49dd-a49d-34fe8b2385fe___GH_HL Leaf 466.1.JPG

Hash: ea0a44df2e4e44c6ccacc31f067a6284
  [train] (Tomato_healthy) D:\dataset_split\train\Tomato_healthy\1af0bfe1-4bcf-4b8b-be66-5d0953eb647e___GH_HL Leaf 482.2.JPG
  [train] (Tomato_healthy) D:\dataset_split\train\Tomato_healthy\cfd491d6-4af5-4728-8f0e-0d330a07174a___GH_HL Leaf 482.2.JPG

Hash: 2e53ad1ec3f810e2ff4c83400dc69ec9
  [train] (Tomato_healthy) D:\dataset_split\train\Tomato_healthy\37203047-d8ba-43f7-b31e-d496c41c569c___GH_HL Leaf 389.JPG
  [train] (Tomato_healthy) D:\dataset_split\train\Tomato_healthy\505465db-407b-4e0a-8110-7479dad5261c___GH_HL Leaf 389.JPG

Hash: 3dab5eb1192b5c43fd3015490cf0b18f
  [train] (Tomato_healthy) D:\dataset_split\train\Toma