In [3]:
import os
import shutil
from imagededup.methods import PHash
from imagededup.methods import CNN
from imagededup.utils import plot_duplicates

In [17]:
def remove_duplicates(input_dir, output_dir, max_distance=10):
    """
    Remove near-duplicate images from a directory
    :param input_dir: Path to folder containing all frames
    :param output_dir: Path to save unique frames
    :param max_distance: PHash hamming distance threshold (0-64). Lower = more strict.
                         Start with 10 (similar to ~0.85 similarity), adjust as needed.
    """
    # Initialize perceptual hasher
    phasher = PHash()

    # Find duplicates
    duplicates = phasher.find_duplicates_to_remove(image_dir=input_dir,
                                                  max_distance_threshold=max_distance)

    # Create output directory if not exists
    os.makedirs(output_dir, exist_ok=True)

    # Get all files sorted
    all_files = sorted([f for f in os.listdir(input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    
    # Copy unique files to output directory
    copied_count = 0
    for file in all_files:
        if file not in duplicates:
            src_path = os.path.join(input_dir, file)
            dst_path = os.path.join(output_dir, file)
            shutil.copy2(src_path, dst_path)
            copied_count += 1
            
    print(f"Original images: {len(all_files)}")
    print(f"Unique images kept: {copied_count}")
    print(f"Removed duplicates: {len(all_files) - copied_count}")


In [23]:
if __name__ == "__main__":
    # ========== CONFIGURATION ========== #
    INPUT_FOLDER = "C:/Users/fahri/OneDrive/Documents/Skripsi/data/processed/data_09_mei_2025/video 3/images"  # Change this
    OUTPUT_FOLDER = "C:/Users/fahri/OneDrive/Documents/Skripsi/data/processed/data_09_mei_2025/deduplicate_video3"  # Change this
    MAX_DISTANCE = 7  # Start with 10 (0=identical, 64=max different)
    # =================================== #
    
    remove_duplicates(INPUT_FOLDER, OUTPUT_FOLDER, MAX_DISTANCE)

2025-05-15 12:48:31,576: INFO Start: Calculating hashes...
100%|██████████| 3103/3103 [02:14<00:00, 22.99it/s]
2025-05-15 12:50:49,471: INFO End: Calculating hashes!
2025-05-15 12:50:49,476: INFO Start: Evaluating hamming distances for getting duplicates
2025-05-15 12:50:49,478: INFO Start: Retrieving duplicates using BKTree algorithm
100%|██████████| 3103/3103 [01:29<00:00, 34.67it/s]
2025-05-15 12:52:25,160: INFO End: Retrieving duplicates using BKTree algorithm
2025-05-15 12:52:25,164: INFO End: Evaluating hamming distances for getting duplicates


Original images: 3103
Unique images kept: 500
Removed duplicates: 2603
