In [5]:
import os
from PIL import Image
import numpy as np
from pathlib import Path
from tqdm import tqdm

def dhash(image, hash_size=8):
    """
    T√≠nh to√°n hash c·ªßa ·∫£nh d·ª±a tr√™n ch√™nh l·ªách gradient (Difference Hash).
    C√°ch n√†y hi·ªáu qu·∫£ h∆°n Average Hash trong vi·ªác t√¨m ·∫£nh t∆∞∆°ng ƒë·ªìng.
    """
    # 1. Resize v√† chuy·ªÉn sang grayscale
    image = image.convert('L').resize((hash_size + 1, hash_size), Image.Resampling.LANCZOS)
    pixels = np.asarray(image)
    
    # 2. T√≠nh ch√™nh l·ªách gi·ªØa c√°c pixel li·ªÅn k·ªÅ theo h√†ng ngang
    diff = pixels[:, 1:] > pixels[:, :-1]
    
    # 3. Chuy·ªÉn boolean array sang hex string ƒë·ªÉ l√†m m√£ ƒë·ªãnh danh
    return "".join(["%1x" % sum([2**i for i, v in enumerate(row) if v]) for row in diff])

def find_duplicates(image_dir, threshold=2):
    """
    T√¨m c√°c ·∫£nh tr√πng l·∫∑p ho·∫∑c g·∫ßn gi·ªëng nhau trong th∆∞ m·ª•c.
    threshold: s·ªë l∆∞·ª£ng bit kh√°c bi·ªát t·ªëi ƒëa (0 l√† gi·ªëng h·ªát).
    """
    image_dir = Path(image_dir)
    image_paths = list(image_dir.glob("*.[jJ][pP][gG]")) + \
                  list(image_dir.glob("*.[jJ][pP][eE][gG]")) + \
                  list(image_dir.glob("*.[pP][nN][gG]")) + \
                  list(image_dir.glob("*.[wW][eE][bB][pP]"))
    
    hashes = {}
    duplicates = []
    
    print(f"üîç ƒêang qu√©t {len(image_paths)} ·∫£nh trong {image_dir}...")
    
    for path in tqdm(image_paths):
        try:
            with Image.open(path) as img:
                h = dhash(img)
                
                # So s√°nh v·ªõi c√°c hash ƒë√£ l∆∞u
                is_duplicate = False
                for existing_hash, existing_path in hashes.items():
                    # T√≠nh kho·∫£ng c√°ch Hamming (s·ªë l∆∞·ª£ng bit kh√°c nhau)
                    distance = sum(c1 != c2 for c1, c2 in zip(h, existing_hash))
                    
                    if distance <= threshold:
                        duplicates.append((path, existing_path, distance))
                        is_duplicate = True
                        break
                
                if not is_duplicate:
                    hashes[h] = path
        except Exception as e:
            print(f"‚ö†Ô∏è L·ªói khi x·ª≠ l√Ω {path}: {e}")
            
    return duplicates

def main():
    # Thay ƒë·ªïi ƒë∆∞·ªùng d·∫´n ƒë·∫øn th∆∞ m·ª•c ·∫£nh c·ªßa b·∫°n ·ªü ƒë√¢y
    target_dir = r"d:\Computer Vision\Computer-Vision Project\Computer-Vision-\data\images\normal"
    
    duplicates = find_duplicates(target_dir, threshold=2)
    
    if not duplicates:
        print("Kh√¥ng t√¨m th·∫•y ·∫£nh tr√πng l·∫∑p!")
        return

    print(f"\nüì¢ T√¨m th·∫•y {len(duplicates)} c·∫∑p ·∫£nh t∆∞∆°ng ƒë·ªìng:")
    for dup, original, dist in duplicates:
        print(f"  - [{dist}] {dup.name} gi·ªëng v·ªõi {original.name}")

    confirm = input("\nB·∫°n c√≥ mu·ªën x√≥a c√°c ·∫£nh tr√πng l·∫∑p n√†y kh√¥ng? (y/n): ")
    if confirm.lower() == 'y':
        for dup, original, dist in duplicates:
            try:
                os.remove(dup)
                print(f"ƒê√£ x√≥a: {dup.name}")
            except Exception as e:
                print(f" Kh√¥ng th·ªÉ x√≥a {dup.name}: {e}")
        print("\n ƒê√£ d·ªçn d·∫πp xong!")
    else:
        print("\n ƒê√£ h·ªßy l·ªánh x√≥a.")

if __name__ == "__main__":
    main()


üîç ƒêang qu√©t 127 ·∫£nh trong d:\Computer Vision\Computer-Vision Project\Computer-Vision-\data\images\normal...


  0%|          | 0/127 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 127/127 [00:00<00:00, 187.05it/s]

Kh√¥ng t√¨m th·∫•y ·∫£nh tr√πng l·∫∑p!



