In [1]:
! pip install imagehash Pillow




[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# ============================
# MEMORY-EFFICIENT DUPLICATE FOLDER CHECK (RESNET50)
# ============================

import os
import torch
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image
import numpy as np
from tqdm import tqdm
from torch.nn.functional import normalize

# ------------------------------
# Load Pretrained ResNet50
# ------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet50(pretrained=True)
model = torch.nn.Sequential(*list(model.children())[:-1])  # remove classifier
model.to(device)
model.eval()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# ------------------------------
# Convert image → embedding
# ------------------------------
def get_embedding(img_path):
    try:
        img = Image.open(img_path).convert("RGB")
    except:
        return None

    img_t = transform(img).unsqueeze(0).to(device)

    with torch.no_grad():
        emb = model(img_t)

    emb = emb.view(-1)
    emb = normalize(emb, dim=0)
    return emb.cpu().numpy()

# ------------------------------
# Process single folder
# ------------------------------
def compute_folder_embeddings(folder_path, save_path):
    if os.path.exists(save_path):
        return np.load(save_path)

    embeddings = []
    image_files = [f for f in os.listdir(folder_path)
                   if f.lower().endswith((".jpg", ".jpeg", ".png"))]

    for img in tqdm(image_files, desc=f"Processing {os.path.basename(folder_path)}"):
        full = os.path.join(folder_path, img)
        emb = get_embedding(full)
        if emb is not None:
            embeddings.append(emb)

    embeddings = np.array(embeddings)
    np.save(save_path, embeddings)
    return embeddings

# ------------------------------
# Folder similarity (cosine)
# ------------------------------
def folder_similarity(emb1, emb2):
    if len(emb1) == 0 or len(emb2) == 0:
        return 0

    sims = []
    emb2_T = emb2.T

    for e1 in emb1:
        cos = np.dot(e1, emb2_T).max()   # best match
        sims.append(cos)

    return np.mean(sims)

# ------------------------------
# DETECT FOLDERS WITH >90% DUPLICATION
# ------------------------------
def detect_duplicate_folders(root_path, threshold=0.90):
    folders = [os.path.join(root_path, d) for d in os.listdir(root_path)
               if os.path.isdir(os.path.join(root_path, d))]

    folder_embeddings = {}
    duplicates = []

    # Step 1: compute/load embeddings
    for f in folders:
        save_path = f"{f}_embeddings.npy"
        emb = compute_folder_embeddings(f, save_path)
        folder_embeddings[f] = emb

    # Step 2: compare folders
    for i in range(len(folders)):
        for j in range(i+1, len(folders)):
            f1, f2 = folders[i], folders[j]

            sim = folder_similarity(folder_embeddings[f1],
                                    folder_embeddings[f2])

            if sim >= threshold:
                duplicates.append((os.path.basename(f1),
                                   os.path.basename(f2),
                                   sim))

    return duplicates


# ===============================
# RUN
# ===============================

root_path = r"E:\merged_waste_dataset"   # <-- CHANGE THIS

duplicates = detect_duplicate_folders(root_path)

for f1, f2, sim in duplicates:
    print(f"ORIGINAL : {f1}   |   DUPLICATE : {f2}   |   Similarity: {sim*100:.2f}%")




Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to C:\Users\HP/.cache\torch\hub\checkpoints\resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:17<00:00, 5.81MB/s]
Processing 1. Polythene: 100%|██████████| 120/120 [00:30<00:00,  3.99it/s]
Processing 10.Mask: 100%|██████████| 129/129 [00:45<00:00,  2.86it/s]
Processing 4. Glass: 100%|██████████| 120/120 [00:40<00:00,  3.00it/s]
Processing 5. Wire: 100%|██████████| 120/120 [00:37<00:00,  3.23it/s]
Processing 6. Glaves: 100%|██████████| 125/125 [00:40<00:00,  3.12it/s]
Processing 7. Empty medicine packet: 100%|██████████| 131/131 [00:43<00:00,  3.00it/s]
Processing 7. Shell of Malta: 100%|██████████| 126/126 [00:40<00:00,  3.07it/s]
Processing background_ground: 100%|██████████| 2470/2470 [05:36<00:00,  7.34it/s]
Processing Battery: 100%|██████████| 300/300 [00:38<00:00,  7.73it/s]
Processing food_dishes: 100%|██████████| 4230/4230 [09:09<00:00,  7.70it/s]
Processing food_items: 100%|██████████| 3862/3862 [08:19<00:00,  7.73it/s]
Processing food_recyclable: 100%|██████████| 2571/2571 [05:43<00:00,  7.48it/s]
Processing fruits_and_vegetables: 100%|

ORIGINAL : food_dishes   |   DUPLICATE : mixed   |   Similarity: 100.00%
ORIGINAL : food_items   |   DUPLICATE : mixed   |   Similarity: 99.99%
ORIGINAL : food_recyclable   |   DUPLICATE : mixed   |   Similarity: 99.99%
ORIGINAL : fruits_and_vegetables   |   DUPLICATE : mixed   |   Similarity: 99.99%
ORIGINAL : gauze   |   DUPLICATE : glove_pair_latex   |   Similarity: 91.57%
ORIGINAL : gauze   |   DUPLICATE : glove_single_latex   |   Similarity: 91.24%
ORIGINAL : glove_pair_latex   |   DUPLICATE : glove_single_latex   |   Similarity: 94.46%
ORIGINAL : glove_pair_nitrile   |   DUPLICATE : glove_single_nitrile   |   Similarity: 94.56%
ORIGINAL : glove_pair_nitrile   |   DUPLICATE : shoe_cover_pair   |   Similarity: 91.18%
ORIGINAL : glove_pair_nitrile   |   DUPLICATE : shoe_cover_single   |   Similarity: 92.11%
ORIGINAL : glove_pair_surgery   |   DUPLICATE : glove_single_surgery   |   Similarity: 92.52%
ORIGINAL : glove_single_nitrile   |   DUPLICATE : glove_single_surgery   |   Similar