In [246]:
import os
from pathlib import Path
import hashlib
import ipyplot
import numpy as np
from PIL import Image

### Step 1: Navigate over fodler and subfolders


First lets us see how we can navigate over folders and subfolders

In [10]:
path = './data'
for dirpath, dirnames, filenames in os.walk(path):
    print('Current path:', dirpath)
    print('Directories:', dirnames)
    print('Files:', filenames)
    print()

Current path: ./data
Directories: ['f2', '.ipynb_checkpoints', 'f1']
Files: ['.DS_Store']

Current path: ./data/f2
Directories: []
Files: ['1653 copy2.jpg', '1637 copy2.jpg', '1654 copy2.jpg', '2827.jpg', '2828.jpg']

Current path: ./data/.ipynb_checkpoints
Directories: []
Files: ['Untitled-checkpoint.ipynb']

Current path: ./data/f1
Directories: ['f3']
Files: ['1637.jpg', '1636.jpg', '1806.jpg', '.DS_Store', '1653 copy.jpg', '1654.jpg', '1654 copy.jpg', '1653.jpg', '2211.jpg', '2218.jpg', '1831.jpg', '1637 copy.jpg', '1836.jpg']

Current path: ./data/f1/f3
Directories: []
Files: ['1653 copy.jpg', '1654 copy.jpg', '2827.jpg', '2828.jpg']



### Step 2: store a dict with file hashes as keys and paths of the files as values


We can store as keys file hashes


In [236]:
path = './data'

file_list = os.walk(path)

file_hashes = dict()
for root,folders,files in file_list:
    for file in files:
        path = Path(os.path.join(root,file))
        fhash = hashlib.md5(open(path,'rb').read()).hexdigest()
        
        if fhash in file_hashes:
            file_hashes[fhash].append(path)
        else:
            file_hashes[fhash] = [path]

In [237]:
duplicate_file_paths = [x[1] for x in file_hashes.items() if len(x[1])>1]

In [238]:
duplicate_file_paths

[[PosixPath('data/f2/1653 copy2.jpg'),
  PosixPath('data/f1/1653 copy.jpg'),
  PosixPath('data/f1/1653.jpg'),
  PosixPath('data/f1/f3/1653 copy.jpg')],
 [PosixPath('data/f2/1637 copy2.jpg'),
  PosixPath('data/f1/1637.jpg'),
  PosixPath('data/f1/1637 copy.jpg')],
 [PosixPath('data/f2/1654 copy2.jpg'),
  PosixPath('data/f1/1654.jpg'),
  PosixPath('data/f1/1654 copy.jpg'),
  PosixPath('data/f1/f3/1654 copy.jpg')],
 [PosixPath('data/f2/2827.jpg'), PosixPath('data/f1/f3/2827.jpg')],
 [PosixPath('data/f2/2828.jpg'), PosixPath('data/f1/f3/2828.jpg')]]

### Step 3: Show duplicates

Now let us plot the duplicates from `duplicate_file_paths`

In [243]:
duplicates = [str(x) for x in duplicate_file_paths[1]]
images = [Image.open(f).resize((128,128)) for f in duplicates]

In [245]:
for duplicates in duplicate_file_paths:
    images = [str(x) for x in duplicates]
    ipyplot.plot_images(images)

### Step 4: For each of the ducplicate images, remove all but one

We will keep the repeated image with shortest filename among the repeated names. This should keep "image.jpgeg" from "image_copy.jpeg" or "image2.jpeg".

In [249]:
get_len = lambda x: len(x.stem)
duplicates = duplicate_file_paths[0]
keep_index = np.argmin(list(map(get_len, duplicates)))

In [253]:

for duplicates in duplicate_file_paths:
    keep_index = np.argmin(list(map(get_len, duplicates)))
    for i, duplicate in enumerate(duplicates):
        if keep_index != i:
            print(f"removed file {duplicate}")
            os.remove(duplicate)

removed file data/f2/1653 copy2.jpg
removed file data/f1/1653 copy.jpg
removed file data/f1/f3/1653 copy.jpg
removed file data/f2/1637 copy2.jpg
removed file data/f1/1637 copy.jpg
removed file data/f2/1654 copy2.jpg
removed file data/f1/1654 copy.jpg
removed file data/f1/f3/1654 copy.jpg
removed file data/f1/f3/2827.jpg
removed file data/f1/f3/2828.jpg
