# Find Duplicate Photos

In [1]:
from src.data_pipeline import DataManager as DM
import src.metadata_handler as mh
%matplotlib inline

In [2]:
import Image
from iptcinfo import IPTCInfo, c_datasets
import os
import sys
import hashlib
from collections import defaultdict
import struct


## Methods

In [3]:
def _hash_jpeg(fh):
    # thank you stack overflow (with minor edits): 
    # http://stackoverflow.com/questions/10075065/compute-hash-of-only-the-core-image-data-excluding-metadata-for-an-image
    _hash =  hashlib.sha1()
    # _hash = hashlib.md5()
    assert fh.read(2) == "\xff\xd8"
    while True:
        marker,length = struct.unpack(">2H", fh.read(4))
        assert marker & 0xff00 == 0xff00
        if marker == 0xFFDA: # Start of stream
            _hash.update(fh.read())
            break
        else:
            fh.seek(length-2, os.SEEK_CUR)
    return _hash.hexdigest()

def printhash(file_path):
    with open(file_path) as fh:
        print _hash_jpeg(fh)

def find_duplicates(directory):
    # return a list of lists of duplicates
    duplicate_dict = defaultdict(list)
    for p, dirs, files in os.walk(directory):
        for ff in files:
            if ff[-4:].lower() == '.jpg':
                file_path = os.path.join(p,ff)
                with open(file_path) as fh:
                    duplicate_dict[_hash_jpeg(fh)].append(file_path)    
    return [v for v in duplicate_dict.itervalues() if len(v) > 1]



In [6]:
%%time
image = Image.open('data/first_sample/EK000004-2.JPG')
pixels = str(list(image.getdata()))

hash_img = hashlib.sha1(pixels)
hex_dig = hash_img.hexdigest()

print hex_dig

4dd5fb0d9c96a4134f53f722458e34624f38ce53
CPU times: user 3.2 s, sys: 290 ms, total: 3.49 s
Wall time: 3.55 s


In [5]:
%%time
printhash('data/first_sample/EK000004-2.JPG')

3a6b5be86fdf3c956bb4c6c0d6e6491a55938d2f
CPU times: user 1.9 ms, sys: 1.62 ms, total: 3.52 ms
Wall time: 2.61 ms


## Testing

In [None]:
testdupes = ['data/first_sample/EK000010.JPG', 
         'data/first_sample/EK000004.JPG', 
         'data/first_sample/EK000004-2.JPG', 
         'data/first_sample/EK000004 copy.JPG']   

In [None]:
# %%time
# for item in testdupes:
#     printhash(item)

In [None]:
%%time
duplicates = find_duplicates('data/second_sample/')

In [7]:
zephy1 = IPTCInfo('data/zephy1.jpg')
zephy2 = IPTCInfo('data/zephy2.jpg')

In [None]:
# zephy1.data['caption/abstract'] = "Data Scientist and his (wife's) dog!"
# zephy1.data['keywords'].append('canine')
# zephy1.data['keywords'].append('homo sapien')
# zephy1.save()

In [8]:
print zephy1
print
print zephy2

charset: None
{'original transmission reference': 'fXp-KYJdAwxGlEsyU0AM', 'supplemental category': [], 'contact': [], 'special instructions': 'FBMD01000a9a0d0000775c0000ffb100000eb3000040b6000088e50000f9730100a78c0100aa9201009c9b010051b20200', 'keywords': ['canine', 'homo sapien'], 'caption/abstract': "Data Scientist and his (wife's) dog!"}

charset: None
{'keywords': [], 'original transmission reference': 'fXp-KYJdAwxGlEsyU0AM', 'supplemental category': [], 'special instructions': 'FBMD01000a9a0d0000775c0000ffb100000eb3000040b6000088e50000f9730100a78c0100aa9201009c9b010051b20200', 'contact': []}


In [9]:
printhash('data/zephy1.jpg')
printhash('data/zephy2.jpg')

17f65583d4b3d6485eb5b25273811c2dc592437e
17f65583d4b3d6485eb5b25273811c2dc592437e
