# Find Duplicate Photos

In [1]:
from src.data_pipeline import DataManager as DM
import src.metadata_handler as mh
import src.duplicate_detector as dud

# does not fix IPTCInfo warning
# import warnings
# warnings.filterwarnings('ignore')

In [2]:
import Image
from iptcinfo import IPTCInfo, c_datasets
import json
import os
import sys
import hashlib
from collections import defaultdict
import struct

%matplotlib inline

## Methods
http://stackoverflow.com/questions/10075065/compute-hash-of-only-the-core-image-data-excluding-metadata-for-an-image

In [3]:
def _hash_jpeg(fh):
    # thank you stack overflow (with minor edits): 
    # http://stackoverflow.com/questions/10075065/compute-hash-of-only-the-core-image-data-excluding-metadata-for-an-image
    _hash =  hashlib.sha1()
    # _hash = hashlib.md5()
    assert fh.read(2) == "\xff\xd8"
    while True:
        marker,length = struct.unpack(">2H", fh.read(4))
        assert marker & 0xff00 == 0xff00
        if marker == 0xFFDA: # Start of stream
            _hash.update(fh.read())
            break
        else:
            fh.seek(length-2, os.SEEK_CUR)
    return _hash.hexdigest()

def printhash(file_path):
    with open(file_path) as fh:
        print _hash_jpeg(fh)

def find_duplicates(directory):
    # return a list of lists of duplicates
    duplicate_dict = defaultdict(list)
    count = 0
    for p, dirs, files in os.walk(directory):
        for ff in files:
            # check for .jpg file extension
            # check for wierd non-jpg files (created by picasa?)
            # -> filenames starting with '._'
            if ff[-4:].lower() == '.jpg' and ff[:2] != '._':
                count +=1
                if count % 100 == 0:
                    print "processing file: ", ff
                file_path = os.path.join(p,ff)
                with open(file_path) as fh:
                    duplicate_dict[_hash_jpeg(fh)].append(file_path)
    return [v for v in duplicate_dict.itervalues() if len(v) > 1]

def listphotos(directory):
    photolist = []
    for p, dirs, files in os.walk(directory):
        for ff in files:
            # check for .jpg file extension
            # check for wierd non-jpg files (created by picasa?)
            # -> filenames starting with '._'
            if ff[-4:].lower() == '.jpg'and ff[:2] != '._':
                file_path = os.path.join(p,ff)
                photolist.append(file_path)
    return photolist

In [None]:
%%time
image = Image.open('../data/first_sample/EK000004-2.JPG')
pixels = str(list(image.getdata()))

hash_img = hashlib.sha1(pixels)
hex_dig = hash_img.hexdigest()

print hex_dig

In [None]:
%%time
printhash('../data/first_sample/EK000004-2.JPG')

## Testing

In [None]:
testdupes = ['../data/first_sample/EK000010.JPG', 
         '../data/first_sample/EK000004.JPG', 
         '../data/first_sample/EK000004-2.JPG', 
         '../data/first_sample/EK000004 copy.JPG']   

In [None]:
# %%time
# for item in testdupes:
#     printhash(item)

In [None]:
photolist = listphotos('../data/second_sample/')
len(photolist)

In [4]:
%%time
duplicates = find_duplicates('../data/second_sample/')
print len(duplicates)

processing file:  PICT0173.JPG
processing file:  EK000054.jpg
processing file:  01080240.JPG
processing file:  01140340.JPG
processing file:  01210440.JPG
processing file:  02040453.JPG
processing file:  02200773.JPG
processing file:  03100935.JPG
processing file:  PICT0106.JPG
processing file:  PICT0242.JPG
processing file:  PICT0360.JPG
processing file:  PICT0487.JPG
processing file:  PICT0628.JPG
processing file:  PICT0749.JPG
47
CPU times: user 2.01 s, sys: 1.13 s, total: 3.15 s
Wall time: 6.55 s


## check that identical photos w/ different metadata have the same hash

In [5]:
# These start off as identical copies of the same image
zephy1 = IPTCInfo('../data/zephy1.jpg')
zephy2 = IPTCInfo('../data/zephy2.jpg')

In [None]:
# zephy1.data['caption/abstract'] = "Data Scientist and his (wife's) dog!"
# zephy1.data['keywords'].append('canine')
# zephy1.data['keywords'].append('homo sapien')
# zephy1.save()

In [6]:
z1 = mh.build_dictionary('../data/zephy1.jpg')
z2 = mh.build_dictionary('../data/zephy2.jpg')

LL = [z1, z2]

for k in max(LL).iterkeys():
    results = [d[k] if k in d else '<no value>' for d in LL]
    if results[0] != results[1]:
        print '\t', k, results

	keywords [['canine', 'homo sapien'], []]
	file_path ['data/zephy1.jpg', 'data/zephy2.jpg']
	caption/abstract ["Data Scientist and his (wife's) dog!", '<no value>']


In [7]:
printhash('../data/zephy1.jpg')
printhash('../data/zephy2.jpg')

17f65583d4b3d6485eb5b25273811c2dc592437e
17f65583d4b3d6485eb5b25273811c2dc592437e


## How to Choose Between Identical Images

In [None]:
%%time

infolist = []
for photo in photolist:
    info = IPTCInfo(photo)
    infolist.append(info)
    
print '\n accessing info for {} files\n'.format(len(photolist))


In [8]:
duplicate_data = [[mh.build_dictionary(i) for i in sublist] for sublist in duplicates]

count = 0
for sublist in duplicate_data:
    if len(sublist) == 2:
        print count
        count +=1
        for k in max(sublist).iterkeys():
            results = [d[k] if k in d else '<no value>' for d in sublist]
            if results[0] != results[1]:
                print '\t', k, results
        print

0
	sub-location ['Hope Mountain', 'Hope Mountain-2015-1']
	by-line ['Ken Vanden Heuvel ', 'Ken Vanden Heuvel']
	object name ['Lynx ', '<no value>']
	file_path ['data/second_sample/Hope Mountain/Hope Mountain-2015-1/01.01.15-02.01.15/11240040.JPG', 'data/second_sample/Hope Mountain/Hope Mountain-2015-1/11.22.14-2.1.15/11240040.JPG']
	caption/abstract ['M2E6L0-0R350B362', '1']

1
	sub-location ['Hope Mountain', 'Hope Mountain-2015-1']
	by-line ['Ken Vanden Heuvel ', 'Ken Vanden Heuvel']
	object name ['Lynx ', '<no value>']
	file_path ['data/second_sample/Hope Mountain/Hope Mountain-2015-1/01.01.15-02.01.15/11240036.JPG', 'data/second_sample/Hope Mountain/Hope Mountain-2015-1/11.22.14-2.1.15/11240036.JPG']
	caption/abstract ['M2E6L0-0R350B362', '1']

2
	sub-location ['Hope Mountain', 'Hope Mountain-2015-1']
	by-line ['Ken Vanden Heuvel ', 'Ken Vanden Heuvel']
	keywords [['snowshoe hare'], ['unidentified']]
	object name ['Lynx ', '<no value>']
	file_path ['data/second_sample/Hope Mountain/

In [9]:
dup5a = mh.build_dictionary('../data/second_sample/Hope Mountain/Hope Mountain-2015-1/01.01.15-02.01.15/11300135.JPG')
dup5b = mh.build_dictionary('../data/second_sample/Hope Mountain/Hope Mountain-2015-1/11.22.14-2.1.15/11300135.JPG')
print 'dup5a keywords: ', dup5a['keywords']
print 'dup5b keywords: ', dup5b['keywords']

dup5a keywords:  ['snowshoe hare']
dup5b keywords:  ['unidentified']


In [10]:
sublist = duplicate_data[1]
print
for k in max(sublist).iterkeys():
    if sublist[0] != sublist[1]:
        print k, ": ", sublist[0].get(k, '<no value>')
        print k, ": ", sublist[1].get(k, '<no value>')


date created :  20141124
date created :  20141124
copyright notice :  Conservation Northwest
copyright notice :  Conservation Northwest
sub-location :  Hope Mountain
sub-location :  Hope Mountain-2015-1
by-line :  Ken Vanden Heuvel 
by-line :  Ken Vanden Heuvel
supplemental category :  []
supplemental category :  []
contact :  []
contact :  []
time created :  185349
time created :  185349
keywords :  ['unidentified']
keywords :  ['unidentified']
object name :  Lynx 
object name :  <no value>
file_path :  data/second_sample/Hope Mountain/Hope Mountain-2015-1/01.01.15-02.01.15/11240036.JPG
file_path :  data/second_sample/Hope Mountain/Hope Mountain-2015-1/11.22.14-2.1.15/11240036.JPG
caption/abstract :  M2E6L0-0R350B362
caption/abstract :  1


## Full List of Duplicates:

In [27]:
with open('../data/duplicate_list.json', 'r') as fh:
    all_duplicates = json.load(fh)
