https://pypi.org/project/ImageHash/

https://stackoverflow.com/questions/998662/what-is-image-hashing-used-for (erklärt die Verfahren)

M.Sc. thesis zum p-hash http://www.phash.org/docs/pubs/thesis_zauner.pdf

noch mehr zu den Distanzen:

https://tech.okcupid.com/evaluating-perceptual-image-hashes-okcupid/
https://content-blockchain.org/research/testing-different-image-hash-functions/


In [133]:
%matplotlib inline

import os
import sys
import math
from datetime import datetime

# image hash dependencies
from PIL import Image
import imagehash

import numpy as np
from scipy.spatial.distance import hamming
from jellyfish import jaro_distance
import scipy.cluster.hierarchy as scipycluster

import matplotlib.pyplot as plt

comparison_base_ppn="PPN745219993"
basePath="/Users/david/src/python/StabiHacks/sbbget/sbbget_downloads.leske_mini/download_temp/"
#basePath="/Users/david/src/python/StabiHacks/sbbget/sbbget_downloads.leske/download_temp/"

In [116]:
def printLog(text):
    now = str(datetime.now())
    print("[" + now + "]\t" + text)
    # forces to output the result of the print command immediately, see: http://stackoverflow.com/questions/230751/how-to-flush-output-of-python-print
    sys.stdout.flush()

In [117]:
# check all subdirectories startings with PPN as each PPN stands for a different medium
dirsPerPPN = dict()
ppnDirs=[]
for x in os.listdir(basePath):
    if x.startswith("PPN"):
        dirsPerPPN[x]=[]
        ppnDirs.append(x)

In [118]:
# browse all directories below sbbGetBasePath and search for *_FULLTEXT and *_TIFF directories
# and associate each with its PPN

# each dictionary will use a PPN as key and an array of file paths (or the like) as value
# base_ denote the values of comparison_base_ppn
base_fulltextFilePaths = []
base_jpgFilePaths = []
base_jpgHashes=[]
fulltextFilePaths = dict()
jpgFilePaths = dict()
jpgHashes=dict()

printLog("Analysing directories and calculating image hashes...")
for ppn in ppnDirs:
    if ppn==comparison_base_ppn:
        pass
    else:
        fulltextFilePaths[ppn] = []
        jpgFilePaths[ppn] = []
        jpgHashes[ppn]=[]
    for dirpath, dirnames, files in os.walk(basePath+ppn):
        for name in files:
            if dirpath.endswith("_FULLTEXT"):
                # if we found a fulltext directory, only add XML files, i.e., the ALTO candidate files
                if name.endswith(".xml") or name.endswith(".XML"):
                    if ppn==comparison_base_ppn:
                        base_fulltextFilePaths.append(os.path.join(dirpath, name))
                    else:
                        fulltextFilePaths[ppn].append(os.path.join(dirpath, name))
                    dirsPerPPN[ppn].append(os.path.join(dirpath, name))
            if dirpath.endswith("_TIFF"):
                # if we found a image directory, only add JPEG files,
                if name.endswith(".jpg") or name.endswith(".JPG"):
                    tokens=dirpath.split("FILE_")
                    physicalID=tokens[1].replace("_TIFF","")

                    hash=imagehash.phash(Image.open(os.path.join(dirpath, name)))
                    if ppn==comparison_base_ppn:
                        base_jpgFilePaths.append((physicalID,os.path.join(dirpath, name),hash))
                    else:
                        jpgFilePaths[ppn].append((physicalID,os.path.join(dirpath, name),hash))
printLog("Done.")                

[2021-03-16 14:43:55.510683]	Analysing directories and calculating image hashes...
[2021-03-16 14:43:58.449171]	Done.


In [119]:
from sklearn.neighbors import DistanceMetric
dist = DistanceMetric.get_metric('hamming')

In [120]:
totalPages=0
for ppn in dirsPerPPN:
    l=len(dirsPerPPN[ppn])
    l_fulltext=0
    if ppn==comparison_base_ppn:
        l_fulltext=len(base_fulltextFilePaths)
    else:
        l_fulltext=len(fulltextFilePaths[ppn])
    print("%s with %i pages and %i fulltext files."%(ppn,l,l_fulltext))
    totalPages+=l
print("Total pages: %i"%totalPages)

PPN745219993 with 10 pages and 10 fulltext files.
PPN745232752 with 10 pages and 10 fulltext files.
PPN745219993_copy with 10 pages and 10 fulltext files.
PPN745236499 with 10 pages and 10 fulltext files.
Total pages: 40


In [124]:
iframe_references=[]
# create the overview for the comparison base
# we have to sort the file path by their physical ID before output
base_sorted_by_physID = sorted(base_jpgFilePaths, key=lambda tup: tup[0])
html="<body style='background-color:grey;'>"
for phys_id,file_path,hash_val in base_sorted_by_physID:
    html+="<img src='"+file_path+"' width='150px' alt='"+phys_id+"'/>\n"
html+="</body>"

f = open(basePath+"base_overview_"+ppn+".html", "w")
f.write(html)
f.close()

iframe_references.append(basePath+"base_overview_"+ppn+".html")

for ppn in jpgFilePaths:
    # we have to sort the file path by their physical ID before output
    sorted_by_physID = sorted(jpgFilePaths[ppn], key=lambda tup: tup[0])
    html=""
    for i,v in enumerate(sorted_by_physID):
        phys_id,file_path,hash_val=v
        html+="<img src='"+file_path+"' width='150px' alt='"+phys_id+"'/>\n"
        hash_diff=base_sorted_by_physID[i][2]-hash_val
        html+=str(hash_diff)

    f = open(basePath+"overview_"+ppn+".html", "w")
    f.write(html)
    f.close()
    
    iframe_references.append(basePath+"overview_"+ppn+".html")

html=""
for iframe in iframe_references: 
    html+="<iframe src='"+iframe+"'' width='100%' height='260px'></iframe> <br />"

f = open(basePath+"comparison.html", "w")
f.write(html)
f.close()

In [166]:
iframe_references=[]

# the window surrounding the current page that will be used for comparisons
# the window has to be uneven as it will consist of the pages before and after the current page
comparison_window=5
# only uneven windows are allowed
if comparison_window%2==0:
    comparison_window+=1

window_offset=comparison_window//2
max_len=len(base_sorted_by_physID)

# for debug purposes only
# dummy_list=("A","B","C","D","E","F","G","H","I","J")


# the window will be constructed as follows:
#           /-------comparison_window-----\
# [...][lower_bound][ ][current_pos][ ][upper_bound][...] 
#
# the lower and upper bound will never be outside the array bounds

other_ppn="PPN745232752"
other_ppn="PPN745236499"
# debug
#other_ppn="PPN745219993_copy"
sorted_by_physID = sorted(jpgFilePaths[other_ppn], key=lambda tup: tup[0])

for current_pos in range(0,max_len):
    # restrict lower/upper bound to the array bounds
    lower_bound=max(current_pos-window_offset,0)
    upper_bound=min(current_pos+window_offset,max_len)
    #print(current_pos)
    #print("\t lower %i, upper %i"%(lower_bound,upper_bound))
    #print("\t"+str(dummy_list[lower_bound:upper_bound+1]))
    
    # TODO: bound check for other
    
    base_hashes=np.array(base_sorted_by_physID[current_pos][2])
    other_hashes=np.array([tuple_element[2] for tuple_element in sorted_by_physID[lower_bound:upper_bound+1]])
    other_paths=np.array([tuple_element[1] for tuple_element in sorted_by_physID[lower_bound:upper_bound+1]])
    other_phys_ids=np.array([tuple_element[0] for tuple_element in sorted_by_physID[lower_bound:upper_bound+1]])
    hash_diffs=base_hashes-other_hashes
    # normalize hash diffs; max. hamming distance is 8*8
    hash_diffs=hash_diffs/64
    
    base_phys_id_nr=int(base_sorted_by_physID[current_pos][0])
    html="<img src='"+base_sorted_by_physID[current_pos][1]+"' width='150px' />'"
    html+="<p>"+str(base_phys_id_nr)+"</p><br />"
    
    for i,diff in enumerate(hash_diffs):
        phys_id_nr=int(other_phys_ids[i])
        distance_multiplicator=max(math.log(abs(base_phys_id_nr-phys_id_nr)+1),1)
        html+="<img src='"+other_paths[i]+"' width='150px' /> "+"δ: "+str(diff*distance_multiplicator)#+" phys. ID:"+str(phys_id_nr)+" : "+str(distance)
    
    outPath=basePath+"diff_current_"+str(current_pos)+".html"
    iframe_references.append(outPath)
    f = open(outPath, "w")
    f.write(html)
    f.close()
# base_jpgFilePaths.append((physicalID,os.path.join(dirpath, name),hash))
# jpgFilePaths[ppn].append((physicalID,os.path.join(dirpath, name),hash))

html=""
for iframe in iframe_references: 
    html+="<iframe src='"+iframe+"'' width='100%' height='260px'></iframe> <br />"

f = open(basePath+"comparison_matching.html", "w")
f.write(html)
f.close()