In [25]:
%matplotlib inline

import os
import sys
from datetime import datetime

# image hash dependencies
from PIL import Image
import imagehash

import numpy as np
from scipy.spatial.distance import hamming
from jellyfish import jaro_distance
import scipy.cluster.hierarchy as scipycluster

import matplotlib.pyplot as plt

comparison_base_ppn="PPN745219993"
basePath="/Users/david/src/python/StabiHacks/sbbget/sbbget_downloads.leske_mini/download_temp/"
#basePath="/Users/david/src/python/StabiHacks/sbbget/sbbget_downloads.leske/download_temp/"

In [33]:
def printLog(text):
    now = str(datetime.now())
    print("[" + now + "]\t" + text)
    # forces to output the result of the print command immediately, see: http://stackoverflow.com/questions/230751/how-to-flush-output-of-python-print
    sys.stdout.flush()

In [34]:
# check all subdirectories startings with PPN as each PPN stands for a different medium
dirsPerPPN = dict()
ppnDirs=[]
for x in os.listdir(basePath):
    if x.startswith("PPN"):
        dirsPerPPN[x]=[]
        ppnDirs.append(x)

In [45]:
# browse all directories below sbbGetBasePath and search for *_FULLTEXT and *_TIFF directories
# and associate each with its PPN

# each dictionary will use a PPN as key and an array of file paths (or the like) as value
# base_ denote the values of comparison_base_ppn
base_fulltextFilePaths = []
base_jpgFilePaths = []
base_jpgHashes=[]
fulltextFilePaths = dict()
jpgFilePaths = dict()
jpgHashes=dict()

printLog("Analysing directories and calculating image hashes...")
for ppn in ppnDirs:
    if ppn==comparison_base_ppn:
        pass
    else:
        fulltextFilePaths[ppn] = []
        jpgFilePaths[ppn] = []
        jpgHashes[ppn]=[]
    for dirpath, dirnames, files in os.walk(basePath+ppn):
        for name in files:
            if dirpath.endswith("_FULLTEXT"):
                # if we found a fulltext directory, only add XML files, i.e., the ALTO candidate files
                if name.endswith(".xml") or name.endswith(".XML"):
                    if ppn==comparison_base_ppn:
                        base_fulltextFilePaths.append(os.path.join(dirpath, name))
                    else:
                        fulltextFilePaths[ppn].append(os.path.join(dirpath, name))
                    dirsPerPPN[ppn].append(os.path.join(dirpath, name))
            if dirpath.endswith("_TIFF"):
                # if we found a image directory, only add JPEG files,
                if name.endswith(".jpg") or name.endswith(".JPG"):
                    tokens=dirpath.split("FILE_")
                    physicalID=tokens[1].replace("_TIFF","")

                    hash=imagehash.phash(Image.open(os.path.join(dirpath, name)))
                    if ppn==comparison_base_ppn:
                        base_jpgFilePaths.append((physicalID,os.path.join(dirpath, name),hash))
                    else:
                        jpgFilePaths[ppn].append((physicalID,os.path.join(dirpath, name),hash))
printLog("Done.")                

[2021-03-14 19:48:02.514162]	Analysing directories and calculating image hashes...
[2021-03-14 19:48:04.674189]	Done.


In [46]:
from sklearn.neighbors import DistanceMetric
dist = DistanceMetric.get_metric('hamming')

In [48]:
totalPages=0
for ppn in dirsPerPPN:
    l=len(dirsPerPPN[ppn])
    l_fulltext=0
    if ppn==comparison_base_ppn:
        l_fulltext=len(base_fulltextFilePaths)
    else:
        l_fulltext=len(fulltextFilePaths[ppn])
    print("%s with %i pages and %i fulltext files."%(ppn,l,l_fulltext))
    totalPages+=l
print("Total pages: %i"%totalPages)

PPN745219993 with 20 pages and 10 fulltext files.
PPN745232752 with 20 pages and 10 fulltext files.
PPN745236499 with 20 pages and 10 fulltext files.
Total pages: 60


In [64]:
iframe_references=[]
# create the overview for the comparison base
# we have to sort the file path by their physical ID before output
base_sorted_by_physID = sorted(base_jpgFilePaths, key=lambda tup: tup[0])
html="<body style='background-color:grey;'>"
for phys_id,file_path,hash_val in base_sorted_by_physID:
    html+="<img src='"+file_path+"' width='150px' alt='"+phys_id+"'/>\n"
html+="</body>"

f = open(basePath+"base_overview_"+ppn+".html", "w")
f.write(html)
f.close()

iframe_references.append(basePath+"base_overview_"+ppn+".html")

for ppn in jpgFilePaths:
    # we have to sort the file path by their physical ID before output
    sorted_by_physID = sorted(jpgFilePaths[ppn], key=lambda tup: tup[0])
    html=""
    for i,v in enumerate(sorted_by_physID):
        phys_id,file_path,hash_val=v
        html+="<img src='"+file_path+"' width='150px' alt='"+phys_id+"'/>\n"
        hash_diff=base_sorted_by_physID[i][2]-hash_val
        html+=str(hash_diff)

    f = open(basePath+"overview_"+ppn+".html", "w")
    f.write(html)
    f.close()
    
    iframe_references.append(basePath+"overview_"+ppn+".html")

html=""
for iframe in iframe_references: 
    html+="<iframe src='"+iframe+"'' width='100%' height='260px'></iframe> <br />"

f = open(basePath+"comparison.html", "w")
f.write(html)
f.close()

https://pypi.org/project/ImageHash/

https://stackoverflow.com/questions/998662/what-is-image-hashing-used-for (erkl√§rt die Verfahren)

M.Sc. thesis zum p-hash http://www.phash.org/docs/pubs/thesis_zauner.pdf

noch mehr zu den Distanzen:

https://tech.okcupid.com/evaluating-perceptual-image-hashes-okcupid/
https://content-blockchain.org/research/testing-different-image-hash-functions/
