In [10]:
import hashlib
from difflib import SequenceMatcher
from PIL import Image, ImageChops
from pdf2image import convert_from_path

In [25]:
# Step 1: Simple comparison of Hashes

In [11]:
# Compare Hashes
def hash_file(fileName1, fileName2):
  
    # Use hashlib to store the hash of a file
    h1 = hashlib.sha1()
    h2 = hashlib.sha1()
  
    with open(fileName1, "rb") as file:
  
        # Use file.read() to read the size of file
        # and read the file in small chunks
        # because we cannot read the large files.
        chunk = 0
        while chunk != b'':
            chunk = file.read(1024)
            h1.update(chunk)
              
    with open(fileName2, "rb") as file:
  
        # Use file.read() to read the size of file a
        # and read the file in small chunks
        # because we cannot read the large files.
        chunk = 0
        while chunk != b'':
            chunk = file.read(1024)
            h2.update(chunk)
  
    # hexdigest() is of 160 bits
    return h1.hexdigest(), h2.hexdigest()



In [12]:
def comparePDFhashes(file1, file2):
    msg1, msg2 = hash_file(file1, file2)

    if(msg1 != msg2):
        print("These files are not identical")
    else:
        print("These files are identical")

In [13]:
comparePDFhashes('DSriVallabha-Resume.pdf', 'DSVResume-OverLeaf.pdf')

These files are not identical


In [26]:
# Step 2: Convert file to image and compare differences

In [14]:
def PDF2Image(filename):    
    
    lastname = filename.split('/')[-1].split('.')[0]
    print (filename, lastname)
    # Store Pdf with convert_from_path function
    pglist = []
    images = convert_from_path(filename)
    for i in range(len(images)):

        pgname = lastname + '_page'+ str(i).zfill(2) +'.jpg'
        # Save pages as images in the pdf
        images[i].save(pgname, 'JPEG')
        pglist.append(pgname)
    return pglist

In [17]:
def diffbetweenimages(il1, il2):
    
    assert(len(il1) == len(il2)),'both lists are not of equal length'
    count = 1
    for x, y in zip(il1, il2):
        # assign images
        img1 = Image.open(x)
        img2 = Image.open(y)

        # finding difference
        diff = ImageChops.difference(img1, img2)
        
        # showing the difference
        #diff.show()
        diff.save('diff_page_'+str(count) + '.jpg')
        count+=1

In [27]:
# Step 3: Putting the two together, and checking
# This function will check both : hashes and image differences between files and save them to disk

In [28]:
def compareTwoPDFs(file1, file2):
    print ('File names are {}, {}'.format(file1, file2))
    
    print ('Checking hash sums of the two files')
    comparePDFhashes(file1, file2)
    
    F1p = PDF2Image(file1)
    F2p = PDF2Image(file2)
    
    diffbetweenimages(F1p, F2p)

In [23]:
compareTwoPDFs('DSVResume1.pdf', 'DSVResume2.pdf')

File names are DSVresume.pdf, DSVResume-OverLeaf.pdf
Checking hash sums of the two files
These files are not identical
DSVresume.pdf DSVresume
DSVResume-OverLeaf.pdf DSVResume-OverLeaf
