In [302]:
##################################################
## Script to manually generate checksum for bags
## and check if internal checksum matches
##################################################
## GNU General Public License, version 2
## Author: UTSC DSU 
##################################################

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import configparser
import os
import zipfile
import hashlib

In [41]:
def find_latest_file_version(zip, datastream_ids_to_check):
    #list of files matching a given id
    id_matching_files = []
    
    #latest version of the file found and the version #
    latest_file = ''
    latest_ver = 0
    
    #list of all the latest files given
    latest_files = []
    
    #find latest version of all the datastream_ids_to_check
    all_files = zip.namelist()
    for id in datastream_ids_to_check:
        #find all files matching the id
        for f in all_files:
            file_parts = f.split('.')
            if (file_parts[0] == id):
                id_matching_files.append(f)
        #loop through all the files and find the latest version
        for f in id_matching_files:
            file_parts = f.split('.')
            latest_file = f
            latest_ver = file_parts[1]
            if(file_parts[1] > latest_ver):
                latest_file = f
                latest_ver = file_parts[1]
        
        #if file exists add it otherwise add the id
        if (latest_file == ''):
            latest_files.append(id)
        else:
            latest_files.append(latest_file)
            
        #set everything back for next set of ids
        latest_file = ''
        latest_ver = 0
        id_matching_files = []
        
    return latest_files

In [43]:
#Manually generate checksums for datastream_ids_to_check files 
#within the initial_zip data folder 
#(gets latest version of file or just the file you input)
def manually_generate_checksums(initial_zip, datastream_ids_to_check):
    #open initial zip file and save it as init_zip
    init_zip = zipfile.ZipFile(initial_zip)

    #path for bag data content within init_zip
    bag_location = os.path.splitext(initial_zip)[0]+'/data/'
    bag_name = bag_location.split('/')[0][4:]

    #the location for the foxml files
    bag_zip_location = bag_location + bag_name + '_foxml_atomzip.zip'
    #open zip file within data
    zip = zipfile.ZipFile(init_zip.open(bag_zip_location))
    print("Bag zip location is " + bag_zip_location)
    
    latest_files = find_latest_file_version(zip, datastream_ids_to_check)
    
    #list of files to check through can set to zip.namelist() 
    #to look through all files within the foxml folder
    filenames = latest_files
    #filenames = zip.namelist()

    #store the files and their checksums in a dict
    manual_checksum_list = {}
    #loop for checking all files in the filenames list
    for filename in filenames:
        filename = filename.strip()
        #try to generate checksum for file (if it exists)
        try:
            file = zip.open(filename)

            #generate md5 checksum for the file
            hash_md5 = hashlib.md5()
            for chunk in iter(lambda: file.read(4096), b""):
                hash_md5.update(chunk)
            generated_checksum = hash_md5.hexdigest()
            #print(generated_checksum)
            
            print('Generated manual checksum for \'' + filename + '\'')

            manual_checksum_list[os.path.basename(filename)] = generated_checksum
        except:
            #if file doesnt exist print to output
            print('File named \'' + filename + '\' does not exist (ignoring it)')

    print('Finished manually generating checksum for all files within data folder')
    return manual_checksum_list
    
manual_checksum_list = manually_generate_checksums('Bag-utsc_2503.zip', ['obj1', 'MODS', 'MODS.3.xml'])
#print(manual_checksum_list)

Bag zip location is Bag-utsc_2503/data/utsc_2503_foxml_atomzip.zip
File named 'obj1' does not exist (ignoring it)
Generated manual checksum for 'MODS.5.xml'
Generated manual checksum for 'MODS.3.xml'
Finished manually generating checksum for all files within data folder


In [44]:
#Get internal checksum stored in the foxml.xml file and compare them
#against manually generated checksum  (calls the manually_generate_checksum method)
def datastream_checksum_validator(initial_zip_loc, datastream_ids_to_check):
    
    #open initial zip file
    init_zip = zipfile.ZipFile(initial_zip_loc)

    #path for bag contents within init_zip
    bag_location = os.path.splitext(initial_zip_loc)[0]+'/data/'
    bag_name = bag_location.split('/')[0][4:]

    #get root of xml tree
    bag_xmldata_path = bag_location+'foxml.xml'
    
    #call to generate checksums from files in data_stream_ids_to_check 
    manual_checksum_list = manually_generate_checksums(initial_zip_loc, datastream_ids_to_check)
    
    #find the root of the xml file
    root = ET.parse(init_zip.open(bag_xmldata_path)).getroot()
    #find the "datastreamVersion" tag in the xml file (contains filenames)
    datastreamVersion_list = root.findall(".//{info:fedora/fedora-system:def/foxml#}datastream/\
.//{info:fedora/fedora-system:def/foxml#}datastreamVersion")

    print('Comparing with internal checksum stored at: ' + bag_xmldata_path)
    
    #loop through the files that have their manually generated checksum
    for filename in manual_checksum_list:
        print('\n')
        #loop through find all datastreamVersion tags (contains filename)
        #can redo to make it run faster... 
        for c_tag in datastreamVersion_list:
            generated_checksum = manual_checksum_list[filename]

            #find contentDigest tags (contains internally stored checksum)
            digest_tag = c_tag.findall('.//{info:fedora/fedora-system:def/foxml#}contentDigest')

            #check if internally stored checksum exists
            #if so find the two checksums match given they have the same filename
            if (len(digest_tag) != 0 and\
                c_tag.get('ID') == os.path.splitext(filename)[0] and\
                digest_tag[0].get('DIGEST') == generated_checksum): 
                
                print("PASS: File " + c_tag.get('ID') +  " with internal checksum \'"\
                      + digest_tag[0].get('DIGEST') + "\' matches manual checksum")
                
            #if the checksums dont match print the non matching one and return false
            elif (len(digest_tag) != 0 and\
                  c_tag.get('ID') == os.path.splitext(filename)[0] and\
                digest_tag[0].get('DIGEST') != generated_checksum):
                
                print("FAIL: File " + c_tag.get('ID') + \
                      " has a conflicting generated and internal checksum as follows:")
                data = {'internally stored checksum': [digest_tag[0].get('DIGEST')], \
                        'manually generated checksum': [generated_checksum]}
                df = pd.DataFrame(data)
                print(df)
                return False
    return True

datastream_checksum_validator('Bag-utsc_2503.zip', ['MODS', 'MODS.3.xml', 'OBJ']) 

Bag zip location is Bag-utsc_2503/data/utsc_2503_foxml_atomzip.zip
Generated manual checksum for 'MODS.5.xml'
Generated manual checksum for 'MODS.3.xml'
Generated manual checksum for 'OBJ.0.tif'
Finished manually generating checksum for all files within data folder
Comparing with internal checksum stored at: Bag-utsc_2503/data/foxml.xml


PASS: File MODS.5 with internal checksum 'b0b35a2844e06229ff07313795023f77' matches manual checksum


PASS: File MODS.3 with internal checksum 'b0b35a2844e06229ff07313795023f77' matches manual checksum


PASS: File OBJ.0 with internal checksum 'ac869cd3959e69cddf397f58900bcbf4' matches manual checksum


True