In [1]:
# imports necessary libraries
import xml.etree.ElementTree as ET
import yaml
import os
from datetime import datetime

# Defining the structure of the script

In this section, we define all the classes and functions which we will later need to process the data.

First, we create classes. In a nutshell, a class is a blueprint for creating objects that have certain common characteristics and behavior. It allows related data (attributes) and behavior (methods) to be grouped together.



In [None]:
# creates a class to describe linked pairs of local ids and their corresponding authority ids

class LinkedPair():


    def __init__(self, local_id, authority_id, authority_label):
        self.local_id = local_id
        self.authority_id = authority_id
        self.authority_label = authority_label


    def paired_entities(self):
        # creates a BEACON-like statement of authority+local ids
        return f"{self.authority_id}||{self.local_id}"


    def show_pair(self):
        # shows the pair by printing it out
        pair = self.paired_entities()
        print(pair)


# creates a class to describe the collection of linked pairs


class LinkedCollection():
    
    
    def __init__(self, register_data, register_type, authority_data):
        self.register_data = register_data
        self.register_type = register_type
        self.authority_data = authority_data
        self.data = {}
    

    def add_linkedpair(self, local_id, authority_id, authority_type):
        # adds an instance of linked pair to the collection
        
        # checks if the authority id contains the full URI or just the alphanumeric id and if so, it removes the base url
        prefix = self.authority_data[f'{authority_type}']
                
        if authority_id.startswith(prefix):
            authority_id =authority_id[len(prefix)::]

        # adds the (now clean) pair to the collection    
        new_pair = LinkedPair(local_id, authority_id,authority_type)
        if authority_type not in self.data:
            self.data[authority_type] = []  
        self.data[authority_type].append(new_pair)  


    def show_pairs(self):
        # shows all collected pairs by printing them out
        for key, value in self.data.items():  
            for pair in value: 
                print(f"  {pair.paired_entities()}")


    def show_authority_types(self):
        # shows all authority types present by printing  them out
        all_auths = list(self.data.keys())
        print(all_auths)


    def write_BEACON(self):
        # writes the BEACON file 
        # iterates through all linked pairs per type of authority data   
            for key, value in self.data.items():  
                authority_label = key
                # define filename for the output file
                filename= f"output/BEACON_{self.register_type}_{authority_label}.txt"
                # define all information of the header
                target = self.register_data['target'] 
                prefix = self.authority_data[f'{key}']
                timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

                # now onto the optional header data
                contact= header_data['contact']
                if contact == '':
                    contact = ''
                else:
                    contact = f'#CONTACT: {contact}\n'
                    
                message = header_data['message']
                if message == '':
                    message = ''
                else:
                    message = f'#MESSAGE: {message}\n'
                    
                name = header_data['name']
                if name == '':
                    name = ''
                else:
                    name = f'#NAME: {name}\n'

                # writes the file and saves it
                with open(filename, "w+", encoding="utf-8") as f:
                    f.write(f"#FORMAT: BEACON\n{name}#PREFIX: {prefix}\n#TARGET: {target}\n{contact}{message}#RELATION: https://www.w3.org/2000/01/rdf-schema#seeAlso\n#TIMESTAMP:{timestamp}\n\n")
                    for pair in value:
                        f.write(f"{pair.paired_entities()}\n")
                    print(f"{filename} written successfully.")

            

Then we define some functions. A function in Python is a reusable block of code that performs a specific task. When this task is used more than once in a code, it makes sense to create a function instead of repeating the same lines more times.

Here we define the functions:

## save_log()
This function saves a log of entities missing authority data.

## iter_items()
This functions crawls through tei-xml and collects the values of the 'xml:id' attribute (that contains the project id) and the idno elements (their type and value) while keeping track of all that belongs together. Since this has to be repeated many times, we define it as a function.

In [3]:
def save_log(register_type,missing_authority):
    """ Creates a log of all items with no authority file associated to them.
    :param register_type: type of register being processed, ex. Names.
    :param missing_authority: list of entities with missing authority files.
    """

    with open(f"output/noids/{register_type}_noids.txt","w+", encoding="utf-8") as f:
        f.write("items with no associated authority file:\n")
        if missing_authority:
            for item in missing_authority:
                f.write("%s,\n" %item)
        else:
            f.write("None")
        print("list of misisng authority files written successfully.")


def iter_items(tei_list):
    """ Iterates over all entities in a given tei:list

    :param tei_list: tei list element that contains all entities of a register
    :param missing_authority: list of entities with missing authority files.
    """

    for child in tei_list:
        project_id = child.attrib[f"{{{ns['xml']}}}id"]
        
        
        authority_ids = child.findall("tei:idno", ns)
        if not authority_ids:
            authority_id = None
            missing_authority.append(project_id)
            
        else:
            for authority_id in authority_ids:
                if multiple_auth_types is True:
                    authority_type = authority_id.attrib["type"]
                else:
                    authority_type = next(iter(authority_data))

                if authority_type not in list(authority_data.keys()):
                    print(f'Some authority types might be wrong: check "{project_id}": "{authority_type}"')
                else:
                    collection.add_linkedpair(project_id,authority_id.text,authority_type)


# Reading the data

In the next code-block we read the config.yaml and the .xml file that contains the indices. We also create an output folder to store our results.

In [4]:
# check if output folder exists and create it if it does not
os.makedirs("output", exist_ok=True)
os.makedirs("output/noids", exist_ok=True)


# read settings from the config.yaml
with open ("config.yaml", "r") as file:
    cfg = yaml.safe_load(file)


# read header data settings
header_data = cfg['header_data']


# define and validate config dat
idx_path = cfg['file_location'] # path of the xml file
if not os.path.isfile(idx_path):
    print("No .xml file located.")
    exit()
else:
    print(f"{idx_path} file successfully located.")

Data/indices.xml file successfully located.


# Parsing, Extracting and Saving

In this last code-block, we parse our indices.xml file and process it, looking for the patterns we identified in order to save the data we want.
For the processing, formatting and saving of the output data, we use the previously defined classes and functions.

In [5]:
# parse the .xml file
print("parsing...")
tree = ET.parse(idx_path)
root = tree.getroot()

# read namespaces from the config file
ns = cfg['namespaces']

# check if there are more than one authority file type
authority_data = cfg['authority_files']

if len(authority_data) > 1:
    multiple_auth_types = True
else:
    multiple_auth_types = False

# go over each of the register/index types we want
for register_type in cfg['register_types']:
    
    # read which element and attributes the script should look for, e.g. "listPerson"
    register = cfg['register_types'][register_type]

    # create an empty container to store all project ids without any authority data
    missing_authority = []

    print(f'extracting info about "{register_type}"')

    # start to collect the data
    collection = LinkedCollection(register, register_type, authority_data)

    # check if we are looking for a specific attribute value
    if register['attribute_type'] == 'None':
        # look for the element that holds all entities
        tei_list = root.find(f".//tei:{register['element']}", ns)
        # runs the function that iterates over each entity
        iter_items(tei_list)
            
    # if there is a specific attribute value, then we look specifically for it        
    else:
        # look for the element (with the specific attribute) that holds all entities
        tei_list = root.find(f".//tei:{register['element']}[@{register['attribute_type']}='{register['attribute_value']}']", ns)
        # runs the function that iterates over each entity
        iter_items(tei_list)

    # collection.show_pairs()
    
    # after collecting all data, we can save the BEACON file
    collection.write_BEACON()

    # and the log of missing authority data
    save_log(register_type,missing_authority)
    

            
            


parsing...
extracting info about "names"
output/BEACON_names_viaf.txt written successfully.
output/BEACON_names_gnd.txt written successfully.
output/BEACON_names_wikidata.txt written successfully.
list of misisng authority files written successfully.
extracting info about "periodicals"
Some authority types might be wrong: check "J2": "bnd"
Some authority types might be wrong: check "J5": "bnd"
Some authority types might be wrong: check "J7": "bnd"
Some authority types might be wrong: check "J11": "bnd"
Some authority types might be wrong: check "J12": "bnd"
Some authority types might be wrong: check "J24": "bnd"
Some authority types might be wrong: check "J29": "bnd"
Some authority types might be wrong: check "J37": "bnd"
Some authority types might be wrong: check "J56": "bnd"
output/BEACON_periodicals_wikidata.txt written successfully.
output/BEACON_periodicals_viaf.txt written successfully.
output/BEACON_periodicals_gnd.txt written successfully.
list of misisng authority files writte