# parse lido file and extract elements

In [133]:
import bs4 #beautifulSoup
import os
import re
from pprint import pprint

In [19]:
DATA_DIR = "../../data/"
institution="graphik_portal"

In [20]:

dir_path = f"raw/{institution}/lido/"
flist = os.listdir(DATA_DIR + dir_path)
flist

['Technisches_Beispiel.xml', 'Albrecht_Duerer.xml']

In [75]:
# load a lido file and parse with beautiful soup
fname = flist[1]
fpath = os.path.join(DATA_DIR, dir_path, fname)
with open(fpath, 'r') as f:
    content = f.readlines()
    # Combine the lines in the list into a string
    content = "".join(content)
    bs_content = bs4.BeautifulSoup(content, "xml")

In [93]:
def get_titles(object_identification_wrap: bs4.element.Tag):
    """
    find list of titles for a given object
    """
    
    titles = []
    for title_set in object_identification_wrap.find_all("lido:titleSet"):
        print(title_set.attrs)
        title = title_set.find("lido:appellationValue")
        if title:
            titles.append(title.text)
            
    return titles

In [94]:
object_identification_wraps = bs_content.find_all("lido:objectIdentificationWrap")
len(object_identification_wraps)

1

In [100]:
lido_tags = bs_content.find_all(re.compile("(lido).+"))
lido_types = {}

for lido_tag in lido_tags:
    attrs = lido_tag.attrs
    if "lido:type" in attrs.keys():
        lido_types[attrs["lido:type"]] = ""


In [107]:
lido_id_to_vocab = {'http://vocab.getty.edu/aat/300417206': 'published_titles',
 'http://vocab.getty.edu/aat/300417214': 'series_titles',
 'http://terminology.lido-schema.org/lido00451': 'Preview image (en)',
 'http://terminology.lido-schema.org/lido00464': 'Provided image'}

In [112]:
# el="objectIdentificationWrap"
el="lido:descriptiveMetadata"
el="lido:objectIdentificationWrap"
el="lido:administrativeMetadata"
resource_reps = bs_content.find(el).find_all("lido:resourceRepresentation")
resources = []

for resource_rep in resource_reps:
    resource_dict = {}    
    
    url = resource_rep.find("lido:linkResource")
    if url:
        resource_dict['url'] = url.text
    
    resource_dict['lido_type'] = resource_rep.attrs.get('lido:type')
    resources.append(resource_dict)

resources

[{'url': 'http://www.bildindex.de/bilder/m/mi11813c13',
  'lido_type': 'http://terminology.lido-schema.org/lido00451'},
 {'url': 'http://www.bildindex.de/bilder/d/mi11813c13',
  'lido_type': 'http://terminology.lido-schema.org/lido00464'}]

In [129]:
def find_lido_element_text(lido: bs4.element.Tag, 
                            element_name:str) -> str:
    
    """
    generic function to a find an element from the given tag and return the innerText from the first found result
    """
    
    try:
        text = lido.find(element_name).text

    except Exception as E:
        print(E)
        text = None
    
    return text  
    

In [130]:
def get_lido_id(lido: bs4.element.Tag):
    """
    <lido:lidoRecID lido:type="http://terminology.lido-schema.org/lido00100" lido:source="http://ld.zdb-services.de/resource/organisations/DE-MUS-079214">DE-MUS-079214/lido/05091715,T,001</lido:lidoRecID>
    """ 

    try:
        _id = lido.find("lido:lidoRecID").text
    except Exception as E:
        print(E)
        _id = None
    
    return _id

In [141]:

# list all the lido records
lidos = bs_content.find_all('lido:lido')

# it seems that lido:lido is the main element that contains all the metadata for an object
for lido in lidos:

    lido_dict = {}
    # find publisher_id
    lido_dict['object_published_id'] = find_lido_element_text(lido,"lido:objectPublishedID")
    # find lido_id
    lido_dict['lido_id'] = get_lido_id(lido)

    # find image_url
    
    # find detail_url
    
    # find host_isil
    
    # find creditline
    
    # find title
    
    # find artist
    
    # find material_technique
    
    # find earliestDate
    
    # find latestDate
    
    # find classification
    

    # add record to list / write to db
    pprint(lido_dict,indent=2)

{ 'lido_id': 'DE-MUS-079214/lido/05091715,T,001',
  'object_published_id': 'DE-MUS-079214/object/05091715,T,001'}
