# parse lido file and extract elements

In [63]:
import bs4 #beautifulSoup
import os
import re
from pprint import pprint

```
recordID
```
<hr/>

```
<lido:recordID lido:type=http://terminology.lido-schema.org/lido00100 lido:source=https://culture.ld.admin.ch/isil/CH-000511-9>21566</lido:recordID>
```


	imageURL	invNr	person	date	title	classification	matTec	institutionIsil	recordURL	imageLicence	timestamp

# Mapping Lido to imageSearch database fields

Need to map lido to the fields in our database.  

our db metadata fields are:

```
id | record_id | created_date | title | image_url | record_url | inventory_number | person | date | classification | material_technique | institution_isil | image_licence | year_min | year_max | classification_id_id | institution_isil_id_id
```

These correspond to the following Lido elements


`record_id`

```html
<lido:recordID lido:type=http://terminology.lido-schema.org/lido00100 lido:source=https://culture.ld.admin.ch/isil/CH-000511-9>21566</lido:recordID>
```


`title`

```html
<lido:titleWrap>
    <lido:titleSet>
        <lido:appellationValue>Porträt von Albrecht Dürer dem Älteren</lido:appellationValue>
    </lido:titleSet>
</lido:titleWrap>
```

`record_url`

```html
<lido:recordInfoLink>https://doi.org/10.16903/ethz-grs-D_008883</lido:recordInfoLink>
```

`person`

```html
<lido:displayActorInRole>Hollar, Wenzel (1607 - 1677)</lido:displayActorInRole>
 
<lido:displayActorInRole>Dürer, Albrecht (1471 - 1528), nach</lido:displayActorInRole>
```



`imageUrl`

```html
<lido:resourceRepresentation lido:type=http://terminology.lido-schema.org/lido00451>
<lido:linkResource>https://e-gs.ethz.ch/eMP/eMuseumPlus?service=ImageAsset&amp;module=collection&amp;objectId=21566&amp;resolution=mediumImageResolution</lido:linkResource>
</lido:resourceRepresentation>
```

`year_min`  `year_max`


```html
<lido:eventDate>
    <lido:displayDate>1498</lido:displayDate>
    <lido:date>
        <lido:earliestDate>1498</lido:earliestDate>
        <lido:latestDate>1498</lido:latestDate>
    </lido:date>
</lido:eventDate>
```

`classification`

```html
<lido:classification lido:type="Objektklassifikation">
<lido:conceptID lido:type=http://terminology.lido-schema.org/lido00099 lido:source=http://vocab.getty.edu/aat>http://vocab.getty.edu/aat/300041273</lido:conceptID>
<lido:term>Druckgraphik</lido:term>
</lido:classification>


`image_licence`

```html
<lido:rightsResource>
<lido:rightsType>
<lido:conceptID lido:type=http://terminology.lido-schema.org/lido00099>https://creativecommons.org/publicdomain/zero/1.0/</lido:conceptID>
<lido:term>CC0 1.0 Universal (CC0 1.0)</lido:term>
</lido:rightsType>
```

`material_technique`

```html
<lido:displayMaterialsTech>Radierung</lido:displayMaterialsTech>
 
 
<lido:termMaterialsTech lido:type=http://terminology.lido-schema.org/lido00132>
<lido:conceptID lido:type=http://terminology.lido-schema.org/lido00099 lido:source=http://vocab.getty.edu/aat>http://vocab.getty.edu/aat/300053241</lido:conceptID>
<lido:term>Radierung</lido:term>
</lido:termMaterialsTech>
```


## functions

In [209]:
def get_titles(object_identification_wrap: bs4.element.Tag):
    """
    find list of titles for a given object
    """
    
    titles = []
    for title_set in object_identification_wrap.find_all("lido:titleSet"):
        print(title_set.attrs)
        title = title_set.find("lido:appellationValue")
        if title:
            titles.append(title.text)
            
    return titles

In [210]:
def find_lido_element_text(lido: bs4.element.Tag, 
                            element_name:str) -> str:
    
    """
    generic function to a find an element from the given tag and return the innerText from the first found result
    """
    
    try:
        text = lido.find(element_name).text

    except Exception as E:
        print(E)
        text = None
    
    return text  
    

In [211]:
def get_lido_id(lido: bs4.element.Tag):
    """
    <lido:lidoRecID lido:type="http://terminology.lido-schema.org/lido00100" lido:source="http://ld.zdb-services.de/resource/organisations/DE-MUS-079214">DE-MUS-079214/lido/05091715,T,001</lido:lidoRecID>
    """ 

    try:
        _id = lido.find("lido:lidoRecID").text
    except Exception as E:
        print(E)
        _id = None
    
    return _id

In [212]:
def get_image_url(lido: bs4.element.Tag) -> str:
    """
    assume there is only one image and that we only use lido:type 'http://terminology.lido-schema.org/lido00451'
    
    return the url for the image or None if element is not found 
    """

    resource_reps = lido.find("lido:resourceRepresentation", attrs={"lido:type":"http://terminology.lido-schema.org/lido00451"})
    if resource_reps:
        url = resource_rep.find("lido:linkResource")
        if url:
            return url.text        

    return None

In [213]:
def find_record_id(lido: bs4.element.Tag) -> str:
        
    res = lido.find("lido:recordID", attrs={"lido:type":"http://terminology.lido-schema.org/lido00100"})

    if res:
        return res.text            
    return None

In [214]:
def find_record_url(lido: bs4.element.Tag) -> str:
    
    res = lido.find("lido:recordInfoLink")

    if res:
        return res.text            
    return None

In [215]:
def find_inventory_number(lido: bs4.element.Tag) -> str:
    
    res = lido.find("lido:workID", attrs={"lido:type":"Inventarnummer"})

    if res:
        return res.text            
    return None


In [216]:
def find_persons(lido) -> [str,]:
    
    res = lido.find_all("lido:displayActorInRole")
    
    if res:
        return [r.text for r in res]

    return None

In [217]:

def find_relevant_dates(lido) -> [str,]:
    
    event_types_to_find= [
    "http://terminology.lido-schema.org/lido00486",
    "http://terminology.lido-schema.org/lido00484",
    "http://terminology.lido-schema.org/lido00487",
    "http://terminology.lido-schema.org/lido00228",
    ]
    found_dates=[]

    events = lido.find_all("lido:event")
    for event in events:
        event_type = event.find("lido:eventType").find("lido:conceptID", attrs={"lido:type":"http://terminology.lido-schema.org/lido00099"}).text
        if event_type in event_types_to_find:
            event_dates = find_min_max_years_within_event(event)
            found_dates +=event_dates
    return found_dates

In [218]:
def find_min_max_years_within_event(lido) -> (int, int):
    """
    find all listed dates and take min max
    
    """
    
    found_dates = []
    date_types = ["lido:displayDate", "lido:earliestDate", "lido:latestDate"]
    
    for date_type in date_types:
        dates = lido.find_all(date_type)
        for date in dates:
            found_dates.append(date.text)
    
    return found_dates

In [219]:
def get_min_max_year_from_dates(found_dates:[str,]) -> (int, int):
    
    years = [int(d[:4]) for d in date_strings]
    year_min = min(years)
    year_max = max(years)
    
    return year_min, year_max

In [220]:
def find_title(lido: bs4.element.Tag) -> str:
    """
    can be multiple titles. this function just finds the first
    """
    try:
        title_element = lido.find("lido:titleSet").find("lido:appellationValue")
    except Exception as e:
        return None
    
    return title_element.text

In [221]:
def find_classification(lido: bs4.element.Tag) -> str:

    cls = lido.find("lido:classification")
    
    if cls:
        return cls.find("lido:term").text
    
    return None

In [222]:
def find_insitution_isil(lido)->str:

    isil = lido.find("lido:legalBodyID", attrs={"lido:type":"http://terminology.lido-schema.org/lido00099"})

    if isil:
        return isil.text

    return None

def find_insitution_name(lido)->str:

    el = lido.find("lido:legalBodyName")
    
    if el:
        return el.find("lido:appellationValue").text

    return None


In [223]:
def find_image_licence(lido) -> str:
    
    el = lido.find("lido:rightsResource").find("lido:term")

    if el:
        return el.text
    
    return None

In [224]:
def find_credit_line(lido) -> str:
    el = lido.find("lido:creditLine")
    
    if el:
        return el.text
    
    return None

In [225]:
def find_material_techniques(lido) -> [str,]:
    
    mat_tecs = lido.find_all("lido:displayMaterialsTech")
    
    if mat_tecs:
        return [mat_tec.text for mat_tec in mat_tecs]

    return None

In [226]:
def find_all_lido_mappings(lido) -> dict:
    
    lido_dict = {}
    # find record_id
    lido_dict['record_id'] = find_record_id(lido)
    
    # find publisher_id
    lido_dict['object_published_id'] = find_lido_element_text(lido,"lido:objectPublishedID")
    
    # find lido_id
    lido_dict['lido_id'] = get_lido_id(lido)
    
    # find image_url
    lido_dict['image_url'] = get_image_url(lido)
    
    # find record_url
    lido_dict['record_url'] = find_record_url(lido)
    
    # find inventory_number
    lido_dict['inventory_number'] = find_inventory_number(lido)
    
    # find host_isil
    lido_dict['institution_isil'] = find_insitution_isil(lido)
    lido_dict['institution_name'] = find_insitution_name(lido)
    
    # find image_licence
    lido_dict['image_licence'] = find_image_licence(lido)
    
    # find creditline
    lido_dict['credit_line'] = find_credit_line(lido)
    
    # find title
    lido_dict['title'] = find_title(lido)
    
    # find person(s)
    lido_dict['person'] = find_persons(lido)    
    
    # find material_technique
    lido_dict['material_technique'] = find_material_techniques(lido)
    
    # find min max years
    found_dates = find_relevant_dates(lido)
    year_min, year_max = get_min_max_year_from_dates(found_dates)
    lido_dict['year_min'] = year_min
    lido_dict['year_max'] = year_max

    # find classification
    lido_dict['classification'] = find_classification(lido)
    
    return lido_dict

# MAIN

In [227]:
DATA_DIR = "../../data/"
institution="graphik_portal"

In [228]:

dir_path = f"raw/{institution}/lido/"
flist = os.listdir(DATA_DIR + dir_path)
flist

['Technisches_Beispiel.xml', 'Albrecht_Duerer.xml']

In [229]:
# load a lido file and parse with beautiful soup
fname = flist[1]
fpath = os.path.join(DATA_DIR, dir_path, fname)
with open(fpath, 'r') as f:
    content = f.readlines()
    # Combine the lines in the list into a string
    content = "".join(content)
    bs_content = bs4.BeautifulSoup(content, "xml")

In [230]:

# find all the individual lido records
lidos = bs_content.find_all('lido:lido')

# it seems that lido:lido is the main element that contains all the metadata for an object
for lido in lidos:

    lido_dict = find_all_lido_mappings(lido)

    # add record to list / write to db
    pprint(lido_dict,indent=2)

{ 'classification': 'Druckgrafik (Visuelles Werk)',
  'credit_line': 'gemeinfrei',
  'image_licence': 'Rechte vorbehalten - Freier Zugang',
  'image_url': 'http://www.bildindex.de/bilder/m/mi11813c13',
  'institution_isil': 'http://ld.zdb-services.de/resource/organisations/DE-MUS-079214',
  'institution_name': 'Wallraf-Richartz-Museum - Fondation Corboud (Köln)',
  'inventory_number': '25367',
  'lido_id': 'DE-MUS-079214/lido/05091715,T,001',
  'material_technique': [ 'Holzschnitt auf Papier',
                          'Turm mit Krone, Briquet 15863'],
  'object_published_id': 'DE-MUS-079214/object/05091715,T,001',
  'person': [ 'Albrecht Dürer (1471-1928), Inventor',
              'Albrecht Dürer (1471-1928), Holzschneider',
              'Albrecht Dürer (1471-1928), Verleger',
              'Metropolitan Museum of Art (New York, NY)'],
  'record_id': 't2/05091715,T,001',
  'record_url': 'https://www.bildindex.de/document/obj05091715?part=1',
  'title': 'Die Jungfrau erscheint Johanne