In [None]:
#| default_exp europena

# Europeana parsers

> Parsers for Europena newspapers

**note** some of these parsers are likely to be more generic but we'll develop them for europena newspapers for now. Once they have been tested on other collections they may be moved to a core module. 

In [None]:
#|hide
from nbdev.showdoc import *


In [None]:
#|export
from typing import Any
from typing import Optional
from functools import lru_cache
from toolz import memoize
import io
from statistics import mean
from statistics import stdev
import xml.etree.ElementTree as ET

## ALTO Processing

ALTO is an XML format commonly used to store the outout of Opitcal Character Recogniton sofwastre

In [None]:
from pathlib import Path

Create test data 

In [None]:
xmls = list(Path("test_data").rglob("*.xml"))

In [None]:
xmls[0]

Path('test_data/9200396/BibliographicResource_3000118435009/63.xml')

In [None]:
# filenames = !rg -l "Illustration" test_data 

In [None]:
def alto_parse(alto, **kargs):
    """ Convert ALTO xml file to element tree """
    try:
        xml = ET.parse(alto, **kargs)
    except ET.ParseError as e:
        print(f"Parser Error in file '{alto}': {e}")
    # Register ALTO namespaces
    # https://www.loc.gov/standards/alto/ | https://github.com/altoxml
    # alto-bnf (unoffical) BnF ALTO dialect - for further info see
    # http://bibnum.bnf.fr/alto_prod/documentation/alto_prod.html
    namespace = {'alto-1': 'http://schema.ccs-gmbh.com/ALTO',
                 'alto-2': 'http://www.loc.gov/standards/alto/ns-v2#',
                 'alto-3': 'http://www.loc.gov/standards/alto/ns-v3#',
                 'alto-4': 'http://www.loc.gov/standards/alto/ns-v4#',
                 'alto-bnf': 'http://bibnum.bnf.fr/ns/alto_prod'}
    # Extract namespace from document root
    if 'http://' in str(xml.getroot().tag.split('}')[0].strip('{')):
        xmlns = xml.getroot().tag.split('}')[0].strip('{')
    else:
        try:
            ns = xml.getroot().attrib
            xmlns = str(ns).split(' ')[1].strip('}').strip("'")
        except IndexError:
            sys.stderr.write(
                f'\nERROR: File "{alto.name}": no namespace declaration found.')
            xmlns = 'no_namespace_found'
    if xmlns in namespace.values():
        return alto, xml, xmlns
    else:
        sys.stdout.write(f'\nERROR: File "{alto.name}": namespace {xmlns} is not registered.\n')

In [None]:
fname, xml, ns = alto_parse(xmls[0])
fname

Path('test_data/9200396/BibliographicResource_3000118435009/63.xml')

In [None]:
def get_alto_text(xml, xmlns, join_lines=True):
    """ Extract text content from ALTO xml file """
    all_text = []
    all_wc = []
    # Find all <TextLine> elements
    for lines in xml.iterfind('.//{%s}TextLine' % xmlns):
        # New line after every <TextLine> element
        # sys.stdout.write('\n')
        # Find all <String> elements
        for line in lines.findall('{%s}String' % xmlns):
            wc = line.attrib['WC']
            if wc is not None:
                all_wc.append(float(wc))
            # Check if there are no hyphenated words
            if ('SUBS_CONTENT' not in line.attrib and 'SUBS_TYPE' not in line.attrib):
            # Get value of attribute @CONTENT from all <String> elements
                text = line.attrib.get('CONTENT') #+ ' '
            elif ('HypPart1' in line.attrib.get('SUBS_TYPE')):
                text = line.attrib.get('SUBS_CONTENT')# + ' '
                if ('HypPart2' in line.attrib.get('SUBS_TYPE')):
                    pass
            all_text.append(text)
    mean_ocr = mean(all_wc)
    std_ocr = stdev(all_wc)
    return " ".join(all_text), mean_ocr, std_ocr

In [None]:
text, wc, std_ocr = get_alto_text(xml, ns)
assert all([text, wc, std_ocr])
assert isinstance(text,str)
assert isinstance(wc,float)
assert isinstance(std_ocr,float)

In [None]:
def alto_illustrations(xml, xmlns):
    """ Extract bounding boxes of illustration from ALTO xml file """
    # Find all <Illustration> elements
    bounding_boxes = []
    for illustration in xml.iterfind('.//{%s}Illustration' % xmlns):
        # Get @ID of <Illustration> element
        illustration_id = illustration.attrib.get('ID')
        # Get coordinates of <Illustration> element
        illustration_coords = (illustration.attrib.get('HEIGHT') + ','
                            + illustration.attrib.get('WIDTH') + ','
                            + illustration.attrib.get('VPOS') + ','
                            + illustration.attrib.get('HPOS'))
        # illustrations = f'{illustration_id}={illustration_coords}'
        bounding_boxes.append(illustration_coords)
    return bounding_boxes


In [None]:
alto_illustrations(xml,ns)

[]

In [None]:
def get_illustrations(xmls):
    for file in xmls:
        with open(file,"r") as f:
            for line in f:
                if "Illustration" in line:
                    yield file
                    break

In [None]:
illustration_xmls = list(set(iter(get_illustrations(xmls))))


In [None]:
# url = f'http://www.europeana.eu/api/v2/search.json?wskey={API_KEY}&query=europeana_collectionName="9200300_Ag_EU_TEL_a0600_Newspapers_ONB"'


In [None]:
# import requests

In [None]:

# r = requests.get(url)

In [None]:
# id = '9200300/BibliographicResource_3000095610170'
# url = f'https://api.europeana.eu/record/v2/{id}.json?wskey={API_KEY}'
# r = requests.get(url)
# data = r.json()['object']['proxies'][1]

In [None]:
# def simplify(obj: Any, key:Optional[str]=None):
#     if type(obj) != dict:
#         return obj
#     r = {}
#     for k, v in obj.items():
#         if k == "def":
#             r[key] = simplify(obj[k], k)
#         else:
#             r[f"{key}-{k}"] = simplify(obj[k], k)
#     return r




In [None]:
# @lru_cache(maxsize=512)
# def metadata_for(id):
#     r = f'https://api.europeana.eu/record/v2/{id}.json?wskey={API_KEY}'
#     if not (r := requests.get(r)):
#         return None
#     try:
#         data = r.json()['object']['proxies'][1]
#         output = {}
#         for k, v in data.items():
#             item = simplify(v, k)
#             if type(item) == dict:
#                 for k2, v2 in item.items():
#                     output[k2] = v2
#             else:
#                 output[k] = item
#         return output
#     except Exception as e:
#         print(e)
#         return None

In [None]:
# test_id = '9200300/BibliographicResource_3000095610170'


In [None]:
# metadata_for(id)

{'about': '/proxy/provider/9200300/BibliographicResource_3000095610170',
 'dcIdentifier': ['http://data.theeuropeanlibrary.org/BibliographicResource/3000095610170'],
 'dcLanguage': ['deu'],
 'dcRelation': ['http://de.wikipedia.org/wiki/Neuigkeits-Welt-Blatt'],
 'dcSubject': ['http://d-nb.info/gnd/4067510-5'],
 'dcTitle': ['Neuigkeits-Welt-Blatt'],
 'dcType': ['http://data.europeana.eu/concept/base/18'],
 'dcType-en': ['Newspaper Title', 'Serial'],
 'dctermsHasPart': ['http://data.europeana.eu/item/9200300/BibliographicResource_3000095593161',
  'http://data.europeana.eu/item/9200300/BibliographicResource_3000095594186',
  'http://data.europeana.eu/item/9200300/BibliographicResource_3000095595319',
  'http://data.europeana.eu/item/9200300/BibliographicResource_3000095595686',
  'http://data.europeana.eu/item/9200300/BibliographicResource_3000095597317',
  'http://data.europeana.eu/item/9200300/BibliographicResource_3000095597743',
  'http://data.europeana.eu/item/9200300/BibliographicRe