In [None]:
# | default_exp europena

# Europeana newspaper parsers

> Parsers for Europena newspapers

The goal of this code is to create a pipeline for parsing the [Europeana newspaper bulk downloads](https://pro.europeana.eu/page/iiif#download) and converting the orignal ALTO XML formats + metadata into a format that can be ingested easily into the 🤗 [datasets](https://huggingface.co/docs/datasets/index) library and cons

 for #BigLAM. This code is mostly colated from other places. We used [nbdev](https://nbdev.fast.ai/) to give our code some:

- basic tests
- some basic documentation 
- make it easily instalable as a Python package. 

**note** some of these parsers are likely to be more generic but we'll develop them for europena newspapers for now. Once they have been tested on other collections they may be moved to a core module. 

In [None]:
# %load_ext rich

In [None]:
# |hide
from nbdev.showdoc import *

In [None]:
# |export
import io
import os
import xml
import xml.etree.ElementTree as ET
from concurrent.futures import ProcessPoolExecutor, as_completed
# from dataclaises import asdict, dataclass, field
from functools import lru_cache
from pathlib import Path
from statistics import mean, stdev
from attrs import asdict

from typing import Any, Dict, Iterable, List, Optional, Union
from attrs import define, field

import xmltodict
from toolz import partition_all
from tqdm.auto import tqdm

## ALTO Processing

ALTO is an XML format commonly used to store the outout of Opitcal Character Recogniton software. 

### Create test data

A small amount of test data is included in the repository and versioned with Git. To avoid bloating the repository too much the below cell can either be run as part of test (including) slow tests or run locally to give oneself a bigger amount of data to test methods with. 

In [None]:
# |slow
# !aria2c -x 4 -d test_data/ --auto-file-renaming=false ftp://download.europeana.eu/newspapers/fulltext/alto/9200357.zip

In [None]:
# !unzip test_data/9200357.zip -d test_data/

In [None]:
xmls = list(Path("test_data").rglob("*.xml"))
len(xmls)

108432

In [None]:
xmls[:2]

[Path('test_data/9200357/BibliographicResource_3000095232526/8.xml'),
 Path('test_data/9200357/BibliographicResource_3000095232526/9.xml')]

## Parse ALTO XMLs

The first step is to parse the xml file from disk into a elementree that we can use for other takss
stolen from; https://github.com/cneud/alto-tools/blob/master/alto_tools.py

In [None]:
# |export
from loguru import logger

In [None]:
# |export
def alto_parse(alto: Union[str, Path], **kwargs):
    """Convert ALTO xml file to element tree"""
    try:
        xml = ET.parse(alto, **kwargs)
    except ET.ParseError as e:
        logger.error(f"Parser Error in file '{alto}': {e}")
        return None
    # Register ALTO namespaces
    # https://www.loc.gov/standards/alto/ | https://github.com/altoxml
    # alto-bnf (unoffical) BnF ALTO dialect - for further info see
    # http://bibnum.bnf.fr/alto_prod/documentation/alto_prod.html
    namespace = {
        "alto-1": "http://schema.ccs-gmbh.com/ALTO",
        "alto-2": "http://www.loc.gov/standards/alto/ns-v2#",
        "alto-3": "http://www.loc.gov/standards/alto/ns-v3#",
        "alto-4": "http://www.loc.gov/standards/alto/ns-v4#",
        "alto-5": "http://schema.ccs-gmbh.com/docworks/version20/alto-1-4.xsd",
        "alto-bnf": "http://bibnum.bnf.fr/ns/alto_prod",
    }
    # Extract namespace from document root
    if "http://" in str(xml.getroot().tag.split("}")[0].strip("{")):
        xmlns = xml.getroot().tag.split("}")[0].strip("{")
    else:
        try:
            ns = xml.getroot().attrib
            xmlns = str(ns).split(" ")[1].strip("}").strip("'")
        except IndexError:
            logger.warning(f"File {alto.name}: no namespace declaration found.")
            xmlns = "no_namespace_found"
    if xmlns in namespace.values():
        return alto, xml, xmlns
    else:
        logger.warning(f"File {alto.name}: namespace {xmlns} is not registered.")

In [None]:
xmls[0]

Path('test_data/9200357/BibliographicResource_3000095232526/8.xml')

In [None]:
fname, xml, ns = alto_parse(xmls[0])
assert all([fname, xml, ns])
Path("fake.xml").touch(exist_ok=True)
bad_xml = alto_parse(Path("fake.xml"))
# assert isinstance(bad_xml, None)

2022-08-09 09:51:26.547 | ERROR    | __main__:alto_parse:7 - Parser Error in file 'fake.xml': no element found: line 1, column 0


In [None]:
# |export
def get_alto_text(xml, xmlns, join_lines=True):
    """Extract text content from ALTO xml file"""
    all_text = []
    all_wc = []
    # Find all <TextLine> elements
    for lines in xml.iterfind(".//{%s}TextLine" % xmlns):
        # Find all <String> elements
        for line in lines.findall("{%s}String" % xmlns):
            wc = line.attrib["WC"]
            if wc is not None:
                all_wc.append(float(wc))
            # Check if there are no hyphenated words
            if "SUBS_CONTENT" not in line.attrib and "SUBS_TYPE" not in line.attrib:
                # Get value of attribute @CONTENT from all <String> elements
                text = line.attrib.get("CONTENT")  # + ' '
            elif "HypPart1" in line.attrib.get("SUBS_TYPE"):
                text = line.attrib.get("SUBS_CONTENT")  # + ' '
                if "HypPart2" in line.attrib.get("SUBS_TYPE"):
                    pass
            all_text.append(text)
    if all_wc:
        mean_ocr = mean(all_wc)
    if len(all_wc) > 2:
        std_ocr = stdev(all_wc)
    else:
        mean_ocr = None
        std_ocr = None
    return " ".join(all_text), mean_ocr, std_ocr

In [None]:
text, wc, std_ocr = get_alto_text(xml, ns)
assert all([text, wc, std_ocr])
assert isinstance(text, str)
assert isinstance(wc, float)
assert isinstance(std_ocr, float)

In [None]:
# |export
def alto_illustrations(xml, xmlns):
    """Extract bounding boxes of illustration from ALTO xml file"""
    # Find all <Illustration> elements
    bounding_boxes = []
    for illustration in xml.iterfind(".//{%s}Illustration" % xmlns):
        # Get @ID of <Illustration> element
        illustration_id = illustration.attrib.get("ID")
        # Get coordinates of <Illustration> element
        illustration_coords = list(
            map(
                float,
                (
                    illustration.attrib.get("HEIGHT"),
                    illustration.attrib.get("WIDTH"),
                    illustration.attrib.get("VPOS"),
                    illustration.attrib.get("HPOS"),
                ),
            )
        )
        bounding_boxes.append(illustration_coords)
    return bounding_boxes

In [None]:
alto_illustrations(xml, ns)

[[113.0, 563.0, 2858.0, 512.0],
 [178.0, 774.0, 4211.0, 363.0],
 [474.0, 1107.0, 4519.0, 366.0],
 [147.0, 2312.0, 466.0, 1540.0],
 [113.0, 186.0, 732.0, 1746.0],
 [110.0, 172.0, 734.0, 2020.0],
 [112.0, 170.0, 738.0, 2884.0],
 [108.0, 214.0, 737.0, 3444.0],
 [194.0, 719.0, 1327.0, 1743.0],
 [158.0, 254.0, 1363.0, 3412.0],
 [102.0, 966.0, 2766.0, 2920.0],
 [364.0, 1679.0, 2889.0, 1686.0],
 [560.0, 341.0, 3670.0, 3562.0],
 [105.0, 1020.0, 4236.0, 2383.0],
 [129.0, 1524.0, 4656.0, 1574.0],
 [37.0, 387.0, 3597.0, 2054.0]]

In [None]:
def get_illustrations(xmls):
    for file in xmls:
        with open(file, "r") as f:
            for line in f:
                if "Illustration" in line:
                    yield file
                    break

In [None]:
from toolz import take

In [None]:
illustration_xmls = list(take(10, get_illustrations(xmls)))

In [None]:
illustration_xmls

[Path('test_data/9200357/BibliographicResource_3000095232526/8.xml'),
 Path('test_data/9200357/BibliographicResource_3000095232526/9.xml'),
 Path('test_data/9200357/BibliographicResource_3000095232526/12.xml'),
 Path('test_data/9200357/BibliographicResource_3000095232526/13.xml'),
 Path('test_data/9200357/BibliographicResource_3000095232526/11.xml'),
 Path('test_data/9200357/BibliographicResource_3000095232526/10.xml'),
 Path('test_data/9200357/BibliographicResource_3000095232526/4.xml'),
 Path('test_data/9200357/BibliographicResource_3000095232526/5.xml'),
 Path('test_data/9200357/BibliographicResource_3000095232526/6.xml'),
 Path('test_data/9200357/BibliographicResource_3000095232526/2.xml')]

## Newspaper page container 



In [None]:
#|export
@define(slots=True)
class NewspaperPageAlto:
    fname: Union[str, Path]
    text: Optional[str]
    mean_ocr: Optional[float]
    std_ocr: Optional[float]
    bounding_boxes: List[Union[float, None]]
    item_id: str = field(init=False)

    def _get_id(self):
        return "/".join(Path(self.fname).parts[-3:-1])

    def __attrs_post_init__(self):
        self.item_id = self._get_id()

In [None]:
# |export
def parse_newspaper_page(xml_fname: Union[str, Path]):
    fname, xml, ns = alto_parse(xml_fname)
    text, wc, std_ocr = get_alto_text(xml, ns)
    bounding_boxes = alto_illustrations(xml, ns)
    return NewspaperPageAlto(xml_fname, text, wc, std_ocr, bounding_boxes)

In [None]:
page = parse_newspaper_page(xmls[20])

In [None]:
assert page
assert isinstance(page, NewspaperPageAlto)
assert isinstance(page.text, (str, None))
assert isinstance(page.mean_ocr, (float, None))
assert isinstance(page.std_ocr, (float, None))
assert isinstance(page.bounding_boxes, List)
assert isinstance(page.item_id, str)

In [None]:
page

NewspaperPageAlto(fname=Path('test_data/9200357/BibliographicResource_3000095237868/3.xml'), text='KŁ7Z< MONITOR POLSKI — Dnia 30! fi- 11 -1 i HiliMiiiiia MiaiHiai Hmma i IM bb aafti t hu Mubhmi aa; ia MmjTawa M in IS2. POSTANOWIENIE Ministrów Przemyśla i Haidło «az Skarb* w ftied- minie zatwierdzenia statets spółki akeyjstj f. f. „Gaiti- niS", Spółka Akcyjna w Krakowie".. Na zasadzie art. 1 Ustawy z d». 29-go kwietnia I&I9 roku o zatwierdzaniu i zmianie stetotów spófek akcyjnych, zatwierdzamy dołączony do niniejszego sta tut spółki akc. pod firmą: „Garbnik", Spółka Akcyjna w Krakowie". Minister Skarbu: (—) Steczkowski. Minister Przemysł* i i Handlu: (—) S. Pmanowski. Warszawa, dnia 16 marca 1921 roku. STATUT Spółki Akcyjnej pod Erm§: „GARBNIK" SPÓŁKA AKCYJNA W KRAKOWIE. \\ Przedmiot, firma, siedziba oraz czas trwania Spółki § 1. W celu: 1) wykonania na własny rachunek lub na zlecenie osób trzecich\', interesów handlowych i komisowych ar tykułami pomocniczymi garbarskimi z dziedziny pa

## Get metadata 

The next step is to create some functionality to get metadata for the items. There are two possible ways we can do this:
- via the metadata download dumps
- via the Europena API


In [None]:
# |slow
#!aria2c -x 4 -d test_metadata -o metadata.zip --auto-file-renaming=false ftp://download.europeana.eu/newspapers/metadata/9200357.zip

In [None]:
# !unzip -d test_data_metadata test_metadata/metadata.zip

In [None]:
metadata_examples = list(take(500, Path("test_data_metadata").rglob("*.xml")))

In [None]:
# |export
@define(slots=True)
class NewspaperPageMetadata:
    metadata_xml_fname: Union[str, Path]
    title: Optional[str]
    date: Optional[str]
    languages: Union[List[str], str, None]
    item_iiif_url: Optional[str]
    all_metadata_dict: Dict[Any, Any]

    def __attrs_post_init__(self):
        self.languages = (
            self.languages.split(",")
            if isinstance(self.languages, str)
            else self.languages
        )
        self.title = self.title.split("-")[0].strip(" ")
        self.metadata_xml_fname = str(self.metadata_xml_fname)

In [None]:
# |export
def get_metadata_from_xml(xml_file: Union[Path, str]):
    with open(xml_file, "r") as f:
        xml = xmltodict.parse(f.read())
    metadata = xml["rdf:RDF"]
    ProvidedCHO = metadata["edm:ProvidedCHO"]
    title = ProvidedCHO["dc:title"]
    data = ProvidedCHO["dcterms:issued"]
    languages = ProvidedCHO["dc:language"]
    iiif_url = metadata["ore:Aggregation"]["edm:isShownBy"]["@rdf:resource"]
    return NewspaperPageMetadata(xml_file, title, data, languages, iiif_url, metadata)

In [None]:
for metadata_xml in metadata_examples:
    metadata = get_metadata_from_xml(metadata_xml)
    assert metadata
    assert isinstance(metadata.languages, (list, None))

In [None]:
metadata

NewspaperPageMetadata(metadata_xml_fname='test_data_metadata/http%3A%2F%2Fdata.theeuropeanlibrary.org%2FBibliographicResource%2F3000095246027.edm.xml', title='Głos Śląski', date='1917-10-30', languages=['pl'], item_iiif_url='https://iiif.europeana.eu/image/PUXTDGFKG2LLRLBO3B6Y5KYMC62PLD4T2LERZFPAH324CH53ASPQ/presentation_images/87b09e00-0235-11e6-a696-fa163e2dd531/node-1/image/NLP/Głos_Śląski/1917/10/30/00742/full/full/0/default.jpg', all_metadata_dict={'@xmlns:cc': 'http://creativecommons.org/ns#', '@xmlns:ore': 'http://www.openarchives.org/ore/terms/', '@xmlns:owl': 'http://www.w3.org/2002/07/owl#', '@xmlns:svcs': 'http://rdfs.org/sioc/services#', '@xmlns:skos': 'http://www.w3.org/2004/02/skos/core#', '@xmlns:rdfs': 'http://www.w3.org/2000/01/rdf-schema#', '@xmlns:rdaGr2': 'http://rdvocab.info/ElementsGr2/', '@xmlns:rdagrp2': 'http://rdvocab.info/ElementsGr2/', '@xmlns:edm': 'http://www.europeana.eu/schemas/edm/', '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', '@xmlns:rdf

## Linking metadata and XML
We need to be able to link from the data we got from the ALTO XML with our metadata and smush them together. We have our `page.item_id` attribute which will hopefully be sufficient to grab the related metadata file. 

In [None]:
page.item_id

'9200357/BibliographicResource_3000095237868'

In [None]:
# |export
def get_metadata_for_page(
    page: NewspaperPageAlto, metadata_directory: Optional[str] = None
):
    short_id = page.item_id.split("_")[-1]
    metadata_xml = f"{metadata_directory}/http%3A%2F%2Fdata.theeuropeanlibrary.org%2FBibliographicResource%2F{short_id}.edm.xml"
    return get_metadata_from_xml(metadata_xml)

In [None]:
metadata = get_metadata_for_page(page, metadata_directory="test_data_metadata")
assert metadata
assert isinstance(metadata, NewspaperPageMetadata)
assert page.item_id.split("_")[-1] in metadata.metadata_xml_fname

## Issue processor

In [None]:
# |export
@define(slots=True)
class NewspaperPage:
    fname: Union[str, Path]
    text: Optional[str]
    mean_ocr: Optional[float]
    std_ocr: Optional[float]
    bounding_boxes: List[Union[float, None]]
    item_id: str
    metadata_xml_fname: Union[str, Path]
    title: Optional[str]
    date: Optional[str]
    languages: Union[List[str], None]
    item_iiif_url: Optional[str]
    # all_metadata_dict: Dict[Any, Any]
    multi_language: bool = field(init=False)

    def __attrs_post_init__(self):
        self.fname = str(self.fname)
        self.metadata_xml_fname = str(self.metadata_xml_fname)
        self.languages = (
            [lang for lang in self.languages if lang != "=="]
            if isinstance(self.languages, list)
            else self.languages
        )
        self.multi_language = (
            isinstance(self.languages, list) and len(self.languages) > 1
        )

In [None]:
# |export
def process_newspaper_page(
    xml_file: Union[str, Path], metadata_directory: Optional[str] = None
) -> Dict[Any, Any]:
    page = parse_newspaper_page(xml_file)
    metadata = get_metadata_for_page(page, metadata_directory=metadata_directory)
    metadata = asdict(metadata)
    metadata.pop("all_metadata_dict")
    page = asdict(page)
    return NewspaperPage(**page, **metadata)

In [None]:
xmls[0]

Path('test_data/9200357/BibliographicResource_3000095232526/8.xml')

In [None]:
assert all(
    process_newspaper_page(xml, metadata_directory="test_data_metadata")
    for xml in xmls[:32]
)

In [None]:
process_newspaper_page(xmls[0], metadata_directory="test_data_metadata")

NewspaperPage(fname='test_data/9200357/BibliographicResource_3000095232526/8.xml', text='k,- :oło =z * -i : i ; r. . a: k Podziękowanie. \'ii.".*.- .*-ó h7 »ka naistn ceezniej**e podzię- ownym Punu>in :-,r\'ys»ko!n St .k >wski*j i ?.6 nip, fraz Panom .>rt st m: Mis.Tń- •j, ■ rof. l wft!i-jott«k!<?raa, >•••.-.» >■\' i "•zk .lv *ł i\' < ■? r ::i ••I. oraz ch-i- te; oper -wej, Przy Ki -rnu, Pa- vsk.- mu i Rybickiemu. er.\'.2 firm e „Ri.-g.-rt i Gm- za !>? v :n t(Ti sok ne uJ.yczenie forter -iarti na kon- urzą\'.zi.myiri z nr eto na doebó. u. 24 !i<?a w s.iU T .w hyejo nie- ; nu żi.ych ucz-Timo. S^ro :U>-vzr.io. W oęł^zi-niu .Podwieczorek* Vi »\' nfc Criiial na K ło >vi isow .Ni ,.!j, mylnie wy- rru w a n-!*^ .Dnia 8- go grudnia, t. j w piątek, od P>-.z. p-ej do \'d-ej wie-.-iorem". winno być: .Duia S-go grudnia, t. j. w sobotę, od g-<dz. 5-oj do" l-ej u ieczo- re:n". Przy stoliku 7- z as i,-idą. ojtócz już wymienio nych, r,p. bzy monostwo JdurtelińBcy oraz p. Włodzi mierz harnecki. 12.\':

## Load into datasets

In [None]:
page = process_newspaper_page(xmls[200], metadata_directory="test_data_metadata")
page

NewspaperPage(fname='test_data/9200357/BibliographicResource_3000095238478/2.xml', text='— Zwycłeżenl zwvefęzcy. »Oaz. Robotnlczaa wskazuje pód tym tytu ł em na przeobrażenie, jakie się dokonało w Grecyi Pisze ona: „Wiadomo, że król grecki Konstantyn, szwagier Wilhelma U, ni\'- chciał pozwolić na wciągnięcie swego ubogiego kraju w wojnę i ze wszystkich sił starał się utrzynać Grecyc w neutralności; ale przeciwdziałał te- wu zagorzały zwolennik ententy Venizelos, który nie epsczął, aż przeprowadził detronizacyę i wygnanie króla i wciągnięcie Grecyi do koalicyi i do wojny. Grecya wyszła z wojny wzbogacona nowymi obszara mi na Bałkanie i nowemi pos ; adłościami w Azyi Mniej szej. ale gospodarczo zrujnowana doszczętnie. Nędza i głód, szerzące się wśród zwycięskich Greków, skie rowały ich umysły przeciw wszechpotężnemu dotąd prezydentowi ministrów, istnemu dyktatorowi Venize- Icsrwi i na je-o g\'owę zaczęty się sypać przekleństwa ludności za zubożenie kraju, za oszukanie nadziei przy wiązyw

## Dump to parquet

In [None]:
# |export
from datasets import Dataset
from datasets import Value, Sequence, Features

In [None]:
# |export
features=Features({
    'fname': Value(dtype='string', id=None),
    'text': Value(dtype='string', id=None),
    'mean_ocr': Value(dtype='float64', id=None),
    'std_ocr': Value(dtype='float64', id=None),
    'bounding_boxes': Sequence(
        feature=Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
        length=-1,
        id=None
    ),
    'item_id': Value(dtype='string', id=None),
    'metadata_xml_fname': Value(dtype='string', id=None),
    'title': Value(dtype='string', id=None),
    'date': Value(dtype='string', id=None),
    'languages': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
    'item_iiif_url': Value(dtype='string', id=None),
    'multi_language': Value(dtype='bool', id=None)
})


In [None]:
# |export
@logger.catch()
def process_batch(xml_batch: Iterable[Union[str, Path]], metadata_directory=None):
    batch = [
        asdict(process_newspaper_page(xml, metadata_directory=metadata_directory))
        for xml in xml_batch
    ]

    batch = {key: [i[key] for i in batch] for key in batch[0]}

    return Dataset.from_dict(batch,features=features)

In [None]:
ds = process_batch(xmls[:32], metadata_directory="test_data_metadata")
assert len(ds) == 32
assert len(ds.column_names) == 12
assert ds.column_names == [
    "fname",
    "text",
    "mean_ocr",
    "std_ocr",
    "bounding_boxes",
    "item_id",
    "metadata_xml_fname",
    "title",
    "date",
    "languages",
    "item_iiif_url",
    "multi_language",
]

In [None]:
ds.features

{'fname': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'mean_ocr': Value(dtype='float64', id=None),
 'std_ocr': Value(dtype='float64', id=None),
 'bounding_boxes': Sequence(feature=Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), length=-1, id=None),
 'item_id': Value(dtype='string', id=None),
 'metadata_xml_fname': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'date': Value(dtype='string', id=None),
 'languages': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'item_iiif_url': Value(dtype='string', id=None),
 'multi_language': Value(dtype='bool', id=None)}

In [None]:
# |export
import multiprocessing 

def process(
    xml_files: Iterable[Union[str, Path]],
    batch_size: int = 32,
    metadata_directory: Optional[str] = None,
    max_workers: int = None
):
    with tqdm(total=len(xml_files) // batch_size) as pbar:
        if not max_workers:
            max_workers = multiprocessing.cpu_count()
        futures = []
        with ProcessPoolExecutor(max_workers=max_workers) as executor:
            for batch in partition_all(batch_size, xml_files):
                batch = list(batch)
                future = executor.submit(
                    process_batch, batch, metadata_directory=metadata_directory
                )
                future.add_done_callback(lambda p: pbar.update(1))
                futures.append(future)
    return [future.result() for future in as_completed(futures)]



In [None]:
datasets = process(xmls[:10], metadata_directory="test_data_metadata")

0it [00:00, ?it/s]

In [None]:
from datasets import concatenate_datasets

In [None]:
len(xmls)

108432

In [None]:
dataset = concatenate_datasets(datasets)
dataset

Dataset({
    features: ['fname', 'text', 'mean_ocr', 'std_ocr', 'bounding_boxes', 'item_id', 'metadata_xml_fname', 'title', 'date', 'languages', 'item_iiif_url', 'multi_language'],
    num_rows: 10
})