In [1]:
import xml.etree.ElementTree as ET

In [2]:
def xml2text(xml_path, output_file=None):
    # ALTO XML uses a default namespace
    ns = {'alto': 'http://www.loc.gov/standards/alto/ns-v3#'}

    # Parse the XML file
    tree = ET.parse(xml_path)
    root = tree.getroot()

    lines = []

    # Traverse each TextLine in the XML
    for text_line in root.findall('.//alto:TextLine', ns):
        words = [string.attrib.get("CONTENT", "") for string in text_line.findall("alto:String", ns)]
        line_text = " ".join(words)
        if line_text.strip():
            lines.append(line_text)

    full_text = "\n".join(lines)

    # Optionally save to file
    if output_file:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(full_text)

In [1]:
import requests

api_key = "OYSi9Dygc0XZ0Nvq2vgPxe4oXNmomCtWWZHM7CVd3Fo7iC0qKge1748029090188"  
headers = {
    "Authorization": f'OAuth oauth_consumer_key="{api_key}"',
    "Accept": "application/json",
}

item_url = f"https://api.deutsche-digitale-bibliothek.de/items/SYPRUTDU2J2V62O2SDIJE6LFPD6G2VBB"
response = requests.get(item_url, headers=headers)

In [2]:
print(response.status_code)

200


In [10]:
data = response.json()
issued = data['edm']['RDF']['ProvidedCHO']['issued']
publisher = data['edm']['RDF']['ProvidedCHO']['publisher']['$']
print(f"Issued: {issued}, Type: {type(issued)}")
print(f"Publisher: {publisher}, Type: {type(publisher)}")

Issued: 1920-01-25, Type: <class 'str'>
Publisher: Berlin : Vorwärts-Verlag, Type: <class 'str'>


In [None]:
def extract_metadata(data):
    record = data.get("source", {}).get("record", {})
    metadata = {
        "publisher": "Unknown",
        "publication_date": "Unknown"
    }

    try:
        mods = record.get("mods:mods", {})
        metadata["publisher"] = mods.get("mods:originInfo", {}).get("mods:publisher", "Unknown")
        metadata["publication_date"] = mods.get("mods:originInfo", {}).get("mods:dateIssued", "Unknown")
    except Exception:
        pass  # Return default "Unknown" if keys are missing

    return metadata

In [9]:

meta = extract_metadata(data)
issued = data['edm']['RDF']['ProvidedCHO']['issued']
publisher = data['edm']['RDF']['ProvidedCHO']['publisher']['$']
print(meta)

{'title': 'Unknown', 'publisher': 'Unknown', 'publication_date': 'Unknown'}


In [None]:
{
'properties': {
    'item-id': 'SYPRUTDU2J2V62O2SDIJE6LFPD6G2VBB', 
    'dataset-id': '30678710177457952dKCx', 
    'dataset-label': 'Gesamtlieferung (Zeitungsportal) - Bibliothek FES (oid1611770703208) - METS/MODS', 
    'revision-id': '2', 
    'ingest-date': '2023-02-24T14:18:52+0100', 
    'cortex-type': 'Kultur', 
    'mapping-version': '4.5'}, 
'edm': {
    'RDF': {
        'Aggregation': {
            '@about': 'http://www.zvdd.de/record/DE-Bo133/VW37045/dmd', 
            'aggregatedCHO': {'@resource': 'http://www.deutsche-digitale-bibliothek.de/item/SYPRUTDU2J2V62O2SDIJE6LFPD6G2VBB'}, 
            'dataProvider': ['Bibliothek der Friedrich-Ebert-Stiftung', {'@resource': 'http://www.deutsche-digitale-bibliothek.de/organization/CZTZO4SBNHW34JVKYRWW67725WWGLZA5'}], 
            'isShownAt': {'@resource': 'http://fes.imageware.de/fes/web/index.html?open=VW37045'}, 
            'isShownBy': {'@resource': 'https://library.fes.de/ddb/vw37045/VW37045_01.jpg'}, 
            'provider': 'Deutsche Digitale Bibliothek', 
            'rights': [{'@resource': 'http://creativecommons.org/licenses/by-nc-sa/3.0/'}, {'@resource': 'http://creativecommons.org/licenses/by-nc-sa/3.0/de/'}], 
            '$': '\n'}, 
        'Agent': [
            {
                '@about': 'http://www.deutsche-digitale-bibliothek.de/organization/CZTZO4SBNHW34JVKYRWW67725WWGLZA5', 
                'prefLabel': 'Bibliothek der Friedrich-Ebert-Stiftung', 
                'type': {'@resource': 'http://ddb.vocnet.org/sparte/sparte002'}, 
                'isPartOf': {'@resource': 'http://www.deutsche-digitale-bibliothek.de/organization/WQFEVKHWJSOL5OGCEQHKHOVCB3K65SSV'}, 
                '$': '\n'
            }, 
            {
                '@about': 'http://www.deutsche-digitale-bibliothek.de/organization/WQFEVKHWJSOL5OGCEQHKHOVCB3K65SSV', 
                'prefLabel': 'Archiv der sozialen Demokratie der Friedrich-Ebert-Stiftung', 
                '$': '\n'
            }, 
            {
                '@about': 'MRYYE54BVOWI5TUZGRXZQRD2YN44WISO', 
                'prefLabel': 'Vorwärts-Verlag', 
                'wasPresentAt': {'@resource': 'EIK7JCGU775QPYFHAYGTZ3V5GCFOLMWX'}, 
                '$': '\n'
            }], 
        'Concept': [
            {'@about': 'http://ddb.vocnet.org/sparte/sparte002', 'notation': 'sec_02', '$': '\n'}, 
            {'@about': 'http://ddb.vocnet.org/medientyp/mt003', 'notation': 'mediatype_003', '$': '\n'}, 
            {'@about': 'http://ddb.vocnet.org/medientyp/mt003', 'notation': 'mediatype_003', '$': '\n'}, 
            {'@about': 'http://ddb.vocnet.org/hierarchietyp/ht014', 'prefLabel': {'@lang': 'de', '$': 'Heft'}, '$': '\n'}], 
        'WebResource': [
            {
                '@about': 'http://fes.imageware.de/fes/web/index.html?open=VW37045', 
                'type': {'@resource': 'http://ddb.vocnet.org/medientyp/mt003'}, 
                '$': '\n'
            }, 
            {
                '@about': 'https://library.fes.de/ddb/vw37045/VW37045_01.jpg', 
                'rights': {'@resource': 'http://creativecommons.org/licenses/by-nc-sa/3.0/de/'}, 
                'type': {'@resource': 'http://ddb.vocnet.org/medientyp/mt003'}, 
                'created': '2021', 
                '$': '\n'
            }], 
        'ProvidedCHO': {
            '@about': 'http://www.deutsche-digitale-bibliothek.de/item/SYPRUTDU2J2V62O2SDIJE6LFPD6G2VBB', 
            'hasType': {'@resource': 'http://ddb.vocnet.org/hierarchietyp/ht014'}, 
            'hasMet': {'@resource': 'EIK7JCGU775QPYFHAYGTZ3V5GCFOLMWX'}, 
            'type': {'@lang': 'de', '$': 'Heft'}, 
            'identifier': ['http://fes.imageware.de/fes/web/index.html?open=VW37045', 'http://www.zvdd.de/object/DE-Bo133/VW37045/log', 'Bibliothek der Friedrich-Ebert-Stiftung -- XX 500'], 
            'language': {'@resource': 'http://id.loc.gov/vocabulary/iso639-2/ger'}, 
            'publisher': {'@lang': 'zxx', '$': 'Berlin : Vorwärts-Verlag'}, 
            'title': {'@lang': 'zxx', '$': 'Vorwärts'}, 
            'extent': '12 Seiten', 
            'issued': '1920-01-25', 
            'hierarchyType': 'htype_014', 
            '$': '\n'}, 'LinguisticSystem': {'@about': 'http://id.loc.gov/vocabulary/iso639-2/ger', 'value': 'ger', '$': '\n'}, 'Event': {'@about': 'EIK7JCGU775QPYFHAYGTZ3V5GCFOLMWX', 'hasType': {'@resource': 'http://terminology.lido-schema.org/eventType/publication'}, 'P11_had_participant': {'@resource': 'MRYYE54BVOWI5TUZGRXZQRD2YN44WISO'}, 'occuredAt': {'@resource': 'JGSW5TGAKM2VD6TCPHQPQUNXS4IGD6DR'}, '$': '\n'}, 'TimeSpan': {'@about': 'JGSW5TGAKM2VD6TCPHQPQUNXS4IGD6DR', 'begin': '1920-01-25', 'end': '1920-01-25', '$': '\n'}, '$': '\n'}}, 'indexing-profile': {'item-id': 'SYPRUTDU2J2V62O2SDIJE6LFPD6G2VBB', 'facet': [{'@name': 'affiliate_fct', 'value': 'Vorwärts-Verlag'}, {'@name': 'affiliate_fct_autocomplete', 'value': 'Vorwärts-Verlag'}, {'@name': 'affiliate_fct_role', 'value': ['Vorwärts-Verlag', 'Vorwärts-Verlag_1_affiliate_fct_involved']}, {'@name': 'affiliate_fct_role_autocomplete', 'value': ['Vorwärts-Verlag', 'Vorwärts-Verlag_1_affiliate_fct_involved']}, {'@name': 'keywords_fct', 'value': 'Heft'}, {'@name': 'keywords_fct_autocomplete', 'value': 'Heft'}, {'@name': 'keywords_fct_normdata', 'value': 'http://ddb.vocnet.org/hierarchietyp/ht014'}, {'@name': 'objecttype_fct', 'value': 'Heft'}, {'@name': 'objecttype_fct_autocomplete', 'value': 'Heft'}, {'@name': 'language_fct', 'value': 'ger'}, {'@name': 'type_fct', 'value': 'mediatype_003'}, {'@name': 'provider_fct', 'value': ['Bibliothek der Friedrich-Ebert-Stiftung', 'Archiv der sozialen Demokratie der Friedrich-Ebert-Stiftung']}, {'@name': 'provider_fct_autocomplete', 'value': ['Bibliothek der Friedrich-Ebert-Stiftung', 'Archiv der sozialen Demokratie der Friedrich-Ebert-Stiftung']}, {'@name': 'provider_fct_normdata', 'value': ['http://www.deutsche-digitale-bibliothek.de/organization/CZTZO4SBNHW34JVKYRWW67725WWGLZA5', 'http://www.deutsche-digitale-bibliothek.de/organization/WQFEVKHWJSOL5OGCEQHKHOVCB3K65SSV']}, {'@name': 'sector_fct', 'value': 'sec_02'}, {'@name': 'last_update', 'value': '2023-02-24T14:18:52+0100'}]}, 'preview': {'@media': 'text', '@type': 'Kultur', 'title': 'Vorwärts', 'subtitle': '1920-01-25', 'thumbnail': {'@href': 'e861414f-6d88-4bf1-bfe7-48b98c17ffdc'}}, 'view': {'item': {'identifier': 'http://www.zvdd.de/object/DE-Bo133/VW37045/log', 'label': 'Vorwärts', 'title': 'Vorwärts', 'subtitle': '', 'rights': '', 'metadata-rights': 'http://creativecommons.org/licenses/by-nc-sa/3.0/de/', 'origin': 'http://fes.imageware.de/fes/web/index.html?open=VW37045', 'category': 'Kultur', 'media': 'text', 'thumbnail': '', 'latitude': '', 'longitude': '', 'viewers': {'viewer': {'name': {'@id': 'dfgKey', '$': 'DFG-Viewer'}, 'url': 'http://dfg-viewer.de/show/?tx_dlf[id]/tx_dlf[page]'}}, 'license': {'@resource': 'http://creativecommons.org/licenses/by-nc-sa/3.0/de/', '$': ''}, 'institution': {'url': '', 'id': 'CZTZO4SBNHW34JVKYRWW67725WWGLZA5', 'name': 'Bibliothek der Friedrich-Ebert-Stiftung', 'logo-institution-ddbid': 'CZTZO4SBNHW34JVKYRWW67725WWGLZA5'}, 'fields': [{'@usage': 'display', 'field': [{'@id': 'flex_bibl_003', 'name': 'Dokumenttyp', 'value': 'Heft'}, {'@id': 'flex_bibl_004', 'name': 'Erschienen in', 'value': 'Vorwärts'}, {'@id': 'flex_bibl_008', 'name': 'Sprache', 'value': 'Deutsch'}, {'@id': 'flex_bibl_009', 'name': 'Umfang', 'value': '12 Seiten'}, {'@id': 'flex_bibl_013a', 'name': 'PURL', 'value': 'http://fes.imageware.de/fes/web/index.html?open=VW37045'}, {'@id': 'flex_bibl_014', 'name': 'Standort', 'value': 'Bibliothek der Friedrich-Ebert-Stiftung -- XX 500'}]}, {'@usage': 'index', 'field': [{'@id': 'title', '@resource': 'http://purl.org/dc/elements/1.1/title', 'name': 'Title', 'value': 'Vorwärts'}, {'@id': 'ranking', 'name': 'Ranking', 'value': 'Heft'}, {'@id': 'begin_time', 'name': 'Anfang', 'value': '700925'}, {'@id': 'end_time', 'name': 'Ende', 'value': '700925'}, {'@id': 'license', 'name': 'Lizenz', 'value': 'http://creativecommons.org/licenses/by-nc-sa/3.0/de/'}, {'@id': 'license_group', 'name': 'Lizenzstatus', 'value': 'rights_002'}, {'@id': 'digitalisat', 'name': 'digitalisat', 'value': 'true'}, {'@id': 'dataset_id', 'name': 'dataset_id', 'value': '30678710177457952dKCx'}, {'@id': 'dataset_label', 'name': 'dataset_label', 'value': 'Gesamtlieferung (Zeitungsportal) - Bibliothek FES (oid1611770703208) - METS/MODS'}, {'@id': 'delivery_id', 'name': 'delivery_id', 'value': '30698936812512492Jhis'}, {'@id': 'supplier_id', 'name': 'supplier_id', 'value': 'oid1643883383221'}, {'@id': 'mapping_version', 'name': 'mapping_version', 'value': '4.5'}, {'@id': 'md_format', 'name': 'Metadata Format', 'value': 'newspaper-mets'}, {'@id': 'source_format', 'name': 'source_format', 'value': 'http://www.loc.gov/METS/'}]}]}},




