Import lxml library 

In [1]:
from lxml import etree

Declare input directory 

In [2]:
tree = etree.parse('input.xml')

Declare TEI namespace

In [3]:
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

Get root of XML document and store in var

In [4]:
root = tree.getroot()

Store useful URIs in variables for later (one from xml:base, one from TEI element id)

In [5]:
base_uri = root.get('{http://www.w3.org/XML/1998/namespace}base')
edition_id = root.get('{http://www.w3.org/XML/1998/namespace}id')

In [6]:
from rdflib import Graph, Literal, BNode, Namespace, RDF, URIRef, RDFS

In [7]:
from rdflib.namespace import XSD, DCTERMS, OWL
agrelon = Namespace("https://d-nb.info/standards/elementset/agrelon#")
crm = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
frbroo = Namespace("http://iflastandards.info/ns/fr/frbr/frbroo/")
pro = Namespace("http://purl.org/spar/pro/")
proles = Namespace("http://www.essepuntato.it/2013/10/politicalroles/")
prov = Namespace("http://www.w3.org/ns/prov#")
schema = Namespace("https://schema.org/")
tvc = Namespace("http://www.essepuntato.it/2012/04/tvc/")

In [8]:
g = Graph()

## Person

In [9]:
def subject(person):
    #person_uri = URIRef(base_uri + '/person/' + person_id)
    g.add( (person_uri, RDF.type, schema.Person))

In [10]:
def sameas(person):    
    same_as = person.get('sameAs').split()
    i = 0
    while i < len(same_as):
        same_as_uri = URIRef(same_as[i])
        g.add( (person_uri, OWL.sameAs, same_as_uri))
        i += 1

In [11]:
def persname(person):
    persname = person.find('./tei:persName', ns)
    label = persname.text
    label_lang = persname.get('{http://www.w3.org/XML/1998/namespace}lang')
    if label_lang is not None:
        g.add( (person_uri, RDFS.label, Literal(label, lang=label_lang)))
    else:
        g.add( (person_uri, RDFS.label, Literal(label)))

In [12]:
def referenced_person(person_id):
    ref = './tei:text//tei:persName[@ref="#' + person_id + '"]'
    for referenced_person in root.findall(ref, ns):
        parent = referenced_person.getparent()
        parent_id = parent.get('{http://www.w3.org/XML/1998/namespace}id')
        parent_uri = URIRef(base_uri + '/text/' + parent_id)
        g.add( (person_uri, DCTERMS.isReferencedBy, parent_uri))
        g.add( (parent_uri, RDF.type, frbroo.F23_Expression_Fragment))
        g.add( (parent_uri, frbroo.R15i_is_fragment_of, URIRef(base_uri + '/' + edition_id)))

In [13]:
def perstype(person):
    listperson = person.find('./...', ns)
    perstype = listperson.get('type')
    perscorr = listperson.get('corresp')
    if perstype is not None:
        g.add( (person_uri, DCTERMS.description, Literal(perstype)))
    if perscorr is not None and perscorr.startswith('http'):
        g.add( (person_uri, DCTERMS.subject, URIRef(perscorr)))

In [14]:
for person in root.findall('.//tei:person', ns):
    person_id = person.get('{http://www.w3.org/XML/1998/namespace}id')
    person_uri = URIRef(base_uri + '/person/' + person_id)
    person_ref = '#' + person_id
    subject(person)
    sameas(person)
    persname(person)
    referenced_person(person_id)
    perstype(person)

## Event

In [15]:
def partic_event(person):     
    for event in person.findall('./tei:event', ns):
        event_id = event.get('{http://www.w3.org/XML/1998/namespace}id')
        partic_event_uri = URIRef(base_uri + '/' + person_id + '-in-' + event_id)
        if event is not None:
            g.add( (person_uri, pro.holdsRoleInTime, partic_event_uri))

In [16]:
def role_in_event(person):
    for event in person.findall('./tei:event', ns):
        event_id = event.get('{http://www.w3.org/XML/1998/namespace}id')
        persName = person.find('./tei:persName', ns)
        label = persName.text
        rit_uri = URIRef(base_uri + '/rit/' + person_id + '-at-' + event_id)
        pers_in_event = event.find('./tei:desc/tei:persName', ns)
    
        g.add( (rit_uri, RDF.type, pro.RoleInTime))
        
        if pers_in_event is not None and pers_in_event.get('ref') == person_ref and pers_in_event.get('role') is not None:
            role_uri = URIRef(base_uri + '/role/' + pers_in_event.get('role'))
            g.add( (rit_uri, pro.withRole, role_uri))
            g.add( (role_uri, RDF.type, pro.Role))
        
        if pers_in_event.get('corresp') is not None:
            g.add( (role_uri, OWL.sameAs, pro.Role))
            g.add( (role_uri, RDFS.label, pro.Role))
            corresp_role_uri = URIRef(pers_in_event.get('corresp'))
            g.add( (role_uri, OWL.sameAs, corresp_role_uri))
            role_label = pers_in_event.get('role')
            g.add( (role_uri, RDFS.label, Literal(role_label)))
        else:
            g.add( (rit_uri, pro.withRole, URIRef(base_uri + '/role/participant')))
            role_uri = URIRef(base_uri + '/role/participant')
            g.add( (role_uri, RDF.type, pro.Role))
            g.add( (role_uri, OWL.sameAs, URIRef('http://wordnet-rdf.princeton.edu/id/10421528-n')))
            g.add( (role_uri, RDFS.label, Literal('participant'))) 

        g.add( (rit_uri, tvc.atTime, URIRef(base_uri + '/tvc/' + event_id + '-time')))
        g.add( (rit_uri, pro.relatesToEntity, URIRef(base_uri + '/event/' + event_id)))

        place = event.find('./tei:desc/tei:placeName', ns)
        if place > 1:
            place_of_event = place.get('type="place_of_event"')
            g.add( (rit_uri, proles.relatesToPlace, URIRef(base_uri + '/place/' + place.get('ref').replace("#", ""))))
        elif event.find('./tei:desc/tei:placeName', ns) == 1:
            g.add( (rit_uri, proles.relatesToPlace, URIRef(base_uri + '/place/' + place.get('ref').replace("#", ""))))       

In [17]:
def event_time():
    g.add( (event_time_uri, RDF.type, URIRef('http://www.ontologydesignpatterns.org/cp/owl/timeinterval.owl#TimeInterval')))
    if event.get('when') is not None:
        g.add( (event_time_uri, OWL.hasIntervalStartDate, Literal(event.get('when'), datatype=XSD.date)))
        g.add( (event_time_uri, OWL.hasIntervalEndDate, Literal(event.get('when'), datatype=XSD.date)))
    if event.get('from') is not None:
        g.add( (event_time_uri, OWL.hasIntervalStartDate, Literal(event.get('from'), datatype=XSD.date)))
    if event.get('to') is not None:
        g.add( (event_time_uri, OWL.hasIntervalEndDate, Literal(event.get('to'), datatype=XSD.date)))

In [18]:
def event_desc():
    g.add( (event_uri, RDF.type, crm.E5_Event))
    g.add( (event_uri, RDF.type, schema.Event))
    if event.find('./tei:label', ns) is not None:
        label = event.find('./tei:label', ns).text
        g.add( (event_uri, RDFS.label, Literal(label)))
    if evtype is not None:
        g.add( (event_uri, DCTERMS.description, Literal(evtype)))
    if evcorr is not None and evcorr.startswith('http'):
        g.add( (event_uri, DCTERMS.subject, URIRef(evcorr)))

In [19]:
def event_source():
    source = event.find('./tei:bibl', ns)
    if source is not None:
        source_id = source.get('{http://www.w3.org/XML/1998/namespace}id')
        source_uri = URIRef(base_uri + '/source/' + source_id)
        g.add( (event_uri, prov.hasPrimarySource, source_uri))
        for event_source in root.findall('.//tei:event//tei:bibl', ns):
            g.add( (source_uri, RDF.type, prov.PrimarySource))
            if event_source.find('./tei:author', ns) is not None and event_source.find('./tei:author', ns).get('ref') is not None:
                author_ref = event_source.find('./tei:author', ns).get('ref')
                author_id = author_ref.split('#')
                g.add( (source_uri, DCTERMS.creator, URIRef(base_uri + '/person/' + author_id[1])))
            if event_source.find('.tei:title', ns) is not None:
                g.add( (source_uri, DCTERMS.title, Literal(event_source.find('.tei:title', ns).text)))
            if event_source.get('sameAs') is not None:
                sameAs = event_source.get('sameAs')
                if sameAs.startswith('http'):
                    g.add( (source_uri, OWL.sameAs, URIRef(event_source.get('sameAs')))) 
            if event_source.find('.tei:date', ns) is not None:
                evdate = event_source.find('.tei:date', ns)
                g.add( (source_uri, DCTERMS.date, Literal(evdate.get('when'), datatype=XSD.date)))

Call functions

In [20]:
for person in root.findall('.//tei:person', ns):
    person_id = person.get('{http://www.w3.org/XML/1998/namespace}id')
    person_uri = URIRef(base_uri + '/person/' + person_id)
    person_ref = '#' + person_id
    partic_event(person)
    role_in_event(person)

In [21]:
for event in root.findall('.//tei:event', ns):
    event_id = event.get('{http://www.w3.org/XML/1998/namespace}id')
    event_time_uri = URIRef(base_uri + '/' + event_id + '-time')
    event_uri = URIRef(base_uri + '/event/' + event_id)
    evcorr = event.get('corresp')
    evtype = event.get('type')
    event_time()
    event_desc()
    event_source()

## Relation

In [22]:
def relation(person):
    for relation in root.findall('.//tei:listRelation/tei:relation', ns):
        person_ref = '#' + person_id
        if relation.get('active') is not None and relation.get('active') == person_ref:
            passive = relation.get('passive').replace("#", "").split()
            i = 0
            while i < len(passive):
                g.add( (person_uri, agrelon[relation.get('name')], URIRef(base_uri + '/' + passive[i])))
                i += 1
        elif relation.get('mutual') is not None:
            relentity = relation.get('mutual').split()
            if person_ref in relentity:
                mutual = relation.get('mutual').replace("#", "").replace(person_id, "").split()
                i = 0
                while i < len(mutual):
                    g.add( (person_uri, agrelon[relation.get('name')], URIRef(base_uri + '/' + mutual[i])))
                    i += 1

In [23]:
for person in root.findall('.//tei:person', ns):
    person_id = person.get('{http://www.w3.org/XML/1998/namespace}id')
    person_uri = URIRef(base_uri + '/person/' + person_id)
    person_ref = '#' + person_id
    relation(person)

## Place

In [24]:
def place_subject(place):
    g.add( (place_uri, RDF.type, schema.Place))

In [25]:
def place_sameas(place):
    same_as = place.get('sameAs').split()
    i = 0
    while i < len(same_as):
        same_as_uri = URIRef(same_as[i])
        g.add( (place_uri, OWL.sameAs, same_as_uri))
        i += 1

In [26]:
def placename(place):
    placename = place.find('./tei:placeName', ns)
    label = placename.text
    label_lang = placename.get('{http://www.w3.org/XML/1998/namespace}lang')
    if label_lang is not None:
        g.add( (place_uri, RDFS.label, Literal(label, lang=label_lang)))
    else:
        g.add( (place_uri, RDFS.label, Literal(label)))

In [27]:
def referenced_place(place_id):
    ref = './/tei:placeName[@ref="#' + place_id + '"]'
    for referenced_place in root.findall(ref, ns):
        parent = referenced_place.getparent()
        parent_id = parent.get('{http://www.w3.org/XML/1998/namespace}id')
        parent_uri = URIRef(base_uri + '/text/' + parent_id)
        g.add( (place_uri, DCTERMS.isReferencedBy, parent_uri))
        g.add( (parent_uri, RDF.type, frbroo.F23_Expression_Fragment))
        g.add( (parent_uri, frbroo.R15i_is_fragment_of, URIRef(base_uri + '/' + edition_id)))

Call functions

In [28]:
for place in root.findall('.//tei:place', ns):
    place_id = place.get('{http://www.w3.org/XML/1998/namespace}id')
    place_uri = URIRef(base_uri + '/place/' + place_id)
    place_ref = '#' + place_id
    place_subject(place)
    place_sameas(place)
    placename(place)
    referenced_place(place_id)

In [29]:
# bind prefix
g.bind("agrelon", agrelon)
g.bind("crm", crm)
g.bind("frbroo", frbroo)
g.bind("dcterms", DCTERMS)
g.bind("schema", schema)
g.bind("owl", OWL)
g.bind("pro", pro)
g.bind("proles", proles)
g.bind("prov", prov)
g.bind("tvc", tvc)

In [30]:
print g.serialize(format='n3')

@prefix agrelon: <https://d-nb.info/standards/elementset/agrelon#> .
@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix frbroo: <http://iflastandards.info/ns/fr/frbr/frbroo/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix pro: <http://purl.org/spar/pro/> .
@prefix proles: <http://www.essepuntato.it/2013/10/politicalroles/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <https://schema.org/> .
@prefix tvc: <http://www.essepuntato.it/2012/04/tvc/> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://example.org/ev01-time> a <http://www.ontologydesignpatterns.org/cp/owl/timeinterval.owl#TimeInterval> ;
    owl:hasIntervalEndDate "-0399"^^xsd:date ;
    owl:hasIntervalStartDate "-0399"^^xsd:date .

<http://example.org/ev02-time> a

In [31]:
g.serialize(destination="output.xml", format='xml')