In [296]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [329]:
import re
from lxml import etree
import html as html_
!pip install titlecase
from titlecase import titlecase
!pip install googletrans
from googletrans import Translator
translator = Translator()
import csv
import json
import datetime
from xml.etree import ElementTree as ET
from copy import deepcopy



In [0]:
# define prefix
prefix = '{http://www.w3.org/XML/1998/namespace}'

In [0]:
def preprocess_xml(xml):
    # replace soft hyphen by true hyphen
    xml = re.sub(r'\xad', '-', xml)

    # add mising #
    xml = re.sub(r'(<sp who="#spCarl) (spDoktorn">)', r'\1 #\2', xml)

    # replace "<idno type="dracor">swe000044</tei:idno>" with "<idno type="dracor">swe000044</idno>" (so that etree could parse the file)
    xml = re.sub(r'(<idno type="dracor">)([^<>]*?)</tei:idno>', r'\1\2</idno>', xml)
    
    return xml

In [0]:
def get_root(xml):
    parser = etree.XMLParser(remove_blank_text=True)  # remove_comments=True, remove_blank_text=True
    root = etree.fromstring(xml, parser=parser)
    return root

In [0]:
def add_particDesc_to_profileDesc(profileDesc):
    # particDesc to profileDesc
    particDesc = etree.Element('particDesc')
    profileDesc.append(particDesc)
    return particDesc

In [0]:
def add_listPerson_to_particDesc(particDesc):
    # listPerson to particDesc
    listPerson = etree.Element('listPerson')
    particDesc.append(listPerson)
    return listPerson

In [0]:
def add_textClass_to_profileDesc(profileDesc):
    # textClass to profileDesc
    textClass = etree.Element('textClass')
    profileDesc.append(textClass)
    return textClass

In [0]:
def add_keywords_to_textClass(textClass):
    # keywords to textClass
    keywords = etree.Element('keywords')
    textClass.append(keywords)
    return keywords

In [0]:
def add_term_to_keywords(keywords):
    term = etree.Element('term', type='genreTitle', subtype='')
    term.text = ''
    keywords.append(term)
    return term

In [0]:
def add_docTitle_to_front(front):
    # docTitle to front
    docTitle = etree.Element('docTitle')
    front.append(docTitle)
    return docTitle

In [0]:
def add_titlePart_main_to_docTitle(docTitle, title):
    # titlePart to docTitle
    titlePart = etree.Element('titlePart', type='main')
    titlePart.text = title.strip('"')
    docTitle.append(titlePart)
    return titlePart

In [0]:
def add_titlePart_sub_to_docTitle(docTitle, title):
    # titlePart to docTitle
    titlePart = etree.Element('titlePart', type='sub')
    titlePart.text = title
    docTitle.append(titlePart)
    return titlePart

In [0]:
def inner_xml(element):
    return (element.text or '') + ''.join(ET.tostring(e, 'unicode') for e in element)

In [0]:
def replace_func(string):
    for letter_with_umlaut, replacement in zip('äåöéè', 'aaoee'):
        string = string.replace(letter_with_umlaut, replacement)
    return string

In [0]:
def recursive_clean(div):
    if div.tag == 'epilogue':
        if prefix + 'id' in div.attrib:
            del div.attrib[prefix + 'id']
    if div.tag == 'sp' and 'who' in div.attrib:
        tmp = ['#' + elem[1 + 2 * (elem.startswith('#sp')):] for elem in div.attrib['who'].split()]
        for i, elem in enumerate(tmp):
            tmp[i] = re.sub(r'([A-ZÅÄÖ][a-zåäö]+)(?=[A-ZÅÄÖ])', r'\1 ', elem[1:]) #.lower()
            tmp[i] = '#' + tmp[i].replace(' ', '_')
        div.attrib['who'] = replace_func(' '.join(tmp))
    for tag in div.getchildren():
        recursive_clean(tag)
    if div.text:
        div.text = re.sub(r'\s{2,}', ' ', div.text).strip()
        div.text = re.sub('\n', ' ', div.text)

In [0]:
authors_to_wikidata_urls = {}
plays_to_wikidata_urls = {}

with open('/content/drive/My Drive/SweDraCor_authors_Wikidata_ids.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader, None) # skip header
    for row in reader:
        authors_to_wikidata_urls[row[0]] = row[1]

with open('/content/drive/My Drive/SweDraCor_plays_Wikidata_ids.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader, None) # skip header
    for row in reader:
        plays_to_wikidata_urls[row[1]] = row[3]

def process_root(root):
    teiHeader, text = root.getchildren()
    fileDesc, encodingDesc, profileDesc = teiHeader.getchildren()
    try:
        titleStmt, publicationStmt, sourceDesc = fileDesc.getchildren()
    except ValueError:
        titleStmt, editionStmt, publicationStmt, sourceDesc = fileDesc.getchildren()
        # NB: <edition>eDrama</edition>
        fileDesc.remove(editionStmt)

    titleStmt_children = titleStmt.getchildren()

    sub_title = None
    if titleStmt_children[1].tag == 'title':
        # there is a sub-title
        if titleStmt_children[2].tag == 'title':
            main_title, sub_title, sub_title_two, author, *others = titleStmt_children
            sub_title.text += ' (' + sub_title_two.text + ')'
        else:
            main_title, sub_title, author, *others = titleStmt_children
    else:
        # there is no sub-title
        main_title, author, *others = titleStmt_children

    # add xml:lang attribute to the title(s):
    main_title.set(prefix + "lang", "sv")
    if main_title.text is not None:
        main_title.text = main_title.text.strip('"')
    if sub_title is not None:
        sub_title.set(prefix + "lang", "sv")

    # add translations (English) of titles to the 'titleStmt':
    main_title_en = etree.Element('title', type='main')

    # set xml:lang
    main_title_en.set(prefix + "lang", "en")

    main_title_text = main_title.text
    if main_title_text is None:
        main_title_text = ''
        for iter_main_title in main_title.itertext():
            main_title_text += iter_main_title.strip() + ' '
        main_title_text = main_title_text.strip()
    en_translation = translator.translate(main_title_text, src='sv', dest='en').text
        
    # capitalize non-functional words
    main_title_en.text = titlecase(en_translation).strip().strip('"')
    # insert after main_title on swedish
    main_title.addnext(main_title_en)

    if sub_title is not None:
        sub_title_en = etree.Element('title', type='sub')

        # set xml:lang
        sub_title_en.set(prefix + "lang", "en")

        en_translation = translator.translate(sub_title.text, src='sv', dest='en').text
        # capitalize non-functional words
        sub_title_en.text = titlecase(en_translation).strip()
        # insert after sub_title on swedish
        sub_title.addnext(sub_title_en)

    forename, nameLink, surname = [None] * 3
    for child in author.getchildren()[0].getchildren():
        if child.tag == 'forename':
            forename = child.text
        elif child.tag == 'nameLink':
            nameLink = child.text
        elif child.tag == 'surname':
            surname = child.text

    name = ''
    if forename is not None:
        name += forename + ' '
    if nameLink is not None:
        name += nameLink + ' '
    if surname is not None:
        name += surname
    name = name.strip()
    if not name:
        name = author.getchildren()[0].text
        forename, surname = name.split()

    # NB: remove xml:id
    if prefix + 'id' in author.attrib:
        del author.attrib[prefix + 'id']
    # add key
    author.set('key', 'wikidata:' + authors_to_wikidata_urls[name].split('/')[-1])

    # NB: delete reference information (sponsors/funders/editors/principals/encoders)
    for other in others:
        titleStmt.remove(other)

    publisher, address, first_idno, second_idno, availability = publicationStmt.getchildren()

    publisher.text = 'DraCor'
    publisher.set(prefix + "id", "dracor")

    # remove address
    publicationStmt.remove(address)

    first_idno.set('type', 'URL')
    first_idno.text = 'https://dracor.org'

    second_idno.set(prefix + "base", "https://dracor.org/id/")

    del availability.attrib['status']
    
    licence = availability.getchildren()[0]
    del licence.attrib['target']
    for child in licence.getchildren():
        licence.remove(child)

    ab = etree.Element('ab')
    ab.text = 'CC0'
    licence.append(ab)

    ref = etree.Element('ref', target='https://creativecommons.org/publicdomain/zero/1.0/')
    ref.text = 'Licence'
    licence.append(ref)

    idno = etree.Element('idno', type='wikidata')
    idno.set(prefix + "base", "https://www.wikidata.org/entity/")
    idno.text = plays_to_wikidata_urls[main_title_text.strip('"')].split('/')[-1]
    publicationStmt.append(idno)

    biblStruct = sourceDesc.getchildren()[0]

    sourceDesc.remove(biblStruct)

    if biblStruct.tag == 'listBibl':
        # NB: listBibl
        print("listBibl here")
        biblStruct = biblStruct.getchildren()[0]

    biblStruct_children = biblStruct.getchildren()
    monogr = None
    for biblStruct_child in biblStruct_children:
        if biblStruct_child.tag == 'monogr':
            monogr = biblStruct_child
            break

    titles, idno, imprint = [None] * 3

    for monogr_child in monogr.getchildren():
        if monogr_child.tag == 'title':
            if titles is None:
                titles = []
            titles.append(monogr_child)
        elif monogr_child.tag == 'idno':
            idno = monogr_child
        elif monogr_child.tag == 'imprint':
            imprint = monogr_child

    if titles[0].text is None and titles[0][-1].tag == 'idno':
        idno = titles[0][-1]
        titles = titles[0][:-1]

    url = idno.text

    pubPlace, publisher, date, biblScopes, respStmt = [None] * 5
    for imprint_child in imprint.getchildren():
        if imprint_child.tag == 'pubPlace':
            pubPlace = imprint_child
        elif imprint_child.tag == 'publisher':
            publisher = imprint_child
        elif imprint_child.tag == 'date':
            date = imprint_child
        elif imprint_child.tag == 'biblScope':
            if biblScopes is None:
                biblScopes = []
            biblScopes.append(imprint_child)
        elif imprint_child.tag == 'respStmt':
            respStmt = imprint_child
        else:
          
#             raise ValueError("WHAT?")
            pass

    when = None
    if date is not None:
        when = date.attrib['when']

    bibl = etree.Element('bibl', type='digitalSource')
    sourceDesc.append(bibl)
    
    for title in titles:
        bibl.append(title)

#     name = etree.Element('name')
#     name.text = ''
#     bibl.append(name)

    idno = etree.Element('idno', type='URL')
    idno.text = url
    bibl.append(idno)

    availability = etree.Element('availability', status='free')
    p = etree.Element('p')
    p.text = 'In the public domain.'
    availability.append(p)
    bibl.append(availability)

    inner_bibl = etree.Element('bibl', type='originalSource')
    bibl.append(inner_bibl)

    for imprint_child in imprint.getchildren():
        bibl.append(imprint_child)

    date = etree.Element('date', type='print', when=when)
    date.text = '"' + when + '"' + ' ' + '(' + url + ')'
    inner_bibl.append(date)

    date = etree.Element('date', type='premiere')
    inner_bibl.append(date)

    date = etree.Element('date', type='written')
    inner_bibl.append(date)

    # NB: remove encodingDesc
    # teiHeader.remove(encodingDesc)
    
#     for child in profileDesc:
#         profileDesc.remove(child)

    particDesc = add_particDesc_to_profileDesc(profileDesc)
    listPerson = add_listPerson_to_particDesc(particDesc)
    textClass = add_textClass_to_profileDesc(profileDesc)
    keywords = add_keywords_to_textClass(textClass)
    term = add_term_to_keywords(keywords)
    # revisionDesc
    revisionDesc = etree.Element('revisionDesc')
    teiHeader.append(revisionDesc)
    listChange = etree.Element('listChange')
    revisionDesc.append(listChange)
    change = etree.Element('change', when=datetime.datetime.now().strftime("%Y-%m-%d"))
    change.text = "(%s) convert from source" % ('eg') # replace if necessary
    listChange.append(change)

    # delete xml:id attribute of text
    del text.attrib[prefix + 'id']

    # text children
    front = None
    try:
        anchor, front, body, back, anchor, *comments = text.getchildren()
    except ValueError:
        anchor, body, back, anchor = text.getchildren()
        if anchor.tag != 'anchor':
            anchor, front, body, back = text.getchildren()

    if back.getchildren()[0].tag == 'epilogue':
        body.append(back.getchildren()[0])    
    
    try:
        if front is not None:
            castList = front.getchildren()[-1]
        else:
            raise ValueError('No front tag')

        while castList.tag != 'castList':
            body.insert(0, castList)
            castList = front.getchildren()[-1]
                
            # append castList to body
        body.insert(0, castList)
    except (ValueError, IndexError):
        castList = None

    if front is not None:
        docTitle = add_docTitle_to_front(front)
        add_titlePart_main_to_docTitle(docTitle, main_title_text)

    if sub_title is not None and front is not None:
        add_titlePart_sub_to_docTitle(docTitle, sub_title.text)

    byline = None
    try:
        *divs, byline = body.getchildren()
        assert divs and byline.tag == 'byline'
    except (ValueError, AssertionError):
        byline = None
        divs = body.getchildren()

    for div in divs:
        recursive_clean(div)

    for sp in root.xpath('//sp'):
        if 'who' not in sp.attrib:
            continue
        tmp = ['#' + elem[1 + 2 * (elem.startswith('#sp')):] for elem in sp.attrib['who'].split()]
        for i, elem in enumerate(tmp):
            tmp[i] = re.sub(r'([A-ZÅÄÖ][a-zåäö]+)(?=[A-ZÅÄÖ])', r'\1 ', elem[1:]).lower()
            tmp[i] = '#' + tmp[i].replace(' ', '_')
        sp.attrib['who'] = replace_func(' '.join(tmp))

    if byline is not None:
        date = byline.getchildren()[0]
        if re.fullmatch(r'[0-9]{4,4}', date.text) is not None:
            written_date = bibl.getchildren()[-1].getchildren()[-1]
            written_date.set('when', date.text)
            written_date.text = date.text

    tmp_ind = 1
    while back.getchildren()[len(back.getchildren()) - tmp_ind].tag != 'div':
        tmp_ind += 1
    div = back.getchildren()[len(back.getchildren()) - tmp_ind]

    try:
        tmp_listPerson, listOrg, *_ = div.getchildren()
        if tmp_listPerson.tag != 'listPerson':
            comment, tmp_listPerson, listOrg, *_ = div.getchildren()
    except ValueError:
        tmp_listPerson, = div.getchildren()
        if tmp_listPerson.tag != 'listPerson':
            comment, tmp_listPerson, = div.getchildren()

    if 'type' in tmp_listPerson.attrib:
        tmp_listPerson = div

    noncast_listPerson = None
    try:
        cast_listPerson, noncast_listPerson, *_ = tmp_listPerson.getchildren()
        if 'type' in noncast_listPerson and noncast_listPerson.attrib['type'] == 'cast':
            raise ValueError
    except ValueError:
        cast_listPerson_epilogue, cast_listPerson_main, *_ = tmp_listPerson.getchildren()
        cast_listPerson = etree.Element('listPerson')
        for inner_listPerson in [cast_listPerson_epilogue, cast_listPerson_main]:
            for child in inner_listPerson:
                cast_listPerson.append(child)

    list_of_speakers = []
    set_of_speakers = set()
    for sp in root.xpath('//sp'):
        if 'who' not in sp.attrib:
            continue
        whos = sp.attrib['who'].split()
        for who in whos:
            if who not in set_of_speakers:
                set_of_speakers.add(who)
                list_of_speakers.append(who)

    list_of_speakers = [re.sub(r'sp([A-Z].*)', r'\1', elem[1:]) for elem in list_of_speakers]

    xml_id_to_sex = {}
    xml_id_to_fullname = {}
    xml_id_to_type_of_person = {}

    for type_listPerson in [cast_listPerson, noncast_listPerson]:
        for child in type_listPerson.getchildren():
            if child.tag in ['person', 'personGrp', 'listPerson']:
                if child.tag == 'listPerson':
                    childs = child.getchildren()
                else:
                    childs = [child]
                for child in childs:
                    if child.tag == 'listPerson':
                        inner_children = child.getchildren()
                    else:
                        inner_children = [child]
                    for inner_child in inner_children:
                        if prefix + 'id' not in inner_child.attrib:
                            continue
                        xml_id = inner_child.attrib[prefix + 'id']
                        xml_id = re.sub(r'([A-ZÅÄÖ][a-zåäö]+)(?=[A-ZÅÄÖ])', r'\1 ', xml_id).lower()
                        xml_id = xml_id.replace(' ', '_')
                        xml_id = replace_func(xml_id)
                        if xml_id in list_of_speakers:
                            if inner_child.tag == 'person':
                                xml_id_to_type_of_person[xml_id] = 'person'
                            else:
                                xml_id_to_type_of_person[xml_id] = 'personGrp'
                            if 'sex' not in inner_child.attrib:
                                xml_id_to_sex[xml_id] = 'UNKNOWN'
                            else:
                                xml_id_to_sex[xml_id] = inner_child.attrib['sex'].upper()
                            try:
                                persName, *_ = inner_child.getchildren()
                                if persName.tag not in ['person', 'personGrp', 'occupation', 'persName', 'note', 'age']:
                                    comment, persName, *_ = inner_child.getchildren()
                            except ValueError:
                                xml_id_to_fullname[xml_id] = ' '.join([elem.capitalize() for elem in xml_id.split('_')])
                                continue
                            full_text = ''
                            try:
                                tmp = deepcopy(inner_child)
                                for comment in tmp.xpath('//comment()'):
                                    comment_parent = comment.getparent()
                                    comment_parent.remove(comment)
                                for child in tmp.getchildren():  # persName
                                    if child.text is not None:
                                        if re.sub(r'\s{2,}', ' ', child.text):
                                            full_text += re.sub(r'\s{2,}', ' ', child.text).strip() + ', '
                                    else:
                                        was_text = ''
                                        for part_of_text in child.itertext():
                                            full_text += re.sub(r'\s{2,}', ' ', part_of_text).strip() + ' '
                                            was_text += re.sub(r'\s{2,}', ' ', part_of_text).strip()
                                        full_text = full_text.strip()
                                        if was_text:
                                            full_text += ', '
                            except ValueError:
                                continue
                #             full_text = inner_xml(persName)
                            full_text = full_text.strip(' ,')
                            xml_id_to_fullname[xml_id] = re.sub(r'\s{2,}', r' ', full_text)

    # NB: here
    print(set(list_of_speakers) ^ set(xml_id_to_sex.keys()), set(list_of_speakers) ^ set(xml_id_to_fullname.keys()), set(list_of_speakers) ^ set(xml_id_to_type_of_person.keys()))
    print(set(list_of_speakers) - set(xml_id_to_sex.keys()), set(list_of_speakers) - set(xml_id_to_fullname.keys()), set(list_of_speakers) - set(xml_id_to_type_of_person.keys()))
    
    for xml_id in list_of_speakers:
        if xml_id_to_type_of_person[xml_id] == 'person':
            person = etree.Element('person')
            person.set(prefix + "id", xml_id)
            person.set('sex', xml_id_to_sex[xml_id])
            persName = etree.Element('persName')
            persName.text = xml_id_to_fullname[xml_id]
            person.append(persName)
            listPerson.append(person)
        else:
            personGrp = etree.Element('personGrp')
            personGrp.set(prefix + "id", xml_id)
            personGrp.set('sex', xml_id_to_sex[xml_id])
            name = etree.Element('name')
            name.text = xml_id_to_fullname[xml_id]
            personGrp.append(name)
            listPerson.append(personGrp)

    for stage in text.xpath('//stage'):
        if 'who' in stage.attrib:
            stage.attrib['who'] = replace_func(stage.attrib['who'].lower())

    if sub_title is not None:
        if sub_title_en.text.split()[0].lower() in ['comedy', 'drama', 'tragedy']:
            term.attrib['subtype'] = sub_title_en.text.split()[0].lower()
            term.text = '"' + sub_title.text + '"'

    result = etree.tostring(root.getroottree(), pretty_print=True, xml_declaration=True, encoding="UTF-8").decode('utf-8')
    result = html_.unescape(result)
#     result = re.sub(r'<fw type="pageNum">([0-9]+)</fw>', r'<pb n="\1"/>', result)
    result = re.sub(r'([^\s])(<[^/])', r'\1 \2', result)
    for i in range(10):
#         print(i)
        result = re.sub(r'(<p(?: rend=\"[^><]*?\")?>(?:(?!</p>).)*?)\s{2,}((?:(?!</p>).)*?</p>)', r'\1 \2', result, 0, re.DOTALL)

    result = re.sub(r'(</[^>]+>) (</[^>]+>)', r'\1\2', result)
    result = re.sub(r'/>([^\s])', r'/> \1', result)
    result = re.sub(r'\s+</stage>', r'</stage>', result)
    result = re.sub(r'\( <', r'(<', result)
    result = re.sub(r',([^\s])', r', \1', result)
    result = re.sub(r' ([,):])', r'\1', result)
    result = re.sub(r'([(]) ', r'\1', result)
    result = re.sub(r'\s+([^<\s])', r' \1', result)
    result = re.sub(r' (</)', r'\1', result)
    result = re.sub(r'(<[^/][^<>]*?>) <', r'\1<', result)
    result = re.sub(r' \.', '.', result)
    result = re.sub(r'- (<pb [^<>]*?/>) ', r'-\1', result)
    
    result = re.sub(r'\n\s+([^><]*?)</stage>', r' \1</stage>', result)
    with open('/content/drive/My Drive/swedracor/converted_plays/' + replace_func(surname.lower() + '-' + '-'.join(main_title_text.lower().split()).split('/')[-1]) + '.xml', 'w', encoding='utf-8') as f:
        f.write(result)

In [347]:
for file in """AgrellA-Domd.xml                                    LefflerAC-HurManGorGodt.xml       StrindbergA-InforDoden.xml
AgrellA-EnHufvudsak.xml                             LefflerAC-MosterMalvina.xml       StrindbergA-Kamraterna.xml
AgrellA-EnLektion.xml                               LefflerAC-SannaKvinnor.xml        StrindbergA-Lycko-PersResa.xml
AgrellA-Ensam.xml                                   LefflerAC-Skadespelerskan.xml     StrindbergA-Marodorer.xml
AgrellA-Hvarfor.xml                                 LefflerAC-UnderToffeln.xml        StrindbergA-MasterOlof.xml
AgrellA-IngridEnDodsKarlekssaga.xml                 LindheW-Modrar.xml                StrindbergA-Moderskarlek.xml
AgrellA-Smastadslif.xml                             LundbergE-ForlatMig.xml           StrindbergA-Paria.xml
AureliusH-FarmorsFodelsedag.xml                     MallingM-FruLeonora.xml           StrindbergA-Samum.xml
BarthelsonA-Efterspel.xml                           MarholmL-Otteringning.xml         StrindbergA-TillDamaskus.xml
BenedictssonV-Final.xml                             MeyersonG-DenNyaKlassen.xml       TopeliusT-ISmaMaffarnasLand.xml
BenedictssonV-ITelefon.xml                          MeyersonG-EttPojkstreck.xml       TopeliusT-JeppeOchMurra.xml
BenedictssonV-RomeosJulia.xml                       MolanderH-Varflod.xml             TopeliusZ-Askungen.xml
BremerF-KonstnarnsFortviflan.xml                    StrindbergA-AnnoFyrtiatta.xml     TopeliusZ-JennysFormaningar.xml
GeijerstamG-LarsAndersOchJanAndersOchDerasBarn.xml  StrindbergA-Bandet.xml            TopeliusZ-Krypskyttarne.xml
HedbergF-Rospiggarna.xml                            StrindbergA-DenStarkare.xml       TopeliusZ-LuciasZiffror.xml
IndebetouH-IDetGrona.xml                            StrindbergA-Fadren.xml            TopeliusZ-Perdita.xml
IndebetouH-IFruntimmersveckan.xml                   StrindbergA-Fordringsegare.xml    TopeliusZ-Skogskonungen.xml
KullgrenL-Karlek.xml                                StrindbergA-ForstaVarningen.xml   TopeliusZ-Snurran.xml
KuylenstiernaE-NarNyarKom.xml                       StrindbergA-FrokenJulie.xml       TopeliusZ-StationSylvester.xml
KuylenstiernaE-NuArDetJulIgen.xml                   StrindbergA-GilletsHemlighet.xml  TopeliusZ-TidernasSpegel.xml
LefflerAC-Elfvan.xml                                StrindbergA-HerrBengtsHustru.xml  WahlenbergA-PaVakt.xml
LefflerAC-Familjelycka.xml                          StrindbergA-Hostslask.xml         WahlenbergA-TvaValsprak.xml""".split():
    # tmp
    # file = 'StrindbergA-MasterOlof.xml'
    with open('/content/drive/My Drive/swedracor/' + file, 'r', encoding='utf-8') as f:
        print(file)
        xml = preprocess_xml(f.read())
        root = get_root(xml)
        
        # strip all namespaces
        for tag in root.iter():
            try:
                if '}' in tag.tag:
                    tag.tag = tag.tag.split('}', 1)[1]
            except TypeError:
                continue

        # add 'xml-stylesheet' tag and set lang in TEI tag:
        root.addprevious(etree.PI('xml-stylesheet', 'type="text/css" href="../css/tei.css"'))
        root.set(prefix + "lang", "sv")

        process_root(root)

AgrellA-Domd.xml
set() set() set()
set() set() set()
LefflerAC-HurManGorGodt.xml
set() set() set()
set() set() set()
StrindbergA-InforDoden.xml
set() set() set()
set() set() set()
AgrellA-EnHufvudsak.xml
set() set() set()
set() set() set()
LefflerAC-MosterMalvina.xml
set() set() set()
set() set() set()
StrindbergA-Kamraterna.xml
set() set() set()
set() set() set()
AgrellA-EnLektion.xml
set() set() set()
set() set() set()
LefflerAC-SannaKvinnor.xml
set() set() set()
set() set() set()
StrindbergA-Lycko-PersResa.xml
set() set() set()
set() set() set()
AgrellA-Ensam.xml
set() set() set()
set() set() set()
LefflerAC-Skadespelerskan.xml
set() set() set()
set() set() set()
StrindbergA-Marodorer.xml
set() set() set()
set() set() set()
AgrellA-Hvarfor.xml
set() set() set()
set() set() set()
LefflerAC-UnderToffeln.xml
set() set() set()
set() set() set()
StrindbergA-MasterOlof.xml
set() set() set()
set() set() set()
AgrellA-IngridEnDodsKarlekssaga.xml
set() set() set()
set() set() set()
LindheW-M