In [None]:
import glob
import itertools

import pandas as pd
from lxml import etree as ET

from config import TEI_DIR, PATTERNS
from teipy import TeiReader
from partials import TEI_NSMAP, tei_gen_header
from slugify import slugify

In [None]:
files = glob.glob("{}/*.xml".format(TEI_DIR))

In [None]:
TEI_DIR

In [None]:
all_docs = []
for x in files:
    doc = TeiReader(x)
    all_docs.append(doc.extract_md())

In [None]:
df = pd.DataFrame(all_docs)

In [None]:
authors = pd.DataFrame(list(itertools.chain.from_iterable(list(df['authors'].values)))).drop_duplicates().sort_values(by='surname')

In [None]:
header = ET.fromstring(tei_gen_header.format('Personenregister', 'DHD2018'))    

In [None]:
body = header.xpath('.//tei:body', namespaces=TEI_NSMAP)[0]
listperson = ET.Element("listPerson")
body.append(listperson)
for i, row in authors.iterrows():
    person = ET.Element("{http://www.tei-c.org/ns/1.0}person")
    person.attrib['{http://www.w3.org/XML/1998/namespace}id'] = "person__{}".format(
            slugify(row['email'])
        )
    persName = ET.Element("persName")
    surname = ET.Element("surname")
    surname.text = row['surname']
    persName.append(surname)
    forename = ET.Element("forename")
    forename.text = row['forename']
    persName.append(forename)
    affil = ET.Element("affiliation")
    affil.text = row['affiliation'] 
    person.append(persName)
    person.append(affil)
    listperson.append(person)

In [None]:
file = "tmp.xml"
with open(file, 'wb') as f:
    f.write(ET.tostring(header, pretty_print=True, encoding='UTF-8'))

In [None]:
authors['affiliation'].drop_duplicates().sort_values().to_csv('affilitions.csv',encoding='UTF-8')

In [None]:
header = ET.fromstring(tei_gen_header.format('Organisationsregsiter', 'DHD2018'))    
body = header.xpath('.//tei:body', namespaces=TEI_NSMAP)[0]
listorg = ET.Element("listOrg")
body.append(listorg)
counter = 1
for x in authors['affiliation'].drop_duplicates().sort_values():
    org = ET.Element("{http://www.tei-c.org/ns/1.0}org")
    org.attrib['{http://www.w3.org/XML/1998/namespace}id'] = "org__{}".format(counter)
    orgName = ET.Element("orgName")
    orgName.text = x
    org.append(orgName)
    listorg.append(org)
    counter += 1

In [None]:
file = "tmp_org.xml"
with open(file, 'wb') as f:
    f.write(ET.tostring(header, pretty_print=True, encoding='UTF-8'))