In [None]:
import glob
import os
import itertools

import pandas as pd
from lxml import etree as ET

from config import TEI_DIR, PATTERNS
from teipy import TeiReader
from partials import TEI_NSMAP, tei_gen_header
from slugify import slugify

In [None]:
try:
    os.mkdir('../indices')
except FileExistsError:
    pass

In [None]:
files = glob.glob("../dhd_*/TEI/*.xml")

In [None]:
len(files)

In [None]:
all_docs = []
for x in files:
    doc = TeiReader(x)
    all_docs.append(doc.extract_md())

In [None]:
all_docs

In [None]:
df = pd.DataFrame(all_docs)

In [None]:
authors = pd.DataFrame(list(itertools.chain.from_iterable(list(df['authors'].values)))).sort_values(by='surname')

In [None]:
authors['email_lower'] = authors['email'].str.lower()

In [None]:
no_dub = authors.drop_duplicates(subset=['email_lower'])

In [None]:
# no_dub

In [None]:
header = ET.fromstring(tei_gen_header.format('Personenregister', f"DHd Book of Abstracts {', '.join(years)}"))    

In [None]:
body = header.xpath('.//tei:body', namespaces=TEI_NSMAP)[0]
listperson = ET.Element("listPerson")
body.append(listperson)
for i, row in no_dub.iterrows():
    person = ET.Element("{http://www.tei-c.org/ns/1.0}person")
    person.attrib['{http://www.w3.org/XML/1998/namespace}id'] = "person__{}".format(
            slugify(row['email'])
        )
    persName = ET.Element("persName")
    surname = ET.Element("surname")
    surname.text = row['surname']
    persName.append(surname)
    forename = ET.Element("forename")
    forename.text = row['forename']
    persName.append(forename)
    affil = ET.Element("affiliation")
    affil.text = row['affiliation'] 
    person.append(persName)
    person.append(affil)
    listperson.append(person)

In [None]:
file = "../indices/listperson.xml"
with open(file, 'wb') as f:
    f.write(ET.tostring(header, pretty_print=True, encoding='UTF-8'))