# deduplicate ~~bibliographic~~ institution references

The DH-Community is not able to follow citation quides, therefore bibliographic references are quite messy. This script

* extracts all `.//tei:titleStmt//tei:affiliation` elements and writes them to a `.csv` file in the current folder
* this file is feed into `csvdedupe` command line interface which returns `output.csv` with deduplicated files

In [None]:
import glob
import os
import itertools
import pandas as pd

from lxml import etree as ET
from slugify import slugify

from config import TEI_DIR, PATTERNS, YEARS
from teipy import TeiReader
from partials import TEI_NSMAP, tei_gen_header

In [None]:
try:
    os.makedirs('../indices')
except OSError as e:
    print('../indices alredy exists')

In [None]:
files = glob.glob("../dhd_*/TEI/*.xml")
len(files)

## note

Extraction and disambiguation of institutions is tricky because:
* usage of different names for the same Institution
* person -> affiliation is a 1:n relation
* but there is no dedicated separator to indicate that one affiliation tag comprises several affiliations
  * `;` is treaded as separator but not
  * `|` or `\` although sometimes 
    * used as separator "forschungsverbund marbach weimar wolfenbüttel / herzog august bibliothek wolfenbüttel"
    * they are not only used as separators e.g. "Akademie der Wissenschaften und der Literatur | Mainz"  but also as part of the name, or to indicate a part of an institution like in "Georg-August-Universität Göttingen, Deutschland - GCDH/Archäologisches Institut"

therefore no automatic splitting is done!
disambiquiation is done in a very generous manner

this means that `eberhard karls universität tübingen, deutschland` and `eberhard karls universität tübingen, deutschland; humboldt universität zu berlin` are treated as one institution

In [None]:
def yield_items(files):
    for x in files:
        doc = TeiReader(x)
        doc_id = x
        titel = doc.extract_md()['title']
        counter = 0
        for rs in doc.tree.xpath('.//tei:titleStmt//tei:affiliation', namespaces=doc.ns_tei):
            author_node = rs.getparent()
            rs_text = " ".join("".join(rs.itertext()).split())
            for y in rs_text.split(';'):
                if len(y.strip()) > 5:
                    item = {
                        "title": titel,
                        "org": y.strip(),
                        "id": f"{doc_id}__{counter}"
                    }
                    counter += 1
                    yield item

In [None]:
df = pd.DataFrame(yield_items(files))

In [None]:
df.to_csv('orgs.csv')

## run csvdedupe cml-tool

```shell
csvdedupe orgs.csv --field_names org --output_file org_output.csv --skip_training true
```

* use the result (saved as output.csv) for any further processing
* read output.csv into a `pandas.Dataframe`
* group rows (i.e. bibl entries) by `Cluster ID` (created by dedupe)

In [None]:
deduped = pd.read_csv('org_output.csv')

## extra work to circumvent a strange behaviour in dedupe

* as reported in https://github.com/dedupeio/csvdedupe/issues/88 dedupe does not group exact string matches into the same clusster, so some extra work needs to be done

In [None]:
org_lookup = {}
for gr in deduped.groupby('org'):
    org_name = gr[0]
    org_ref = f"#org__{gr[1]['Cluster ID'].iloc[0]}"
    org_lookup[org_name] = org_ref

## write org_id as ref attributes into affiliation

In [None]:
for x in files:
    doc = TeiReader(x)
    for rs in doc.tree.xpath('.//tei:titleStmt//tei:affiliation', namespaces=doc.ns_tei):
        orgs = []
        rs_text = " ".join("".join(rs.itertext()).split())
        for y in rs_text.split(';'):
            y = y.strip()
            if len(y) > 5:
                
                org_id = org_lookup[y]
                orgs.append(org_id)
        org_refs = " ".join(orgs)
        rs.attrib.pop("ref", None)
        rs.attrib['ref'] = org_refs
    doc.tree_to_file(x)

# create a listOrg

In [None]:
from collections import defaultdict

In [None]:
orgs = defaultdict(list)
for key, value in org_lookup.items():
    org_xml_id = value[1:]
    orgs[org_xml_id].append(key)

In [None]:
header = ET.fromstring(tei_gen_header.format('Organisationsregsiter', f"DHd Book of Abstracts {', '.join(YEARS)}"))    
body = header.xpath('.//tei:body', namespaces=TEI_NSMAP)[0]
listorg = ET.Element("listOrg")
for key, value in orgs.items():
    sorted_names = sorted(value, key=len, reverse=True)
    title_name = sorted_names[0]
    if len(title_name.split(',')) > 1:
        country = title_name.split(',')[-1]
    else:
        country = False
    try:
        alt_names = sorted_names[1:]
    except:
        alt_names = False
    org = ET.Element("{http://www.tei-c.org/ns/1.0}org")
    org.attrib['{http://www.w3.org/XML/1998/namespace}id'] = key
    orgName = ET.Element("orgName")
    orgName.text = title_name
    org.append(orgName)
    if alt_names:
        for x in alt_names:
            alt_name = ET.Element("orgName")
            alt_name.text = x
            alt_name.attrib['type'] = 'alt'
            org.append(alt_name)
    if country:
        c_el = ET.Element("{http://www.tei-c.org/ns/1.0}country")
        c_el.text = country.strip()
        org.append(c_el)
    listorg.append(org)
body.append(listorg)


In [None]:
file = os.path.join('../indices', 'listorg.xml')

In [None]:
with open(file, 'wb') as f:
    f.write(ET.tostring(header, pretty_print=True, encoding='utf-8'))

# create listperson.xml

In [None]:
all_docs = []
for x in files:
    doc = TeiReader(x)
    all_docs.append(doc.extract_md())

In [None]:
df = pd.DataFrame(all_docs)

In [None]:
authors = pd.DataFrame(list(itertools.chain.from_iterable(list(df['authors'].values)))).sort_values(by='surname')
authors['email_lower'] = authors['email'].str.lower()
authors = authors.drop_duplicates(subset=['email_lower'])

In [None]:
# authors

In [None]:
header = ET.fromstring(tei_gen_header.format('Personenregister', f"DHd Book of Abstracts {', '.join(YEARS)}"))    

In [None]:
body = header.xpath('.//tei:body', namespaces=TEI_NSMAP)[0]
listperson = ET.Element("listPerson")
body.append(listperson)
for gr in authors.groupby('email'):
    df = gr[1]
    xml_id = "person__{}".format(slugify(gr[1].iloc[0]['email'].lower()))
    row = gr[1].iloc[0]
    person = ET.Element("{http://www.tei-c.org/ns/1.0}person")
    person.attrib['{http://www.w3.org/XML/1998/namespace}id'] = xml_id
    persName = ET.Element("persName")
    surname = ET.Element("surname")
    surname.text = row['surname']
    persName.append(surname)
    forename = ET.Element("forename")
    forename.text = row['forename']
    persName.append(forename)
    person.append(persName)
    for i, row in df.iterrows():
        morg_name = row['affiliation']
        if morg_name:
            for y in morg_name.split(';'):
                org_name = y.strip()
                org_id = org_lookup.get(org_name, '#org__99999')
                print(org_id)
                affil = ET.Element("affiliation")
                affil.text = org_name 
                affil.attrib['ref'] = org_id
                person.append(affil)
    listperson.append(person)

In [None]:
file = "../indices/listperson.xml"
with open(file, 'wb') as f:
    f.write(ET.tostring(header, pretty_print=True, encoding='UTF-8'))