# deduplicate bibliographic references

The DH-Community is not able to follow citation quides, therefore bibliographic references are quite messy. This script

* extracts all `tei:listBibl//tei:bibl` elements and writes them to a `.csv` file in the current folder
* this file is feed into `csvdedupe` command line interface which returns `output.csv` with deduplicated files

In [1]:
import glob
import os
import pandas as pd
from lxml import etree as ET

from config import TEI_DIR, YEAR_DIR
from teipy import TeiReader

In [None]:
files = glob.glob("{}/*.xml".format(TEI_DIR))

In [None]:
def yield_items(files):
    for x in files:
        doc = TeiReader(x)
        doc_id = x.replace(TEI_DIR, '')[1:]
        titel = doc.extract_md()['title']
        counter = 0
        for bibl in doc.tree.xpath('.//tei:listBibl//tei:bibl', namespaces=doc.ns_tei):
            bibl_text = " ".join("".join(bibl.itertext()).split())
            item = {
                "title": titel,
                "book": bibl_text,
                "id": f"{doc_id}__{counter}"
            }
            counter += 1
            yield item

In [None]:
df = pd.DataFrame(yield_items(files))

In [None]:
df.to_csv('bibls.csv')

## run csvdedupe cmd-tool

* use the result (saved as output.csv) for any further processing
* read output.csv into a `pandas.Dataframe`
* group rows (i.e. bibl entries) by `Cluster ID` (created by dedupe)
* for each group write first row as `tei:bibl` node into a `tei:listBibl`

In [2]:
deduped = pd.read_csv('output.csv')

In [3]:
tei_stump = TeiReader.tei_stump(
    title="Literaturverzeichnis",
    source_desc="Generiert mit dedupe_bibls"
)

In [4]:
root = ET.fromstring(tei_stump)
bibl_root = root.find('.//tei:body', namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
list_bibl = ET.Element("{http://www.tei-c.org/ns/1.0}listBibl")
bibl_root.append(list_bibl)
bibl_root.append(list_bibl)
for i, gr in deduped.groupby('Cluster ID'):
    my_df = pd.DataFrame(gr)
    item = {
        "title": str(my_df.iloc[0]['book']),
        "id": f"bibl__{my_df.iloc[0]['Cluster ID']}",
        "occ": len(my_df.index)
    }
    bibl = ET.Element("{http://www.tei-c.org/ns/1.0}bibl")
    bibl.attrib["{http://www.w3.org/XML/1998/namespace}id"] = item['id']
    bibl.text = item['title']
    
    list_bibl.append(bibl)
    

save XML as listbibl 

In [6]:
file = os.path.join(YEAR_DIR, 'indices', 'listbibl.xml')

In [7]:
with open(file, 'wb') as f:
    f.write(ET.tostring(root, pretty_print=True, encoding='utf-8'))