In [None]:
# ! pip install spacy
# ! pip install spacy-langdetect
# ! python -m spacy download de_core_news_sm

# deduplicate bibliographic references

The DH-Community is not able to follow citation quides, therefore bibliographic references are quite messy. This script

* extracts all `tei:listBibl//tei:bibl` elements and writes them to a `.csv` file in the current folder
* this file is feed into `csvdedupe` command line interface which returns `output.csv` with deduplicated files

In [None]:
import glob
import os
import pandas as pd
import spacy

from spacy_langdetect import LanguageDetector
from lxml import etree as ET

from teipy import TeiReader

In [None]:
try:
    os.makedirs('../indices')
except OSError as e:
    print('../indices alredy exists')

In [None]:
files = glob.glob("../dhd_*/TEI/*.xml")
len(files)

In [None]:
def yield_items(files):
    for x in files:
        doc = TeiReader(x)
        doc_id = x
        titel = doc.extract_md()['title']
        counter = 0
        for bibl in doc.tree.xpath('.//tei:listBibl//tei:bibl', namespaces=doc.ns_tei):
            bibl_text = " ".join("".join(bibl.itertext()).split())
            item = {
                "title": titel,
                "book": bibl_text,
                "id": f"{doc_id}__{counter}"
            }
            counter += 1
            yield item

In [None]:
df = pd.DataFrame(yield_items(files))

In [None]:
df.to_csv('bibls.csv')

## run csvdedupe cmd-tool

```shell
csvdedupe bibls.csv --field_names book --output_file output.csv
```

* use the result (saved as output.csv) for any further processing
* read output.csv into a `pandas.Dataframe`
* group rows (i.e. bibl entries) by `Cluster ID` (created by dedupe)
* for each group write first row as `tei:bibl` node into a `tei:listBibl`

In [None]:
deduped = pd.read_csv('output.csv')

In [None]:
nlp = spacy.load('de_core_news_sm')
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

In [None]:
def get_lang(text):
    try:
        doc = nlp(text)
        lang = (doc._.language)['language']
    except Exception as e:
        print(e)
        lang = ''
    return lang

In [None]:
deduped['date'] = deduped['book'].str.extract('([1|2]\d\d\d)', expand=True)

In [None]:
deduped['lang'] = deduped.apply(lambda x: get_lang(x['book']), axis=1)

In [None]:
deduped.to_csv('bibls_deduped_enriched.csv')

In [None]:
tei_stump = TeiReader.tei_stump(
    title="Literaturverzeichnis",
    source_desc="Generiert mit dedupe_bibls"
)

In [None]:
deduped = pd.read_csv('bibls_deduped_enriched.csv')

In [None]:
deduped.tail()

In [None]:
root = ET.fromstring(tei_stump)
bibl_root = root.find('.//tei:body', namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
list_bibl = ET.Element("{http://www.tei-c.org/ns/1.0}listBibl")
bibl_root.append(list_bibl)
for i, gr in deduped.groupby('Cluster ID'):
    my_df = pd.DataFrame(gr)
    item = {
        "title": str(my_df.iloc[0]['book']),
        "id": f"bibl__{my_df.iloc[0]['Cluster ID']}",
        "occ": len(my_df.index)
    }
    bibl = ET.Element("{http://www.tei-c.org/ns/1.0}bibl")
    bibl.attrib["{http://www.w3.org/XML/1998/namespace}id"] = item['id']
    bibl.attrib["{http://www.w3.org/XML/1998/namespace}lang"] = my_df.iloc[0]['lang']
    bibl.text = item['title']
    year_value = my_df.iloc[0]['date']
    try:
        year = int(year_value)
    except ValueError:
        year = False
    if year:
        year = ET.Element("{http://www.tei-c.org/ns/1.0}date")
        year.text = year_value
        year.attrib["when"] = f"{year_value}-01-01"
        bibl.append(year)
    list_bibl.append(bibl)
    

save XML as listbibl 

In [None]:
file = os.path.join('../indices', 'listbibl.xml')

In [None]:
with open(file, 'wb') as f:
    f.write(ET.tostring(root, pretty_print=True, encoding='utf-8'))

## write @ref attributes into bibl items

In [None]:
for i, row in deduped.iterrows():
    doc_id = row['id'].split('.xml__')
    item_id = f"#bibl__{row['Cluster ID']}"
    node_id = doc_id[1]
    file = f"{doc_id[0]}.xml"
    try:
        doc = TeiReader(file)
    except:
        continue
    bibl_node = doc.tree.xpath('.//tei:listBibl//tei:bibl', namespaces=doc.ns_tei)[int(node_id)]
    bibl_node_children = []
    for child in bibl_node:
        bibl_node_children.append(child) 
        bibl_node.remove(child)
    rs_node = ET.Element("{http://www.tei-c.org/ns/1.0}rs")
    rs_node.attrib["type"] = 'bibl'
    rs_node.attrib["ref"] = item_id
    for child in bibl_node_children:
        rs_node.append(child)
    bibl_node.append(rs_node)
    bibl_node.attrib.pop("ref", None)
    doc.tree_to_file(file)