# extract NER Training Samples from TEI-Files fetched from a GitHub Repo

In [1]:
from github import Github
import json
import os
import acdh_tei_pyutils
from acdh_tei_pyutils.tei import TeiReader

In [2]:
clean_markup = os.path.join(acdh_tei_pyutils.__path__[0], 'files', 'clean_markup.xsl')

In [3]:
g = Github()

In [4]:
repo = g.get_repo('bleierr/NERDPool')

In [5]:
contents = repo.get_contents("RTA_1576")

In [6]:
with open("sample.jsonl", "w") as a_file:
    for x in contents:
        dl_url = x._rawData.get('download_url')
        print(dl_url)
        doc = TeiReader(xml=dl_url, xsl=clean_markup)
        parent_node = doc.any_xpath('.//tei:body')[0]
        ne_list = doc.extract_ne_offsets(
            parent_nodes='.//tei:body//tei:p',
            ne_xpath=".//*[contains(name(), 'Name') or name()='date' or name()='time']"
        )
        for y in ne_list:
            item = {
                "text": y[0],
                "entities": y[1]['entities']
            }
            a_file.write("\n")
            a_file.write(json.dumps(item, ensure_ascii=False))

https://raw.githubusercontent.com/bleierr/NERDPool/main/RTA_1576/FR-Protokoll-Baden.xml
https://raw.githubusercontent.com/bleierr/NERDPool/main/RTA_1576/FR-Protokoll-Stra%C3%9Fburg.xml
https://raw.githubusercontent.com/bleierr/NERDPool/main/RTA_1576/FR-Protokoll-%C3%96sterreich-Juli.xml
https://raw.githubusercontent.com/bleierr/NERDPool/main/RTA_1576/Geheimer_Rat-Protokoll-Wien-HHStA.xml
https://raw.githubusercontent.com/bleierr/NERDPool/main/RTA_1576/HHStA%20Wien_AUR%201576_Oktober%2012_V2.xml
https://raw.githubusercontent.com/bleierr/NERDPool/main/RTA_1576/HStA_Dresden_Loc10199_4_fol11_1576-05-21.xml
https://raw.githubusercontent.com/bleierr/NERDPool/main/RTA_1576/HStA_Dresden_Loc10199_4_fol13_1576-05-25.xml
https://raw.githubusercontent.com/bleierr/NERDPool/main/RTA_1576/HStA_Dresden_Loc10199_4_fol159_1576-06-30.xml
https://raw.githubusercontent.com/bleierr/NERDPool/main/RTA_1576/HStA_Dresden_Loc10199_4_fol16_1576-05-23.xml
https://raw.githubusercontent.com/bleierr/NERDPool/main/RTA

In [10]:
import warnings
import spacy
from spacy.tokens import DocBin

In [15]:
nlp = spacy.blank('de')
db_train = DocBin()
db_eval = DocBin()
fails = 0
train_eval = 0
train_sample_count = 0
eval_sample_count = 0
with open('sample.jsonl') as f:
    for line in f:
        if train_eval == 5:
            train_eval = 0
        try:
            data = json.loads(line.strip())
        except:
            continue
        text = data['text']
        entities = data['entities']
        doc = nlp.make_doc(text)
        ents = []
        train_eval += 1
        for start, end, label in entities:
            span = doc.char_span(start, end, label=label)
            if span is None:
                # msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                # warnings.warn(msg)
                fails += 1
            else:
                ents.append(span)
        try:
            doc.ents = ents
            if train_eval != 4:
                db_train.add(doc)
                train_sample_count += 1
            else:
                db_eval.add(doc)
                eval_sample_count += 1
        except ValueError:
            continue
    db_train.to_disk('train.spacy')
    db_eval.to_disk('dev.spacy')
print(
    f"fails: {fails}; train: {train_sample_count}; eval: {eval_sample_count}"
)
        

fails: 189; train: 8330; eval: 2083


In [16]:
! python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;4mℹ Using CPU[0m
[1m
[2021-03-17 12:51:04,146] [INFO] Set up nlp object from config
[2021-03-17 12:51:04,156] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-03-17 12:51:04,160] [INFO] Created vocabulary
[2021-03-17 12:51:04,160] [INFO] Finished initializing nlp object
[2021-03-17 12:51:16,694] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     75.50    0.00    0.00    0.00    0.00
  0     200        348.11   1451.85   54.46   60.81   49.32    0.54
  0     400        122.47    735.54   62.56   70.94   55.95    0.63
  0     600      34278.71    767.23   68.69   82.81   58.69    0.69
  0     800       2756.13    800.79   72.25   73.86   70.71    0.72
  0    1000      48938.94   1015.72   75