In [6]:
from datasets import Dataset
import rubrix as rb
import pandas as pd
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

from sklearn.model_selection import train_test_split

# load rubrix dataset
# select text input and the annotated label
# https://github.com/recognai/rubrix#3-load-and-create-a-training-set
dataset_rb = rb.load('chronik_annotations', query="status:Validated")
dataset_rb.head()

Unnamed: 0,text,tokens,prediction,prediction_agent,annotation,annotation_agent,id,metadata,status,event_timestamp,metrics,search_keywords
0,Teestube - In der Pestalozzistraße 20 Rgb. erö...,"[Teestube, -, In, der, Pestalozzistraße, 20, R...",,,"[(LOC, 0, 8), (ADR, 18, 37), (ADR, 104, 121), ...",rubrix,00f2ae40-e155-4f98-adaa-61d1db5a08ff,"{'date': '22. Juni 1974', 'year': 1974, 'id': ...",Validated,,"{'text_length': 350, 'tokens': [{'idx': 0, 'va...",
1,"Adelheid Lissmann - Adelheid Lissmann, geb. 19...","[Adelheid, Lissmann, -, Adelheid, Lissmann, ,,...",,,"[(PER, 0, 17), (ORG, 74, 95)]",rubrix,01a4910f-1088-413a-8879-55f06ded5d20,"{'date': '1946', 'year': 1946, 'id': 12}",Validated,,"{'text_length': 192, 'tokens': [{'idx': 0, 'va...",
2,Beschlagnahmen - Beschlagnahme der Blätter für...,"[Beschlagnahmen, -, Beschlagnahme, der, Blätte...",,,"[(PUBLICATION, 35, 60), (PUBLICATION, 71, 90)]",rubrix,029c88d4-bda1-4e3c-af72-85e516588301,"{'date': '1925', 'year': 1925, 'id': 28}",Validated,,"{'text_length': 266, 'tokens': [{'idx': 0, 'va...",
3,Lesbenfrühlings­treffen - Das Lesbenfrühlingst...,"[Lesbenfrühlings­treffen, -, Das, Lesbenfrühli...",,,"[(EVENT, 0, 23), (EVENT, 30, 52), (EVENT, 61, ...",rubrix,02c3041b-6122-4f61-aa1e-26001b2cc865,"{'date': '24. – 27. Mai 1996', 'year': 1996, '...",Validated,,"{'text_length': 210, 'tokens': [{'idx': 0, 'va...",
4,Moby Dyke - Die Kunstaktion „Moby Dyke Lesbian...,"[Moby, Dyke, -, Die, Kunstaktion, „, Moby, Dyk...",,,"[(PER, 56, 70), (PER, 75, 93), (LOC, 118, 138)...",rubrix,03bf56b1-0857-47fc-9588-5caf5929847e,"{'date': '21. – 22. August 2015', 'year': 2015...",Validated,,"{'text_length': 193, 'tokens': [{'idx': 0, 'va...",


## Preprocessing

The data needs two steps of preprocessing:

1. Convert `rubrix` to `spacy` format for annotations, because `rubrix` exports the annotated information as `(label, start, end)`, but `spacy` needs `(start, end, label)`. See: `convert_rubrix_to_spacy()`.
2. Convert annotations to a `DocBin` and save file to disk. See `create_doc_bin()`


[More infos in official Documentation](https://spacy.io/usage/training#training-data):

> For example, if you’re creating an NER pipeline, loading your annotations and setting them as the .ents property on a Doc is all you need to worry about.

In [7]:
def convert_rubrix_to_spacy(rubrix_name:str, query:str):
    """
    import annotated data from rubrix and transform it to spacy flavour
    @name name of rubrix dataset to import
    @query query rubrix data, typically "status:Validated"
    """
    # load rubrix dataset
    labeled_data = rb.load(rubrix_name, query=query)

    labeled_data_df = pd.DataFrame({
        "text": labeled_data.text,
        "label": labeled_data.annotation,
    })
    
    training_data = []

    for record in labeled_data_df.index:
        entities = []
        text = labeled_data_df["text"][record]
        labels = labeled_data_df["label"][record]
        
        for label in labels:
            
            start = label[1]
            end = label[2]
            label = label[0]
            # switch position
            entity = text, label, start, end
            entities.append((start, end, label))

        training_data.append([(text, entities)])
    return(training_data)


def create_doc_bin(data: list, lang: str):

    nlp = spacy.blank(lang)

    # the DocBin will store the documents
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])

    for record in tqdm(data):
        
        # text are class list, need to be transformed to character
        text = " ".join(map(str,[el[0] for el in record]))
        doc = nlp(text)

        annotations = [item[1] for item in record]
        # print("annotations:")
        # print(annotations)
        ents = []
        
        for annotation in annotations[0]:
            # add start, end and label as spans
            start = annotation[0]
            end = annotation[1]
            label = annotation[2]
            span = doc.char_span(start, end, label=label)
            ents.append(span)
        doc.ents = ents
        doc_bin.add(doc)
    return(doc_bin)

Preprocessing execution, split data in train and test (called `dev` here).

In [8]:
labeled_data = convert_rubrix_to_spacy(rubrix_name="chronik_annotations", query="status:Validated")

train, dev = train_test_split(labeled_data, test_size=0.2) 
create_doc_bin(train, "de").to_disk("data/train.spacy")
create_doc_bin(dev, "de").to_disk("data/dev.spacy")

100%|██████████| 105/105 [00:00<00:00, 1765.29it/s]
100%|██████████| 27/27 [00:00<00:00, 1622.11it/s]


## Use a spacy config file
- https://ner.pythonhumanities.com/03_02_train_spacy_ner_model.html#what-is-the-spacy-config-cfg-file-and-how-do-i-create-it
- base config file from: https://github.com/wjbmattingly/holocaust_ner_lessons/blob/main/data/config.cfg
- Spacy Documentation about projects https://explosion.ai/blog/spacy-v3-project-config-systems

In [13]:
# create base config
!python3 -m spacy init config --pipeline ner data/base-config.cfg

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
data/base-config.cfg
You can now add your data and train your pipeline:
python -m spacy train base-config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [14]:
# fill with default values
!python3 -m spacy init fill-config data/base-config.cfg data/config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
data/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


Validate training data based on config file:

In [16]:
# check data 
!python3 -m spacy debug data data/config.cfg

[1m
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.9/3.9.10/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/homebrew/Cellar/python@3.9/3.9.10/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/kabr/code/remove-na-lgbtiq-queer-knowledge-graph/venv/lib/python3.9/site-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/Users/kabr/code/remove-na-lgbtiq-queer-knowledge-graph/venv/lib/python3.9/site-packages/spacy/cli/_util.py", line 71, in setup_cli
    command(prog_name=COMMAND)
  File "/Users/kabr/code/remove-na-lgbtiq-queer-knowledge-graph/venv/lib/python3.9/site-packages/click/core.py", line 1128, in __call__
    return self.main(*args, **kwargs)
  File "/Users/kabr/code/remove-na-lgbtiq-queer-knowledge-graph/venv/lib/python3.9/site-packages/click/cor

## Train model

In [None]:
!python3 -m spacy train data/config.cfg --output ./models/output

In [None]:
trained_nlp = spacy.load("models/output/model-best")
text = 'Die Rosa Liste öffnet sich den Lesben: „Rosa Liste – jetzt lesbisch-schwul?“, eine Veranstaltung organisiert vom AK Uferlos. In der folgenden Stadtratswahl 1994 treten sowohl schwule als auch lesbische KandidatInnen an; Marion Hölczl war bereits ab 1992 Rosa-Liste-Bezirksrätin in Altstadt-Lehel.'
doc = trained_nlp(text)

for ent in doc.ents:
    print (ent.text, ent.label_)
if len(doc.ents) == 0:
    print ("No entities found.")