In [1]:
from datasets import Dataset
import rubrix as rb
import pandas as pd
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

from sklearn.model_selection import train_test_split

# load rubrix dataset
# select text input and the annotated label
# https://github.com/recognai/rubrix#3-load-and-create-a-training-set
dataset_rb = rb.load('chronik_annotations', query="status:Validated")
dataset_rb.head()



Unnamed: 0,text,tokens,prediction,prediction_agent,annotation,annotation_agent,id,metadata,status,event_timestamp,metrics,search_keywords
0,Teestube - In der Pestalozzistraße 20 Rgb. erö...,"[Teestube, -, In, der, Pestalozzistraße, 20, R...",,,"[(LOC, 0, 8), (ADR, 18, 37), (ADR, 104, 121), ...",rubrix,00f2ae40-e155-4f98-adaa-61d1db5a08ff,"{'date': '22. Juni 1974', 'year': 1974, 'id': ...",Validated,,"{'text_length': 350, 'tokens': [{'idx': 0, 'va...",
1,"Adelheid Lissmann - Adelheid Lissmann, geb. 19...","[Adelheid, Lissmann, -, Adelheid, Lissmann, ,,...",,,"[(PER, 0, 17), (ORG, 74, 95)]",rubrix,01a4910f-1088-413a-8879-55f06ded5d20,"{'date': '1946', 'year': 1946, 'id': 12}",Validated,,"{'text_length': 192, 'tokens': [{'idx': 0, 'va...",
2,Beschlagnahmen - Beschlagnahme der Blätter für...,"[Beschlagnahmen, -, Beschlagnahme, der, Blätte...",,,"[(PUBLICATION, 35, 60), (PUBLICATION, 71, 90)]",rubrix,029c88d4-bda1-4e3c-af72-85e516588301,"{'date': '1925', 'year': 1925, 'id': 28}",Validated,,"{'text_length': 266, 'tokens': [{'idx': 0, 'va...",
3,Lesbenfrühlings­treffen - Das Lesbenfrühlingst...,"[Lesbenfrühlings­treffen, -, Das, Lesbenfrühli...",,,"[(EVENT, 0, 23), (EVENT, 30, 52), (EVENT, 61, ...",rubrix,02c3041b-6122-4f61-aa1e-26001b2cc865,"{'date': '24. – 27. Mai 1996', 'year': 1996, '...",Validated,,"{'text_length': 210, 'tokens': [{'idx': 0, 'va...",
4,Moby Dyke - Die Kunstaktion „Moby Dyke Lesbian...,"[Moby, Dyke, -, Die, Kunstaktion, „, Moby, Dyk...",,,"[(PER, 56, 70), (PER, 75, 93), (LOC, 118, 138)...",rubrix,03bf56b1-0857-47fc-9588-5caf5929847e,"{'date': '21. – 22. August 2015', 'year': 2015...",Validated,,"{'text_length': 193, 'tokens': [{'idx': 0, 'va...",


## Preprocessing

The data needs two steps of preprocessing:

1. Convert `rubrix` to `spacy` format for annotations, because `rubrix` exports the annotated information as `(label, start, end)`, but `spacy` needs `(start, end, label)`. See: `convert_rubrix_to_spacy()`.
2. Convert annotations to a `DocBin` and save file to disk. See `create_doc_bin()`


[More infos in official Documentation](https://spacy.io/usage/training#training-data):

> For example, if you’re creating an NER pipeline, loading your annotations and setting them as the .ents property on a Doc is all you need to worry about.

In [2]:
def convert_rubrix_to_spacy(rubrix_name:str, query:str, filter_labels:bool, translate_labels:bool):
    """import annotated data from rubrix and transform it to spacy flavour

    Args:
        rubrix_name (str): name of dataset in rubrix webapp
        query (str): query of rubrix data, typically `"status:Validated"` if manually labeled data
        filter_labels (bool): use all labels or subpart. If subpart, then provide a list called keep_label = ["PER", "LOC", "EVENT"]
        translate_labels (bool): use if some labels should be translated, in order to get more items per label. If so, provide a dict called label_dict = {"CLUB":"ORG", "PARTY":"EVENT", "MOVEMENT":"EVENT", "AWARD": "EVENT"}
    """
    # load rubrix dataset
    labeled_data = rb.load(rubrix_name, query=query)

    labeled_data_df = pd.DataFrame({
        "text": labeled_data.text,
        "label": labeled_data.annotation,
    })
    
    training_data = []

    for record in labeled_data_df.index:
        entities = []
        text = labeled_data_df["text"][record]
        labels = labeled_data_df["label"][record]
        
        for label in labels:
            
            start = label[1]
            end = label[2]
            label = label[0]
            # switch position:
            entity = text, label, start, end

            if (translate_labels == True):
                for key, value in label_dict.items():
                    label = label.replace(key, value)
            else:
                pass
            
            if (filter_labels == True):
                if (label in keep_label):
                    entities.append((start, end, label))
                else:
                    pass
            else:
                entities.append((start, end, label))
        

        training_data.append([(text, entities)])
    return(training_data)


def create_doc_bin(data: list, lang: str):
    """create a spacy DocBin file from training data/labeled data

    Args:
        data (list): list with training/annotated/labeled data
        lang (str): spacy language, "en" for English, "de" for German
    """

    nlp = spacy.blank(lang)

    # the DocBin will store the documents
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])

    for record in tqdm(data):
        
        # text are class list, need to be transformed to character
        text = " ".join(map(str,[el[0] for el in record]))
        doc = nlp(text)

        annotations = [item[1] for item in record]
        # print("annotations:")
        # print(annotations)
        ents = []
        
        for annotation in annotations[0]:
            # add start, end and label as spans
            start = annotation[0]
            end = annotation[1]
            label = annotation[2]
            span = doc.char_span(start, end, label=label)
            ents.append(span)
        doc.ents = ents
        doc_bin.add(doc)
    return(doc_bin)

Preprocessing execution, split data in train and test (called `dev` here).

The manually annotated data has too many labels for training: `"PER", "LOC", "EVENT", "LAW", "ORG", "CLUB", "CITY", "COUNTRY", "MOVEMENT", "DATE", "ADR", "SLOGAN", "PUBLICATION", "PARTY", "AWARD"`.
So I 

In [3]:
# keep_label

keep_label = ["PER", "LOC", "EVENT", "LAW", "ORG", "CLUB", "CITY", "COUNTRY", "MOVEMENT", "PARTY", "AWARD"]

label_dict = {"CLUB":"ORG", "PARTY":"ORG", "MOVEMENT":"EVENT", "AWARD": "EVENT"}

labeled_data = convert_rubrix_to_spacy(rubrix_name="chronik_annotations", query="status:Validated", filter_labels = True, translate_labels = True)

Split data and save in binary format:

In [4]:
train, dev = train_test_split(labeled_data, test_size=0.2) 
create_doc_bin(train, "de").to_disk("data/train.spacy")
create_doc_bin(dev, "de").to_disk("data/dev.spacy")

100%|██████████| 186/186 [00:00<00:00, 1893.86it/s]
100%|██████████| 47/47 [00:00<00:00, 1619.24it/s]


## Use a spacy config file
- https://ner.pythonhumanities.com/03_02_train_spacy_ner_model.html#what-is-the-spacy-config-cfg-file-and-how-do-i-create-it
- base config file from: https://github.com/wjbmattingly/holocaust_ner_lessons/blob/main/data/config.cfg
- Spacy Documentation about projects https://explosion.ai/blog/spacy-v3-project-config-systems

In [5]:
# create base config
!python3 -m spacy init config --pipeline ner data/base-config.cfg


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



In [6]:
# fill with default values
!python3 -m spacy init fill-config data/base-config.cfg data/config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
data/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## Validate training data 

based on config file

In [7]:
# check data 
!python3 -m spacy debug data data/config.cfg

[1m
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: de
Training pipeline: tok2vec, ner
186 training docs
47 evaluation docs
[38;5;2m✔ No overlap between training and evaluation data[0m
[38;5;3m⚠ Low number of examples to train a new pipeline (186)[0m
[1m
[38;5;4mℹ 8124 total word(s) in the data (2554 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m
[38;5;4mℹ 7 label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;3m⚠ Low number of examples for label 'LAW' (42)[0m
[2K[38;5;3m⚠ Low number of examples for label 'CITY' (35)[0m
[2K[38;5;3m⚠ Low number of examples for label 'COUNTRY' (25)[0m
[2Kalyzing label distribution...[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[38;5;2m✔ No entities crossing sentence boundaries[0m
[1m
[38;5;2m✔ 6 checks passed[0m


## Train model

In [8]:
!python3 -m spacy train data/config.cfg --output ./models/output

[38;5;4mℹ Saving to output directory: models/output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-04-14 12:00:24,105] [INFO] Set up nlp object from config
[2022-04-14 12:00:24,110] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-04-14 12:00:24,112] [INFO] Created vocabulary
[2022-04-14 12:00:24,112] [INFO] Finished initializing nlp object
[2022-04-14 12:00:24,487] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     28.67    0.00    0.00    0.00    0.00
  2     200        233.66   2644.35   26.83   41.25   19.88    0.27


In [None]:
trained_nlp = spacy.load("models/output/model-best")
text = 'Die Rosa Liste öffnet sich den Lesben: „Rosa Liste – jetzt lesbisch-schwul?“, eine Veranstaltung organisiert vom AK Uferlos. In der folgenden Stadtratswahl 1994 treten sowohl schwule als auch lesbische KandidatInnen an; Marion Hölczl war bereits ab 1992 Rosa-Liste-Bezirksrätin in Altstadt-Lehel.'
doc = trained_nlp(text)

for ent in doc.ents:
    print (ent.text, ent.label_)
if len(doc.ents) == 0:
    print ("No entities found.")

Rosa Liste PARTY
