# Train NER with manually annotated data

## Import data from rubrix

I manually annotated data in rubrix, `status:Validated`:

In [1]:
from datasets import Dataset
import rubrix as rb
import pandas as pd
import spacy
from tqdm import tqdm

# load rubrix dataset
dataset_rb = rb.load('chronik_annotations', query="status:Validated")
dataset_rb.head()




Unnamed: 0,text,tokens,prediction,prediction_agent,annotation,annotation_agent,id,metadata,status,event_timestamp,metrics,search_keywords
0,Teestube - In der Pestalozzistraße 20 Rgb. erö...,"[Teestube, -, In, der, Pestalozzistraße, 20, R...",,,"[(LOC, 0, 8), (ADR, 18, 37), (ADR, 104, 121), ...",rubrix,00f2ae40-e155-4f98-adaa-61d1db5a08ff,"{'date': '22. Juni 1974', 'year': 1974, 'id': ...",Validated,,"{'text_length': 350, 'tokens': [{'idx': 0, 'va...",
1,"Adelheid Lissmann - Adelheid Lissmann, geb. 19...","[Adelheid, Lissmann, -, Adelheid, Lissmann, ,,...",,,"[(PER, 0, 17), (ORG, 74, 95)]",rubrix,01a4910f-1088-413a-8879-55f06ded5d20,"{'date': '1946', 'year': 1946, 'id': 12}",Validated,,"{'text_length': 192, 'tokens': [{'idx': 0, 'va...",
2,Beschlagnahmen - Beschlagnahme der Blätter für...,"[Beschlagnahmen, -, Beschlagnahme, der, Blätte...",,,"[(PUBLICATION, 35, 60), (PUBLICATION, 71, 90)]",rubrix,029c88d4-bda1-4e3c-af72-85e516588301,"{'date': '1925', 'year': 1925, 'id': 28}",Validated,,"{'text_length': 266, 'tokens': [{'idx': 0, 'va...",
3,Lesbenfrühlings­treffen - Das Lesbenfrühlingst...,"[Lesbenfrühlings­treffen, -, Das, Lesbenfrühli...",,,"[(EVENT, 0, 23), (EVENT, 30, 52), (EVENT, 61, ...",rubrix,02c3041b-6122-4f61-aa1e-26001b2cc865,"{'date': '24. – 27. Mai 1996', 'year': 1996, '...",Validated,,"{'text_length': 210, 'tokens': [{'idx': 0, 'va...",
4,Moby Dyke - Die Kunstaktion „Moby Dyke Lesbian...,"[Moby, Dyke, -, Die, Kunstaktion, „, Moby, Dyk...",,,"[(PER, 56, 70), (PER, 75, 93), (LOC, 118, 138)...",rubrix,03bf56b1-0857-47fc-9588-5caf5929847e,"{'date': '21. – 22. August 2015', 'year': 2015...",Validated,,"{'text_length': 193, 'tokens': [{'idx': 0, 'va...",


In [2]:
# select text input and the annotated label
# https://github.com/recognai/rubrix#3-load-and-create-a-training-set
train_df = pd.DataFrame({
    "text": dataset_rb.text,
    "label": dataset_rb.annotation,
})
train_df.head()

Unnamed: 0,text,label
0,Teestube - In der Pestalozzistraße 20 Rgb. erö...,"[(LOC, 0, 8), (ADR, 18, 37), (ADR, 104, 121), ..."
1,"Adelheid Lissmann - Adelheid Lissmann, geb. 19...","[(PER, 0, 17), (ORG, 74, 95)]"
2,Beschlagnahmen - Beschlagnahme der Blätter für...,"[(PUBLICATION, 35, 60), (PUBLICATION, 71, 90)]"
3,Lesbenfrühlings­treffen - Das Lesbenfrühlingst...,"[(EVENT, 0, 23), (EVENT, 30, 52), (EVENT, 61, ..."
4,Moby Dyke - Die Kunstaktion „Moby Dyke Lesbian...,"[(PER, 56, 70), (PER, 75, 93), (LOC, 118, 138)..."


## Preparing Training data

### Transform rubrix format to spacy format

I need to do some minor data transformation, because rubrix exports the annotated information as `(label, start, end)`, but Spacy needs `(start, end, label)`.

So iterate over the data frame and change the order and save the data as TRAIN_DATA.

In [3]:

TRAIN_DATA = []

for record in train_df.index:
    entities = []
    text = train_df["text"][record]
    labels = train_df["label"][record]
    
    for label in labels:
        
        start = label[1]
        end = label[2]
        label = label[0]
        # switch position
        entity = text, label, start, end
        entities.append((start, end, label))

    TRAIN_DATA.append([(text, entities)])

# print(TRAIN_DATA)

### Create spacy Doc object

Prepare the data for training following the official tutorial: https://spacy.io/usage/training#training-data

> For example, if you’re creating an NER pipeline, loading your annotations and setting them as the .ents property on a Doc is all you need to worry about.

In [13]:
from spacy.tokens import DocBin

nlp = spacy.blank("de")

# the DocBin will store the example documents
doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])  # we're just concerned with NER

for record in tqdm(TRAIN_DATA):
    
    # text are class list, need to be transformed to character
    text = " ".join(map(str,[el[0] for el in record]))
    doc = nlp(text)

    annotations = [item[1] for item in record]
    # print("annotations:")
    # print(annotations)
    ents = []
    
    for annotation in annotations[0]:
        # add start, end and label as spans
        start = annotation[0]
        end = annotation[1]
        label = annotation[2]
        span = doc.char_span(start, end, label=label)
        
        ents.append(span)
    doc.ents = ents
    #print(doc.ents)
    doc_bin.add(doc)

# no worries, i know this is dumb... 
doc_bin.to_disk("data/train.spacy")
doc_bin.to_disk("data/valid.spacy")

100%|██████████| 132/132 [00:00<00:00, 1810.73it/s]


In [None]:
python -m spacy debug-data de ./train.spacy -p ner -b de_core_news_md

python3 -m spacy debug data data-gathering/named-entity-recognition/train.spacy
python3 -m spacy debug data ./train.spacy

## Use a spacy config file
- https://ner.pythonhumanities.com/03_02_train_spacy_ner_model.html#what-is-the-spacy-config-cfg-file-and-how-do-i-create-it
- base config file from: https://github.com/wjbmattingly/holocaust_ner_lessons/blob/main/data/config.cfg

In [9]:
!python3 -m spacy init fill-config data/spacy-base-config.cfg data/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
data/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


Validate training data based on config file:

In [14]:
# check data 
!python3 -m spacy debug-data data/config.cfg

[38;5;3m⚠ The debug-data command is now available via the 'debug data'
subcommand (without the hyphen). You can run python -m spacy debug --help for an
overview of the other available debugging commands.[0m
[1m
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: de
Training pipeline: tok2vec, ner
132 training docs
132 evaluation docs
[38;5;3m⚠ 132 training examples also in evaluation data[0m
[38;5;3m⚠ Low number of examples to train a new pipeline (132)[0m
[1m
[38;5;4mℹ 5900 total word(s) in the data (2019 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m
[38;5;4mℹ 15 label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;3m⚠ Low number of examples for label 'Slogan' (7)[0m
[2K[38;5;3m⚠ Low number of examples for label 'EVENT' (43)[0m
[2K[38;5;3m⚠ Low number of examples for label 'CITY' (25)[0m
[2K[38;5;3m⚠ Low number of examples for label 'DATE' (22)[0m
[2K[38;5;3m⚠ Low number of examp

## Train model

In [15]:
!python3 -m spacy train data/config.cfg --output ./models/output

[38;5;2m✔ Created output directory: models/output[0m
[38;5;4mℹ Saving to output directory: models/output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-04-13 17:37:32,419] [INFO] Set up nlp object from config
[2022-04-13 17:37:32,424] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-04-13 17:37:32,426] [INFO] Created vocabulary
[2022-04-13 17:37:32,426] [INFO] Finished initializing nlp object
[2022-04-13 17:37:32,744] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     27.74    0.81    0.44    5.06    0.01
  2     200        764.60   2841.59   40.31   45.79   35.99    0.40
  6     400       2359.37   2282.67   76.03   78.47   73.74    0.76
 11     600       1550.19   1398.56   94.04   94.50   93.58    0

In [20]:
trained_nlp = spacy.load("models/output/model-best")
text = 'Die Rosa Liste öffnet sich den Lesben: „Rosa Liste – jetzt lesbisch-schwul?“, eine Veranstaltung organisiert vom AK Uferlos. In der folgenden Stadtratswahl 1994 treten sowohl schwule als auch lesbische KandidatInnen an; Marion Hölczl war bereits ab 1992 Rosa-Liste-Bezirksrätin in Altstadt-Lehel.'
doc = trained_nlp(text)

for ent in doc.ents:
    print (ent.text, ent.label_)
if len(doc.ents) == 0:
    print ("No entities found.")

Rosa Liste PARTY
AK Uferlos ORG
Marion Hölczl EVENT
Rosa-Liste-Bezirksrätin EVENT
Altstadt-Lehel. ORG
