In [1]:
# Run this once
import sys
import os

sys.path.append('/app/src')
sys.path.append('/app/src/data_access')

os.chdir('..')

%load_ext autoreload

# Annotation Pipeline
this notebook functions as pipeline to automatically annotate new letters. It uses a previously trained Flair model. The pipeline consists of the following steps:
- Configure pipeline, i.e. specify locations of text files, model and output
- Preprocess texts, i.e. split files in page files and remove hyphenation
- Automatically annotate named entities and store it in WebAnno format
- Map output to the Bookviewer JSON format

## Step 1: Configure pipeline
Specify locations, load necessry modules and iterate thorugh files

In [56]:
%autoreload 2
from webanno_tsv import Document, Token, Sentence as WebAnno_Sentence, Annotation, NO_LABEL_ID, webanno_tsv_write
from match_webanno_ocr import clean_ocr, OUTPUT_LAYERS, TARGET_LAYER, PAGE_SEP, TARGET_FIELD, sentence_tokenizer

from flair.models import SequenceTagger
from flair.data import Sentence as Flair_Sentence
from nltk.data import load as nltk_load
from nltk.tokenize import word_tokenize

model_path = 'resources/ner_models/ner-experiment_06_coarse/final-model.pt'
source_path = 'data/texts/test'
output_path = 'resources/'

IOB_INSIDE = 'I-'
IOB_OUTSIDE = 'B-'
IOB_NULL = 'O'

# load the NER tagger
tagger = SequenceTagger.load(model_path)    

ImportError: cannot import name '_anno_type' from 'webanno_tsv' (/app/src/data_access/webanno_tsv.py)

## Step 2-3: Preprocess and annotate 
Each file consists of multiple pages in the corpus. Before the automatic annotation preceeds the This step contains multiple sub steps:

In [62]:
for root, dirs, files in os.walk(os.path.abspath(source_path)):
    for file in files:
        with open(os.path.join(root, file), mode='r', encoding='utf-8') as f:
            pages = f.read().split(PAGE_SEP)
            for page_number, page_text in enumerate(pages):
                next_token_idx = 0
                next_label_idx = 1
                last_label_prefix = None
                doc = Document(OUTPUT_LAYERS)
                sentences = sentence_tokenizer.tokenize(clean_ocr(page_text), realign_boundaries=True)
                last_annotation = None
                for i, sentence_text in enumerate(sentences):
                    text = sentence_text.replace('\n', ' ')
                    flair_sentence = Flair_Sentence(text)
                    tagger.predict(flair_sentence)
                    webanno_sentence = WebAnno_Sentence(doc, idx=i+1, text=text)
                    for token_idx, flair_token in enumerate(flair_sentence, start=1):
                        token_utf16_length = int(len(flair_token.text.encode('utf-16-le')) / 2)
                        end = next_token_idx + token_utf16_length
                        webanno_token = Token(sentence=webanno_sentence, idx=token_idx, start=next_token_idx, end=end, text=flair_token.text)
                        next_token_idx = end + 1
                        webanno_sentence.add_token(webanno_token)
                        
                        label_id = NO_LABEL_ID
                        label = flair_token.get_tag('ner').value.replace(IOB_INSIDE, '').replace(IOB_OUTSIDE, '')
                        
                        if label != IOB_NULL:
                            if IOB_INSIDE in flair_token.get_tag('ner').value:
                                label_id = next_label_idx
                                last_label_prefix = IOB_INSIDE

                                if last_annotation.label == label:
                                    last_annotation.label_id = label_id

                            if IOB_OUTSIDE in flair_token.get_tag('ner').value:
                                if last_label_prefix == IOB_INSIDE:
                                    next_label_idx += 1
                                last_label_prefix = IOB_OUTSIDE

                            last_annotation = Annotation(
                                tokens=[webanno_token],
                                label=label,
                                layer_name=TARGET_LAYER,
                                field_name=TARGET_FIELD,
                                label_id=label_id,
                            )
                            doc._annotations[doc._anno_type(last_annotation.layer_name, last_annotation.field_name)].append(last_annotation)
                    doc.add_sentence(webanno_sentence)
                content = webanno_tsv_write(doc)
                print(content)

#FORMAT=WebAnno TSV 3.1
#T_SP=webanno.custom.LetterEntity|entity_id|value


#Text=, u Braun an Gerhard Dresden, 10, März 7183: Dresden 10 März 1832 Mein freundlichster Herr Professor \|!
1-1	0-1	,	_	_
1-2	2-3	u	_	_
1-3	4-9	Braun	*	PER
1-4	10-12	an	_	_
1-5	13-20	Gerhard	*	PER
1-6	21-28	Dresden	*	PLACE
1-7	29-30	,	_	_
1-8	31-33	10	*[1]	DATE[1]
1-9	34-35	,	*[1]	DATE[1]
1-10	36-40	März	*[1]	DATE[1]
1-11	41-45	7183	*[1]	DATE[1]
1-12	46-47	:	_	_
1-13	48-55	Dresden	*	PLACE
1-14	56-58	10	*[2]	DATE[2]
1-15	59-63	März	*[2]	DATE[2]
1-16	64-68	1832	*[2]	DATE[2]
1-17	69-73	Mein	_	_
1-18	74-88	freundlichster	_	_
1-19	89-93	Herr	_	_
1-20	94-103	Professor	_	_
1-21	104-105	\|	_	_
1-22	106-107	!	_	_

#Text=Die Wohlthat und Annehmlichkeit in Ihrer nächsten Nähe zu logiren ist mir zu werth und lieb, als daß ich nicht die Worte Ihres lieben Briefes "daher Sie vielleicht besser einige Tage zögerten" besonders hervorheben und Sie nun bitte das besagte Logis für mich zu miethen, Ich werde dann gerade 8 Tage s

#FORMAT=WebAnno TSV 3.1
#T_SP=webanno.custom.LetterEntity|entity_id|value


#Text=Braun an Gerhard Dresden, 10. März 1832 (Zettel an den Brief geklebt) Die Intelligenzblätter habe ich zurückgehalten und sende nur den Brief.
1-1	0-5	Braun	*	PER
1-2	6-8	an	_	_
1-3	9-16	Gerhard	*	PER
1-4	17-24	Dresden	*	PLACE
1-5	25-26	,	_	_
1-6	27-30	10.	*[1]	DATE[1]
1-7	31-35	März	*[1]	DATE[1]
1-8	36-40	1832	*[1]	DATE[1]
1-9	41-42	(	_	_
1-10	43-49	Zettel	_	_
1-11	50-52	an	_	_
1-12	53-56	den	_	_
1-13	57-62	Brief	_	_
1-14	63-70	geklebt	_	_
1-15	71-72	)	_	_
1-16	73-76	Die	_	_
1-17	77-95	Intelligenzblätter	*	LIT
1-18	96-100	habe	_	_
1-19	101-104	ich	_	_
1-20	105-119	zurückgehalten	_	_
1-21	120-123	und	_	_
1-22	124-129	sende	_	_
1-23	130-133	nur	_	_
1-24	134-137	den	_	_
1-25	138-143	Brief	_	_
1-26	144-145	.	_	_

#Text=- Den Intelligenzblättern ist auf eine Anzeige der bald erscheinenden Bernhardischen Suidas sowie der Scriptores hist., August, beigelegt.
2-1	146-147	-	_	_
2-2	148-151	Den	_	_
2-3	152-171	Inte

#FORMAT=WebAnno TSV 3.1
#T_SP=webanno.custom.LetterEntity|entity_id|value


#Text=Braun an Gerhard Dresden, 30.
1-1	0-5	Braun	*	PER
1-2	6-8	an	_	_
1-3	9-16	Gerhard	*	PER
1-4	17-24	Dresden	*	PLACE
1-5	25-26	,	_	_
1-6	27-29	30	*[1]	DATE[1]
1-7	30-31	.	*[1]	DATE[1]

#Text=Oktober 71832 zuzuwenden, wie Sie es in München thaten.
2-1	32-39	Oktober	*[1]	DATE[1]
2-2	40-45	71832	*[1]	DATE[1]
2-3	46-56	zuzuwenden	_	_
2-4	57-58	,	_	_
2-5	59-62	wie	_	_
2-6	63-66	Sie	_	_
2-7	67-69	es	_	_
2-8	70-72	in	_	_
2-9	73-80	München	*	PLACE
2-10	81-87	thaten	_	_
2-11	88-89	.	_	_

#Text=Vielleicht lerne ich dereinst noch so viel, daß ich thätig dankbar vor Ihnen erscheinen kann.
3-1	90-100	Vielleicht	_	_
3-2	101-106	lerne	_	_
3-3	107-110	ich	_	_
3-4	111-119	dereinst	_	_
3-5	120-124	noch	_	_
3-6	125-127	so	_	_
3-7	128-132	viel	_	_
3-8	133-134	,	_	_
3-9	135-138	daß	_	_
3-10	139-142	ich	_	_
3-11	143-149	thätig	_	_
3-12	150-157	dankbar	_	_
3-13	158-161	vor	_	_
3-14	162-167	Ihnen	_	_
3-15	168-178	erscheinen	_	_
3-1