## Run spacy on the NorNE testset


In [4]:

import re
# from nltk.tokenize.simple import SpaceTokenizer
from helpers import *

import spacy
from spacy.tokenizer import Tokenizer


In [5]:
testpath = "data/norne2/no_bokmaal-ud-test.bmes"
test_tags  = dataset_w_tags(testpath, testpath)[2]
test_tags

['B-LOC',
 'B-EVT',
 'I-LOC',
 'B-GPE_ORG',
 'B-DRV',
 'I-EVT',
 'B-ORG',
 'I-GPE_LOC',
 'I-PER',
 'I-DRV',
 'I-PROD',
 'O',
 'B-PER',
 'I-ORG',
 'B-GPE_LOC',
 'B-PROD',
 'I-GPE_ORG']

In [6]:
with open (testpath) as rf:
    test_text = rf.read().strip()
tokens, tags = [], [] # List of lists
for sent in test_text.split("\n\n"):
    tokens.append([line.strip().split()[0] for line in sent.strip().split("\n")])
    tags.append([line.strip().split()[1] for line in sent.strip().split("\n")])
    assert len(tokens[-1]) == len(tags[-1])

tokens[5], tags[5]

(['Det',
  'nye',
  'slagordet',
  'er',
  '"',
  'tilpasning',
  'gjennom',
  'kontrast',
  '"',
  '.'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])

In [7]:
spacy_pred = []
nlp = spacy.load("nb_core_news_lg",
                disable=[
                "tok2vec",
                "morphologizer",
                "parser",
                "lemmatizer",
                "attribute_ruler"
            ])
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)
# With the above tokenizer, it should split on whitespace only
for tks, tgs in zip(tokens, tags):
    text = " ".join(tks)
    doc = nlp(text)
    assert  len(doc) == len(tgs)
    spacy_tgs = []
    for t in doc:
        bio_tag = t.ent_iob_
        if t.ent_type_:
            bio_tag += "-"+t.ent_type_
        spacy_tgs.append(bio_tag)
    spacy_pred.append(spacy_tgs)
    
spacy_pred[300]

['B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [8]:
from collections import Counter
def tagsset(tagslists):
    return(set([t for tgs in tagslists for t in tgs]))

print("Spacy", tagsset(spacy_pred))
print("NorNE", tagsset(tags))
gold_pred = Counter([g+">"+p for g_line, p_line in zip(tags, spacy_pred) for g, p in zip (g_line, p_line)])
ent_ent, ent_o, o_ent = 0,0,0
for key, value in gold_pred.items():
    gold, pred = key.split(">")
    gold_ent = gold != "O"
    pred_ent = pred != "O"
    if gold_ent and pred_ent:
        ent_ent += value
    if gold_ent and not pred_ent:
        ent_o += value
    if not gold_ent and pred_ent:
        o_ent += value
print("Ent-token pred as ent", ent_ent)
print ('Ent-token pred as "O"', ent_o)
print('"O"-token pred as ent', o_ent) 
print("Sentences", len(tokens))
print("Tokens", len([t for tks in tokens for t in tks]))

Spacy {'B-LOC', 'B-EVT', 'I-LOC', 'B-GPE_ORG', 'B-DRV', 'B-ORG', 'I-GPE_LOC', 'I-PER', 'B-PROD', 'I-DRV', 'I-PROD', 'B-MISC', 'I-ORG', 'B-PER', 'B-GPE_LOC', 'O', 'I-GPE_ORG'}
NorNE {'B-LOC', 'B-EVT', 'I-LOC', 'B-GPE_ORG', 'B-DRV', 'I-EVT', 'B-ORG', 'I-GPE_LOC', 'I-PER', 'I-DRV', 'I-PROD', 'O', 'B-PER', 'I-ORG', 'B-GPE_LOC', 'B-PROD', 'I-GPE_ORG'}
Ent-token pred as ent 1742
Ent-token pred as "O" 177
"O"-token pred as ent 87
Sentences 1939
Tokens 29966


In [9]:
gold_pred["O>O"]

27960