In [2]:
from main import make_tagger

stub_ner = make_tagger('stub')
corenlp_ner = make_tagger('corenlp')
spacy_ner = make_tagger('spacy')

In [3]:
import spacy

_prefixes = ['\s+']
_suffixes = ['\s+']
_infixes = ['\s+']

spacy_ner.nlp.tokenizer.infix_finditer = spacy.util.compile_infix_regex(_infixes).finditer
spacy_ner.nlp.tokenizer.prefix_search = spacy.util.compile_prefix_regex(_prefixes).search
spacy_ner.nlp.tokenizer.suffix_search = spacy.util.compile_suffix_regex(_suffixes).search

special_case = [{spacy.symbols.ORTH: u'cannot', spacy.symbols.LEMMA: u'cannot', spacy.symbols.POS: u'VERB'}]
spacy_ner.nlp.tokenizer.add_special_case(u'cannot', special_case)

In [4]:
def load_conll(conll_path):
    with open(conll_path, 'r') as conll_file:
        conll_data = conll_file.read()
        entries = [entry for entry in conll_data.split('\n\n') if entry]
        
        sentences = []
        tags = []
        for entry in entries:    
            lines = entry.split('\n')
            sentence = []
            sentence_tags = []
            for l in lines:
                columns = l.split(' ')
                word = columns[0]
                tag = columns[3].split('-')[-1]
                
                if word == '-DOCSTART-':
                    continue
                
                sentence.append(word)
                sentence_tags.append(tag)
            
            if not sentence:
                continue

            sentences.append(sentence)
            tags.append(sentence_tags)
            
        return sentences, tags

In [5]:
sentences, tags = load_conll('data/conll2003/eng.testa')

In [9]:
nlp = spacy_ner
y_true = []
y_pred = []
for s, t in zip(sentences, tags):
    sentence = ' '.join(s)
    pred = nlp.predict(sentence)
    tokens = [w for w, t in pred]
    tags_pred = [t for w, t in pred]
    if len(tokens) != len(s):
        print(sentence)
        print('ERROR: Tokens do not match')
        print('Expected:')
        print(s)
        print('Actual:')
        print(tokens)
        break
        
    y_true += t
    y_pred += tags_pred

In [7]:
from sklearn.metrics import f1_score

In [10]:
f1_score(y_true, y_pred, average='weighted')

0.8563504398999212