## Tool3: Spacy

In [1]:
import spacy
from spacy.pipeline import EntityRecognizer

### For spacy to recognize the model, you need to put it under spacy/data. (For details please read README of the repo)

In [2]:
# model = 'en_core_web_sm'
model = 'xx_ent_wiki_sm'
nerpath = '/home/berfu/anaconda/lib/python3.6/site-packages/spacy/data/xx_ent_wiki_sm/xx_ent_wiki_sm-2.0.0/ner'


### Configure input and output file:

In [14]:
inputfile = 'conll-testb.txt'
outfile = 'spacy_eval.txt'

### Load the pretrained spacy model:

In [4]:
nlp = spacy.load(model)
ner = EntityRecognizer(nlp.vocab)
ner.from_disk(nerpath)

<spacy.pipeline.EntityRecognizer at 0x7f633603e308>

### Read input to sentences, tokens, and tags:

In [5]:
def conll2sentences(testfile):
    with open(testfile, 'r') as f:
        lines = []
        sentences = [[]]
        for line in f:
            if line != '\n':
                sentences[-1].append(line.split(None, 1)[0])
                lines.append(line.split())
            else:
                sentences.append([])
    all_tokens = [line[0] for line in lines]
    actual_tags = [line[-1] for line in lines]
    return [sentences, all_tokens, actual_tags]

In [6]:
[sentences, all_tokens, actual_tags] = conll2sentences(inputfile)

### Spacy has a tokenizer but if you do not want to use spacy's tokenizer it's OK. You can give spacy already tokenized input. Then let spacy apply the process you want (here, ner):

In [7]:
result = []
for sentence in sentences:
    doc = spacy.tokens.doc.Doc(nlp.vocab, words=sentence)

    # run ner against every sentence
    processed = ner(doc)
    for token in processed:
        result.append([token.text, token.ent_type_])
        
print('NER operation ended.')

NER operation ended.


### Spacy output tags are of conll raw tag format. Convert actual tags to that format as well:

In [8]:
def conll2raw(tags):
    raw_tags = []
    for tag in tags:
        raw_tag = tag
        t = tag.split('-')
        if len(t) > 1: raw_tag = t[1]
        raw_tags.append(raw_tag)
    return raw_tags

In [9]:
pred_tags = [t[1] for t in result]
pred_tags_edited = ['O' if x == '' else x for x in pred_tags]
actual_tags_edited = conll2raw(actual_tags)

### Create input file to conlleval script, then run the script. the input is a file on each line containing **token actual-tag predicted-tag**.

In [10]:
def createconllevalinputfile(sentences, actual_tags, pred_tags):
    conlleval_inputfile_name = 'conlleval_input'
    result_file = open(conlleval_inputfile_name, 'w')
    idx = -1
    for sentence in sentences:
        for word in sentence:
            idx = idx + 1
            result_file.write(word + ' ' + actual_tags[idx] + ' ' + pred_tags[idx] + '\n')
        result_file.write('\n')
    result_file.close()
    return conlleval_inputfile_name

In [11]:
import sys
import os

def runconlleval(infile, outfile):
    python_path = sys.executable
    os.system(python_path + ' conlleval.py -r < ' + infile + ' > ' + outfile)
    print('Please see the scores wrt conlleval script in the file: ' + outfile + '\n')

In [15]:
conlleval_inputfile_name = createconllevalinputfile(sentences, actual_tags_edited, pred_tags_edited)
runconlleval(conlleval_inputfile_name, outfile)

Please see the scores wrt conlleval script in the file: spacy_eval.txt

