## Tool3: Spacy

In [1]:
import spacy
from spacy.pipeline import EntityRecognizer

### For spacy to recognize the model, you need to put it under spacy/data. (For details please read README of the repo)

In [2]:
# model = 'en_core_web_sm'
model = 'xx_ent_wiki_sm'
nerpath = '/home/berfu/anaconda/lib/python3.6/site-packages/spacy/data/xx_ent_wiki_sm/xx_ent_wiki_sm-2.0.0/ner'


### Configure input and output file:

In [3]:
# inputfile = 'conll-testa.txt'
inputfile = './foliadocs/alladjudicated'
outfile = 'spacy_eval.txt'

### Load the pretrained spacy model:

In [4]:
nlp = spacy.load(model)
ner = EntityRecognizer(nlp.vocab)
ner.from_disk(nerpath)

<spacy.pipeline.EntityRecognizer at 0x7fe3c9707678>

### Read input to sentences, tokens, and tags:

In [5]:
import re
def foliaclass2rawtag(e):
    per = 'PER'
    loc = 'LOC'
    org = 'ORG'
    cls = e.cls
    if re.match('^.*Target.*$', e.set):
        if cls == 'name':
            return per
    elif re.match('^.*Organizer.*$', e.set):
        if cls == 'name':
            return org
    if cls == 'loc' or cls == 'place' or cls == 'place_pub':
        return loc
    if cls == 'pname':
        return per
    if cls == 'fname':
        return org
    return 'O'

In [12]:
import os
from pynlpl.formats import folia

def folia2sentences(path, tagFormat):
    sentences_as_tokens = []
    ids = []
    id2idx = {}
    idx2id = {}
    all_tokens = []
    actual_tags = []
    if os.path.isdir(path):
        idx = -1
        for filename in os.listdir(path):
            doc = folia.Document(file=path + '/' + filename)
            for h, sentence in enumerate(doc.sentences()):
                sentence_tokenized = sentence.select(folia.Word)
                words_folia = list(sentence_tokenized)
                sentence_tokens = []
                for word in words_folia:
                    w_id = word.id
                    w_text = word.text()
                    if w_id in ids:
                        continue
                    idx = idx + 1
                    if w_text == '<P>':
                        idx = idx - 1
                        continue
                    ids.append(w_id)
                    id2idx[w_id] = idx
                    idx2id[idx] = w_id
                    actual_tags.append('O')
                    sentence_tokens.append(w_text)
                    all_tokens.append(w_text)

                sentences_as_tokens.append(sentence_tokens)
                for layer in sentence.select(folia.EntitiesLayer):
                    for entity in layer.select(folia.Entity):
                        for word in entity.wrefs():
                            word_id = word.id
                            _idx = id2idx[word_id]
                            if tagFormat == 'stanford':
                                tag = foliaclass2stanfordtag(entity)
                            elif tagFormat == 'conll':
                                print('TODO: reuse codes that output files to output objects instead.')
                            elif tagFormat == 'raw':
                                tag = foliaclass2rawtag(entity)
                            actual_tags[_idx] = tag
    else:
        print("TODO: Handling of a single Folia file instead of a folder of Folia files.")
    return [sentences_as_tokens, all_tokens, actual_tags]


In [10]:
def conll2sentences(testfile):
    with open(testfile, 'r') as f:
        lines = []
        sentences = [[]]
        for line in f:
            if line != '\n':
                sentences[-1].append(line.split(None, 1)[0])
                lines.append(line.split())
            else:
                sentences.append([])
    all_tokens = [line[0] for line in lines]
    actual_tags = [line[-1] for line in lines]
    return [sentences, all_tokens, actual_tags]

In [13]:
# [sentences, all_tokens, actual_tags] = conll2sentences(inputfile)
[sentences, all_tokens, actual_tags] = folia2sentences(inputfile, 'raw')

### Spacy has a tokenizer but if you do not want to use spacy's tokenizer it's OK. You can give spacy already tokenized input. Then let spacy apply the process you want (here, ner):

In [14]:
result = []
for sentence in sentences:
    doc = spacy.tokens.doc.Doc(nlp.vocab, words=sentence)

    # run ner against every sentence
    processed = ner(doc)
    for token in processed:
        result.append([token.text, token.ent_type_])
        
print('NER operation ended.')

NER operation ended.


### Spacy output tags are of conll raw tag format. Convert actual tags to that format as well:

In [15]:
def conll2raw(tags):
    raw_tags = []
    for tag in tags:
        raw_tag = tag
        t = tag.split('-')
        if len(t) > 1: raw_tag = t[1]
        raw_tags.append(raw_tag)
    return raw_tags

In [16]:
pred_tags = [t[1] for t in result]
pred_tags_edited = ['O' if x == '' else x for x in pred_tags]
# comment out below line for folia format.
# actual_tags = conll2raw(actual_tags)

### Create input file to conlleval script, then run the script. the input is a file on each line containing **token actual-tag predicted-tag**.

In [17]:
def createconllevalinputfile(sentences, actual_tags, pred_tags):
    conlleval_inputfile_name = 'conlleval_input'
    result_file = open(conlleval_inputfile_name, 'w')
    idx = -1
    for sentence in sentences:
        for word in sentence:
            idx = idx + 1
            result_file.write(word + ' ' + actual_tags[idx] + ' ' + pred_tags[idx] + '\n')
        result_file.write('\n')
    result_file.close()
    return conlleval_inputfile_name

In [18]:
import sys
import os

def runconlleval(infile, outfile):
    python_path = sys.executable
    os.system(python_path + ' conlleval.py -r < ' + infile + ' > ' + outfile)
    print('Please see the scores wrt conlleval script in the file: ' + outfile + '\n')

In [19]:
conlleval_inputfile_name = createconllevalinputfile(sentences, actual_tags, pred_tags_edited)
runconlleval(conlleval_inputfile_name, outfile)

Please see the scores wrt conlleval script in the file: spacy_eval.txt

