Credits go to Emiel van Miltenburg for the script to convert the spacy output to NAF:

https://github.com/evanmiltenburg/SpaCy-to-NAF 

In [1]:
from spacy.en import English
import json
from lxml import etree
import pickle

In [2]:
import spacy_to_naf
import semeval_classes 

In [3]:
nlp = English()

In [4]:
def process_first_x_files(size):
    path = 'signalmedia-1m.jsonl'
    with open(path) as infile:
        for counter, line in enumerate(infile, 1):
            article = json.loads(line)
            
            if counter == size: 
                break
            else:
                yield article


In [5]:
the_news_items = {}
the_collection = 'SignalMedia'

for article in process_first_x_files(100):
    
    identifier = article['id']
    
    a_news_item = semeval_classes.NewsItem(
        identifier = identifier,
        collection = the_collection,
        dct = article['published'],
        publisher = article['source'])
    
    # add entity mention
    naf = spacy_to_naf.text_to_NAF(article['content'], nlp)

    iden2wf_el = {int(wf_el.get('id')[1:]) : wf_el
                  for wf_el in naf.iterfind('text/wf')}
    for entity_el in naf.iterfind('entities/entity'):
        entity_type = entity_el.get('type')
        idens = [int(t_id.get('id')[1:]) 
                 for t_id in entity_el.iterfind('references/span/target')]
        
        # get mention
        mention = ' '.join([iden2wf_el[iden].text
                            for iden in idens])
        
        # get sentence id
        sent_ids = [iden2wf_el[iden].get('sent')
                    for iden in idens]
        assert len(set(sent_ids)) == 1, 'in %s, entity in multiple sentences' % article['content']
        sent_id = sent_ids[0]
        
        # get start and end offset
        wf_el = iden2wf_el[idens[0]]
        begin_index = int(wf_el.get('offset'))
        
        if len(idens) == 1:
            end_index = begin_index + int(wf_el.get('length'))
        else:
            end_wf_el = iden2wf_el[idens[-1]]
            end_index = int(end_wf_el.get('offset')) + int(end_wf_el.get('length'))

        entity_mention_obj = semeval_classes.EntityMention(
            sentence=sent_id,
            mention=mention,
            the_type=entity_type,
            begin_index=begin_index,
            end_index=end_index)
        a_news_item.entity_mentions.add(entity_mention_obj)
        
    the_news_items[identifier] = a_news_item

In [6]:
with open('news_items.bin', 'wb') as outfile:
    pickle.dump(the_news_items, outfile)

## Running entire datasets using miltiple threads

In [7]:
#%time for text in process_first_x_files(1000): nlp(text)

In [8]:
#texts = process_first_x_files(1000)
#%time for doc in nlp.pipe(texts, n_threads=16, batch_size=1000): pass