In [1]:
import os
import pickle
from glob import glob
from collections import defaultdict
from lxml import etree
from spacy.en import English
nlp = English()

In [2]:
# magic to force own tokenization
# https://github.com/explosion/spaCy/issues/182
def my_split_function(string):
    return string.split()

old_tokenizer = nlp.tokenizer
nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(my_split_function(string))

In [3]:
import ecb_parser_classes

In [4]:
def load_tid_and_sent_dict(doc):
    """
    given a loaded ECB(+) xml file, extract two dictionaries:
    1. mapping t_id -> ecb_parser_classes.Token instance
    2. mapping sentence identifier -> sentence (list of strings)
    
    :param lxml.etree._ElementTree doc: loaded ECB(+) xml file
    
    :rtype: tuple
    :return: (tid2token_objs, sentence_dict)
    """
    sentence_dict = defaultdict(list)
    tid2token_objs = dict()
    
    for token_el in doc.iterfind('token'):
        token_el_obj = ecb_parser_classes.Token(token_el)

        tid2token_objs[token_el_obj.t_id] = token_el_obj
        sentence_dict[token_el_obj.sentence].append(token_el_obj.token)
    
    return tid2token_objs, sentence_dict

In [5]:
def load_mid_dict(doc):
    """
    given a loaded ECB(+) xml file, extract one dictionary:
    1. mapping m_id -> list of ecb_parser_classes.Token instances
    
    :param lxml.etree._ElementTree doc: loaded ECB(+) xml file
    
    :rtype: dict
    :return: mid2tids
    """
    mid2tids = dict()
    
    for event_mention_el in doc.iterfind('Markables/*[token_anchor]'):
        m_id = event_mention_el.get('m_id')
        t_ids = {int(token_anchor_el.get('t_id'))
                 for token_anchor_el in event_mention_el.iterfind('token_anchor') }
        mid2tids[m_id] = [tid2token_objs[t_id] for t_id in sorted(t_ids)]
    
    return mid2tids

In [6]:
def update_tokens_with_lemma(tid2token_objs, pos_mapping, debug=False):
    """
    given the tid2token_objs dictionary (output from load_tid_and_sent_dict)
    this function adds the lemma attribute to all ecb_parser_classes.Token instances
    
    :param dict tid2token_objs: mapping t_id -> ecb_parser_classes.Token instance
    :param dict pos_mapping: mapping from spacy pos tagset to wordnet pos tagset
    """
    text = ' '.join([token_el_obj.token
                     for t_id, token_el_obj in sorted(tid2token_objs.items())])
    parsed_text = nlp(text)
    for t_id, token in enumerate(parsed_text, 1):
        if debug:
            print(t_id, token.lemma_, token.pos_)
        tid2token_objs[t_id].set_lemma(token.lemma_)
        mapped_pos = pos_mapping[token.pos_]
        tid2token_objs[t_id].set_pos(mapped_pos)
        
    if debug:
        input('continue?')

In [7]:
def create_path_generator(ecb_folder, ecb=False, ecbplus=False):
    """
    function to create an iterable over the ecb(+) corpus
    
    :param str ecb_folder: path to ecb_folder
    :param bool ecb: if True, ecb files are included
    :param bool ecbplus: if True, ecbplus files are included
    
    :rtype: generator
    :return: (lxml.etree._ElementTree of xml file, document name, document identifier)
    """
    if ecb and ecbplus:
        glob_path = '/*.xml'
    elif ecb and not ecbplus:
        glob_path = '/*ecb.xml'
    elif not ecb and ecbplus:
        glob_path = '/*ecbplus.xml'
        
    for topic_number in range(1, 46):
        folder_path = os.path.join(ecb_folder, str(topic_number))
        for xml_path in glob(folder_path + glob_path):
            doc = etree.parse(xml_path)
            root = doc.getroot()
            doc_name = root.get('doc_name')
            doc_id = root.get('doc_id')
            yield doc, doc_name, doc_id

### Process files

In [8]:
include_ecb = True
include_ecbplus = True
path_ecb_folder = 'ECB+'

pos_mapping = defaultdict(lambda: 'reste')
pos_mapping['NOUN'] = 'n'
pos_mapping['VERB'] = 'v'

In [9]:
ev_instances = dict()
ev_triggers = dict()

In [10]:
for doc, doc_name, doc_id in create_path_generator(path_ecb_folder, 
                                                   ecb=include_ecb, 
                                                       ecbplus=include_ecbplus):
    tid2token_objs, sentence_dict = load_tid_and_sent_dict(doc)
    mid2tids = load_mid_dict(doc)
    update_tokens_with_lemma(tid2token_objs, pos_mapping, debug=False)
    
    for an_event_instance_el in doc.iterfind('Markables/*[@instance_id]'):
        event_instance_obj = ecb_parser_classes.EventInstance(event_instance_el=an_event_instance_el)
        if 'ACTION' in event_instance_obj.xml_tag:
            ev_instances[event_instance_obj.instance_id] = event_instance_obj

In [11]:
for doc, doc_name, doc_id in create_path_generator('ECB+', 
                                                   ecb=include_ecb, 
                                                   ecbplus=include_ecbplus):
    
    tid2token_objs, sentence_dict = load_tid_and_sent_dict(doc)
    mid2tids = load_mid_dict(doc)
    update_tokens_with_lemma(tid2token_objs, pos_mapping)
    
    for a_cross_doc_coref_el in doc.iterfind('Relations/CROSS_DOC_COREF'):
        event_instance_id = a_cross_doc_coref_el.get('note')

        for source_el in a_cross_doc_coref_el.iterfind('source'):
            m_id = source_el.get('m_id')
            token_objs = mid2tids[m_id]
            sentence_number = token_objs[0].sentence
            sentence = ' '.join(sentence_dict[sentence_number])

            event_mention_obj = ecb_parser_classes.EventMention(event_instance_id,
                                                                token_objs,
                                                                doc_name,
                                                                doc_id,
                                                                m_id,
                                                                sentence)

            if event_instance_id in ev_instances:
                event_instance_obj = ev_instances[event_instance_id]
                event_instance_obj.event_mentions.add(event_mention_obj)  

In [12]:
for event_instance_id, event_instance_obj in ev_instances.items():
    for event_mention_obj in event_instance_obj.event_mentions:
        trigger, pos = event_mention_obj.event_trigger
        
        if trigger not in ev_triggers:
            event_trigger_obj = ecb_parser_classes.EventTrigger(event_trigger=trigger)
            ev_triggers[trigger] = event_trigger_obj
        
        event_trigger_obj = ev_triggers[trigger]
        event_trigger_obj.event_instances.add(event_instance_obj)

In [13]:
with open('cache/ecbANDecbplus.bin' ,'wb') as outfile:
    pickle.dump((ev_instances, ev_triggers), outfile)