In [1]:
import re

ace_types = {
    "Elect": "Personnel",
    "Declare-Bankruptcy": "Business",
    "Fine": "Justice",
    "Pardon": "Justice",
    "Start-Org": "Business",
    "Execute": "Justice",
    "Convict": "Justice",
    "Transfer-Money": "Transaction",
    "Start-Position": "Personnel",
    "Extradite": "Justice",
    "Transfer-Ownership": "Transaction",
    "Transport": "Movement",
    "Attack": "Conflict",
    "End-Org": "Business",
    "Phone-Write": "Contact",
    "Arrest-Jail": "Justice",
    "Meeting": "Contact",
    "Merge-Org": "Business",
    "Acquit": "Justice",
    "Sue": "Justice",
    "Charge-Indict": "Justice",
    "End-Position": "Personnel",
    "Sentence": "Justice",
    "Die": "Life",
    "Appeal": "Justice",
    "Nominate": "Personnel",
    "Marry": "Life",
    "Trial-Hearing": "Justice",
    "Divorce": "Life",
    "Demonstrate": "Conflict",
    "Be-Born": "Life",
    "Release-Parole": "Justice",
    "Injure": "Life"
}


In [3]:
from typing import Tuple

def get_tok_sent(char_b: str, char_e: str, xml_file: str) -> Tuple['Element', 'Element', 'Element']:
    """get token and sentence from CoreNLP xml file by char_b and char_e
    >>> token_b, token_e, sentence = get_tok_sent(33, 34, 'test.xml')
    >>> tid_b = token_b.attrib.get('id')
    >>> tid_e = token_e.attrib.get('id')
    >>> sid = sentence.attrib.get('id')
    >>> print(tid_b, tid_e, sid)
    21 21 1
    >>> print(''.join([word.text for word in sentence.iter('word')]))
    新北市新庄区陈姓男子常因细故与邻居争吵，今年8月在家门口遇到隔壁庄姓8旬老翁又发生口角，竟持酒瓶砸向老翁，导致对方跌倒伤及脑部，昏迷数日后中枢神经休克死亡，今天被新北地检署依杀人罪起诉。
    """
    
    from lxml import etree
    
    def between():
        pass
    
    with open(xml_file, 'r') as f:
        tree = etree.parse(f)
    
    sentence = tree.xpath(".//token[CharacterOffsetBegin<=" + str(char_b) + " and CharacterOffsetEnd>" + str(char_b) + "]/../..")[0]
    sentence2 = tree.xpath(".//token[CharacterOffsetBegin<" + str(char_e) + " and CharacterOffsetEnd>=" + str(char_e) + "]/../..")[0]
    assert sentence == sentence2, 'char_b {} is in sentence {} and char_e {} is in sentence {}'.format(str(char_b), sentence.attrib.get('id'), str(char_e), sentence2.attrib.get('id') )
            
    token_b = tree.xpath(".//token[CharacterOffsetBegin<=" + str(char_b) + " and CharacterOffsetEnd>" + str(char_b) + "]")[0]
    token_e = tree.xpath(".//token[CharacterOffsetBegin<" + str(char_e) + " and CharacterOffsetEnd>=" + str(char_e) + "]")[0]
    
    return token_b, token_e, sentence


def parse_line(line):
    import re
    match = re.match(r'(\d+),(\d+) ([\w-]+) (.*)', line)
    try:
        match = [ match.group(i) for i in range(0,5) ]
        match[1], match[2] = int(match[1]), int(match[2])
    except AttributeError:
        raise ValueError('The input file might not have a correct format, which should be like "59,63 Injure 清理伤口"')
    
    return match


def bold(text):
    """
    >>> print(bold('Y') + 'N')
    \033[1mY\033[0mN
    """
    return ('\033[1m' + text + '\033[0m')


def show_event(s, t, idx_list, d):
    d = list(d)
    if t - s < 20:
        win = 10
    else:
        win = 0
        
    for tok in idx_list: # make bold ('\033[1m') and red ('\033[91m')
        d[tok[0]] = '\033[91m' + '\033[1m' + d[tok[0]]
        d[tok[1]] = d[tok[1]] + '\033[0m'
    if s < 0 or t >= len(d):
        print("invalid range (should be 0 ~ {})".format(len(d)-1))
    elif s >= win and t < len(d)-win-1:
        return ''.join(d[s-win: t+1+win])
    elif s < win and t > len(d)-win:
        return ''.join(d[s:t+1], d)
    elif s < win:
        return ''.join(d[0: t+1+win])
    else:
        return ''.join(d[s-win: ])

# import doctest
# doctest.testmod()

In [4]:


def read_and_output(fname, input_folder):
    
    corenlp_xml_path = input_folder + fname + '.xml'
    orig_fpath = input_folder + fname
    ee_out_fpath = input_folder + fname + ".arg"

    with open(orig_fpath, 'r') as f:
        doc = f.read()
    # print(doc)

    did = orig_fpath.split('/')[-1]

    with open(ee_out_fpath, 'r') as f:
        doc_arg = f.read().splitlines()
    # print(doc_arg)


    from collections import OrderedDict

    event_list = []
    event = []

    # parse event
    for line in doc_arg:
        if line == "==================":
            if event != []:
                event_list.append(event)
            event = []
        else:
            event.append(line)
    event_list.append(event)


    event_dict_list = []

    # print("=========================")


    # generate output
    lastsid = 0
    id_counter = -1
    for evid, event in enumerate(event_list):
    #     print(event)
        l_min, l_max = float('inf'), float('-inf')
        idx_list = []
        event_dict = OrderedDict({'abs_id': evid, 'trigger': OrderedDict(), 'args': []})

        for idx, arg in enumerate(event):
#             print('arg: ', repr(arg))
    #         s, t, type_, *cn_word = re.split('[, ]', arg)
            _, s, t, type_, cn_word = parse_line(arg)

            if idx == 0:
                event_dict['did'] = did
                event_dict['type'] = ace_types[type_]
                event_dict['subtype'] = type_
                event_dict['trigger']['text'] = cn_word
                event_dict['trigger']['char_b'] = s
                event_dict['trigger']['char_e'] = t + 1  # substring: string[s:t] , last word: string[t+1]
                token_b, token_e, sentence = get_tok_sent(s, t + 1, corenlp_xml_path)
                token_b_int, token_e_int = int(token_b.attrib['id']), int(token_e.attrib['id']) + 1
                token_b_int, token_e_int = token_b_int -1, token_e_int - 1
                event_dict['trigger']['token_b'] = token_b_int
                event_dict['trigger']['token_e'] = token_e_int
                event_dict['trigger']['in_tokens'] = [tok.xpath('word/text()') for tok in token_b.xpath('../token')[token_b_int: token_e_int]]
                event_dict['sid'] = int(sentence.get('id'))
                event_dict['s_text'] = ''.join([word.text for word in sentence.iter('word')])

                id_counter = id_counter + 1 if event_dict['sid'] == lastsid else 0
                lastsid = event_dict['sid']

                event_dict['id'] = id_counter

            else:
                arg_dict = OrderedDict()
                arg_dict['role'] = type_
                arg_dict['text'] = cn_word
                arg_dict['char_b'] = s
                arg_dict['char_e'] = t + 1
                token_b, token_e, _ = get_tok_sent(s, t + 1, corenlp_xml_path)
                token_b_int, token_e_int = int(token_b.attrib['id']), int(token_e.attrib['id']) + 1
                token_b_int, token_e_int = token_b_int -1, token_e_int - 1            
                arg_dict['token_b'] = token_b_int
                arg_dict['token_e'] = token_e_int
                arg_dict['in_tokens'] = [tok.xpath('word/text()') for tok in token_b.xpath('../token')[token_b_int: token_e_int]]
                event_dict['args'].append(arg_dict)


            idx_list.append([s, t])
            l_min = min(l_min, s)
            l_max = max(l_max, t)
    #     print(cc.convert(show_event(l_min, l_max, idx_list, doc)))

        event_dict_list.append(event_dict)

    #     print("=========================")


    # sort and add full_id
    output = OrderedDict()
    for event_dict in event_dict_list:
        # sort
        order = ['id', 'sid', 'did', 'cid', 'type', 'subtype', 's_text', 'trigger', 'args']
        ordered_event_dict = OrderedDict()
        for arg in order:
            if arg in event_dict.keys():
                ordered_event_dict.update({arg: event_dict[arg]})

        # fullid
        fullid = 'D' + str(event_dict['did']) + '-S' + str(event_dict['sid']) + '-EVM' + str(event_dict['id'])

        # output
        output[fullid] = ordered_event_dict
    
    return output


In [7]:
import os
os.getcwd()

'/workspace/EEtask/Chinese'

In [88]:
fname = '0'

from opencc import OpenCC
import subprocess
import os


os.popen('cp ' + fname + ' SinoCoreferencer/stanford-corenlp-full-2014-08-27/' + fname)
os.popen('cp ' + fname + ' SinoCoreferencer/data/' + fname)

os.chdir('SinoCoreferencer')
    
pipes = subprocess.Popen(['bash', 'run.sh', 'test'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = pipes.communicate()

import shutil
shutil.move('data/doc.coref.entities', '../outputs/' + fname + '.coref.entities')
shutil.move('data/doc.coref.events', '../outputs/' + fname + '.coref.events')
shutil.move('data/doc.arg', '../outputs/' + fname + '.arg')
shutil.move('data/doc.xml', '../outputs/' + fname + '.xml')
shutil.move('data/doc.time', '../outputs/' + fname + '.time')
shutil.move('data/doc.value', '../outputs/' + fname + '.value')

os.chdir('..')

In [49]:
output = read_and_output(fname, "SinoCoreferencer/data/")
import json
# print(json.dumps(event_dict_list, indent=4, ensure_ascii=False))
print(json.dumps(output, indent=4, ensure_ascii=False))

{
    "Ddoc-S4-EVM0": {
        "id": 0,
        "sid": 4,
        "did": "doc",
        "type": "Conflict",
        "subtype": "Attack",
        "s_text": "侦讯时，陈男坦承持酒瓶打人，但辩称「只是想给对方教训」、「轻轻打一下」等语，否认杀人犯意。",
        "trigger": {
            "text": "打",
            "char_b": 214,
            "char_e": 215,
            "token_b": 8,
            "token_e": 9,
            "in_tokens": [
                [
                    "打人"
                ]
            ]
        },
        "args": [
            {
                "role": "Target",
                "text": "人",
                "char_b": 215,
                "char_e": 216,
                "token_b": 8,
                "token_e": 9,
                "in_tokens": [
                    [
                        "打人"
                    ]
                ]
            }
        ]
    },
    "Ddoc-S2-EVM0": {
        "id": 0,
        "sid": 2,
        "did": "doc",
        "type": "Movement",
        "subtype": "Transport",
        "s_text": "检方

In [1]:
from event_zh.main import extract_to_json
extract_to_json('1', 'outputs')

# Batch

In [10]:
import sys
import os
from opencc import OpenCC


def gen_doc_index(infolder, out_fname=None, write_to_text=False, write_to_json=False):

    from collections import OrderedDict
    import json
    import csv
    
    doc_id_dict = OrderedDict()
    
    files = (file for file in sorted(os.listdir(infolder)) 
         if os.path.isfile(os.path.join(infolder, file)))  # get only files
    
    for did, fname in enumerate(files): # ....txt
        doc_id_dict[did] = fname

    if write_to_json:
        with open(out_fname + '.json', 'w') as f:
            json.dump(doc_id_dict, f)
            
    if write_to_text:
        with open(out_fname + '.txt', 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerows(doc_id_dict.items())
            
    return doc_id_dict

        

In [21]:
input_folder = 'chinese_1990'
doc_id_dict = gen_doc_index(infolder=input_folder, out_fname='did', write_to_text=True, write_to_json=True)
    
    
# t2s
cc = OpenCC('t2s')

fnames = os.listdir(input_folder)

for fname in fnames:
    if not os.path.isdir(fname):
        with open(os.path.join(input_folder, fname), 'r') as f:
            text2conv = [line for line in f]
        with open(os.path.join('inputs', str(list(doc_id_dict.keys())[list(doc_id_dict.values()).index(fname)])), 'w') as f:
            for line in text2conv:
                f.write(cc.convert(line))

In [34]:
os.chdir('../..')

In [35]:
from event_zh.main import extract_to_json
import os

for fname in doc_id_dict.keys():
    input(fname)
    fpath = 'inputs/' + str(fname)
    if not os.path.isdir(fpath):
        extract_to_json(fpath, 'outputs')




0 


current dir /workspace/EEtask/Chinese
ls:  ['did.txt', 'venv', '.ipynb_checkpoints', 'outputs', 'configure', 'inputs', 'batch.py', 'main.py', 'chinese_1990', 'main.ipynb', 'did.json', 'event_zh']


 


FileNotFoundError: [Errno 2] No such file or directory: 'inputs'