In [34]:
import numpy as np
import pandas as pd
import json

In [35]:
def get_data(datapath):
    data = []
    with open(datapath) as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [36]:
def prepare_event_data(data):
    output_dict = {}
    output_dict['attack'] = {}
    output_dict['kidnapping'] = {}
    output_dict['bombing'] = {}
    
    for doc in data:
        for template in doc['templates']:
            incident_type = template['incident_type']
            if incident_type not in ['attack', 'kidnapping', 'bombing']:
                continue

            new_dict = {}
            new_dict['doc'] = doc['doctext']
            role_dict = {}
            
            perp_individual_id = []
            if len(template['PerpInd'])>0:
                for item in template['PerpInd']:
                    perpid_list = []
                    for perpid in item:
                        perpid_list.append(perpid[0])
                    perp_individual_id.append(perpid_list)
            role_dict['perp_individual_id'] = perp_individual_id
            
            perp_organisation_id = []
            if len(template['PerpOrg'])>0:
                for item in template['PerpOrg']:
                    perporg_list = []
                    for perporg in item:
                        perporg_list.append(perporg[0])
                    perp_organisation_id.append(perporg_list)
            role_dict['perp_organization_id'] = perp_organisation_id
            
            phys_tgt_id = []
            if len(template['Target'])>0:
                for item in template['Target']:
                    target_list = []
                    for target in item:
                        target_list.append(target[0])
                    phys_tgt_id.append(target_list)
            role_dict['phys_tgt_id'] = phys_tgt_id
            
            hum_tgt_name = []
            if len(template['Victim'])>0:
                for item in template['Victim']:
                    victim_list = []
                    for victim in item:
                        victim_list.append(victim[0])
                    hum_tgt_name.append(victim_list)
            role_dict['hum_tgt_name'] = hum_tgt_name
            
            incident_instrument_id = []
            if len(template['Weapon'])>0:
                for item in template['Weapon']:
                    weapon_list = []
                    for weapon in item:
                        weapon_list.append(weapon[0])
                    incident_instrument_id.append(weapon_list)
            role_dict['incident_instrument_id'] = incident_instrument_id
            
            new_dict['roles'] = role_dict
            count = len(output_dict[incident_type].keys()) + 1
            docid = "TST-MUC3-" + str(count).zfill(4)
            output_dict[incident_type][docid] = new_dict
    return output_dict

In [37]:
train = prepare_event_data(get_data('train.json'))
dev = prepare_event_data(get_data('dev.json'))
test = prepare_event_data(get_data('test.json'))

In [38]:
with open('event_output/attack/train.json', 'w', encoding='utf-8') as f:
    json.dump(train['attack'], f, ensure_ascii=False, indent=4)
with open('event_output/attack/dev.json', 'w', encoding='utf-8') as f:
    json.dump(dev['attack'], f, ensure_ascii=False, indent=4)
with open('event_output/attack/test.json', 'w', encoding='utf-8') as f:
    json.dump(test['attack'], f, ensure_ascii=False, indent=4)

In [39]:
with open('event_output/kidnapping/train.json', 'w', encoding='utf-8') as f:
    json.dump(train['kidnapping'], f, ensure_ascii=False, indent=4)
with open('event_output/kidnapping/dev.json', 'w', encoding='utf-8') as f:
    json.dump(dev['kidnapping'], f, ensure_ascii=False, indent=4)
with open('event_output/kidnapping/test.json', 'w', encoding='utf-8') as f:
    json.dump(test['kidnapping'], f, ensure_ascii=False, indent=4)

In [40]:
with open('event_output/bombing/train.json', 'w', encoding='utf-8') as f:
    json.dump(train['bombing'], f, ensure_ascii=False, indent=4)
with open('event_output/bombing/dev.json', 'w', encoding='utf-8') as f:
    json.dump(dev['bombing'], f, ensure_ascii=False, indent=4)
with open('event_output/bombing/test.json', 'w', encoding='utf-8') as f:
    json.dump(test['bombing'], f, ensure_ascii=False, indent=4)