In [1]:
import copy
import json
import random

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset
import torch.nn.functional as F
from torch.optim import Adam

from collections import Counter

from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm import tqdm_notebook as tqdm

from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification

from crf_layer import CRFLayer

In [2]:
tokenizer_model = "hfl/chinese-roberta-wwm-ext-large"

shema_path = './dictionary/event_schema.json'
enerm_dict_path = './dictionary/enum_tag.dict'
trigger_dict_path = './dictionary/trigger_tag.dict'
role_dict_path = './dictionary/role_tag.dict'

enerm_model_path = './models/DuEE_fin/roberta-chinese-large/enum.bin'
tigger_model_path = './models/DuEE_fin/roberta-chinese-large/trigger.bin'
role_model_path = './models/DuEE_fin/roberta-chinese-large/role.bin'

duee_fin_dev_path = './resources/duee_fin_dev.json'
duee_fin_dev_preprocess_path = './resources/duee_fin_dev_preprocess.json'

enum_role = "环节"
enum_event_type = "公司上市"
max_seq_len = 512

In [3]:
def data_process(dataset, model="trigger"):
    """data_process"""
    
    def label_data(data, start, l, _type):
        """label_data"""
        for i in range(start, start + l):
            suffix = "B-" if i == start else "I-"
            data[i] = "{}{}".format(suffix, _type)
        return data

    output = []
    for d_json in dataset:
        _id = d_json["id"]
        text_a = [
            "，" if t == " " or t == "\n" or t == "\t" else t
            for t in list(d_json["text"].lower())
        ]
        if model == "trigger":
            labels = ["O"] * len(text_a)
#             if len(d_json.get("event_list", [])) == 0:
#                 continue
            for event in d_json.get("event_list", []):
                event_type = event["event_type"]
                start = event["trigger_start_index"]
                trigger = event["trigger"]
                labels = label_data(labels, start, len(trigger), event_type)
            output.append({
                "id": d_json["id"],
                "sent_id": d_json["sent_id"],
                "text": d_json["text"],
                "tokens": text_a,
                "labels": labels
            })
        elif model == "role":
            labels = ["O"] * len(text_a)
            for event in d_json.get("event_list", []):
                for arg in event["arguments"]:
                    role_type = arg["role"]
                    if role_type == enum_role:
                        continue
                    argument = arg["argument"]
                    start = arg["argument_start_index"]
                    labels = label_data(labels, start, len(argument), role_type)
            output.append({
                "id": d_json["id"],
                "sent_id": d_json["sent_id"],
                "text": d_json["text"],
                "tokens": text_a,
                "labels": labels
            })
    return output

In [4]:
def enum_data_process(dataset):
    """enum_data_process"""
    output = []
    for d_json in dataset:
        text = d_json["text"].lower().replace("\t", " ")
#         if len(d_json.get("event_list", [])) == 0:
#             continue
        label = 'ABS'
        for event in d_json.get("event_list", []):
            if event["event_type"] != "公司上市":
                continue
            for argument in event["arguments"]:
                role_type = argument["role"]
                if role_type == enum_role:
                    label = argument["argument"]
        output.append({
            "id": d_json["id"],
            "sent_id": d_json["sent_id"],
            "text": text,
            "label": label
        })
    return output

In [5]:
def load_dict(dict_path):
    """load_dict"""
    vocab = {}
    for line in open(dict_path, 'r', encoding='utf-8'):
        value, key = line.strip('\n').split('\t')
        vocab[key] = int(value)
    return vocab

In [6]:
label_enum_vocab = load_dict(dict_path=enerm_dict_path)
id2enumlabel = {val: key for key, val in label_enum_vocab.items()}
label_trigger_vocab = load_dict(dict_path=trigger_dict_path)
id2triggerlabel = {val: key for key, val in label_trigger_vocab.items()}
label_role_vocab = load_dict(dict_path=role_dict_path)
id2rolelabel = {val: key for key, val in label_role_vocab.items()}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)

In [8]:
PADDING = tokenizer.vocab[tokenizer.pad_token]
SEP = tokenizer.vocab[tokenizer.sep_token]

In [9]:
class BaiduEnermDataset(Dataset):
    
    def __init__(self, dataset_path, label_dict_path):
        self.label_vocab = load_dict(label_dict_path)
        self.label_num = max(self.label_vocab.values()) + 1
        self.examples = []
        with open(dataset_path, 'r', encoding='utf-8') as f:
            dataset = json.loads(f.read())
            preprocess_dataset = enum_data_process(dataset)
            for d_json in preprocess_dataset:
                text = d_json['text']
                input_ids = tokenizer(text, is_split_into_words=False, add_special_tokens=True, max_length=max_seq_len, truncation=True)['input_ids']
                tokens_input = input_ids + [PADDING] * (max_seq_len - len(input_ids))
                attention_masks = self._get_attention_mask(input_ids, max_seq_len)
                token_type_ids = self._get_token_type_id(input_ids, max_seq_len)
                example = {
                    "input_ids": tokens_input, "attention_masks": attention_masks, "token_type_ids": token_type_ids,
                    "seq_lens": len(input_ids)
                }
                example.update(d_json)
                if 'label' in d_json:
                    label = d_json['label']
                    example.update({"encoded_label": self.label_vocab.get(label, -1)})
                self.examples.append(example)
                
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, item_idx):
        example = {
            "id": self.examples[item_idx]["id"],
            "sent_id": self.examples[item_idx]["sent_id"],
            "text": self.examples[item_idx]["text"],
            "input_ids": torch.tensor(self.examples[item_idx]["input_ids"]).long(),
            "attention_masks": torch.tensor(self.examples[item_idx]["attention_masks"]),
            "token_type_ids": torch.tensor(self.examples[item_idx]["token_type_ids"]),
            "seq_lens": self.examples[item_idx]["seq_lens"]
        }
        if "encoded_label" in self.examples[item_idx]:
            example.update({"encoded_label": torch.tensor(self.examples[item_idx]["encoded_label"], dtype=torch.long)})
        return example

    def _get_attention_mask(self, input_ids, max_seq_len):
        """Mask for padding."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        return [1] * len(input_ids) + [0] * (max_seq_len - len(input_ids))

    def _get_token_type_id(self, input_ids, max_seq_len):
        """Segments: 0 for the first sequence, 1 for the second."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        segments = []
        current_segment_id = 0
        for input_id in input_ids:
            segments.append(current_segment_id)
            if input_id == SEP:
                current_segment_id = 1
        return segments + [0] * (max_seq_len - len(input_ids)) 

In [10]:
class BaiduEventDataset(Dataset):
    
    def __init__(self, dataset_path, label_dict_path, model="trigger", ignore_index=-100):
        self.label_vocab = load_dict(label_dict_path)
        self.label_num = max(self.label_vocab.values()) + 1
        self.examples = []
        with open(dataset_path, 'r', encoding='utf-8') as f:
            dataset = json.loads(f.read())
            preprocess_dataset = data_process(dataset, model=model)
            for d_json in preprocess_dataset:
                tokens = d_json['tokens']
                input_ids = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=max_seq_len, truncation=True)['input_ids']
                tokens_input = input_ids + [PADDING] * (max_seq_len - len(input_ids))
                attention_masks = self._get_attention_mask(input_ids, max_seq_len)
                token_type_ids = self._get_token_type_id(input_ids, max_seq_len)
                example = {
                    "input_ids": tokens_input, "attention_masks": attention_masks, "token_type_ids": token_type_ids,
                    "seq_lens": len(input_ids)
                }
                example.update(d_json)
                if 'labels' in d_json:
                    labels = d_json['labels']
                    labels = labels[:(max_seq_len - 2)]
                    encoded_label = ["O"] + labels + ["O"]
                    encoded_label = [self.label_vocab[x] for x in encoded_label] + [ignore_index] * (max_seq_len - 2 - len(labels))
                    example.update({"encoded_label": encoded_label})
                self.examples.append(example)

    def _get_attention_mask(self, input_ids, max_seq_len):
        """Mask for padding."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        return [1] * len(input_ids) + [0] * (max_seq_len - len(input_ids))

    def _get_token_type_id(self, input_ids, max_seq_len):
        """Segments: 0 for the first sequence, 1 for the second."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        segments = []
        current_segment_id = 0
        for input_id in input_ids:
            segments.append(current_segment_id)
            if input_id == SEP:
                current_segment_id = 1
        return segments + [0] * (max_seq_len - len(input_ids))        
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, item_idx):
        example = {
            "id": self.examples[item_idx]["id"],
            "sent_id": self.examples[item_idx]["sent_id"],
            "text": self.examples[item_idx]["text"],
            "input_ids": torch.tensor(self.examples[item_idx]["input_ids"]).long(),
            "attention_masks": torch.tensor(self.examples[item_idx]["attention_masks"]),
            "token_type_ids": torch.tensor(self.examples[item_idx]["token_type_ids"]),
            "seq_lens": self.examples[item_idx]["seq_lens"]
        }
        if "encoded_label" in self.examples[item_idx]:
            example.update({"encoded_label": torch.tensor(self.examples[item_idx]["encoded_label"], dtype=torch.long)})
        return example

In [11]:
dev_enerm_dataset = BaiduEnermDataset(dataset_path=duee_fin_dev_preprocess_path, label_dict_path=enerm_dict_path)
dev_trigger_dataset = BaiduEventDataset(dataset_path=duee_fin_dev_preprocess_path, label_dict_path=trigger_dict_path, model="trigger")
dev_role_dataset = BaiduEventDataset(dataset_path=duee_fin_dev_preprocess_path, label_dict_path=role_dict_path, model="role")

In [12]:
len(dev_enerm_dataset)

4647

In [13]:
len(dev_trigger_dataset)

4647

In [14]:
len(dev_role_dataset)

4647

In [15]:
def set_seed(seed = 42):
    """Set the seed for generating random numbers on all GPUs.

    It's safe to call this function if CUDA is not available; in that case, it is silently ignored.

    Args:
        seed (int, optional): random numbers on all GPUs. Defaults to 42.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [16]:
# setting device on GPU if available, else CPU
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))
    
    print('Memory Usage:')
    print('Allocated:', torch.cuda.memory_allocated(0)/1024**3, 'GB')
    print('Cached:   ', torch.cuda.memory_reserved(0)/1024**3, 'GB')
    
    print('CUDA Device Count:', n_gpu)
    
set_seed(seed=42)

Using device: cuda:1

Tesla V100-PCIE-32GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
CUDA Device Count: 3


In [17]:
@torch.no_grad()
def test_enerm(model, test_dataloader):
    from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
    
    model.eval()
    step = 0
    eval_acc = 0.0
    eval_f1 = 0.0
    eval_precision = 0.0
    eval_recall = 0.0
    results = []
    test_iterator = tqdm(test_dataloader)
    for batch in test_iterator:
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_masks'].to(device),
            token_type_ids=batch['token_type_ids'].to(device)
        )
        logits = outputs.logits

        probs = F.softmax(logits, dim=1).cpu()
        probs_ids = torch.argmax(logits, dim=-1).cpu().numpy()
        true_label = batch.get("encoded_label", None).cpu().numpy()
        pred_label = copy.deepcopy(probs_ids)
        ignore_indices = np.argwhere(true_label == -1)
        pred_label[ignore_indices] = -1
        probs = probs.numpy()
        eval_acc += accuracy_score(true_label.flatten(), pred_label.flatten())
        eval_precision += precision_score(true_label.flatten(), pred_label.flatten(), average="macro", zero_division=1)
        eval_recall += recall_score(true_label.flatten(), pred_label.flatten(), average="macro", zero_division=1)
        eval_f1 += f1_score(true_label.flatten(), pred_label.flatten(), average="macro")
        for id_, sent_id, text, prob_one, p_id in zip(batch['id'], batch['sent_id'], batch['text'], probs.tolist(), probs_ids.tolist()):
            label_probs = {}
            for idx, p in enumerate(prob_one):
                label_probs[id2enumlabel[idx]] = p
            results.append({"id": id_, "sent_id": sent_id, "text": text, "pred":{"probs": label_probs, "label": id2enumlabel[p_id]}})
        step += 1
    print({"Avg eval acc": f"{eval_acc/step:.2f}", "Avg eval precision": f"{eval_precision/step:.2f}", "Avg eval recall": f"{eval_recall/step:.2f}", "Avg eval f1": f"{eval_f1/step:.2f}"})
    return results

In [18]:
enum_model = torch.load(enerm_model_path).to(device)

test_enerm_sampler = SequentialSampler(dev_enerm_dataset)
test_enerm_dataloader = DataLoader(dev_enerm_dataset, sampler=test_enerm_sampler, batch_size = 512)
    
sentences_enum_data = test_enerm(enum_model, test_enerm_dataloader)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


{'Avg eval acc': '1.00', 'Avg eval precision': '0.91', 'Avg eval recall': '0.87', 'Avg eval f1': '0.83'}


In [19]:
@torch.no_grad()
def test_trigger(model, test_dataloader):
    from seqeval.metrics import f1_score, recall_score, precision_score, accuracy_score

    model.eval()
    step = 0
    eval_acc = 0.0
    eval_f1 = 0.0
    eval_precision = 0.0
    eval_recall = 0.0
    results = []
    test_iterator = tqdm(test_dataloader)
    for batch in test_iterator:
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_masks'].to(device),
            token_type_ids=batch['token_type_ids'].to(device)
        )
        logits = outputs.logits

        probs = F.softmax(logits, dim=1).cpu()
        probs_ids = torch.argmax(logits, dim=-1).cpu().numpy()
        probs = probs.numpy()
        pred_Y, true_Y = [], []
        for t_list, p_list, seq_len in zip(batch['encoded_label'].cpu().tolist(), probs_ids.tolist(), batch['seq_lens']):
            if not all([id2triggerlabel.get(tid, "O") == "O" for tid in t_list[1: seq_len - 1]]):
                pred_Y.append([id2triggerlabel.get(pid, "O") for pid in p_list[1: seq_len - 1]])
                true_Y.append([id2triggerlabel.get(tid, "O") for tid in t_list[1: seq_len - 1]])
        eval_acc += accuracy_score(pred_Y, true_Y)
        eval_precision += precision_score(pred_Y, true_Y, zero_division=1)
        eval_recall += recall_score(pred_Y, true_Y, zero_division=1)
        eval_f1 += f1_score(pred_Y, true_Y)
        for id_, sent_id, text, p_list, p_ids, seq_len in zip(batch['id'], batch['sent_id'], batch['text'], probs.tolist(), probs_ids.tolist(), batch['seq_lens']):
            prob_one = [p_list[index][pid] for index, pid in enumerate(p_ids[1: seq_len - 1])]
            label_one = [id2triggerlabel[pid] for pid in p_ids[1: seq_len - 1]]
            results.append({"id": id_, "sent_id":sent_id, "text": text, "pred": {"probs": prob_one, "labels": label_one}})
        step += 1
    print({"Avg eval acc": f"{eval_acc/step:.2f}", "Avg eval precision": f"{eval_precision/step:.2f}", "Avg eval recall": f"{eval_recall/step:.2f}", "Avg eval f1": f"{eval_f1/step:.2f}"})
    return results

In [20]:
tigger_model = torch.load(tigger_model_path).to(device)

test_trigger_sampler = SequentialSampler(dev_trigger_dataset)
test_trigger_dataloader = DataLoader(dev_trigger_dataset, sampler=test_trigger_sampler, batch_size = 512)
    
sentences_tigger_data = test_trigger(tigger_model, test_trigger_dataloader)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


{'Avg eval acc': '1.00', 'Avg eval precision': '0.89', 'Avg eval recall': '0.86', 'Avg eval f1': '0.87'}


In [21]:
@torch.no_grad()
def test_role(model, test_dataloader):
    from seqeval.metrics import f1_score, recall_score, precision_score, accuracy_score

    model.eval()
    step = 0
    eval_acc = 0.0
    eval_f1 = 0.0
    eval_precision = 0.0
    eval_recall = 0.0
    results = []
    test_iterator = tqdm(test_dataloader)
    for batch in test_iterator:
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_masks'].to(device),
            token_type_ids=batch['token_type_ids'].to(device)
        )
        logits = outputs.logits

        probs = F.softmax(logits, dim=1).cpu()
        probs_ids = torch.argmax(logits, dim=-1).cpu().numpy()
        probs = probs.numpy()
        pred_Y, true_Y = [], []
        for t_list, p_list, seq_len in zip(batch['encoded_label'].cpu().tolist(), probs_ids.tolist(), batch['seq_lens']):
            if not all([id2rolelabel.get(tid, "O") == "O" for tid in t_list[1: seq_len - 1]]):
                pred_Y.append([id2rolelabel.get(pid, "O") for pid in p_list[1: seq_len - 1]])
                true_Y.append([id2rolelabel.get(tid, "O") for tid in t_list[1: seq_len - 1]])
        eval_acc += accuracy_score(pred_Y, true_Y)
        eval_precision += precision_score(pred_Y, true_Y, zero_division=1)
        eval_recall += recall_score(pred_Y, true_Y, zero_division=1)
        eval_f1 += f1_score(pred_Y, true_Y)
        for id_, sent_id, text, p_list, p_ids, seq_len in zip(batch['id'], batch['sent_id'], batch['text'], probs.tolist(), probs_ids.tolist(), batch['seq_lens']):
            prob_one = [p_list[index][pid] for index, pid in enumerate(p_ids[1: seq_len - 1])]
            label_one = [id2rolelabel[pid] for pid in p_ids[1: seq_len - 1]]
            results.append({"id": id_, "sent_id": sent_id, "text": text, "pred":{"probs": prob_one, "labels": label_one}})
        step += 1
    print({"Avg eval acc": f"{eval_acc/step:.2f}", "Avg eval precision": f"{eval_precision/step:.2f}", "Avg eval recall": f"{eval_recall/step:.2f}", "Avg eval f1": f"{eval_f1/step:.2f}"})
    return results

In [22]:
role_model = torch.load(role_model_path).to(device)

test_role_sampler = SequentialSampler(dev_role_dataset)
test_role_dataloader = DataLoader(dev_role_dataset, sampler=test_role_sampler, batch_size = 512)
    
sentences_role_data = test_role(role_model, test_role_dataloader)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


{'Avg eval acc': '0.92', 'Avg eval precision': '0.65', 'Avg eval recall': '0.72', 'Avg eval f1': '0.68'}


In [23]:
from utils import read_by_lines, extract_result

In [24]:
def event_normalization(doc):
    """event_merge"""
    for event in doc.get("event_list", []):
        argument_list = []
        argument_set = set()
        for arg in event["arguments"]:
            arg_str = "{}-{}".format(arg["role"], arg["argument"])
            if arg_str not in argument_set:
                argument_list.append(arg)
            argument_set.add(arg_str)
        event["arguments"] = argument_list

    event_list = sorted(
        doc.get("event_list", []),
        key=lambda x: len(x["arguments"]),
        reverse=True)
    new_event_list = []
    for event in event_list:
        event_type = event["event_type"]
        event_argument_set = set()
        for arg in event["arguments"]:
            event_argument_set.add("{}-{}".format(arg["role"], arg["argument"]))
        flag = True
        for new_event in new_event_list:
            if event_type != new_event["event_type"]:
                continue
            new_event_argument_set = set()
            for arg in new_event["arguments"]:
                new_event_argument_set.add("{}-{}".format(arg["role"], arg[
                    "argument"]))
            if len(event_argument_set & new_event_argument_set) == len(
                    new_event_argument_set):
                flag = False
        if flag:
            new_event_list.append(event)
    doc["event_list"] = new_event_list
    return doc

def predict_data_process(trigger_data, role_data, enum_data, schema_file):
    """predict_data_process"""
    pred_ret = []
    schema_data = read_by_lines(schema_file)
    print("trigger predict {} load.".format(len(trigger_data)))
    print("role predict {} load".format(len(role_data)))
    print("enum predict {} load".format(len(enum_data)))
    print("schema {} load from {}".format(len(schema_data), schema_file))

    schema, sent_role_mapping, sent_enum_mapping = {}, {}, {}
    for s in schema_data:
        d_json = json.loads(s)
        schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]]

    # role depends on id and sent_id 
    for d_json in role_data:
        r_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
        role_ret = {}
        for r in r_ret:
            role_type = r["type"]
            if role_type not in role_ret:
                role_ret[role_type] = []
            role_ret[role_type].append("".join(r["text"]))
        _id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
        sent_role_mapping[_id] = role_ret

    # process the enum_role data
    for d_json in enum_data:
        _id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
        label = d_json["pred"]["label"]
        sent_enum_mapping[_id] = label

    # process trigger data
    for d_json in trigger_data:
        t_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
        pred_event_types = list(set([t["type"] for t in t_ret]))
        event_list = []
        _id = "{}\t{}".format(d_json["id"], d_json["sent_id"])
        for event_type in pred_event_types:
            role_list = schema[event_type]
            arguments = []
            for role_type, ags in sent_role_mapping[_id].items():
                if role_type not in role_list:
                    continue
                for arg in ags:
                    arguments.append({"role": role_type, "argument": arg})
            # 特殊处理环节
            if event_type == enum_event_type:
                arguments.append({
                    "role": enum_role,
                    "argument": sent_enum_mapping[_id]
                })
            event = {
                "event_type": event_type,
                "arguments": arguments,
                "text": d_json["text"],
                "label": d_json["pred"]["labels"]
            }
            event_list.append(event)
        pred_ret.append({
            "id": d_json["id"],
            "sent_id": d_json["sent_id"],
            "text": d_json["text"],
            "event_list": event_list
        })
    doc_pred = {}
    for d in pred_ret:
        if d["id"] not in doc_pred:
            doc_pred[d["id"]] = {"id": d["id"], "event_list": []}
        doc_pred[d["id"]]["event_list"].extend(d["event_list"])

    # unfiy the all prediction results and save them
    doc_pred = [
        event_normalization(r)
        for r in doc_pred.values()
    ]
    print("submit data {} save".format(len(doc_pred)))
    return doc_pred

In [25]:
doc_pred = predict_data_process(sentences_tigger_data, sentences_role_data, sentences_enum_data, shema_path)

trigger predict 4647 load.
role predict 4647 load
enum predict 4647 load
schema 13 load from ./dictionary/event_schema.json
submit data 1023 save


In [26]:
def evaluate_mergedata(predict_doc, true_merge_dataset_path):
    true_data_list = []
    with open(true_merge_dataset_path, 'r', encoding='utf-8') as f:
        for line in f:
            json_data = json.loads(line)
            true_data_list.append(json_data)
    predict_mapping_dict = {}
    for doc in predict_doc:
        predict_mapping_dict[doc['id']] = doc
    count_predict = 0
    count_true = 0
    count_correct = 0
    for true_data in true_data_list:
        if true_data['id'] not in predict_mapping_dict:
            if 'event_list' in true_data:
                print('error: ', true_data)
        else:
            predict_doc = predict_mapping_dict[true_data['id']]
            for pred_event in predict_doc.get('event_list', []):
                pred_event_type = pred_event['event_type']
                count_predict += len(pred_event.get('arguments', []))
            for true_event in true_data.get('event_list', []):
                true_event_type = true_event['event_type']
                count_true += len(true_event.get('arguments', []))
                predict_arguments = []
                for pred_event in predict_doc.get('event_list', []):
                    pred_event_type = pred_event['event_type']
                    if true_event_type == pred_event_type:
                        for pre_arg in pred_event.get('arguments', []):
                            if pre_arg not in predict_arguments:
                                predict_arguments.append(pre_arg)
                for true_argument in true_event.get('arguments', []):
                    for predict_argument in predict_arguments:
                        if predict_argument['role'] == true_argument['role'] and predict_argument['argument'] == true_argument['argument']:
                            count_correct += 1
    p = count_correct / max(1, count_predict)  # precision
    r = count_correct / max(1, count_true)  # recall
    f1 = 2 * r * p / max(1e-9, r + p) # f1 score
    s = count_true  # support

    print("{:>10}{:>10}{:>10}{:>10}\n".format("precision", "recall", "f1-score", "support"))
    formatter = "{:>10.3f}{:>10.3f}{:>10.3f}{:>10d}".format
    print(formatter(p, r, f1, s))
    print("")

In [27]:
evaluate_mergedata(doc_pred, duee_fin_dev_path)

 precision    recall  f1-score   support

     0.463     0.668     0.547      7946

