In [1]:
import json
import random

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset
import torch.nn.functional as F
from sklearn.metrics import f1_score, accuracy_score
from torch.optim import Adam

from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm import tqdm_notebook as tqdm

from transformers import AutoTokenizer, AutoConfig, AutoModel

from crf_layer import CRFLayer
from multiLabelTokenClassfication import MultiLabelTokenClassification

from utils import read_by_lines, extract_result_multilabel

In [2]:
folder_name = "roberta-chinese-base"
tokenizer_model = "hfl/chinese-roberta-wwm-ext"

shema_path = './dictionary/event_schema.json'
enerm_dict_path = './dictionary/enum_tag.dict'
trigger_dict_path = './dictionary/trigger_tag.dict'
role_dict_path = './dictionary/role_tag.dict'

enerm_model_path = f'./models/DuEE_fin/{folder_name}/enum.bin'
tigger_model_path = f'./models/DuEE_fin/{folder_name}/trigger-multilabel.bin'
role_model_path = f'./models/DuEE_fin/{folder_name}/role-multilabel-trick1.bin'

duee_fin_test_preprocess_path = './resources/duee_fin_test_preprocess.json'

enum_role = "环节"
enum_event_type = "公司上市"
max_seq_len = 512

In [3]:
def load_dict(dict_path):
    """load_dict"""
    vocab = {}
    for line in open(dict_path, 'r', encoding='utf-8'):
        value, key = line.strip('\n').split('\t')
        vocab[key] = int(value)
    return vocab

In [4]:
label_enum_vocab = load_dict(dict_path=enerm_dict_path)
id2enumlabel = {val: key for key, val in label_enum_vocab.items()}
label_trigger_vocab = load_dict(dict_path=trigger_dict_path)
id2triggerlabel = {val: key for key, val in label_trigger_vocab.items()}
label_role_vocab = load_dict(dict_path=role_dict_path)
id2rolelabel = {val: key for key, val in label_role_vocab.items()}

In [5]:
def enum_data_process(dataset):
    """enum_data_process"""
    output = []
    for d_json in dataset:
        text = d_json["text"].lower().replace("\t", " ")
        output.append({
            "id": d_json["id"],
            "sent_id": d_json["sent_id"],
            "text": text
        })
    return output

In [6]:
def trigger_data_process(dataset):
    """data_process"""
    
    def replace_control_chars(str):
        if str == '\u200b' or str == '\ufeff' or str == '\ue601' or str == '\u3000':
            return '[UNK]'
        else:
            return str

    output = []
    for d_json in dataset:
        _id = d_json["id"]
        text_a = [
            "，" if t == " " or t == "\n" or t == "\t" else replace_control_chars(t)
            for t in list(d_json["text"].lower())
        ]
        output.append({
            "id": d_json["id"],
            "sent_id": d_json["sent_id"],
            "text": d_json["text"],
            "tokens": text_a
        })
    return output

In [7]:
def role_data_process(dataset, trigger_file_path):
    """data_process"""
    
    def replace_control_chars(str):
        if str == '\u200b' or str == '\ufeff' or str == '\ue601' or str == '\u3000':
            return '[UNK]'
        else:
            return str
    
    trigger_data = read_by_lines(trigger_file_path)
    # process trigger data
    sent_trigger_mapping = {}
    for d in tqdm(trigger_data, total=len(trigger_data)):
        d_json = json.loads(d)
        t_ret = extract_result_multilabel(d_json["text"], d_json["pred"]["labels"])
        pred_event_types = list(set([(t["type"], ''.join(t["text"])) for t in t_ret]))
        if t_ret:
            print(pred_event_types)
            break
        if d_json["id"] not in sent_trigger_mapping:
            sent_trigger_mapping[d_json["id"]] = []
        for pred_event_type in pred_event_types:
            if pred_event_type not in sent_trigger_mapping[d_json["id"]]:
                sent_trigger_mapping[d_json["id"]].append(pred_event_type)

    output = []
    for d_json in dataset:
        _id = d_json["id"]
        text_a = [
            "，" if t == " " or t == "\n" or t == "\t" else replace_control_chars(t)
            for t in list(d_json["text"].lower())
        ]
        for pred_event_type in sent_trigger_mapping[d_json["id"]]:
            trigger_text = pred_event_type[0] + f"({pred_event_type[1]})："
            text_trigger = [
                "，" if t == " " or t == "\n" or t == "\t" else t
                for t in list(trigger_text.lower())
            ]
            output.append({
                "id": d_json["id"],
                "sent_id": d_json["sent_id"],
                "org_text": d_json["text"],
                "text": trigger_text + d_json["text"],
                "event_type": pred_event_type[0],
                "trigger": pred_event_type[1],
                "tokens": text_trigger+text_a
            })
    return output

In [8]:
with open(duee_fin_test_preprocess_path, 'r', encoding='utf-8') as f:
    dataset = json.loads(f.read())
    preprocess_dataset = role_data_process(dataset, trigger_file_path='./predict/DuEE_fin/trigger/test_pred.json')

HBox(children=(IntProgress(value=0, max=30000), HTML(value='')))




In [8]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)

In [9]:
PADDING = tokenizer.vocab[tokenizer.pad_token]
SEP = tokenizer.vocab[tokenizer.sep_token]

In [10]:
class BaiduEnermDataset(Dataset):
    
    def __init__(self, dataset_path):
        self.examples = []
        with open(dataset_path, 'r', encoding='utf-8') as f:
            dataset = json.loads(f.read())
            preprocess_dataset = enum_data_process(dataset)
            for d_json in preprocess_dataset:
                text = d_json['text']
                input_ids = tokenizer(text, is_split_into_words=False, add_special_tokens=True, max_length=max_seq_len, truncation=True)['input_ids']
                tokens_input = input_ids + [PADDING] * (max_seq_len - len(input_ids))
                attention_masks = self._get_attention_mask(input_ids, max_seq_len)
                token_type_ids = self._get_token_type_id(input_ids, max_seq_len)
                example = {
                    "input_ids": tokens_input, "attention_masks": attention_masks, "token_type_ids": token_type_ids,
                }
                example.update(d_json)
                self.examples.append(example)
                
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, item_idx):
        example = {
            "id": self.examples[item_idx]["id"],
            "sent_id": self.examples[item_idx]["sent_id"],
            "text": self.examples[item_idx]["text"],
            "input_ids": torch.tensor(self.examples[item_idx]["input_ids"]).long(),
            "attention_masks": torch.tensor(self.examples[item_idx]["attention_masks"]),
            "token_type_ids": torch.tensor(self.examples[item_idx]["token_type_ids"]),
        }
        return example

    def _get_attention_mask(self, input_ids, max_seq_len):
        """Mask for padding."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        return [1] * len(input_ids) + [0] * (max_seq_len - len(input_ids))

    def _get_token_type_id(self, input_ids, max_seq_len):
        """Segments: 0 for the first sequence, 1 for the second."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        segments = []
        current_segment_id = 0
        for input_id in input_ids:
            segments.append(current_segment_id)
            if input_id == SEP:
                current_segment_id = 1
        return segments + [0] * (max_seq_len - len(input_ids)) 

In [11]:
class BaiduTriggerDataset(Dataset):
    
    def __init__(self, dataset_path):
        self.examples = []
        with open(dataset_path, 'r', encoding='utf-8') as f:
            dataset = json.loads(f.read())
            preprocess_dataset = trigger_data_process(dataset)
            for d_json in preprocess_dataset:
                tokens = d_json['tokens']
                input_ids = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=max_seq_len, truncation=True)['input_ids']
                tokens_input = input_ids + [PADDING] * (max_seq_len - len(input_ids))
                attention_masks = self._get_attention_mask(input_ids, max_seq_len)
                token_type_ids = self._get_token_type_id(input_ids, max_seq_len)
                example = {
                    "input_ids": tokens_input,
                    "attention_masks": attention_masks,
                    "token_type_ids": token_type_ids,
                    "seq_lens": len(tokens)
                }
                example.update(d_json)
                self.examples.append(example)

    def _get_attention_mask(self, input_ids, max_seq_len):
        """Mask for padding."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        return [1] * len(input_ids) + [0] * (max_seq_len - len(input_ids))

    def _get_token_type_id(self, input_ids, max_seq_len):
        """Segments: 0 for the first sequence, 1 for the second."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        segments = []
        current_segment_id = 0
        for input_id in input_ids:
            segments.append(current_segment_id)
            if input_id == SEP:
                current_segment_id = 1
        return segments + [0] * (max_seq_len - len(input_ids))        
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, item_idx):
        example = {
            "id": self.examples[item_idx]["id"],
            "sent_id": self.examples[item_idx]["sent_id"],
            "text": self.examples[item_idx]["text"],
            "input_ids": torch.tensor(self.examples[item_idx]["input_ids"]).long(),
            "attention_masks": torch.tensor(self.examples[item_idx]["attention_masks"]),
            "token_type_ids": torch.tensor(self.examples[item_idx]["token_type_ids"]),
            "seq_lens": self.examples[item_idx]["seq_lens"],
        }
        return example

In [12]:
class BaiduRoleDataset(Dataset):
    
    def __init__(self, dataset_path, trigger_file_path):
        self.examples = []
        with open(dataset_path, 'r', encoding='utf-8') as f:
            dataset = json.loads(f.read())
            preprocess_dataset = role_data_process(dataset, trigger_file_path=trigger_file_path)
            for d_json in preprocess_dataset:
                tokens = d_json['tokens']
                input_ids = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=max_seq_len, truncation=True)['input_ids']
                tokens_input = input_ids + [PADDING] * (max_seq_len - len(input_ids))
                attention_masks = self._get_attention_mask(input_ids, max_seq_len)
                token_type_ids = self._get_token_type_id(input_ids, max_seq_len)
                example = {
                    "input_ids": tokens_input,
                    "attention_masks": attention_masks,
                    "token_type_ids": token_type_ids,
                    "seq_lens": len(tokens)
                }
                example.update(d_json)
                self.examples.append(example)

    def _get_attention_mask(self, input_ids, max_seq_len):
        """Mask for padding."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        return [1] * len(input_ids) + [0] * (max_seq_len - len(input_ids))

    def _get_token_type_id(self, input_ids, max_seq_len):
        """Segments: 0 for the first sequence, 1 for the second."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        segments = []
        current_segment_id = 0
        for input_id in input_ids:
            segments.append(current_segment_id)
            if input_id == SEP:
                current_segment_id = 1
        return segments + [0] * (max_seq_len - len(input_ids))        
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, item_idx):
        example = {
            "id": self.examples[item_idx]["id"],
            "sent_id": self.examples[item_idx]["sent_id"],
            "event_type": self.examples[item_idx]["event_type"],
            "trigger": self.examples[item_idx]["trigger"],
            "text": self.examples[item_idx]["text"],
            "input_ids": torch.tensor(self.examples[item_idx]["input_ids"]).long(),
            "attention_masks": torch.tensor(self.examples[item_idx]["attention_masks"]),
            "token_type_ids": torch.tensor(self.examples[item_idx]["token_type_ids"]),
            "seq_lens": self.examples[item_idx]["seq_lens"],
        }
        return example

In [13]:
test_enerm_dataset = BaiduEnermDataset(dataset_path=duee_fin_test_preprocess_path)
test_trigger_dataset = BaiduTriggerDataset(dataset_path=duee_fin_test_preprocess_path)

In [14]:
len(test_enerm_dataset)

30000

In [15]:
len(test_trigger_dataset)

30000

In [16]:
def set_seed(seed = 42):
    """Set the seed for generating random numbers on all GPUs.

    It's safe to call this function if CUDA is not available; in that case, it is silently ignored.

    Args:
        seed (int, optional): random numbers on all GPUs. Defaults to 42.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [17]:
# setting device on GPU if available, else CPU
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))
    
    print('Memory Usage:')
    print('Allocated:', torch.cuda.memory_allocated(0)/1024**3, 'GB')
    print('Cached:   ', torch.cuda.memory_reserved(0)/1024**3, 'GB')
    
    print('CUDA Device Count:', n_gpu)
    
set_seed(seed=42)

Using device: cuda:2

Tesla V100-PCIE-32GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
CUDA Device Count: 3


In [18]:
def write_by_lines(path, data):
    """write the data"""
    with open(path, "w") as outfile:
        [outfile.write(d + "\n") for d in data]

In [19]:
@torch.no_grad()
def test_enerm(model, test_dataloader):
    model.eval()
    step = 0
    eval_acc = 0.0
    eval_f1 = 0.0
    eval_loss = 0.0
    results = []
    test_iterator = tqdm(test_dataloader)
    for batch in test_iterator:
        _, logits = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_masks'].to(device),
            token_type_ids=batch['token_type_ids'].to(device)
        )

        probs = torch.sigmoid(logits).data.cpu()
        probs_ids = (probs > 0.5).numpy()
        probs = probs.numpy()
        for id_, sent_id, text, label_probs, p_id in zip(batch['id'], batch['sent_id'], batch['text'], probs.tolist(), probs_ids.tolist()):
            true_indices = np.argwhere(p_id).flatten()
            labels = [id2enumlabel[true_index] for true_index in true_indices]
            results.append({"id": id_, "sent_id": sent_id, "text": text, "pred":{"probs": label_probs, "label": labels}})
    return results

In [20]:
@torch.no_grad()
def test_trigger(model, test_dataloader):
    model.eval()
    step = 0
    eval_acc = 0.0
    eval_f1 = 0.0
    eval_loss = 0.0
    results = []
    test_iterator = tqdm(test_dataloader)
    for batch in test_iterator:
        loss, logits = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_masks'].to(device),
            token_type_ids=batch['token_type_ids'].to(device)
        )

        probs = torch.sigmoid(logits).data.cpu()
        probs_ids = (probs > 0.5).numpy()
        probs = probs.numpy()
        for id_, sent_id, text, p_list, p_ids, seq_len in zip(batch['id'], batch['sent_id'], batch['text'], probs.tolist(), probs_ids.tolist(), batch['seq_lens']):
            prob_multi, label_multi = [], []
            for index, pid in enumerate(p_ids[1: seq_len - 1]):
                true_indices = np.argwhere(pid).flatten()
                prob_multi.append(p_list[index])
                label_multi.append([id2triggerlabel[true_index] for true_index in true_indices])
            results.append({"id": id_, "sent_id":sent_id, "text": text, "pred": {"probs": prob_multi, "labels": label_multi}})
    return results

In [21]:
@torch.no_grad()
def test_role(model, test_dataloader):
    model.eval()
    step = 0
    eval_acc = 0.0
    eval_f1 = 0.0
    eval_loss = 0.0
    results = []
    test_iterator = tqdm(test_dataloader)
    for batch in test_iterator:
        loss, logits = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_masks'].to(device),
            token_type_ids=batch['token_type_ids'].to(device)
        )

        probs = torch.sigmoid(logits).data.cpu()
        probs_ids = (probs > 0.5).numpy()
        probs = probs.numpy()
        for id_, sent_id, text, event_type, trigger, p_list, p_ids, seq_len in zip(batch['id'], batch['sent_id'], batch['text'], batch['event_type'], batch['trigger'], probs.tolist(), probs_ids.tolist(), batch['seq_lens']):
            prob_multi, label_multi = [], []
            for index, pid in enumerate(p_ids[1: seq_len - 1]):
                true_indices = np.argwhere(pid).flatten()
                prob_multi.append(p_list[index])
                label_multi.append([id2rolelabel[true_index] for true_index in true_indices])
            results.append({"id": id_, "sent_id":sent_id, "event_type": event_type, "trigger": trigger, "text": text, "pred": {"probs": prob_multi, "labels": label_multi}})
    return results

In [22]:
test_enerm_sampler = SequentialSampler(test_enerm_dataset)
test_enerm_dataloader = DataLoader(test_enerm_dataset, sampler=test_enerm_sampler, batch_size = 512)

test_trigger_sampler = SequentialSampler(test_trigger_dataset)
test_trigger_dataloader = DataLoader(test_trigger_dataset, sampler=test_trigger_sampler, batch_size = 512)

In [23]:
enum_model = torch.load(enerm_model_path).to(device)
tigger_model = torch.load(tigger_model_path).to(device)
role_model = torch.load(role_model_path).to(device)

In [24]:
sentences_enerm = test_enerm(enum_model, test_enerm_dataloader)
sentences_enerm = [json.dumps(sent_enerm, ensure_ascii=False) for sent_enerm in sentences_enerm]
write_by_lines('./predict/DuEE_fin/enerm/test_pred.json', sentences_enerm)

HBox(children=(IntProgress(value=0, max=59), HTML(value='')))




In [25]:
sentences_tigger = test_trigger(tigger_model, test_trigger_dataloader)
sentences_tigger = [json.dumps(sent_tigger, ensure_ascii=False) for sent_tigger in sentences_tigger]
write_by_lines('./predict/DuEE_fin/trigger/test_pred.json', sentences_tigger)

HBox(children=(IntProgress(value=0, max=59), HTML(value='')))




In [27]:
test_role_dataset = BaiduRoleDataset(dataset_path=duee_fin_test_preprocess_path, trigger_file_path='./predict/DuEE_fin/trigger/test_pred.json')

test_role_sampler = SequentialSampler(test_role_dataset)
test_role_dataloader = DataLoader(test_role_dataset, sampler=test_role_sampler, batch_size = 512)

HBox(children=(IntProgress(value=0, max=30000), HTML(value='')))




In [29]:
test_role_dataset[0]

IndexError: list index out of range

In [28]:
sentences_role = test_role(role_model, test_role_dataloader)
sentences_role = [json.dumps(sent_role, ensure_ascii=False) for sent_role in sentences_role]
write_by_lines('./predict/DuEE_fin/role/test_pred.json', sentences_role)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [29]:
### multilabel classfication with header trick use 

!python duee_fin_postprocess_trick1.py --trigger_file ./predict/DuEE_fin/trigger/test_pred.json \
    --role_file ./predict/DuEE_fin/role/test_pred.json \
    --enum_file ./predict/DuEE_fin/enerm/test_pred.json --schema_file ./dictionary/event_schema.json \
    --save_path ./submit/DuEE_fin/test_duee_fin_erine_multilabel-trick1-fix.json --multilabel

trigger predict 30000 load from ./predict/DuEE_fin/trigger/test_pred.json
role predict 0 load from ./predict/DuEE_fin/role/test_pred.json
enum predict 30000 load from ./predict/DuEE_fin/enerm/test_pred.json
schema 13 load from ./dictionary/event_schema.json
submit data 30000 save to ./submit/DuEE_fin/test_duee_fin_erine_multilabel-trick1-fix.json
