In [1]:
import json
import random

import torch
import torch.nn as nn
from torch.optim import AdamW
import numpy as np
from torch.utils.data import Dataset
import torch.nn.functional as F
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from torch.optim import Adam

from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm import tqdm

from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForTokenClassification

from crf_layer import CRFLayer
from multiLabelTokenClassfication import MultiLabelTokenClassification

from utils import extract_result_multilabel

import warnings
warnings.filterwarnings(action='ignore',category=UserWarning,module='torch')

In [2]:
model_name = "hfl/chinese-roberta-wwm-ext"

save_model_path = "roberta-chinese-base"

use_n_gpu = False

enum_role = "环节"
max_seq_len = 512

In [3]:
def data_process(dataset):
    """data_process"""

    def label_data(data, start, l, _type):
        """label_data"""
        for i in range(start, start + l):
            suffix = "B-" if i == start else "I-"
            if isinstance(data[i], str):
                data[i] = []
            solt = "{}{}".format(suffix, _type)
            if solt not in data[i]:
                data[i].append(solt)
        return data
    
    def replace_control_chars(str):
        if str == '\u200b' or str == '\ufeff' or str == '\ue601' or str == '\u3000':
            return '[UNK]'
        else:
            return str

    output = []
    for d_json in dataset:
        _id = d_json["id"]
        text_a = [
            "，" if t == " " or t == "\n" or t == "\t" else replace_control_chars(t)
            for t in list(d_json["text"].lower())
        ]
        if len(d_json.get("event_list", [])) == 0:
            continue
        ### combine same event type
        event_type_mapping = {}
        for event in d_json.get("event_list", []):
            event_type = event['event_type']
            trigger = event['trigger']
            type_tuple = (event_type, trigger)
            if type_tuple not in event_type_mapping:
                event_type_mapping[type_tuple] = []
            for argument in event["arguments"]:
                if argument not in event_type_mapping[type_tuple]:
                    event_type_mapping[type_tuple].append(argument)

        for type_tuple, arguments in event_type_mapping.items():
            event_type = type_tuple[0]
            trigger = type_tuple[1]
            labels = ["O"] * len(text_a)
            for arg in arguments:
                role_type = arg["role"]
                if role_type == enum_role:
                    continue
                argument = arg["argument"]
                start = arg["argument_start_index"]
                labels = label_data(labels, start, len(argument), role_type)
            text_trigger = [
                "，" if t == " " or t == "\n" or t == "\t" else t
                for t in list(event_type+f"({trigger})：".lower())
            ]
            trigger_label = ["O"] * len(text_trigger)
            output.append({
               "text": event_type + f"({trigger})：" + d_json["text"], "tokens": text_trigger+text_a, "labels": trigger_label+labels
            })
    return output

In [4]:
def load_dict(dict_path):
    """load_dict"""
    vocab = {}
    for line in open(dict_path, 'r', encoding='utf-8'):
        value, key = line.strip('\n').split('\t')
        vocab[key] = int(value)
    return vocab

In [5]:
label_vocab = load_dict(dict_path='./dictionary/role_tag.dict')
id2label = {val: key for key, val in label_vocab.items()}

In [6]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(label_vocab)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
PADDING = tokenizer.vocab[tokenizer.pad_token]
SEP = tokenizer.vocab[tokenizer.sep_token]

In [9]:
class BaiduEventDataset(Dataset):
    
    def __init__(self, dataset_path, label_dict_path, ignore_index=-100):
        self.label_vocab = load_dict(label_dict_path)
        self.label_num = max(self.label_vocab.values()) + 1
        self.examples = []
        with open(dataset_path, 'r', encoding='utf-8') as f:
            dataset = json.loads(f.read())
            preprocess_dataset = data_process(dataset)
            for d_json in preprocess_dataset:
                tokens = d_json['tokens']
                input_ids = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=max_seq_len, truncation=True)['input_ids']
                tokens_input = input_ids + [PADDING] * (max_seq_len - len(input_ids))
                attention_masks = self._get_attention_mask(input_ids, max_seq_len)
                token_type_ids = self._get_token_type_id(input_ids, max_seq_len)
                example = {
                    "text": d_json['text'], 
                    "input_ids": tokens_input, "attention_masks": attention_masks,
                    "token_type_ids": token_type_ids, "seq_lens": len(tokens)
                }
                if 'labels' in d_json:
                    labels = d_json['labels']
                    labels = labels[:(max_seq_len - 2)]
                    encoded_label = ["O"] + labels + ["O"]
                    encoded_label = self.to_one_hot_vector(encoded_label, max_seq_len - 2 - len(labels))
                    example.update({"encoded_label": encoded_label})
                self.examples.append(example)

    def to_one_hot_vector(self, labels, zero_padding_len = 0):
        """Convert seq to one hot."""
        one_hot_vectors = []
        for label in labels:
            one_hot_vector = np.zeros(self.label_num)
            if isinstance(label, str):
                one_hot_vector[self.label_vocab.get(label, 0)] = 1
            elif isinstance(label, list):
                for l in label:
                    one_hot_vector[self.label_vocab.get(l, 0)] = 1
            one_hot_vectors.append(one_hot_vector)
        for _ in range(zero_padding_len):
            one_hot_vector = np.zeros(self.label_num)
            one_hot_vectors.append(one_hot_vector)
        return np.array(one_hot_vectors)

    def _get_attention_mask(self, input_ids, max_seq_len):
        """Mask for padding."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        return [1] * len(input_ids) + [0] * (max_seq_len - len(input_ids))

    def _get_token_type_id(self, input_ids, max_seq_len):
        """Segments: 0 for the first sequence, 1 for the second."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        segments = []
        current_segment_id = 0
        for input_id in input_ids:
            segments.append(current_segment_id)
            if input_id == SEP:
                current_segment_id = 1
        return segments + [0] * (max_seq_len - len(input_ids))        
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, item_idx):
        example = {
            "text": self.examples[item_idx]["text"],
            "input_ids": torch.tensor(self.examples[item_idx]["input_ids"]).long(),
            "attention_masks": torch.tensor(self.examples[item_idx]["attention_masks"]),
            "token_type_ids": torch.tensor(self.examples[item_idx]["token_type_ids"]),
            "seq_lens": self.examples[item_idx]["seq_lens"]
        }
        if "encoded_label" in self.examples[item_idx]:
            example.update({"encoded_label": torch.tensor(self.examples[item_idx]["encoded_label"], dtype=torch.float)})
        return example

In [10]:
train_dataset = BaiduEventDataset(dataset_path='./resources/duee_fin_train_preprocess.json', label_dict_path='./dictionary/role_tag.dict')
dev_dataset = BaiduEventDataset(dataset_path='./resources/duee_fin_dev_preprocess.json', label_dict_path='./dictionary/role_tag.dict')

In [11]:
dev_dataset[0]

{'text': '公司上市(IPO)：理想汽车拟将美国IPO定价在招股区间顶端或更高水平据腾讯美股30日消息，据知情人士透露，理想汽车告诉潜在投资者，计划把美国首次公开募股（IPO）发行价定在招股区间顶端，甚至更高水平。该公司正以每股8-10美元发行9500万股股票。',
 'input_ids': tensor([ 101, 1062, 1385,  677, 2356,  113,  151,  158,  157,  114, 8038, 4415,
         2682, 3749, 6756, 2877, 2199, 5401, 1744,  151,  158,  157, 2137,  817,
         1762, 2875, 5500, 1277, 7313, 7553, 4999, 2772, 3291, 7770, 3717, 2398,
         2945, 5596, 6380, 5401, 5500,  124,  121, 3189, 3867, 2622, 8024, 2945,
         4761, 2658,  782, 1894, 6851, 7463, 8024, 4415, 2682, 3749, 6756, 1440,
         6401, 4052, 1762, 2832, 6598, 5442, 8024, 6369, 1153, 2828, 5401, 1744,
         7674, 3613, 1062, 2458, 1247, 5500, 8020,  151,  158,  157, 8021, 1355,
         6121,  817, 2137, 1762, 2875, 5500, 1277, 7313, 7553, 4999, 8024, 4493,
         5635, 3291, 7770, 3717, 2398,  511, 6421, 1062, 1385, 3633,  809, 3680,
         5500,  129,  118,  122,  121, 5401, 1039, 1355, 6121,  130,  126,  121,
          121,  674, 5500, 5500, 4873, 

In [12]:
model = MultiLabelTokenClassification(model_name, config)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
def set_seed(seed = 42):
    """Set the seed for generating random numbers on all GPUs.

    It's safe to call this function if CUDA is not available; in that case, it is silently ignored.

    Args:
        seed (int, optional): random numbers on all GPUs. Defaults to 42.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [14]:
# setting device on GPU if available, else CPU
if use_n_gpu:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))
    
    print('Memory Usage:')
    print('Allocated:', torch.cuda.memory_allocated(0)/1024**3, 'GB')
    print('Cached:   ', torch.cuda.memory_reserved(0)/1024**3, 'GB')
    
    print('CUDA Device Count:', n_gpu)
    
set_seed(seed=42)

Using device: cuda:2

Tesla V100-PCIE-32GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
CUDA Device Count: 3


In [15]:
@torch.no_grad()
def evaluate(model, eval_dataloader):
    model.eval()
    step = 0
    eval_acc = 0.0
    eval_f1 = 0.0
    eval_precision = 0.0
    eval_recall = 0.0
    eval_loss = 0.0
    for batch in eval_dataloader:
        loss, logits = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_masks'].to(device),
            token_type_ids=batch['token_type_ids'].to(device),
            labels=batch['encoded_label'].to(device)
        )
        
        if use_n_gpu and n_gpu > 1:
            loss = loss.mean()
        
        eval_loss += loss.item()
        pred_Y = (torch.sigmoid(logits).data > 0.5).cpu().numpy()
        true_Y = batch['encoded_label'].cpu().numpy()
        batch_size = true_Y.shape[0]
        batch_precision, batch_recall, batch_f1 = 0.0, 0.0, 0.0
        for text, t_ids, p_ids, seq_len in zip(batch["text"], true_Y, pred_Y, batch['seq_lens']):
            true_label, pred_label = [], []
            for pid in p_ids[1: seq_len - 1]:
                true_indices = np.argwhere(pid).flatten()
                pred_label.append([id2label[true_index] for true_index in true_indices])
            p_ret = extract_result_multilabel(text, pred_label)
            pred_event_types = set([(p["type"], ''.join(p["text"]), p["start"]) for p in p_ret])
            for tid in t_ids[1: seq_len - 1]:
                true_indices = np.argwhere(tid).flatten()
                true_label.append([id2label[true_index] for true_index in true_indices])
            t_ret = extract_result_multilabel(text, true_label)
            true_event_types = set([(t["type"], ''.join(t["text"]), t["start"]) for t in t_ret])
            count_predict = len(list(pred_event_types))
            count_true = len(list(true_event_types))
            count_correct = len(list(pred_event_types & true_event_types))
            p = count_correct / max(1, count_predict)  # precision
            r = count_correct / max(1, count_true)  # recall
            batch_precision += p
            batch_recall += r
            batch_f1 += 2 * r * p / max(1e-9, r + p) # f1 score
        eval_acc += accuracy_score(pred_Y.flatten(), true_Y.flatten())
        eval_precision += batch_precision / batch_size
        eval_recall += batch_recall / batch_size
        eval_f1 += batch_f1 / batch_size
        step += 1
    model.train()
    return eval_loss/step, eval_acc/step, eval_precision/step, eval_recall/step, eval_f1/step

In [16]:
### train model

def train(model, ds_train, ds_dev = None, n_epochs = 100, learning_rate = 5e-5, weight_decay = 0.01, batch_size = 1, eval_per_epoch = 2):
    model = model.to(device)
    
    train_sampler = RandomSampler(ds_train)
    train_dataloader = DataLoader(ds_train, sampler=train_sampler, batch_size=batch_size)
    
    eval_sampler = SequentialSampler(ds_dev)
    eval_dataloader = DataLoader(ds_dev, sampler=eval_sampler, batch_size=batch_size)
    
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]

    if use_n_gpu and n_gpu > 1:
        model = torch.nn.DataParallel(model, device_ids=[0, 1])

    optimizer_grouped_parameters = [{
        "params": model.parameters(),
        "lr": learning_rate, 
        "weight_decay": weight_decay,
        "apply_decay_param_fun": lambda x: x in decay_params
    }]
    optimizer = AdamW(optimizer_grouped_parameters)
#     scheduler = ReduceLROnPlateau(optimizer, "min")
    
    f1 = 0.0
    acc = 0.0
    precision = 0.0
    recall = 0.0
    tr_loss = 0.0
    global_step = 0
    model.train()
    model.zero_grad()
    postfix = {}
    for epoch in range(0, n_epochs):
        eval_flag = False
        train_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{n_epochs}")
        for batch in train_iterator:
            loss, logits = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_masks'].to(device),
                token_type_ids=batch['token_type_ids'].to(device),
                labels=batch['encoded_label'].to(device)
            )
            
            if use_n_gpu and n_gpu > 1:
                loss = loss.mean()
            
            loss.backward()
            optimizer.step()
            # scheduler.step(loss)

            tr_loss += loss.item()
            pred_Y = (torch.sigmoid(logits).data > 0.5).cpu().numpy()
            true_Y = batch['encoded_label'].cpu().numpy()
            batch_size = true_Y.shape[0]
            batch_precision, batch_recall, batch_f1 = 0.0, 0.0, 0.0
            for text, t_ids, p_ids, seq_len in zip(batch["text"], true_Y, pred_Y, batch['seq_lens']):
                true_label, pred_label = [], []
                for pid in p_ids[1: seq_len - 1]:
                    true_indices = np.argwhere(pid).flatten()
                    pred_label.append([id2label[true_index] for true_index in true_indices])
                p_ret = extract_result_multilabel(text, pred_label)
                pred_event_types = set([(p["type"], ''.join(p["text"]), p["start"]) for p in p_ret])
                for tid in t_ids[1: seq_len - 1]:
                    true_indices = np.argwhere(tid).flatten()
                    true_label.append([id2label[true_index] for true_index in true_indices])
                t_ret = extract_result_multilabel(text, true_label)
                true_event_types = set([(t["type"], ''.join(t["text"]), t["start"]) for t in t_ret])
                count_predict = len(list(pred_event_types))
                count_true = len(list(true_event_types))
                count_correct = len(list(pred_event_types & true_event_types))
                p = count_correct / max(1, count_predict)  # precision
                r = count_correct / max(1, count_true)  # recall
                batch_precision += p
                batch_recall += r
                batch_f1 += 2 * r * p / max(1e-9, r + p) # f1 score
            acc += accuracy_score(pred_Y.flatten(), true_Y.flatten())
            precision += batch_precision / batch_size
            recall += batch_recall / batch_size
            f1 += batch_f1 / batch_size
            model.zero_grad()

            postfix.update({"Avg loss": f"{tr_loss / (global_step + 1):.5f}", "Avg acc score": f"{acc / (global_step + 1):.5f}", "Avg precision score": f"{precision / (global_step + 1):.5f}", "Avg recall score": f"{recall / (global_step + 1):.5f}", "Avg f1 score": f"{f1 / (global_step + 1):.5f}"})
            if (
                not eval_flag
                and (global_step + 1) % len(train_dataloader) == 0
                and (epoch % eval_per_epoch) == 0
            ):
                if ds_dev is not None:
                    eval_loss, eval_acc, eval_precision, eval_recall, eval_f1 = evaluate(model, eval_dataloader)
                postfix.update({"Avg eval loss": f"{eval_loss:.5f}", "Avg eval acc": f"{eval_acc:.5f}", "Avg eval precision": f"{eval_precision:.5f}", "Avg eval recall": f"{eval_recall:.5f}", "Avg eval f1": f"{eval_f1:.5f}"})
                eval_flag = True
            train_iterator.set_postfix(postfix)
            global_step += 1

In [17]:
if use_n_gpu:
    train(model, train_dataset, ds_dev=dev_dataset, n_epochs=20, batch_size=12*2)
else:
    train(model, train_dataset, ds_dev=dev_dataset, n_epochs=20, batch_size=32)

Epoch 1/20: 100%|██████████| 352/352 [23:06<00:00,  3.94s/it, Avg loss=0.07204, Avg acc score=0.99358, Avg precision score=0.00000, Avg recall score=0.00000, Avg f1 score=0.00000, Avg eval loss=0.01239, Avg eval acc=0.99905, Avg eval precision=0.00000, Avg eval recall=0.00000, Avg eval f1=0.00000]
Epoch 2/20: 100%|██████████| 352/352 [08:46<00:00,  1.50s/it, Avg loss=0.04042, Avg acc score=0.99638, Avg precision score=0.00000, Avg recall score=0.00000, Avg f1 score=0.00000, Avg eval loss=0.01239, Avg eval acc=0.99905, Avg eval precision=0.00000, Avg eval recall=0.00000, Avg eval f1=0.00000]
Epoch 3/20: 100%|██████████| 352/352 [09:30<00:00,  1.62s/it, Avg loss=0.02857, Avg acc score=0.99734, Avg precision score=0.00000, Avg recall score=0.00000, Avg f1 score=0.00000, Avg eval loss=0.00375, Avg eval acc=0.99932, Avg eval precision=0.00000, Avg eval recall=0.00000, Avg eval f1=0.00000]
Epoch 4/20: 100%|██████████| 352/352 [08:49<00:00,  1.50s/it, Avg loss=0.02221, Avg acc score=0.99786, 

In [18]:
torch.save(model.cpu(), f'./models/DuEE_fin/{save_model_path}/role-multilabel-trick1.bin')

In [20]:
torch.cuda.empty_cache()

RuntimeError: CUDA error: out of memory