In [1]:
import json
import random

import torch
import torch.nn as nn
from torch.optim import AdamW
import numpy as np
from torch.utils.data import Dataset
import torch.nn.functional as F
from seqeval.metrics import f1_score, recall_score, precision_score, accuracy_score
from torch.optim import Adam

from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm import tqdm_notebook as tqdm

from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForTokenClassification

from crf_layer import CRFLayer

In [2]:
enum_role = "环节"
max_seq_len = 300

In [3]:
def data_process(dataset):
    """data_process"""

    def label_data(data, start, l, _type):
        """label_data"""
        for i in range(start, start + l):
            suffix = "B-" if i == start else "I-"
            data[i] = "{}{}".format(suffix, _type)
        return data

    output = []
    for d_json in dataset:
        _id = d_json["id"]
        text_a = [
            "，" if t == " " or t == "\n" or t == "\t" else t
            for t in list(d_json["text"].lower())
        ]
        for event in d_json.get("event_list", []):
            event_type = event['event_type']
            labels = ["O"] * len(text_a)
            for arg in event["arguments"]:
                role_type = arg["role"]
                if role_type == enum_role:
                    continue
                argument = arg["argument"]
                start = arg["argument_start_index"]
                labels = label_data(labels, start, len(argument), role_type)
            output.append({
                "tokens": text_a, "labels": labels
            })
    return output

In [4]:
def load_dict(dict_path):
    """load_dict"""
    vocab = {}
    for line in open(dict_path, 'r', encoding='utf-8'):
        value, key = line.strip('\n').split('\t')
        vocab[key] = int(value)
    return vocab

In [5]:
label_vocab = load_dict(dict_path='./dictionary/role_tag.dict')
id2label = {val: key for key, val in label_vocab.items()}

In [6]:
config = AutoConfig.from_pretrained("nghuyong/ernie-1.0")
config.num_labels = len(label_vocab)
model = AutoModelForTokenClassification.from_pretrained("nghuyong/ernie-1.0", config=config)

Some weights of the model checkpoint at nghuyong/ernie-1.0 were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at nghuyong/ernie-1.0 and are newly initi

In [7]:
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0")

In [8]:
PADDING = tokenizer.vocab[tokenizer.pad_token]
SEP = tokenizer.vocab[tokenizer.sep_token]

In [9]:
class BaiduEventDataset(Dataset):
    
    def __init__(self, dataset_path, label_dict_path, ignore_index=-100):
        self.label_vocab = load_dict(label_dict_path)
        self.label_num = max(self.label_vocab.values()) + 1
        self.examples = []
        with open(dataset_path, 'r', encoding='utf-8') as f:
            dataset = json.loads(f.read())
            preprocess_dataset = data_process(dataset)
            for d_json in preprocess_dataset:
                tokens = d_json['tokens']
                input_ids = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=max_seq_len, truncation=True)['input_ids']
                tokens_input = input_ids + [PADDING] * (max_seq_len - len(input_ids))
                attention_masks = self._get_attention_mask(input_ids, max_seq_len)
                token_type_ids = self._get_token_type_id(input_ids, max_seq_len)
                example = {
                    "input_ids": tokens_input, "attention_masks": attention_masks,
                    "token_type_ids": token_type_ids, "seq_lens": len(input_ids)
                }
                if 'labels' in d_json:
                    labels = d_json['labels']
                    labels = labels[:(max_seq_len - 2)]
                    encoded_label = ["O"] + labels + ["O"]
                    encoded_label = [self.label_vocab[x] for x in encoded_label] + [ignore_index] * (max_seq_len - 2 - len(labels))
                    example.update({"encoded_label": encoded_label})
                self.examples.append(example)

    def _get_attention_mask(self, input_ids, max_seq_len):
        """Mask for padding."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        return [1] * len(input_ids) + [0] * (max_seq_len - len(input_ids))

    def _get_token_type_id(self, input_ids, max_seq_len):
        """Segments: 0 for the first sequence, 1 for the second."""
        if len(input_ids) > max_seq_len:
            raise IndexError("Token length more than max seq length!")
        segments = []
        current_segment_id = 0
        for input_id in input_ids:
            segments.append(current_segment_id)
            if input_id == SEP:
                current_segment_id = 1
        return segments + [0] * (max_seq_len - len(input_ids))        
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, item_idx):
        example = {
            "input_ids": torch.tensor(self.examples[item_idx]["input_ids"]).long(),
            "attention_masks": torch.tensor(self.examples[item_idx]["attention_masks"]),
            "token_type_ids": torch.tensor(self.examples[item_idx]["token_type_ids"]),
            "seq_lens": self.examples[item_idx]["seq_lens"]
        }
        if "encoded_label" in self.examples[item_idx]:
            example.update({"encoded_label": torch.tensor(self.examples[item_idx]["encoded_label"], dtype=torch.long)})
        return example

In [10]:
def set_seed(seed = 42):
    """Set the seed for generating random numbers on all GPUs.

    It's safe to call this function if CUDA is not available; in that case, it is silently ignored.

    Args:
        seed (int, optional): random numbers on all GPUs. Defaults to 42.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [11]:
train_dataset = BaiduEventDataset(dataset_path='./resources/duee_fin_train_preprocess.json', label_dict_path='./dictionary/role_tag.dict')
dev_dataset = BaiduEventDataset(dataset_path='./resources/duee_fin_dev_preprocess.json', label_dict_path='./dictionary/role_tag.dict')

In [12]:
train_dataset[0]

{'input_ids': tensor([    1,   250,   275,   281,    74,   211,  1452,    67,   586, 17963,
           589,    42,   397,   701,   540,    30, 17963,   540,    42,   208,
           540,    30, 17963,   208,    42,   284,   249, 17963, 17963,    74,
          2124,  1336,  1947,   273,   599,    64,    59,   837,   793,   207,
          2091,     6,  1236,   159,   207,  2091,   277,   656,    13,   284,
           701,    42,   317,   317, 17963,     4,     4,     4,     4,    61,
           362,    74,   341,   139,    60,   316,   102,   877,     4,   341,
            60,  1545,  1675,   532,  1452,     4,   211,  1452,    67,   586,
            78,  1671,  3409,     4,   284,   540,   540,   208,   208,   249,
             4,   185,   966,   463,    74,   589,    42,   397,   701,   183,
            77,   515,   136,   284,   139,    86,  1598,    34,   460,    53,
           612,   351,     4,    53,   230,   293,    45,   837,   242,  2124,
          1336,  1947,     5,   124,   

In [13]:
# setting device on GPU if available, else CPU
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))
    
    print('Memory Usage:')
    print('Allocated:', torch.cuda.memory_allocated(0)/1024**3, 'GB')
    print('Cached:   ', torch.cuda.memory_reserved(0)/1024**3, 'GB')
    
    print('CUDA Device Count:', n_gpu)
    
set_seed(seed=42)

Using device: cuda:2

Tesla V100-PCIE-32GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
CUDA Device Count: 3


In [14]:
@torch.no_grad()
def evaluate(model, eval_dataloader):
    model.eval()
    step = 0
    eval_acc = 0.0
    eval_f1 = 0.0
    eval_precision = 0.0
    eval_recall = 0.0
    eval_loss = 0.0
    for batch in eval_dataloader:
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            # attention_mask=batch['attention_masks'].to(device),
            token_type_ids=batch['token_type_ids'].to(device),
            labels=batch['encoded_label'].to(device)
        )
        loss = outputs.loss
        logits = outputs.logits
        
        eval_loss += loss.item()
        pred_list = torch.argmax(logits, dim=-1)
        pred_Y, true_Y = [], []
        for t_list, p_list, seq_len in zip(batch['encoded_label'].cpu().tolist(), pred_list.cpu().tolist(), batch['seq_lens']):
            pred_Y.append([id2label.get(pid, "O") for pid in p_list[1: seq_len - 1]])
            true_Y.append([id2label.get(tid, "O") for tid in t_list[1: seq_len - 1]])
        eval_acc += accuracy_score(pred_Y, true_Y)
        eval_precision += precision_score(pred_Y, true_Y, zero_division=1)
        eval_recall += recall_score(pred_Y, true_Y, zero_division=1)
        eval_f1 += f1_score(pred_Y, true_Y)
        step += 1
    model.train()
    return eval_loss/step, eval_acc/step, eval_precision/step, eval_recall/step, eval_f1/step

In [15]:
### train model

def train(model, ds_train, ds_dev = None, n_epochs = 100, learning_rate = 2e-5, weight_decay = 0.01, batch_size = 1, eval_per_epoch = 1):
    model = model.to(device)
    
    train_sampler = RandomSampler(ds_train)
    train_dataloader = DataLoader(ds_train, sampler=train_sampler, batch_size=batch_size)
    
    eval_sampler = SequentialSampler(ds_dev)
    eval_dataloader = DataLoader(ds_dev, sampler=eval_sampler, batch_size=batch_size)
    
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]

#     if n_gpu > 1:
#         model = torch.nn.DataParallel(model)

    optimizer_grouped_parameters = [{
        "params": model.parameters(),
        "lr": learning_rate, 
        "weight_decay": weight_decay,
        "apply_decay_param_fun": lambda x: x in decay_params
    }]
    optimizer = AdamW(optimizer_grouped_parameters)
#     scheduler = ReduceLROnPlateau(optimizer, "min")
    
    f1 = 0.0
    acc = 0.0
    precision = 0.0
    recall = 0.0
    tr_loss = 0.0
    global_step = 0
    model.train()
    model.zero_grad()
    postfix = {}
    for epoch in range(0, n_epochs):
        eval_flag = False
        train_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{n_epochs}")
        for batch in train_iterator:
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                # attention_mask=batch['attention_masks'].to(device),
                token_type_ids=batch['token_type_ids'].to(device),
                labels=batch['encoded_label'].to(device)
            )
            loss = outputs.loss
            logits = outputs.logits
            
#             if n_gpu > 1:
#                 loss = loss.mean()
            
            loss.backward()
            optimizer.step()
            # scheduler.step(loss)

            tr_loss += loss.item()
            pred_list = torch.argmax(logits, dim=-1)
            pred_Y, true_Y = [], []
            for t_list, p_list, seq_len in zip(batch['encoded_label'].cpu().tolist(), pred_list.cpu().tolist(), batch['seq_lens']):
                pred_Y.append([id2label.get(pid, "O") for pid in p_list[1: seq_len - 1]])
                true_Y.append([id2label.get(tid, "O") for tid in t_list[1: seq_len - 1]])
            acc += accuracy_score(pred_Y, true_Y)
            precision += precision_score(pred_Y, true_Y, zero_division=1)
            recall += recall_score(pred_Y, true_Y, zero_division=1)
            f1 += f1_score(pred_Y, true_Y)
            model.zero_grad()

            postfix.update({"Avg loss": f"{tr_loss / (global_step + 1):.2f}", "Avg acc score": f"{acc / (global_step + 1):.2f}", "Avg precision score": f"{precision / (global_step + 1):.2f}", "Avg recall score": f"{recall / (global_step + 1):.2f}", "Avg f1 score": f"{f1 / (global_step + 1):.2f}"})
            if (
                not eval_flag
                and (global_step + 1) % len(train_dataloader) == 0
                and (epoch % eval_per_epoch) == 0
            ):
                if ds_dev is not None:
                    eval_loss, eval_acc, eval_precision, eval_recall, eval_f1 = evaluate(model, eval_dataloader)
                postfix.update({"Avg eval loss": f"{eval_loss:.2f}", "Avg eval acc": f"{eval_acc:.2f}", "Avg eval precision": f"{eval_precision:.2f}", "Avg eval recall": f"{eval_recall:.2f}", "Avg eval f1": f"{eval_f1:.2f}"})
                eval_flag = True
            train_iterator.set_postfix(postfix)
            global_step += 1

In [None]:
train(model, train_dataset, ds_dev=dev_dataset, n_epochs=20, batch_size=16)

HBox(children=(IntProgress(value=0, description='Epoch 1/20', max=590, style=ProgressStyle(description_width='…

In [16]:
torch.save(model.cpu(), './models/DuEE_fin/roberta-chinese-large/role.bin')

In [17]:
torch.cuda.empty_cache()