In [1]:

#下载数据集
!wget https://dataset-bj.cdn.bcebos.com/qianyan/COTE-BD.zip
!wget https://dataset-bj.cdn.bcebos.com/qianyan/COTE-MFW.zip
!wget https://dataset-bj.cdn.bcebos.com/qianyan/COTE-DP.zip

# 解压数据集到 ./data 目录
!unzip ./COTE-BD.zip -d ./data/
!unzip ./COTE-MFW.zip -d ./data/
!unzip ./COTE-DP.zip -d ./data/

# 删除压缩包
!rm COTE-BD.zip
!rm COTE-MFW.zip
!rm COTE-DP.zip

--2021-06-24 21:13:03--  https://dataset-bj.cdn.bcebos.com/qianyan/COTE-BD.zip
正在解析主机 dataset-bj.cdn.bcebos.com (dataset-bj.cdn.bcebos.com)... 114.232.92.35, 114.80.30.35
正在连接 dataset-bj.cdn.bcebos.com (dataset-bj.cdn.bcebos.com)|114.232.92.35|:443... 已连接。
已发出 HTTP 请求，正在等待回应... 200 OK
长度： 1182741 (1.1M) [application/zip]
正在保存至: “COTE-BD.zip”


2021-06-24 21:13:03 (6.01 MB/s) - 已保存 “COTE-BD.zip” [1182741/1182741])

--2021-06-24 21:13:03--  https://dataset-bj.cdn.bcebos.com/qianyan/COTE-MFW.zip
正在解析主机 dataset-bj.cdn.bcebos.com (dataset-bj.cdn.bcebos.com)... 114.232.92.35, 114.80.30.35
正在连接 dataset-bj.cdn.bcebos.com (dataset-bj.cdn.bcebos.com)|114.232.92.35|:443... 已连接。
已发出 HTTP 请求，正在等待回应... 200 OK
长度： 4872264 (4.6M) [application/zip]
正在保存至: “COTE-MFW.zip”


2021-06-24 21:13:04 (9.41 MB/s) - 已保存 “COTE-MFW.zip” [4872264/4872264])

--2021-06-24 21:13:04--  https://dataset-bj.cdn.bcebos.com/qianyan/COTE-DP.zip
正在解析主机 dataset-bj.cdn.bcebos.com (dataset-bj.cdn.bcebos.com)... 114.232.92.35, 114

In [1]:
# 得到数据集字典
def open_func(file_path):
    return [line.strip() for line in open(file_path, 'r', encoding='utf8').readlines()[1:] if len(line.strip().split('\t')) >= 2]

data_dict = {'cotebd': {'test': open_func('data/COTE-BD/test.tsv'),
                        'train': open_func('data/COTE-BD/train.tsv')},
             'cotedp': {'test': open_func('data/COTE-DP/test.tsv'),
                        'train': open_func('data/COTE-DP/train.tsv')},
             'cotemfw': {'test': open_func('data/COTE-MFW/test.tsv'),
                        'train': open_func('data/COTE-MFW/train.tsv')}}

In [2]:

# 定义数据集
from paddle.io import Dataset, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = {'B': 0, 'I': 1, 'O': 2}
index2label = {0: 'B', 1: 'I', 2: 'O'}

# 考虑token_type_id
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512, for_test=False):
        super().__init__()
        self._data = data
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._for_test = for_test
    
    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, idx):
        samples = self._data[idx].split('\t')
        label = samples[-2]
        text = samples[-1]
        if self._for_test:
            origin_enc = self._tokenizer.encode(text, max_seq_len=self._max_len)['input_ids']
            return np.array(origin_enc, dtype='int64')
        else:
            
            # 由于并不是每个字都是一个token，这里采用一种简单的处理方法，先编码label，再编码text中除了label以外的词，最后合到一起
            texts = text.split(label)
            label_enc = self._tokenizer.encode(label)['input_ids']
            cls_enc = label_enc[0]
            sep_enc = label_enc[-1]
            label_enc = label_enc[1:-1]
            
            # 合并
            origin_enc = []
            label_ids = []
            for index, text in enumerate(texts):
                text_enc = self._tokenizer.encode(text)['input_ids']
                text_enc = text_enc[1:-1]
                origin_enc += text_enc
                label_ids += [label_list['O']] * len(text_enc)
                if index != len(texts) - 1:
                    origin_enc += label_enc
                    label_ids += [label_list['B']] + [label_list['I']] * (len(label_enc) - 1)

            origin_enc = [cls_enc] + origin_enc + [sep_enc]
            label_ids = [label_list['O']] + label_ids + [label_list['O']]
            
            # 截断
            if len(origin_enc) > self._max_len:
                origin_enc = origin_enc[:self._max_len-1] + origin_enc[-1:]
                label_ids = label_ids[:self._max_len-1] + label_ids[-1:]
            return np.array(origin_enc, dtype='int64'), np.array(label_ids, dtype='int64')


def batchify_fn(for_test=False):
    if for_test:
        return lambda samples, fn=Pad(axis=0, pad_val=tokenizer.pad_token_id): np.row_stack([data for data in fn(samples)])
    else:
        return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
                                        Pad(axis=0, pad_val=label_list['O'])): [data for data in fn(samples)]


def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False):
    dataset = MyDataset(data, tokenizer, max_len, for_test)
    shuffle = True if not for_test else False
    data_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
    return data_loader

In [4]:
# 模型搭建

# 载入模型和Tokenizer
import paddlenlp
from paddlenlp.transformers import SkepForTokenClassification, SkepTokenizer
import paddle
from paddle.static import InputSpec
from paddlenlp.metrics import Perplexity

# 模型和分词
model = SkepForTokenClassification.from_pretrained('skep_ernie_1.0_large_ch', num_classes=3)
tokenizer = SkepTokenizer.from_pretrained('skep_ernie_1.0_large_ch')

# 参数设置
data_name = 'cotemfw'  # 更改此选项改变数据集

## 训练相关
epochs = 1
learning_rate = 2e-5
batch_size = 24
max_len = 512

## 数据相关
train_dataloader = get_data_loader(data_dict[data_name]['train'], tokenizer, batch_size, max_len, for_test=False)

input = InputSpec((-1, -1), dtype='int64', name='input')
label = InputSpec((-1, -1, 3), dtype='int64', name='label')
model = paddle.Model(model, [input], [label])

# 模型准备

optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters())
model.prepare(optimizer, loss=paddle.nn.CrossEntropyLoss(), metrics=[Perplexity()])

[32m[2021-06-24 21:18:59,558] [    INFO][0m - Already cached /home/gaojing/.paddlenlp/models/skep_ernie_1.0_large_ch/skep_ernie_1.0_large_ch.pdparams[0m
[32m[2021-06-24 21:19:04,192] [    INFO][0m - Found /home/gaojing/.paddlenlp/models/skep_ernie_1.0_large_ch/skep_ernie_1.0_large_ch.vocab.txt[0m


In [5]:
model.fit(train_dataloader, batch_size=batch_size, epochs=epochs, save_freq=5, save_dir='./checkpoints', log_freq=200)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/1


SystemError: (Fatal) Operator elementwise_add raises an paddle::memory::allocation::BadAlloc exception.
The exception content is
:ResourceExhaustedError: 

Out of memory error on GPU 0. Cannot allocate 79.500244MB memory on GPU 0, 10.702026GB memory has been allocated and available memory is only 60.562500MB.

Please check whether there is any other process using GPU 0.
1. If yes, please stop them, or start PaddlePaddle on another GPU.
2. If no, please decrease the batch size of your model. 

 (at /paddle/paddle/fluid/memory/allocation/cuda_allocator.cc:79)
. (at /paddle/paddle/fluid/imperative/tracer.cc:192)
