In [1]:
import pandas as pd
import json
import numpy as np
import random
import torch
from transformers import AutoTokenizer, AutoModel
import copy
from sklearn.metrics import accuracy_score
import time

from model import Model

In [2]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


seed = 2022
set_seed(seed)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
label_to_id = {'病情诊断': 0, '病因分析': 1, '治疗方案': 2, '就医建议': 3, 
               '指标解读': 4, '疾病表述': 5, '后果表述': 6, '注意事项': 7,
               '功效作用': 8, '医疗费用': 9, '其他': 10}

In [5]:
with open('KUAKE-QIC/KUAKE-QIC_train.json', encoding='utf-8') as f:
    data_train = json.load(f)

with open('KUAKE-QIC/KUAKE-QIC_dev.json', encoding='utf-8') as f:
    data_valid = json.load(f)

train = pd.DataFrame(data_train).iloc[:, 1:]
train['label'] = train['label'].map(label_to_id)
train = train.values.tolist()
valid = pd.DataFrame(data_valid).iloc[:, 1:]
valid['label'] = valid['label'].map(label_to_id)
valid = valid.values.tolist()
valid[:5]

[['心肌缺血如何治疗与调养呢？', 2],
 ['19号来的月经，25号服用了紧急避孕药本月5号，怎么办？', 2],
 ['什么叫痔核脱出？什么叫外痔？', 5],
 ['您好，请问一岁三个月的孩子可以服用复方锌布颗粒吗？', 10],
 ['多发乳腺结节中药能治愈吗', 5]]

In [6]:
model_ckpt = "nghuyong/ernie-health-zh"

token = AutoTokenizer.from_pretrained(model_ckpt)
print(token.model_input_names)
pretrained = AutoModel.from_pretrained(model_ckpt)
print(pretrained.num_parameters())

['input_ids', 'token_type_ids', 'attention_mask']


Some weights of ErnieModel were not initialized from the model checkpoint at nghuyong/ernie-health-zh and are newly initialized: ['ernie.pooler.dense.weight', 'ernie.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


103404288


In [7]:
def get_collate_fn(tokenizer, max_len=512):
    """返回collate_fun函数(通过闭包函数引入形参)"""

    def collate_fn(data):
        sents = [i[0] for i in data]
        labels = [i[1] for i in data]

        # 批量编码句子
        text_t = tokenizer(text=sents,
                           truncation=True,
                           padding=True,
                           max_length=max_len,
                           return_token_type_ids=True,
                           return_attention_mask=True,
                           return_tensors='pt')

        input_ids = text_t['input_ids']
        attention_mask = text_t['attention_mask']
        token_type_ids = text_t['token_type_ids']
        labels = torch.LongTensor(labels)
        return input_ids, attention_mask, token_type_ids, labels

    return collate_fn


# 数据处理
dataloader_valid = torch.utils.data.DataLoader(dataset=valid,
                                               batch_size=32,
                                               shuffle=False,
                                               collate_fn=get_collate_fn(token))

dataloader_train = torch.utils.data.DataLoader(dataset=train,
                                               batch_size=32,
                                               shuffle=True,
                                               collate_fn=get_collate_fn(token))

for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(dataloader_train):
    print(input_ids.shape)
    print(labels.shape)
    break


torch.Size([32, 152])
torch.Size([32])


In [8]:
# 损失函数
criterion_cross_entropy = torch.nn.CrossEntropyLoss()

model_bert_base = Model(copy.deepcopy(pretrained))  # 必须进行深拷贝(pretrained(模型子网络结构)会参与梯度更新)
model_bert_base = model_bert_base.to(device)  # 模型设备切换

# 优化器
optimizer_adamw = torch.optim.AdamW(model_bert_base.parameters(), lr=2e-5)  # 模型设备切换必须在优化器定义前执行

In [9]:
# 模型训练
def train(model, dataloader, criterion, optimizer, device):
    model.train()

    for idx, (input_ids, attention_mask, token_type_ids, labels) in enumerate(dataloader):
        # 数据设备切换
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)

        loss = criterion(out, labels)  # 每个step的损失值

        loss.backward()
        optimizer.step()

        if idx % 20 == 0 and idx > 0:
            predict = out.argmax(dim=1).cpu().numpy()
            accuracy = accuracy_score(labels.cpu().numpy(), predict)  # 评估指标
            print('| step {:5d} | loss {:8.5f} | accuracy {:8.5f} |'.format(idx, loss.item(), accuracy))


# 模型验证
def evaluate(model, dataloader, device):
    model.eval()

    predict_list = []
    y_true_list = []
    with torch.no_grad():
        for input_ids, attention_mask, token_type_ids, labels in dataloader:
            # 数据设备切换
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
            predict_list.append(out.cpu())
            y_true_list.extend(labels.tolist())

    predict_all = torch.cat(predict_list, dim=0)  # 合并所有批次的预测结果
    y_true_all = torch.tensor(y_true_list)  # 真实标签
    accuracy = accuracy_score(y_true_all.numpy(), predict_all.argmax(dim=1).numpy())  # 验证数据集准确率
    return accuracy

In [10]:
best_valid_acc = 0.0 

for epoch in range(1, 6):
    epoch_start_time = time.time()
    train(model_bert_base, dataloader_train, criterion_cross_entropy, optimizer_adamw, device)
    valid_acc = evaluate(model_bert_base, dataloader_valid, device)
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model_bert_base.state_dict(), 'torch_model.bin')
    print('-' * 62)
    print('| end of epoch {:5d} | time: {:5.2f}s | valid accuracy {:8.5f} |'.format(epoch, 
                                                                                    time.time() - epoch_start_time, 
                                                                                    valid_acc))
    print('-' * 62)

| step    20 | loss  2.26329 | accuracy  0.31250 |
| step    40 | loss  2.16239 | accuracy  0.50000 |
| step    60 | loss  2.05094 | accuracy  0.53125 |
| step    80 | loss  1.95836 | accuracy  0.59375 |
| step   100 | loss  1.85118 | accuracy  0.71875 |
| step   120 | loss  2.09436 | accuracy  0.46875 |
| step   140 | loss  2.09912 | accuracy  0.43750 |
| step   160 | loss  2.00383 | accuracy  0.53125 |
| step   180 | loss  1.90342 | accuracy  0.65625 |
| step   200 | loss  1.94786 | accuracy  0.59375 |
--------------------------------------------------------------
| end of epoch     1 | time: 19.35s | valid accuracy  0.71202 |
--------------------------------------------------------------
| step    20 | loss  1.73350 | accuracy  0.84375 |
| step    40 | loss  1.80547 | accuracy  0.75000 |
| step    60 | loss  1.85299 | accuracy  0.71875 |
| step    80 | loss  1.76644 | accuracy  0.78125 |
| step   100 | loss  1.69388 | accuracy  0.84375 |
| step   120 | loss  1.87045 | accuracy  0.65

In [14]:
model_best = Model(copy.deepcopy(pretrained))
model_best.load_state_dict(torch.load('torch_model.bin'))
model_best = model_best.to(device)
best_valid_acc = evaluate(model_best, dataloader_valid, device)
best_valid_acc

0.8040920716112532