In [None]:
from functools import partial
import numpy as np
import paddle
import pandas as pd
from paddle.utils import run_check
from paddle import nn
import paddlenlp
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.datasets import load_dataset
from paddlenlp import datasets, transformers
from visualdl import LogWriter

run_check()
print('自然语言相关数据集：', paddle.text.__all__)

In [None]:
#加载数据集
train_ds, dev_ds, test_ds = paddlenlp.datasets.load_dataset('chnsenticorp', splits=['train', 'dev', 'test'])

In [None]:
MODEL_NAME = "ernie-3.0-medium-zh"
#ernie_model = paddlenlp.transformers.ErnieModel.from_pretrained(MODEL_NAME)
model = paddlenlp.transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=len(train_ds.label_list))
tokenizer = paddlenlp.transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# 数据处理，从文本到处理好的数据，input_ids，token_type_ids
def convert_example(example, tokenizer):
    encoded_inputs = tokenizer(text=example["text"])
    return encoded_inputs['input_ids'], example['label']

def create_train_dataloader(dataset, tokenizer, batch_size):
    batchify_fn = lambda samples, fn=Tuple([
        Pad(axis=0, pad_val=tokenizer.pad_token_id),
        Stack(dtype="int64")
    ]): fn(samples)
    batch_sampler = paddle.io.DistributedBatchSampler(dataset, batch_size=batch_size, shuffle=True)
    return paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
    

trans_func = partial(convert_example, tokenizer=tokenizer)
train_ds.map(trans_func)
train_data_loader = create_train_dataloader(dataset=train_ds, tokenizer=tokenizer, batch_size=8)

In [None]:
# 定义 optimizer 优化器
optimizer = paddle.optimizer.AdamW(learning_rate=5e-5, parameters=model.parameters())
# 定义 loss
criterion = paddle.nn.loss.CrossEntropyLoss()

metric = paddle.metric.Accuracy()
# 训练
epochs = 3
global_step = 0
with LogWriter(logdir="./logs") as writer:
    for epoch in range(1, epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, labels = batch
            logits = model(input_ids)
            loss = criterion(logits, labels)
            probs = F.softmax(logits, axis=1)
            # 预测分类概率
            correct = metric.compute(logits, labels)
            metric.update(correct)
            acc = metric.accumulate()

            writer.add_scalar(tag="acc", step=global_step, value=acc)
            # 向记录器添加一个tag为`loss`的数据
            writer.add_scalar(tag="loss", step=global_step, value=loss)
            global_step += 1
            if global_step % 10 == 0:
                print("epoch %d, step %d: loss:%.5f, acc:%.5f" % (epoch, step, loss, acc))
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()

In [None]:
model.save_pretrained('./trained_model/')