In [1]:
import os
import paddle
import paddlenlp
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Load Datasets

In [2]:
from paddlenlp.datasets import load_dataset

def read(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            words = line[:-3]
            labels = line[-2]
            yield {'text': words, 'labels': labels}

In [3]:
train_ds = load_dataset(read, data_path='train_clean.txt',lazy=False)
dev_ds = load_dataset(read, data_path='dev_clean.txt',lazy=False)
test_ds = load_dataset(read, data_path='test_clean.txt',lazy=False)

In [4]:
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "ernie-3.0-medium-zh"
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=len(train_ds.label_list))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

[32m[2022-12-02 11:40:08,224] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.modeling.ErnieForSequenceClassification'> to load 'ernie-3.0-medium-zh'.[0m
[32m[2022-12-02 11:40:08,228] [    INFO][0m - Already cached /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/ernie_3.0_medium_zh.pdparams[0m
[32m[2022-12-02 11:40:16,140] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'ernie-3.0-medium-zh'.[0m
[32m[2022-12-02 11:40:16,144] [    INFO][0m - Already cached /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/ernie_3.0_medium_zh_vocab.txt[0m
[32m[2022-12-02 11:40:16,177] [    INFO][0m - tokenizer config file saved in /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/tokenizer_config.json[0m
[32m[2022-12-02 11:40:16,179] [    INFO][0m - Special tokens file saved in /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/special_tokens_map.json[0m


In [5]:
def convert_example(example, tokenizer):
    tokenized_example = tokenizer(text=example['text'], max_seq_length=128)
    # 加上label用于训练
    tokenized_example['label'] = [int(example['labels'])]
    return tokenized_example

from functools import partial

trans_func = partial(convert_example, tokenizer=tokenizer)

train_ds = train_ds.map(trans_func)
dev_ds = dev_ds.map(trans_func)
test_ds = test_ds.map(trans_func)

In [6]:
from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

collate_fn = DataCollatorWithPadding(tokenizer)

# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader
train_batch_sampler = BatchSampler(train_ds, batch_size=32, shuffle=True)
dev_batch_sampler = BatchSampler(dev_ds, batch_size=64, shuffle=False)
train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=collate_fn)
test_batch_sampler = BatchSampler(test_ds, batch_size=32, shuffle=False)
test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn)

In [7]:
# Adam优化器、交叉熵损失函数、accuracy评价指标
optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

In [8]:
import paddle.nn.functional as F
def evaluate(model, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    for step, batch in enumerate(data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']
        # 计算模型输出、损失函数值、分类概率值、准确率
        logits = model(input_ids, token_type_ids)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()
    print("eval accu: %.5f" % (acc))
    model.train()
    return acc

# Loading the Model

In [9]:
# 加载ERNIR 3.0最佳模型参数
params_path = 'ernie_ckpt_cleaner/model_state.pdparams'
state_dict = paddle.load(params_path)
model.set_dict(state_dict)

In [10]:
print('ERNIE 3.0-Medium 在weibo_senti的dev集表现', end=' ')
eval_acc = evaluate(model, metric, dev_data_loader)

ERNIE 3.0-Medium 在weibo_senti的dev集表现 eval accu: 0.77066


In [11]:
print('ERNIE 3.0-Medium 在weibo_senti的test集表现', end=' ')
eval_acc = evaluate(model, metric, test_data_loader)

ERNIE 3.0-Medium 在weibo_senti的test集表现 eval accu: 0.77453


In [None]:
# 模型预测分类结果
import paddle.nn.functional as F

label_map = {0: '负面', 1: '正面'}
results = []
model.eval()
for batch in test_data_loader:
    input_ids, token_type_ids = batch['input_ids'], batch['token_type_ids']
    logits = model(batch['input_ids'], batch['token_type_ids'])
    probs = F.softmax(logits, axis=-1)
    idx = paddle.argmax(probs, axis=1).numpy()
    idx = idx.tolist()
    preds = [label_map[i] for i in idx]
    results.extend(preds)

# My dataset

In [12]:
actual_test_ds = load_dataset(read, data_path='actual_test_clean.txt',lazy=False)

In [13]:
from functools import partial

def convert_example(example, tokenizer):
    tokenized_example = tokenizer(text=example['text'], max_seq_length=128)
    # 加上label用于训练
    tokenized_example['label'] = [int(example['labels'])]
    return tokenized_example

trans_func = partial(convert_example, tokenizer=tokenizer)

actual_test_ds = actual_test_ds.map(trans_func)

# 进行采样组batch
collate_fn_test = DataCollatorWithPadding(tokenizer)
test_batch_sampler = BatchSampler(actual_test_ds, batch_size=32, shuffle=False)
actual_test_data_loader = DataLoader(dataset=actual_test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn_test)

In [14]:
# 加载ERNIR 3.0最佳模型参数
params_path = 'ernie_ckpt_clean/model_state.pdparams'
state_dict = paddle.load(params_path)
model.set_dict(state_dict)

print('ERNIE 3.0-Medium 在my weibo datatest的表现', end=' ')
actual_eval_acc = evaluate(model, metric, actual_test_data_loader)

ERNIE 3.0-Medium 在my weibo datatest的表现 eval accu: 0.72222
