In [2]:
import os
import paddle
import paddlenlp
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Chinese NLP Corpus
* https://github.com/SophonPlus/ChineseNlpCorpus

# Preprocess my training data

In [3]:
#read in the original dataset
weibo_senti = pd.read_csv('weibo_senti_100k.csv')

weibo_senti = weibo_senti.rename(columns={'review': 'text'})

#define function to swap columns
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df

#swap points and rebounds columns
df = swap_columns(weibo_senti, 'label', 'text')

In [4]:
# split the df into 3 df's - 80% for train, 10% for dev, 10% for test
train = df.sample(frac = 0.8, random_state=1)
rest = pd.concat([df, train, train]).drop_duplicates(keep=False)
dev = rest.sample(frac = 0.5, random_state=1)
test = pd.concat([rest, dev, dev]).drop_duplicates(keep=False)
# test['label'] = ''

In [27]:
test[test.label == 1]

Unnamed: 0,text,label
9,【霍思燕剖腹产下“小江江” 老公落泪】今晨9时霍思燕产下一名男婴，宝宝重8斤3两，母子平安。...,1
59,顶风作案!明知有雨，但实在舍不得半个月前就定好的票!40大洋一张呢，3个人120大洋呢![嘻...,1
60,偶爱土豆[哈哈],1
78,//@爱旅游爱赣州: 中央四套出品，[good]展示赣州客家人文历史，将拍十二集，九月左右播...,1
80,挺好玩儿的，过年更充实了。[太开心] //@母其弥雅:初一我?一起吃素吧~?得不要吃稀?，洗...,1
...,...,...
59950,我刚认真想了想，主要是因为市场部4个家伙全部热恋中。。。难怪这么抽疯。。。产品童鞋真善解人意...,1
59960,我靠！[鼓掌] //@必须叫我姐姐:巴萨、巴萨、巴萨，我在巴萨的主场包箱里看过球。有香槟、...,1
59961,大家应该大部分都回到自己的工作岗位或开始新学年了对吧？有没有感觉到假期后感觉闷闷不乐[失望]...,1
59963,对@云卷云舒的胡言乱语 说[话筒]：祝愿帅哥图图：生日快乐！[蛋糕][礼物][鼓掌]早日找到...,1


In [5]:
np.savetxt(r'train.txt', train.values, fmt='%s')
np.savetxt(r'dev.txt', dev.values, fmt='%s')
np.savetxt(r'test.txt', test.values, fmt='%s')

In [6]:
from paddlenlp.datasets import load_dataset

def read(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        # 跳过列名
        next(f)
        for line in f:
            words = line[:-3]
            labels = line[-2]
            yield {'text': words, 'labels': labels}

In [7]:
train_ds = load_dataset(read, data_path='train.txt',lazy=False)
dev_ds = load_dataset(read, data_path='dev.txt',lazy=False)
test_ds = load_dataset(read, data_path='test.txt',lazy=False)

In [8]:
print("Train example:", train_ds[10])
print("Dev example:", dev_ds[10])
print("Test example:", test_ds[10])

Train example: {'text': '回来晚忘记给酒店打电话了[泪]只能加钱住“ 行政大床房 ”[汗]@--怡然自乐 @小尤爸爸 @猪露头 @生于静默 @滴沥波罗 @羊和猪的生活意见 @雏菊花开的围脖 @最左边那只', 'labels': '0'}
Dev example: {'text': '结果我没下来，你下来了[衰]', 'labels': '0'}
Test example: {'text': '晚饭开始啦！[抱抱] 这是什么菜？竞猜竞猜！！', 'labels': '1'}


In [9]:
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "ernie-3.0-medium-zh"
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=len(train_ds.label_list))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

[32m[2022-11-15 09:54:39,143] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.modeling.ErnieForSequenceClassification'> to load 'ernie-3.0-medium-zh'.[0m
[32m[2022-11-15 09:54:39,147] [    INFO][0m - Already cached /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/ernie_3.0_medium_zh.pdparams[0m
[32m[2022-11-15 09:54:46,975] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'ernie-3.0-medium-zh'.[0m
[32m[2022-11-15 09:54:46,980] [    INFO][0m - Already cached /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/ernie_3.0_medium_zh_vocab.txt[0m
[32m[2022-11-15 09:54:47,011] [    INFO][0m - tokenizer config file saved in /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/tokenizer_config.json[0m
[32m[2022-11-15 09:54:47,013] [    INFO][0m - Special tokens file saved in /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/special_tokens_map.json[0m


In [10]:
def convert_example(example, tokenizer):
    tokenized_example = tokenizer(text=example['text'], max_seq_length=128)
    # 加上label用于训练
    tokenized_example['label'] = [int(example['labels'])]
    return tokenized_example

from functools import partial

trans_func = partial(convert_example, tokenizer=tokenizer)

train_ds = train_ds.map(trans_func)
dev_ds = dev_ds.map(trans_func)

In [11]:
# from paddlenlp.datasets import MapDataset

# train_list = convert(train_ds, tokenizer)
# train_ds = MapDataset(train_list)
# dev_list = convert(dev_ds, tokenizer)
# dev_ds = MapDataset(dev_list)

In [12]:
from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

collate_fn = DataCollatorWithPadding(tokenizer)

# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader
train_batch_sampler = BatchSampler(train_ds, batch_size=32, shuffle=True)
dev_batch_sampler = BatchSampler(dev_ds, batch_size=64, shuffle=False)
train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=collate_fn)

In [29]:
'''
import functools
import numpy as np

from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

# 数据预处理函数，利用分词器将文本转化为整数序列
def preprocess_function(examples, tokenizer, max_seq_length, is_test=False):
    result = tokenizer(text=examples["text"], max_seq_len=max_seq_length)
    if not is_test:
        result["labels"] = examples["label"]
    return result

trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=128)
train_ds = train_ds.map(trans_func)
dev_ds = dev_ds.map(trans_func)

# collate_fn函数构造，将不同长度序列充到批中数据的最大长度，再将数据堆叠
collate_fn = DataCollatorWithPadding(tokenizer)

# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader
train_batch_sampler = BatchSampler(train_ds, batch_size=32, shuffle=True)
dev_batch_sampler = BatchSampler(dev_ds, batch_size=64, shuffle=False)
train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=collate_fn)
'''

'\nimport functools\nimport numpy as np\n\nfrom paddle.io import DataLoader, BatchSampler\nfrom paddlenlp.data import DataCollatorWithPadding\n\n# 数据预处理函数，利用分词器将文本转化为整数序列\ndef preprocess_function(examples, tokenizer, max_seq_length, is_test=False):\n    result = tokenizer(text=examples["text"], max_seq_len=max_seq_length)\n    if not is_test:\n        result["labels"] = examples["label"]\n    return result\n\ntrans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=128)\ntrain_ds = train_ds.map(trans_func)\ndev_ds = dev_ds.map(trans_func)\n\n# collate_fn函数构造，将不同长度序列充到批中数据的最大长度，再将数据堆叠\ncollate_fn = DataCollatorWithPadding(tokenizer)\n\n# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader\ntrain_batch_sampler = BatchSampler(train_ds, batch_size=32, shuffle=True)\ndev_batch_sampler = BatchSampler(dev_ds, batch_size=64, shuffle=False)\ntrain_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)\ndev_data_loader = DataLoade

In [13]:
# Adam优化器、交叉熵损失函数、accuracy评价指标
optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

In [14]:
import paddle.nn.functional as F
def evaluate(model, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    for step, batch in enumerate(data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值、分类概率值、准确率
        logits = model(input_ids, token_type_ids)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()
    print("eval accu: %.5f" % (acc))
    model.train()
    return acc

In [None]:
# 开始训练
import time
import paddle.nn.functional as F

# from eval import evaluate
# from PaddleNLP.model_zoo.uie.evaluate import evaluate

epochs = 5 # 训练轮次
ckpt_dir = "ernie_ckpt" #训练过程中保存模型参数的文件夹
best_acc = 0
best_step = 0
global_step = 0 #迭代次数
tic_train = time.time()
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值、分类概率值、准确率
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        # 每迭代10次，打印损失函数值、准确率、计算速度
        global_step += 1
        if global_step % 10 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, acc,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        
        # 反向梯度回传，更新参数
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        # 每迭代100次，评估当前训练的模型、保存当前模型参数和分词器的词表等
        if global_step % 10 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            print(global_step, end=' ')
            acc_eval = evaluate(model, metric, dev_data_loader)
            if acc_eval > best_acc:
                best_acc = acc_eval
                best_step = global_step

                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)

global step 10, epoch: 1, batch: 10, loss: 0.63461, accu: 0.58437, speed: 0.08 step/s
10 eval accu: 0.73779


[32m[2022-11-14 10:52:35,948] [    INFO][0m - tokenizer config file saved in ernie_ckpt/tokenizer_config.json[0m
[32m[2022-11-14 10:52:35,950] [    INFO][0m - Special tokens file saved in ernie_ckpt/special_tokens_map.json[0m


global step 20, epoch: 1, batch: 20, loss: 0.18690, accu: 0.74006, speed: 0.01 step/s
20 

In [32]:
# 加载ERNIR 3.0最佳模型参数
params_path = 'ernie_ckpt/model_state.pdparams'
state_dict = paddle.load(params_path)
model.set_dict(state_dict)

# 也可以选择加载预先训练好的模型参数结果查看模型训练结果
# model.set_dict(paddle.load('ernie_ckpt_trained/model_state.pdparams'))

print('ERNIE 3.0-Medium 在weibo_senti的dev集表现', end=' ')
eval_acc = evaluate(model, metric, dev_data_loader)

ERNIE 3.0-Medium 在weibo_senti的dev集表现 eval accu: 0.98291


In [15]:
from functools import partial

def convert_example(example, tokenizer):
    tokenized_example = tokenizer(text=example['text'], max_seq_length=128)
    # 加上label用于训练
    tokenized_example['label'] = [int(example['labels'])]
    return tokenized_example

trans_func = partial(convert_example, tokenizer=tokenizer)

test_ds = test_ds.map(trans_func)

# 进行采样组batch
collate_fn_test = DataCollatorWithPadding(tokenizer)
test_batch_sampler = BatchSampler(test_ds, batch_size=32, shuffle=False)
test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn_test)

In [16]:
# 模型预测分类结果
import paddle.nn.functional as F

label_map = {0: '负面', 1: '正面'}
results = []
model.eval()
for batch in test_data_loader:
    input_ids, token_type_ids = batch['input_ids'], batch['token_type_ids']
    logits = model(batch['input_ids'], batch['token_type_ids'])
    probs = F.softmax(logits, axis=-1)
    idx = paddle.argmax(probs, axis=1).numpy()
    idx = idx.tolist()
    preds = [label_map[i] for i in idx]
    results.extend(preds)

In [17]:
# 存储ChnSentiCorp预测结果  
test_ds = load_dataset("chnsenticorp", splits=["test"]) 

res_dir = "./results"
if not os.path.exists(res_dir):
    os.makedirs(res_dir)
with open(os.path.join(res_dir, "ChnSentiCorp.tsv"), 'w', encoding="utf8") as f:
    f.write("qid\ttext\tprediction\n")
    for i, pred in enumerate(results):
        f.write(test_ds[i]['qid']+"\t"+test_ds[i]['text']+"\t"+pred+"\n")

INFO 2022-11-15 10:16:05,592 download.py:117] unique_endpoints {''}


IndexError: list index out of range

In [23]:
results.count('正面')

11998

In [25]:
len(results)

11998