In [1]:
import os
import paddle
import paddlenlp
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Chinese NLP Corpus
* https://github.com/SophonPlus/ChineseNlpCorpus

# Preprocess my training data

In [2]:
#read in the original dataset
weibo_senti = pd.read_csv('weibo_senti_100k.csv')

weibo_senti = weibo_senti.rename(columns={'review': 'text'})

#define function to swap columns
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df

#swap points and rebounds columns
df = swap_columns(weibo_senti, 'label', 'text')

In [3]:
# split the df into 3 df's - 80% for train, 10% for dev, 10% for test
train = df.sample(frac = 0.8, random_state=1)
rest = pd.concat([df, train, train]).drop_duplicates(keep=False)
dev = rest.sample(frac = 0.5, random_state=1)
test = pd.concat([rest, dev, dev]).drop_duplicates(keep=False)
# test['label'] = ''

In [4]:
np.savetxt(r'train.txt', train.values, fmt='%s')
np.savetxt(r'dev.txt', dev.values, fmt='%s')
np.savetxt(r'test.txt', test.values, fmt='%s')

In [5]:
from paddlenlp.datasets import load_dataset

def read(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            words = line[:-3]
            labels = line[-2]
            yield {'text': words, 'labels': labels}

In [6]:
train_ds = load_dataset(read, data_path='train.txt',lazy=False)
dev_ds = load_dataset(read, data_path='dev.txt',lazy=False)
test_ds = load_dataset(read, data_path='test.txt',lazy=False)

In [7]:
print("Train example:", train_ds[10])
print("Dev example:", dev_ds[10])
print("Test example:", test_ds[10])

Train example: {'text': '必须严惩！[怒]//@乐妈雒乐: 同意，这个纯属故意杀人！ //@chaton-桐妈:这种人就应该按故意杀人罪判！！', 'labels': '0'}
Dev example: {'text': '#风尚志七周年#颁奖盛典暨#LUX?风尚SPARK燃情行动# 期待你的红毯表现！[爱你]//@张伦硕:丝滑芬芳，点燃火花，尽享亲密接触。11月21日在北京，和我一起祝风尚志7周岁生日快乐，见证一场燃情行动，你，准备好了吗？[噢耶][噢耶][噢耶]@风尚志', 'labels': '1'}
Test example: {'text': '真的有很多相同之处啊！结拜为兄弟吧！[哈哈]//@厚子林:亲们，视频发啦。回味下泉州的味道。客串主持人好美@南京-老李 @嘉美猫 @温和行者 @苏世独立啦 @繁星满天飞扬 @石泡泡大仙 @Domo-YoYo @大脚丫丫跳芭蕾 @香大菜 @郝浩', 'labels': '1'}


In [8]:
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "ernie-3.0-medium-zh"
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=len(train_ds.label_list))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

[32m[2022-11-21 00:53:25,714] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.modeling.ErnieForSequenceClassification'> to load 'ernie-3.0-medium-zh'.[0m
[32m[2022-11-21 00:53:25,716] [    INFO][0m - Already cached /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/ernie_3.0_medium_zh.pdparams[0m
[32m[2022-11-21 00:53:33,493] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'ernie-3.0-medium-zh'.[0m
[32m[2022-11-21 00:53:33,497] [    INFO][0m - Already cached /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/ernie_3.0_medium_zh_vocab.txt[0m
[32m[2022-11-21 00:53:33,530] [    INFO][0m - tokenizer config file saved in /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/tokenizer_config.json[0m
[32m[2022-11-21 00:53:33,531] [    INFO][0m - Special tokens file saved in /home/emilygong/.paddlenlp/models/ernie-3.0-medium-zh/special_tokens_map.json[0m


In [9]:
def convert_example(example, tokenizer):
    tokenized_example = tokenizer(text=example['text'], max_seq_length=128)
    # 加上label用于训练
    tokenized_example['label'] = [int(example['labels'])]
    return tokenized_example

from functools import partial

trans_func = partial(convert_example, tokenizer=tokenizer)

train_ds = train_ds.map(trans_func)
dev_ds = dev_ds.map(trans_func)

In [10]:
from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

collate_fn = DataCollatorWithPadding(tokenizer)

# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader
train_batch_sampler = BatchSampler(train_ds, batch_size=32, shuffle=True)
dev_batch_sampler = BatchSampler(dev_ds, batch_size=64, shuffle=False)
train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=collate_fn)

In [11]:
# Adam优化器、交叉熵损失函数、accuracy评价指标
optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

In [31]:
import paddle.nn.functional as F
def evaluate(model, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    for step, batch in enumerate(data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']
        # 计算模型输出、损失函数值、分类概率值、准确率
        logits = model(input_ids, token_type_ids)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()
    print("eval accu: %.5f" % (acc))
    model.train()
    return acc

In [None]:
# 开始训练
import time
import paddle.nn.functional as F

# from eval import evaluate
# from PaddleNLP.model_zoo.uie.evaluate import evaluate

epochs = 5 # 训练轮次
ckpt_dir = "ernie_ckpt" #训练过程中保存模型参数的文件夹
best_acc = 0
best_step = 0
global_step = 0 #迭代次数
tic_train = time.time()
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值、分类概率值、准确率
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        # 每迭代10次，打印损失函数值、准确率、计算速度
        global_step += 1
        if global_step % 10 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, acc,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        
        # 反向梯度回传，更新参数
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        # 每迭代100次，评估当前训练的模型、保存当前模型参数和分词器的词表等
        if global_step % 10 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            print(global_step, end=' ')
            acc_eval = evaluate(model, metric, dev_data_loader)
            if acc_eval > best_acc:
                best_acc = acc_eval
                best_step = global_step

                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)

global step 10, epoch: 1, batch: 10, loss: 0.63461, accu: 0.58437, speed: 0.08 step/s
10 eval accu: 0.73779


[32m[2022-11-14 10:52:35,948] [    INFO][0m - tokenizer config file saved in ernie_ckpt/tokenizer_config.json[0m
[32m[2022-11-14 10:52:35,950] [    INFO][0m - Special tokens file saved in ernie_ckpt/special_tokens_map.json[0m


global step 20, epoch: 1, batch: 20, loss: 0.18690, accu: 0.74006, speed: 0.01 step/s
20 

In [17]:
# 加载ERNIR 3.0最佳模型参数
params_path = 'ernie_ckpt/model_state.pdparams'
state_dict = paddle.load(params_path)
model.set_dict(state_dict)

# 也可以选择加载预先训练好的模型参数结果查看模型训练结果
# model.set_dict(paddle.load('ernie_ckpt_trained/model_state.pdparams'))

print('ERNIE 3.0-Medium 在weibo_senti的dev集表现', end=' ')
eval_acc = evaluate(model, metric, dev_data_loader)

ERNIE 3.0-Medium 在weibo_senti的dev集表现 

In [16]:
from functools import partial

def convert_example(example, tokenizer):
    tokenized_example = tokenizer(text=example['text'], max_seq_length=128)
    # 加上label用于训练
    tokenized_example['label'] = [int(example['labels'])]
    return tokenized_example

trans_func = partial(convert_example, tokenizer=tokenizer)

test_ds = test_ds.map(trans_func)

# 进行采样组batch
collate_fn_test = DataCollatorWithPadding(tokenizer)
test_batch_sampler = BatchSampler(test_ds, batch_size=32, shuffle=False)
test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn_test)

In [17]:
# 模型预测分类结果
import paddle.nn.functional as F

label_map = {0: '负面', 1: '正面'}
results = []
model.eval()
for batch in test_data_loader:
    input_ids, token_type_ids = batch['input_ids'], batch['token_type_ids']
    logits = model(batch['input_ids'], batch['token_type_ids'])
    probs = F.softmax(logits, axis=-1)
    idx = paddle.argmax(probs, axis=1).numpy()
    idx = idx.tolist()
    preds = [label_map[i] for i in idx]
    results.extend(preds)

In [19]:
results.count('正面') #positive

5712

In [20]:
results.count('负面') #negative

6286

In [23]:
print('ERNIE 3.0-Medium 在weibo_senti的test集表现', end=' ')
eval_acc = evaluate(model, metric, test_data_loader)

ERNIE 3.0-Medium 在weibo_senti的test集表现 eval accu: 0.98225


# My dataset

In [13]:
marked = pd.read_csv('new_manual_sa_marked.csv')
marked = marked.drop(marked.columns[[0, 1, 2]], axis = 1)
marked['sentiment_score'].replace(['negative', 'positive'], [0, 1], inplace=True)
marked = marked.rename(columns={'post_text': 'text', 'sentiment_score': 'label'})
swap_columns(marked, 'label', 'text')
marked

Unnamed: 0,text,label
0,我是真服了#合肥疫情#,0
1,啥也不说了，时间真的证明了，武汉真是英雄的城市。,1
2,如果这个女孩朋友圈是造谣，周处长可以选择报警了；如果她所说属实，周处长真是一位好婆婆。这一邮...,0
3,【昨日#上海新增355例确诊5298例无症状#】上海市卫健委今早（31日）通报：2022年3...,0
4,#100万癌症患者的上海生存#过去两个月，在上海这座医疗资源被剧烈争夺的超级大都市，癌症患者...,1
...,...,...
195,“上海封上一周不行吗？”不行！因为上海承载全国乃至全球重要功能#上海为什么不能封城#L中事的...,0
196,本来打了很多字，但后来全部删掉了，因为说太多反而会牵扯很多话题，也不想评论里大家吵起来。现在...,1
197,#全国已有上万名医护人员驰援上海#2020至今，最拉的城市！没有之一！！！！关键这种时候某些...,0
198,没转吉林市丹东市等一系列三四线封城求助是因为我坚信转了也没用，一二线省会城市还会稍微注意以下...,0


In [14]:
# 3 is too long for the model to predict
# 18, 40, 45, 59, 151, 168, 177, 184 contains emoji -> needs some cleaning first
# drop_list = [3, 18, 40, 45, 59, 151, 168, 177, 184]

In [15]:
import demoji
def clean(context):
    try:
        context = demoji.replace(context, repl="")
    except:
        context = context
    return context

In [16]:
marked['text'] = marked['text'].apply(clean)

In [17]:
marked = marked.drop(3).reset_index(drop=True)
marked

Unnamed: 0,text,label
0,我是真服了#合肥疫情#,0
1,啥也不说了，时间真的证明了，武汉真是英雄的城市。,1
2,如果这个女孩朋友圈是造谣，周处长可以选择报警了；如果她所说属实，周处长真是一位好婆婆。这一邮...,0
3,#100万癌症患者的上海生存#过去两个月，在上海这座医疗资源被剧烈争夺的超级大都市，癌症患者...,1
4,你的领导（上级）有口臭吗,0
...,...,...
194,“上海封上一周不行吗？”不行！因为上海承载全国乃至全球重要功能#上海为什么不能封城#L中事的...,0
195,本来打了很多字，但后来全部删掉了，因为说太多反而会牵扯很多话题，也不想评论里大家吵起来。现在...,1
196,#全国已有上万名医护人员驰援上海#2020至今，最拉的城市！没有之一！！！！关键这种时候某些...,0
197,没转吉林市丹东市等一系列三四线封城求助是因为我坚信转了也没用，一二线省会城市还会稍微注意以下...,0


In [24]:
marked.iat[6, 0]

'【#吉林新增本土确诊1730例##吉林新增本土无症状1244例#】4月1日0-24时，吉林全省新增本地确诊病例1730例(轻型1720例、普通型10例），其中长春市1544例（含19例无症状感染者转为确诊病例）、吉林市178例、四平市6例（含1例无症状感染者转为确诊病例）、白城市2例；新增本地无症状感染者1244例，其中长春市894例、吉林市349例、四平市1例。以上感染者均已转运至定点医疗机构隔离治疗，对以上人员的密切接触者、次密切接触者均已开展追踪排查，并落实管控措施，对其生活和工作场所进行了终末消毒。'

In [26]:
np.savetxt(r'actual_test.txt', marked.values, fmt='%s')

In [27]:
actual_test_ds = load_dataset(read, data_path='actual_test.txt',lazy=False)

In [28]:
from functools import partial

def convert_example(example, tokenizer):
    tokenized_example = tokenizer(text=example['text'], max_seq_length=128)
    # 加上label用于训练
    tokenized_example['label'] = [int(example['labels'])]
    return tokenized_example

trans_func = partial(convert_example, tokenizer=tokenizer)

actual_test_ds = actual_test_ds.map(trans_func)

# 进行采样组batch
collate_fn_test = DataCollatorWithPadding(tokenizer)
test_batch_sampler = BatchSampler(test_ds, batch_size=32, shuffle=False)
actual_test_data_loader = DataLoader(dataset=actual_test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn_test)

In [None]:
# Padding

In [32]:
max_val =[]
for i in actual_test_ds:
    max_val.append(len(i['input_ids']))
max_val
# for i in range(len(max_val)):
#     if max_val[i] == 39979:
#         print(i)

[13,
 26,
 76,
 197,
 14,
 129,
 235,
 99,
 192,
 142,
 248,
 14,
 165,
 130,
 36,
 414,
 34,
 364,
 12,
 37,
 619,
 147,
 25,
 460,
 245,
 51,
 510,
 22,
 32,
 304,
 97,
 127,
 75,
 43,
 174,
 148,
 251,
 233,
 50,
 69,
 170,
 684,
 19,
 43,
 87,
 423,
 92,
 19,
 473,
 3,
 29,
 173,
 127,
 31,
 881,
 192,
 50,
 177,
 149,
 375,
 10,
 25,
 512,
 290,
 184,
 359,
 359,
 435,
 15,
 569,
 826,
 233,
 12,
 66,
 10,
 25,
 45,
 35,
 20,
 881,
 328,
 62,
 358,
 272,
 52,
 113,
 94,
 345,
 190,
 144,
 108,
 322,
 42,
 43,
 247,
 122,
 156,
 39,
 8,
 20,
 193,
 44,
 169,
 143,
 38,
 122,
 451,
 39,
 571,
 338,
 238,
 96,
 37,
 119,
 33,
 125,
 41,
 62,
 914,
 104,
 30,
 376,
 31,
 21,
 573,
 42,
 315,
 22,
 51,
 229,
 62,
 262,
 158,
 103,
 159,
 433,
 117,
 210,
 74,
 442,
 97,
 12,
 382,
 529,
 322,
 137,
 272,
 491,
 629,
 655,
 9,
 213,
 419,
 93,
 150,
 8,
 259,
 205,
 53,
 382,
 130,
 163,
 51,
 313,
 390,
 540,
 122,
 35,
 77,
 50,
 54,
 19,
 18,
 310,
 17,
 13,
 55,
 61,
 110,
 157,
 60

In [33]:
tokenizer

PretrainedTokenizer(name_or_path='', vocab_size=39979, model_max_len=2048, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
# 加载ERNIR 3.0最佳模型参数
params_path = 'ernie_ckpt/model_state.pdparams'
state_dict = paddle.load(params_path)
model.set_dict(state_dict)

print('ERNIE 3.0-Medium 在my weibo datatest的表现', end=' ')
actual_eval_acc = evaluate(model, metric, actual_test_data_loader)

ERNIE 3.0-Medium 在my weibo datatest的表现 

Exception in thread Thread-6:
Traceback (most recent call last):
  File "/home/emilygong/miniconda3/envs/paddle_env/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/home/emilygong/miniconda3/envs/paddle_env/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/home/emilygong/miniconda3/envs/paddle_env/lib/python3.9/site-packages/paddle/fluid/dataloader/dataloader_iter.py", line 217, in _thread_loop
    batch = self._dataset_fetcher.fetch(indices,
  File "/home/emilygong/miniconda3/envs/paddle_env/lib/python3.9/site-packages/paddle/fluid/dataloader/fetcher.py", line 121, in fetch
    data.append(self.dataset[idx])
  File "/home/emilygong/miniconda3/envs/paddle_env/lib/python3.9/site-packages/paddlenlp/datasets/dataset.py", line 276, in __getitem__
    self.new_data[idx]
IndexError: list index out of range
