In [None]:
# 科大讯飞数据
'''
6551700932705387022_!_101_!_news_culture_!_京城最值得你来场文化之旅的博物馆_!_保利集团,马未都,中国科学技术馆,博物馆,新中国
6552368441838272771_!_101_!_news_culture_!_发酵床的垫料种类有哪些？哪种更好？_!_
6552407965343678723_!_101_!_news_culture_!_上联：黄山黄河黄皮肤黄土高原。怎么对下联？_!_
6552332417753940238_!_101_!_news_culture_!_林徽因什么理由拒绝了徐志摩而选择梁思成为终身伴侣？_!_
6552475601595269390_!_101_!_news_culture_!_黄杨木是什么树？_!_
6552387648126714125_!_101_!_news_culture_!_上联：草根登上星光道，怎么对下联？_!_
6552271725814350087_!_101_!_news_culture_!_什么是超写实绘画？_!_
6552452982015787268_!_101_!_news_culture_!_松涛听雨莺婉转，下联？_!_
6552400379030536455_!_101_!_news_culture_!_上联：老子骑牛读书，下联怎么对？_!_
6552339283632455939_!_101_!_news_culture_!_上联：山水醉人何须酒。如何对下联？_!_
'''

In [1]:
import pandas as pd
import codecs

In [2]:
# 标签
news_label = [int(x.split('_!_')[1])-100 for x in codecs.open('toutiao_cat_data.txt',encoding='utf8')]

# 文本
# 6554468413316530436_!_101_!_news_culture_!_上联：巧逢春景添月色，求下联？_!_
news_text = [x.strip().split('_!_')[-1] if x.strip()[-3:] != '_!_' else x.strip().split('_!_')[-2]
                 for x in codecs.open('toutiao_cat_data.txt',encoding='utf8')]

In [3]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import random
import re

# 划分为训练集和验证集
# stratify 按照标签进行采样，训练集和验证部分同分布
x_train, x_test, train_label, test_label =  train_test_split(news_text[:50000], 
                                                             news_label[:50000], 
                                                             test_size=0.2, 
                                                             stratify=news_label[:50000])

In [4]:
# pip install transformers
from transformers import BertTokenizer

# 获取分词器时并不会下载模型
# tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizer.from_pretrained('E:/models/chinese-roberta-wwm-ext')


train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=64)
test_encoding = tokenizer(x_test, truncation=True, padding=True, max_length=64)

In [5]:
# 构建Dataset
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encoding, train_label)
test_dataset = NewsDataset(test_encoding, test_label)

In [8]:
# NewsDataset直接读取文本在经过 tokenizer 处理后的数据,主要的含义如下：
# input_ids：字在词典中的编码
# token_type_ids：区分两个句子的编码（上句全为0，下句全为1）
# attention_mask：即Padding Mask(标识 在哪些词上进行self-Attention操作)
train_dataset[1]

{'input_ids': tensor([ 101,  711,  784,  720, 2255, 7553,  677, 4638, 3946, 2428, 3683, 2255,
         5558,  678, 4638, 3946, 2428,  856, 8043,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(12)}

In [6]:
# 准确度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

由于是文本分类任务，直接使用BertForSequenceClassification加载bert-base-chinese模型，这里需要制定对应的类别数量

In [7]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# 此时才从网上下载模型
model = BertForSequenceClassification.from_pretrained('E:/models/chinese-roberta-wwm-ext', num_labels=17)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# 优化方法，初始学习率2e-5
optim = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Some weights of the model checkpoint at E:/models/chinese-roberta-wwm-ext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the mod

In [8]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 参数更新
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    

for epoch in range(3):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 1.2159, 4.00%
epoth: 0, iter_num: 200, loss: 0.9938, 8.00%
epoth: 0, iter_num: 300, loss: 0.9240, 12.00%
epoth: 0, iter_num: 400, loss: 0.8493, 16.00%
epoth: 0, iter_num: 500, loss: 0.4449, 20.00%
epoth: 0, iter_num: 600, loss: 0.2735, 24.00%
epoth: 0, iter_num: 700, loss: 0.5147, 28.00%
epoth: 0, iter_num: 800, loss: 0.4732, 32.00%
epoth: 0, iter_num: 900, loss: 0.4290, 36.00%
epoth: 0, iter_num: 1000, loss: 0.4736, 40.00%
epoth: 0, iter_num: 1100, loss: 0.9289, 44.00%
epoth: 0, iter_num: 1200, loss: 1.0946, 48.00%
epoth: 0, iter_num: 1300, loss: 0.5912, 52.00%
epoth: 0, iter_num: 1400, loss: 0.0931, 56.00%
epoth: 0, iter_num: 1500, loss: 0.7967, 60.00%
epoth: 0, iter_num: 1600, loss: 0.7024, 64.00%
epoth: 0, iter_num: 1700, loss: 0.5830, 68.00%
epoth: 0, iter_num: 1800, loss: 0.2056, 72.00%
epoth: 0, iter_num: 1900, loss: 0.8426, 76.00%
epoth: 0, iter_num: 2000, loss: 0.3800, 80.00%
epoth: 0, iter_num: 2100, loss: 0

Bert_base：训练一个Epoch的输出精度已经达到87.11%  
chinese-roberta-wwm-ext: 训练一个Epoch的输出精度已经达到87.61%