In [214]:
import torch
import numpy as np
import random
from tqdm import tqdm
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataloader import default_collate
from sklearn.metrics import accuracy_score
import time
from torch.optim.lr_scheduler import LambdaLR

In [215]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


RANDOM_SEED = 42
set_seed(RANDOM_SEED)

In [216]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

#### 数据分析与处理

In [217]:
with open('datasets/train.txt', 'r', encoding='utf-8') as f:
    tmp = []
    cnt = 1
    for line in tqdm(f.read().split('\n')):  # 句子与句子之间通过'\n'分隔
        sentence_id = f'sentence_{cnt}'
        if line != '\n' and len(line.strip()) > 0:
            word_tags = line.split(' ')
            if len(word_tags) == 2:  # 如:'枪 B-16'、'黑 I-16'、'色 I-16'
                tmp.append([sentence_id] + word_tags)
            elif len(word_tags) == 3:  # 如:'  O'
                word = '[SEP]'  # 这里使用Bert模型的sep_token('[SEP]')表示空格(' ')
                tag = word_tags[-1]
                tmp.append([sentence_id, word, tag])
        else:
            cnt += 1

100%|██████████| 2288791/2288791 [00:07<00:00, 299008.34it/s]


In [218]:
tmp[:5]

[['sentence_1', '手', 'B-40'],
 ['sentence_1', '机', 'I-40'],
 ['sentence_1', '三', 'B-4'],
 ['sentence_1', '脚', 'I-4'],
 ['sentence_1', '架', 'I-4']]

In [219]:
data_tmp = pd.DataFrame(tmp, columns=['sentence_id', 'words', 'tags'])  # 转换为DataFrame
data_tmp.head()

Unnamed: 0,sentence_id,words,tags
0,sentence_1,手,B-40
1,sentence_1,机,I-40
2,sentence_1,三,B-4
3,sentence_1,脚,I-4
4,sentence_1,架,I-4


In [220]:
labels_to_ids = {k: v for v, k in enumerate(data_tmp.tags.unique())}  # 标签到id的字典映射
ids_to_labels = {v: k for v, k in enumerate(data_tmp.tags.unique())}  # id到标签的字典映射
labels_to_ids

{'B-40': 0,
 'I-40': 1,
 'B-4': 2,
 'I-4': 3,
 'B-14': 4,
 'I-14': 5,
 'B-5': 6,
 'I-5': 7,
 'B-7': 8,
 'I-7': 9,
 'B-11': 10,
 'I-11': 11,
 'B-13': 12,
 'I-13': 13,
 'B-8': 14,
 'I-8': 15,
 'O': 16,
 'B-16': 17,
 'I-16': 18,
 'B-29': 19,
 'I-29': 20,
 'B-9': 21,
 'I-9': 22,
 'B-12': 23,
 'I-12': 24,
 'B-18': 25,
 'I-18': 26,
 'B-1': 27,
 'I-1': 28,
 'B-3': 29,
 'I-3': 30,
 'B-22': 31,
 'I-22': 32,
 'B-37': 33,
 'I-37': 34,
 'B-39': 35,
 'I-39': 36,
 'B-10': 37,
 'I-10': 38,
 'B-36': 39,
 'I-36': 40,
 'B-34': 41,
 'I-34': 42,
 'B-31': 43,
 'I-31': 44,
 'B-38': 45,
 'I-38': 46,
 'B-54': 47,
 'I-54': 48,
 'B-6': 49,
 'I-6': 50,
 'B-30': 51,
 'I-30': 52,
 'B-15': 53,
 'I-15': 54,
 'B-2': 55,
 'I-2': 56,
 'B-49': 57,
 'I-49': 58,
 'B-21': 59,
 'I-21': 60,
 'B-47': 61,
 'I-47': 62,
 'B-23': 63,
 'I-23': 64,
 'B-20': 65,
 'I-20': 66,
 'B-50': 67,
 'I-50': 68,
 'B-46': 69,
 'I-46': 70,
 'B-41': 71,
 'I-41': 72,
 'B-43': 73,
 'I-43': 74,
 'B-48': 75,
 'I-48': 76,
 'B-19': 77,
 'I-19': 78,
 'B-

In [221]:
data = pd.concat([data_tmp.groupby(['sentence_id'])['words'].apply(lambda x: ' '.join(x)),
                  data_tmp.groupby(['sentence_id'])['tags'].apply(lambda x: ','.join(x))], axis=1)
data.columns = ['sentence', 'word_labels']
data = data.drop_duplicates()
data.head()

Unnamed: 0_level_0,sentence,word_labels
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1
sentence_1,手 机 三 脚 架 网 红 直 播 支 架 桌 面 自 拍 杆 蓝 牙 遥 控 三 脚 架 ...,"B-40,I-40,B-4,I-4,I-4,B-14,I-14,B-5,I-5,B-4,I-..."
sentence_10,亚 信 作 废 章 财 务 现 金 付 讫 章 收 讫 受 控 文 件 银 行 付 讫 章 ...,"O,O,B-4,I-4,I-4,B-5,I-5,B-9,I-9,B-4,I-4,I-4,B-..."
sentence_100,乐 创 （ l e c o n ） 奶 茶 店 设 备 全 套 冷 藏 冷 冻 水 吧 台 ...,"B-1,I-1,O,B-1,I-1,I-1,I-1,I-1,O,B-7,I-7,I-7,O,..."
sentence_1000,京 贺 [SEP] 暗 黑 游 戏 鼠 标 垫 超 大 电 竞 加 厚 锁 边 键 盘 垫 ...,"O,O,O,B-14,I-14,B-5,I-5,B-4,I-4,I-4,B-13,I-13,..."
sentence_10000,英 特 尔 （ I n t e l ） 7 8 2 0 7 9 0 0 X i 9 7 9 ...,"B-1,I-1,I-1,O,B-1,I-1,I-1,I-1,I-1,O,B-4,I-4,I-..."


In [222]:
# 句子最大长度为101
# 中文按字分词;英文情况下,wordPiece或BPE分词可以用更小的词片段来组成更大的词
data['sentence'].apply(lambda x: len(x.split(' '))).describe()  #

count    39995.000000
mean        56.220828
std         13.473300
min          7.000000
25%         46.000000
50%         56.000000
75%         65.000000
max        101.000000
Name: sentence, dtype: float64

In [223]:
train_dataset, val_dataset = train_test_split(data, test_size=0.1, random_state=RANDOM_SEED)  # 划分训练数据集验证数据集 

In [224]:
with open('datasets/sample_per_line_preliminary_A.txt', 'r', encoding='utf-8') as f:
    tmp_test = []
    cnt_test = 1
    for line in tqdm(f.read().split('\n')):
        sentence_id = f'test_{cnt_test}'
        for word in line:
            if word.strip():
                # 若不为空白字符
                tmp_test.append([sentence_id, word])
            else:
                tmp_test.append([sentence_id, '[SEP]'])  # 同理使用Bert模型的sep_token('[SEP]')表示空格(' ')      
        cnt_test += 1

data_tmp_test = pd.DataFrame(tmp_test, columns=['sentence_id', 'words'])  # 转换为DataFrame
test_dataset = data_tmp_test.groupby(['sentence_id'])['words'].agg([lambda x: ' '.join(x)])  # 返回值类型为DataFrame
test_dataset.columns = ['sentence']
test_dataset.head()

100%|██████████| 10000/10000 [00:01<00:00, 7487.50it/s]


Unnamed: 0_level_0,sentence
sentence_id,Unnamed: 1_level_1
test_1,O P P O 闪 充 充 电 器 [SEP] X 9 0 7 0 [SEP] X 9 0 ...
test_10,听 雨 轩 8 0 支 装 中 性 笔 芯 0 . 5 m m 全 针 管 水 笔 芯 0 ...
test_100,联 想 I d e a p a d 7 2 0 S - 1 3 I K B 电 脑 炫 彩 ...
test_1000,A 4 纸 包 胶 y o 3 : 1 双 线 圈 3 4 孔 笔 记 本 台 挂 历 菜 ...
test_10000,六 品 堂 半 生 半 熟 宣 纸 1 0 0 张 书 法 专 用 纸 作 品 纸 国 画 ...


In [225]:
MODEL_NAME = 'hfl/chinese-roberta-wwm-ext'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
pretrained = BertModel.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [226]:
class MyDataset(Dataset):
    """自定义Dataset"""

    def __init__(self, df, have_label=True):
        self.texts = df['sentence'].values
        self.have_label = have_label
        if have_label:
            self.labels = df['word_labels'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        if self.have_label:
            label = self.labels[item]
            return text, label  # 自定义索引方式
        else:
            return text,  # 测试数据集不含标签

In [227]:
def find_label(index, sen_list):
    """递归往前找到第一个不是以'##'开头的单词"""
    index -= 1
    if not sen_list[index].startswith('##'):
        return index
    else:
        return find_label(index, sen_list)

In [228]:
def create_data_loader(df, tokenizer, batch_size, shuffle=True, have_label=True):
    mda = MyDataset(df, have_label=have_label)

    def collate_func(data):
        new_data = default_collate(data)
        encoding = tokenizer(text=list(new_data[0]),
                             return_token_type_ids=True,
                             padding=True,  # Pad to the longest sequence in the batch
                             return_attention_mask=True,
                             return_tensors='pt')
        result_dict = {'input_ids': encoding['input_ids'],
                       'token_type_ids': encoding['token_type_ids'],
                       'attention_mask': encoding['attention_mask']}
        if len(new_data) == 1:  # 测试数据集
            return result_dict  # 测试数据集不含标签
        else:
            want_list = []
            # 可通过指定tokenizer return_offsets_mapping=True或通过encoding.word_ids()简化过程
            for i in range(encoding['input_ids'].shape[0]):
                tokenize_sentence = tokenizer.convert_ids_to_tokens(encoding['input_ids'][i])
                label_list = new_data[1][i].split(',')
                label_list.insert(0, 'O')  # 对应Bert模型句子开头标签'[CLS]'
                label_list.append('O')  # 对应Bert模型句子结束标签'[SEP]'
                for i in range(len(tokenize_sentence)):
                    # Bert使用wordPiece分词
                    # For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`
                    # 如:"unaffable"的标签为:"O",output:`["un", "##aff", "##able"]`的标签应为:`["O", "O", "O"]`
                    if tokenize_sentence[i].startswith('##'):
                        insert_index = find_label(i, tokenize_sentence)
                        insert_label = label_list[insert_index]
                        label_list.insert(i, insert_label)
                    if tokenize_sentence[i] == '[PAD]':
                        label_list.append('O')  # 对应填充字符'[PAD]'
                label_list_ids = [labels_to_ids[label] for label in label_list]
                want_list.append(label_list_ids)
            result_dict['labels'] = torch.tensor(want_list)
            return result_dict

    return DataLoader(mda, batch_size=batch_size, collate_fn=collate_func, shuffle=shuffle)

In [229]:
train_data_loader = create_data_loader(train_dataset, tokenizer, 32, shuffle=True)  # 训练数据集
val_data_loader = create_data_loader(val_dataset, tokenizer, 32, shuffle=False)  # 验证数据集
test_data_loader = create_data_loader(test_dataset, tokenizer, 32, shuffle=False, have_label=False)  # 测试数据集

In [230]:
for i in val_data_loader:
    print(i['input_ids'])
    print(i['input_ids'].shape)
    print(i['labels'])
    print(i['labels'].shape)
    break

tensor([[ 101,  523,  524,  ...,    0,    0,    0],
        [ 101, 1912, 1297,  ...,    0,    0,    0],
        [ 101,  155,  143,  ...,    0,    0,    0],
        ...,
        [ 101, 4035, 3635,  ...,    0,    0,    0],
        [ 101, 6844, 4500,  ...,    0,    0,    0],
        [ 101, 2157, 4500,  ...,    0,    0,    0]])
torch.Size([32, 89])
tensor([[16, 16, 16,  ..., 16, 16, 16],
        [16,  2,  3,  ..., 16, 16, 16],
        [16, 45, 46,  ..., 16, 16, 16],
        ...,
        [16, 27, 28,  ..., 16, 16, 16],
        [16, 16, 16,  ..., 16, 16, 16],
        [16,  8,  9,  ..., 16, 16, 16]])
torch.Size([32, 89])


#### 模型构建

In [231]:
class BaseNER(nn.Module):
    """Bert + Linear基础模型进行命名实体识别"""

    def __init__(self, pretrained_model, num_labels, classifier_dropout=0.3):
        super().__init__()
        self.num_labels = num_labels
        self.pretrained = pretrained_model
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(self.pretrained.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.pretrained(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  token_type_ids=token_type_ids)

        # sequence_output.shape=[batch_size, sequence_length, hidden_size]
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        # logits.shape=[batch_size, sequence_length, num_labels]
        logits = self.classifier(sequence_output)
        return logits

In [232]:
model = BaseNER(pretrained, len(labels_to_ids), 0.3)
model = model.to(device)

### 模型训练与评估

In [233]:
# 损失函数
criterion_cross_entropy = torch.nn.CrossEntropyLoss()

# 优化器
optimizer_adamw = optim.AdamW(model.parameters(), lr=5e-5)  # 模型设备切换必须在优化器定义前执行

In [234]:
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            # 学习率预热(线性增加)
            return float(current_step) / float(max(1, num_warmup_steps))
        # 学习率线性衰减(最小为0)
        # num_training_steps后学习率恒为0
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda)


scheduler_lr = get_linear_schedule_with_warmup(optimizer_adamw, 0, len(train_data_loader) * 5)

In [235]:
# 模型训练
def train(model, dataloader, criterion, optimizer, device):
    model.train()

    for idx, i in enumerate(dataloader):
        # 数据设备切换
        input_ids = i['input_ids'].to(device)
        attention_mask = i['attention_mask'].to(device)
        token_type_ids = i['token_type_ids'].to(device)
        labels = i['labels'].to(device)

        optimizer.zero_grad()
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)
        # out.shape=[batch_size * sequence_length, model.num_labels]
        out = out.reshape(-1, model.num_labels)
        # labels.shape=[batch_size * sequence_length, ]
        labels = labels.reshape(-1)
        loss = criterion(out, labels)  # 损失值

        loss.backward()
        nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=5)  # 梯度裁剪
        optimizer.step()
        scheduler_lr.step()

        if idx % 500 == 0 and idx > 0:
            attention_mask_flatten_bool = attention_mask.reshape(-1) == 1  # reshape顺序与out、labels reshape顺序相同
            labels_mask = torch.masked_select(labels, attention_mask_flatten_bool)  # 布尔索引非填充区域标签(使之不参与评估准确率)
            predict_labels_mask = torch.masked_select(torch.argmax(out, dim=1), attention_mask_flatten_bool)
            acc = accuracy_score(labels_mask.cpu().numpy(), predict_labels_mask.cpu().numpy())
            print('| step {:5d} | loss {:8.5f} | accuracy {:8.5f} |'.format(idx, loss.item(), acc))

In [237]:
# 模型验证
def evaluate(model, dataloader, device):
    model.eval()

    predict_list = []
    y_true_list = []
    with torch.no_grad():
        for idx, i in enumerate(dataloader):
            # 数据设备切换
            input_ids = i['input_ids'].to(device)
            attention_mask = i['attention_mask'].to(device)
            token_type_ids = i['token_type_ids'].to(device)
            labels = i['labels']

            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
            # out.shape=[batch_size * sequence_length, model.num_labels]
            out = out.reshape(-1, model.num_labels)
            # labels.shape=[batch_size * sequence_length, ]
            labels = labels.reshape(-1)

            attention_mask_flatten_bool = attention_mask.cpu().reshape(-1) == 1  # reshape顺序与out、labels reshape顺序相同
            labels_mask = torch.masked_select(labels, attention_mask_flatten_bool)  # 布尔索引非填充区域标签(使之不参与评估准确率)
            predict_labels_mask = torch.masked_select(torch.argmax(out.cpu(), dim=1), attention_mask_flatten_bool)

            predict_list.append(predict_labels_mask)
            y_true_list.extend(labels_mask.tolist())

    predict_all = torch.cat(predict_list, dim=0)  # 合并所有批次的预测结果
    y_true_all = torch.tensor(y_true_list)
    accuracy = accuracy_score(y_true_all.numpy(), predict_all.numpy())  # 评估指标
    return accuracy

In [238]:
EPOCHS = 5

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(model, train_data_loader, criterion_cross_entropy, optimizer_adamw, device)
    acc_val = evaluate(model, val_data_loader, device)
    print('-' * 63)
    print('| end of epoch {:5d} | time: {:5.2f}s | '
          'valid accuracy {:8.5f} |'.format(epoch, time.time() - epoch_start_time, acc_val))
    print('-' * 63)

| step   500 | loss  0.44088 | accuracy  0.77423 |
| step  1000 | loss  0.44723 | accuracy  0.78127 |
---------------------------------------------------------------
| end of epoch     1 | time: 156.08s | valid accuracy  0.81043 |
---------------------------------------------------------------
| step   500 | loss  0.35923 | accuracy  0.82469 |
| step  1000 | loss  0.36174 | accuracy  0.82898 |
---------------------------------------------------------------
| end of epoch     2 | time: 156.56s | valid accuracy  0.81976 |
---------------------------------------------------------------
| step   500 | loss  0.37173 | accuracy  0.81074 |
| step  1000 | loss  0.41952 | accuracy  0.80341 |
---------------------------------------------------------------
| end of epoch     3 | time: 157.14s | valid accuracy  0.82326 |
---------------------------------------------------------------
| step   500 | loss  0.26962 | accuracy  0.87680 |
| step  1000 | loss  0.32972 | accuracy  0.84546 |
-------------

### 模型预测

In [239]:
# 模型预测
def predict(model, dataloader, device):
    model.eval()

    predict_list = []
    with torch.no_grad():
        for idx, i in enumerate(dataloader):
            # 数据设备切换
            input_ids = i['input_ids'].to(device)
            attention_mask = i['attention_mask'].to(device)
            token_type_ids = i['token_type_ids'].to(device)

            # out.shape=[batch_size, sequence_length, model.num_labels]
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
            predict = torch.argmax(out, dim=-1)

            # 将id重新映射回标签
            predict_label = pd.DataFrame(predict.cpu().tolist()).applymap(lambda x: ids_to_labels[x])

            predict_list.extend(predict_label.values.tolist())

    return predict_list

In [240]:
predict_result = predict(model, test_data_loader, device)
predict_result[0]  # 模型预测标签(包含冗余部分)

['O',
 'B-37',
 'I-37',
 'I-37',
 'I-37',
 'B-11',
 'I-11',
 'B-4',
 'I-4',
 'I-4',
 'O',
 'B-38',
 'I-38',
 'I-38',
 'I-38',
 'I-38',
 'O',
 'B-38',
 'I-38',
 'I-38',
 'I-38',
 'I-38',
 'O',
 'B-38',
 'I-38',
 'O',
 'B-4',
 'I-4',
 'I-4',
 'B-11',
 'I-11',
 'B-40',
 'I-40',
 'B-4',
 'I-4',
 'I-4',
 'O',
 'O',
 'O',
 'O',
 'B-18',
 'I-18',
 'I-18',
 'B-4',
 'I-4',
 'I-4',
 'O',
 'B-4',
 'I-4',
 'I-4',
 'O',
 'O',
 'O',
 'B-37',
 'I-37',
 'O',
 'B-18',
 'I-18',
 'I-18',
 'I-18',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [242]:
tokenize_list = []
for i in test_data_loader:
    for j in i['input_ids'].tolist():
        tokenize_list.append(tokenizer.convert_ids_to_tokens(j))
tokenize_list[0]  # 分词器分词结果(包括'[CLS]', '[PAD]', Bert模型句子结束标签'[SEP]', 组成英文单词的更小的词片段)

['[CLS]',
 'o',
 'p',
 'p',
 'o',
 '闪',
 '充',
 '充',
 '电',
 '器',
 '[SEP]',
 'x',
 '9',
 '0',
 '7',
 '0',
 '[SEP]',
 'x',
 '9',
 '0',
 '7',
 '7',
 '[SEP]',
 'r',
 '5',
 '[SEP]',
 '快',
 '充',
 '头',
 '通',
 '用',
 '手',
 '机',
 '数',
 '据',
 '线',
 '[SEP]',
 '套',
 '餐',
 '【',
 '2',
 '.',
 '4',
 '充',
 '电',
 '头',
 '+',
 '数',
 '据',
 '线',
 '[SEP]',
 '】',
 '[SEP]',
 '安',
 '卓',
 '[SEP]',
 '1',
 '.',
 '5',
 'm',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [243]:
predict_result_remove_redundancy = []

for num in range(10000):  # 共10000条句子
    predict_num = predict_result[num]
    tokenize_num = tokenize_list[num]
    temp_remove_redundancy_list = []

    for word_id in range(len(tokenize_num)):
        if (tokenize_num[word_id].startswith("##")) or (tokenize_num[word_id] in ['[CLS]', '[PAD]']):
            # 去除'[CLS]'和'[PAD]'的和"##"开头的字的预测结果
            # 例:英文单词"unaffable"的分词结果为:`["un", "##aff", "##able"]`,这里只考虑"un"的预测结果
            continue
        else:
            temp_remove_redundancy_list.append(predict_num[word_id])
    temp_remove_redundancy_str = ' '.join(temp_remove_redundancy_list[:-1])  # 去除Bert模型句子结束标签'[SEP]'的预测结果
    predict_result_remove_redundancy.append(temp_remove_redundancy_str)

In [244]:
# example:
print(test_dataset.iloc[0][0])
print(predict_result_remove_redundancy[0])


O P P O 闪 充 充 电 器 [SEP] X 9 0 7 0 [SEP] X 9 0 7 7 [SEP] R 5 [SEP] 快 充 头 通 用 手 机 数 据 线 [SEP] 套 餐 【 2 . 4 充 电 头 + 数 据 线 [SEP] 】 [SEP] 安 卓 [SEP] 1 . 5 m
B-37 I-37 I-37 I-37 B-11 I-11 B-4 I-4 I-4 O B-38 I-38 I-38 I-38 I-38 O B-38 I-38 I-38 I-38 I-38 O B-38 I-38 O B-4 I-4 I-4 B-11 I-11 B-40 I-40 B-4 I-4 I-4 O O O O B-18 I-18 I-18 B-4 I-4 I-4 O B-4 I-4 I-4 O O O B-37 I-37 O B-18 I-18 I-18 I-18
