# NER是一种序列标注问题

In [1]:
#NER数据命名实体识别数据
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
from transformers import BertForTokenClassification
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
# 读取数据
import os
os.environ['CURL_CA_BUNDLE'] = ''
df = pd.read_csv('ner.csv')
#根据空格拆分标签，并将它们转换为列表
labels = [i.split() for i in df['labels'].values.tolist()]
# 检查数据集中有多少标签
unique_labels = set()
for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
#在能够使用 BERT 模型对 token 级别的实体进行分类之前，需要先进行数据预处理，包括两部分：tokenization 和调整标签以匹配 tokenization
text = df['text'].values.tolist()
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')



In [2]:
def align_label(texts, labels,label_all_tokens=True):
    # 首先tokenizer输入文本
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
  # 获取word_ids
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        # 如果token不在word_ids内，则用 “-100” 填充
        if word_idx is None:
            label_ids.append(-100)
        # 如果token在word_ids内，且word_idx不为None，则从labels_to_ids获取label id
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        # 如果token在word_ids内，且word_idx为None
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids



# 构建自己的数据集类
class DataSequence(Dataset):
    def __init__(self, df):
        # 根据空格拆分labels
        lb = [i.split() for i in df['labels'].values.tolist()]
        # tokenizer 向量化文本
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512,
                                truncation=True, return_tensors="pt") for i in txt]
        # 对齐标签
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        return self.texts[idx]

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)
        return batch_data, batch_labels

df = df[0:1000]
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])    #0.8训练集，0.1验证集,0.1测试集

In [3]:
## 建模
class BertModel(nn.Module):
    def __init__(self):
        super(BertModel, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained(
                       'bert-base-cased',num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask,
                           labels=label, return_dict=False)
        return output

In [4]:
# 定义训练和验证集数据
train_dataset = DataSequence(df_train)
val_dataset = DataSequence(df_val)
# 批量获取训练和验证集数据
train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=1, shuffle=True)
val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=1)
# 判断是否使用GPU，如果有，尽量使用，可以加快训练速度
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")


In [5]:
from tqdm import tqdm 
def train_loop(model, df_train, df_val):
  # 定义训练和验证集数据
  train_dataset = DataSequence(df_train)
  val_dataset = DataSequence(df_val)
  # 批量获取训练和验证集数据
  train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=1, shuffle=True)
  val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=1)
  # 判断是否使用GPU，如果有，尽量使用，可以加快训练速度
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  # 定义优化器
  optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

  if use_cuda:
    model = model.cuda()
  # 开始训练循环
  best_acc = 0
  best_loss = 1000
  for epoch_num in range(EPOCHS):

    total_acc_train = 0
    total_loss_train = 0
    # 训练模型
    model.train()
    # 按批量循环训练模型
    for train_data, train_label in tqdm(train_dataloader):
      # 从train_data中获取mask和input_id
      train_label = train_label[0].to(device)
      mask = train_data['attention_mask'][0].to(device)
      input_id = train_data['input_ids'][0].to(device)
      # 梯度清零！！
      optimizer.zero_grad()
      # 输入模型训练结果：损失及分类概率
      loss, logits = model(input_id, mask, train_label)
      # 过滤掉特殊token及padding的token
      logits_clean = logits[0][train_label != -100]
      label_clean = train_label[train_label != -100]
      # 获取最大概率值
      predictions = logits_clean.argmax(dim=1)
      # 计算准确率
      acc = (predictions == label_clean).float().mean()
      total_acc_train += acc
      total_loss_train += loss.item()
      # 反向传递
      loss.backward()
      # 参数更新
      optimizer.step()
    # 模型评估
    model.eval()

    total_acc_val = 0
    total_loss_val = 0
    for val_data, val_label in val_dataloader:
      # 批量获取验证数据
      val_label = val_label[0].to(device)
      mask = val_data['attention_mask'][0].to(device)
      input_id = val_data['input_ids'][0].to(device)
      # 输出模型预测结果
      loss, logits = model(input_id, mask, val_label)
      # 清楚无效token对应的结果
      logits_clean = logits[0][val_label != -100]
      label_clean = val_label[val_label != -100]
      # 获取概率值最大的预测
      predictions = logits_clean.argmax(dim=1)
      # 计算精度
      acc = (predictions == label_clean).float().mean()
      total_acc_val += acc
      total_loss_val += loss.item()

    val_accuracy = total_acc_val / len(df_val)
    val_loss = total_loss_val / len(df_val)

    print(
      f'''Epochs: {epoch_num + 1} |
                Loss: {total_loss_train / len(df_train): .3f} |
                Accuracy: {total_acc_train / len(df_train): .3f} |
                Val_Loss: {total_loss_val / len(df_val): .3f} |
                Accuracy: {total_acc_val / len(df_val): .3f}''')

In [None]:

LEARNING_RATE = 1e-2
EPOCHS = 5
model = BertModel()
train_loop(model, df_train, df_val)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/800 [00:00<?, ?it/s]