In [None]:
import os
import time
import numpy as np
from tqdm import tqdm, trange
import codecs
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from seqeval.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report

设置数据路径

In [None]:
data_path = './CONLL2003/'

fnames = os.listdir(data_path)
fnames.sort()

data_files = []
for f in fnames:
    data_files.append(os.path.join(data_path, f))

In [None]:
word_count = {}
tag_count = {}
word_train = []
train_tag = []
word_val = []
val_tag = []
word_test = []
test_tag = []
corpus_dic = {'word_test': word_test, 'test_tag': test_tag, 
              'word_val': word_val, 'val_tag': val_tag, 
              'word_train': word_train, 'train_tag': train_tag}
corpus_list = ['word_test', 'test_tag', 'word_train', 
               'train_tag', 'word_val', 'val_tag']

数据处理

In [None]:
for i, f in enumerate(data_files):
    with codecs.open(f, encoding='utf-8') as fid:
        for l in fid:
            if l == '\r\n' or l == '\n': continue
                
            l = l.replace('\r\n', '')
            t = l.split()
            corpus_dic[corpus_list[2*i]].append(t[0])
            corpus_dic[corpus_list[2*i+1]].append(t[1])

In [None]:
for i in range(len(data_files)):
    for w in corpus_dic[corpus_list[2*i]]:
        if w in word_count:
            word_count[w] += 1
        else:
            word_count[w] = 1
    
    for t in corpus_dic[corpus_list[2*i+1]]:
        if t in tag_count:
            tag_count[t] += 1
        else:
            tag_count[t] = 1

In [None]:
wcounts = list(word_count.items())
wcounts.sort(key=lambda x: x[1], reverse=True)
word_sorted = [w[0] for w in wcounts]
word_index = dict(list(zip(word_sorted, list(range(1, len(word_count) + 1)))))
word_index['PAD'] = 0
word_index['UNK'] = len(word_index)

保存标签与数值标签的映射

In [None]:
tag_index = {t[0]: i for i, t in enumerate(list(tag_count.items()))}
index_tag = {i: t[0] for i, t in enumerate(list(tag_count.items()))}

增加PAD这一Label

In [None]:
tag_index['PAD'] = len(tag_count)
index_tag[len(tag_count)] = 'PAD'

In [None]:
train_sent = []
train_tag = []
val_sent = []
val_tag = []
test_sent = []
test_tag = []
sentences_corpus = {'test_sent': test_sent, 'test_tag': test_tag, 
                    'val_sent': val_sent, 'val_tag': val_tag, 
                    'train_sent': train_sent, 'train_tag': train_tag}
sent_corpus_list = ['test_sent', 'test_tag', 
                    'train_sent', 'train_tag',
                    'val_sent', 'val_tag']

读取数据

In [None]:
for i, f in enumerate(data_files):
    sent = []
    tag = []
    for l in open(f):
        if l in ['\n', '\r\n']:
            sentences_corpus[sent_corpus_list[2*i]].append(sent)
            sentences_corpus[sent_corpus_list[2*i+1]].append(tag)
            sent = []
            tag = []
        else:
            t = l.replace('\r\n', '').split()
            sent.append(t[0])
            tag.append(t[1])

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [None]:
ner_model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag_index),
    output_attentions = False,
    output_hidden_states = False
)

对输入数据进行分词

In [None]:
def tokenize_and_preserve_tags(sentence, sent_tags):
    tokenized_sentence = []
    tags = []
    n_tokens = []

    for w, t in zip(sentence, sent_tags):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(w)
        n_subwords = len(tokenized_word)
        n_tokens.append(n_subwords)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels b`n_subwords` times
        tags.extend([t] * n_subwords)

    return tokenized_sentence, tags, n_tokens

In [None]:
train_tokenized_sents_and_tags = [
    tokenize_and_preserve_tags(sent, tag)
    for sent, tag in zip(sentences_corpus['train_sent'], sentences_corpus['train_tag'])
]

将数据集分为数据，标签，分词数量

In [None]:
train_tokenized_texts = [token_label_pair[0] for token_label_pair in train_tokenized_sents_and_tags]
train_labels = [token_label_pair[1] for token_label_pair in train_tokenized_sents_and_tags]
num_tokens = [token_label_pair[2] for token_label_pair in train_tokenized_sents_and_tags]

In [None]:
train_input_ids = []
train_attention_masks = []
train_tags = []
for i in range(len(train_tokenized_texts)):
    temp = tokenizer.encode_plus(train_tokenized_texts[i],add_special_tokens=True,max_length=128, pad_to_max_length=True)
    train_input_ids.append(temp['input_ids'])
    train_attention_masks.append(temp['attention_mask'])
    temp_tag = [tag_index.get(l) for l in train_labels[i]]
    for i in range(len(temp_tag),128):
        temp_tag.append(tag_index['PAD'])
    if len(temp_tag)>128:
        temp_tag = temp_tag[:128]
    train_tags.append(temp_tag)

In [None]:
val_tokenized_sents_and_tags = [
    tokenize_and_preserve_tags(sent, tag)
    for sent, tag in zip(sentences_corpus['val_sent'], sentences_corpus['val_tag'])
]

In [None]:
val_tokenized_texts = [token_label_pair[0] for token_label_pair in val_tokenized_sents_and_tags]
val_labels = [token_label_pair[1] for token_label_pair in val_tokenized_sents_and_tags]

In [None]:
val_input_ids = []
val_attention_masks = []
val_tags = []
for i in range(len(val_tokenized_texts)):
    temp = tokenizer.encode_plus(val_tokenized_texts[i],add_special_tokens=True,max_length=128, pad_to_max_length=True)
    val_input_ids.append(temp['input_ids'])
    val_attention_masks.append(temp['attention_mask'])
    temp_tag = [tag_index.get(l) for l in val_labels[i]]
    for i in range(len(temp_tag),128):
        temp_tag.append(tag_index['PAD'])
    if len(temp_tag)>128:
        temp_tag = temp_tag[:128]
    val_tags.append(temp_tag)

构建DataSet与DataLoader

In [None]:
train_inputs = torch.tensor(train_input_ids)
val_inputs = torch.tensor(val_input_ids)
train_tags = torch.tensor(train_tags)
val_tags = torch.tensor(val_tags)
train_masks = torch.tensor(train_attention_masks)
val_masks = torch.tensor(val_attention_masks)

In [None]:
batch_size = 4
training_data = TensorDataset(train_inputs, train_masks, train_tags)
training_sampler = RandomSampler(training_data)
training_dataloader = DataLoader(training_data, sampler=training_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [None]:
ner_model.to('cuda:0');

对预训练模型进行finetune，设置优化器

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(ner_model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(ner_model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [None]:
epochs = 4
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(training_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
loss_values, validation_loss_values = [], []

for _ in range(epochs):
    ner_model.train()
    total_loss = 0

    # Training loop
    for batch in tqdm(training_dataloader):
        # add batch to device (gpu or cpu)
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # reset gradients before performing a backward pass.
        ner_model.zero_grad()
        # forward pass, get the training loss
        outputs = ner_model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the training loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        total_loss += loss.item()
        # Clip the norm of the gradient to prevent from the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=ner_model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(training_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    ner_model.eval()
    # Reset the validation loss
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            # Forward pass, calculate the predicted logits
            outputs = ner_model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    
    pred_tags = []
    valid_tags = []
    for p, l in zip(predictions, true_labels):
        preds = []
        labs = []
        for p_i, l_i in zip(p, l):
            if index_tag[l_i] != "PAD":
                preds.append(index_tag[p_i])
                labs.append(index_tag[l_i])
        pred_tags.append(preds)
        valid_tags.append(labs)
    
    print("Validation Precision: {}".format(precision_score(valid_tags, pred_tags)))
    print("Validation Recall: {}".format(recall_score(valid_tags, pred_tags)))
    print("Validation Accuracy: {}".format(accuracy_score(valid_tags, pred_tags)))
    print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags)))
    print("Report: {}".format(classification_report(valid_tags, pred_tags)))
    print()