## 1. 任务介绍 
序列标注问题，是自然语言中最常见的问题，包括分词，词性标注，命名实体识别，关键词抽取，词义角色标注等。具体而言即给定一个序列:
$$
X = (x_1,x_2,x_3,\dots, x_n)
$$
为其每个元素打上标签集合中的某个标签，得到标签序列：
$$
Y = (y_1,y_2,y_3,\dots,y_n)
$$
命名实体识别任务是识别句子中出现的实体，通常识别人名、地名、机构名这三类实体


## 2. 环境准备

In [1]:
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch
import pickle
import re
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 3. 定义参数

In [2]:
data_dir = "../data/CONLL2003/"

tag_scheme = "BIOES"  # 标记策略，BIO 或者 BIOES
lower = True  # 是否将单词转成小写
zeros = True  # 是否将所有数字都转成0

max_seq_length = 50
max_word_length = 50

kernel_size = 3  # 1维卷积核大小
char_embed_size = 50
num_filters = 30  # 卷积个数，即输出的通道数

word_embed_size = 100 # 嵌入词向量维度
hidden_size = 200  # LSTM的隐状态维度
bidrection = True  # 是否使用双向的LSTM
dropout_rate = 0.5  # dropout 参数
gradient_clip = 5.0

lr = 0.015  # 初始学习率
decay_rate = 0.05  # 衰减率
batch_size = 10
shuffle = True  # 是否打乱数据加载的次序
epoch = 50

START_TAG = '<START>'
STOP_TAG = '<STOP>'

embedding_path = "../data/pretrained/glove.6B.100d.txt"
models_path = "../checkpoints/"  # 模型保存路径
mapping_file = "../data/mapping.pkl"  # 保存数据预处理后的文件路径


device = torch.device(("cuda:0" if torch.cuda.is_available() else "cpu"))

## 4. 数据处理
主要包括以下几个步骤：
1. 将句子中的所有数字都替换为0：对于NER任务，数值对于识别实体并无帮助，数值替换为0有利于模型更加关注于重要的单词
2. 更新标注策略：NER可能会使用不同的标注策略，如BIOES，BIO，论文中使用的时BIOES
3. 创建单词映射表，字符映射表和标签映射表

### 4.1 加载数据

In [30]:
def zero_digits(s):
    """
    替换单词中的数字为0
    """
    return re.sub('\d', '0', s)


def load_corpus(path, zeros):
    sentences = []
    tags = []
    sent = []
    tag = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            line = zero_digits(line.strip()) if zeros else line.strip()
            if len(line) > 0:
                if 'DOCSTART' not in line:
                    word = line.split()
                    sent.append(word[0])
                    tag.append(word[-1])
            else:
                if len(sent) > 0:
                    sentences.append(sent)
                    tags.append(tag)
                    sent = []
                    tag = []
    return sentences, tags


train_sentences, train_tags = load_corpus(data_dir+"eng.train", zeros)
valid_sentences, valid_tags = load_corpus(data_dir+"eng.testa", zeros)
test_sentences, test_tags = load_corpus(data_dir+"eng.testb", zeros)

print(len(train_sentences), len(train_tags))
train_tags

14041 14041


[['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O'],
 ['I-PER', 'I-PER'],
 ['I-LOC', 'O'],
 ['O',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['I-LOC',
  'O',
  'O',
  'O',
  'O',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-ORG',
  'O',
  'O',
  'O',
  'I-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'I-ORG',
  'I-ORG',
 

### 4.2 更新标记策略
+ B - 单词为该类型实体短语的开始
+ I - 单词为该类型实体短语的中间
+ O - 单词不是一个实体短语
+ E - 单词为该类型实体短语的截止
+ S - 单个实体类型单词

In [31]:
def iob2(tags):
    """
    检查标签是否是合法的BIO格式，然后从BIO1转换为BIO2格式
    """
    for i, tag in enumerate(tags):
        if tag == 'O':
            continue
        split = tag.split('-')
        if len(split) != 2 or split[0] not in ['I', 'B']:  # 说明不是BIO格式
            return False
        if split[0] == 'B':
            continue
        elif i == 0 or tags[i-1] == 'O':
            tags[i] = 'B-' + split[1]
        elif tags[i-1].split('-')[1] == split[1]:
            continue
        else:
            tags[i] = 'B-' + split[1]
    return True


def iob_to_iobes(tags):
    """
    将BIO格式转换成BIOES格式
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i+1 != len(tags) and tags[i+1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i+1 < len(tags) and tags[i+1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags


def update_tag_scheme(tags_list, tag_scheme):
    """
    更新tag标记为BIOES
    """
    new_tags_list = []
    for i, tags in enumerate(tags_list):
        if not iob2(tags):
            raise Exception('Not BIO format!')

        if tag_scheme == 'BIOES':
            new_tags = iob_to_iobes(tags)
        else:
            raise Exception('Wrong tagging scheme!')

        new_tags_list.append(new_tags)

    return new_tags_list


train_tags = update_tag_scheme(train_tags, tag_scheme)
valid_tags = update_tag_scheme(valid_tags, tag_scheme)
test_tags = update_tag_scheme(test_tags, tag_scheme)

print(len(train_tags))
train_tags

14041


[['S-ORG', 'O', 'S-MISC', 'O', 'O', 'O', 'S-MISC', 'O', 'O'],
 ['B-PER', 'E-PER'],
 ['S-LOC', 'O'],
 ['O',
  'B-ORG',
  'E-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'S-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'S-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['S-LOC',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'E-ORG',
  'O',
  'O',
  'O',
  'B-PER',
  'E-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'S-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'S-ORG',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'E-PER',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'E-ORG',
 

### 4.3 创建映射表

In [32]:
def word_mapping(sentences, lower):
    words = [w.lower() if lower else w for s in sentences for w in s]
    words_all = ['<UNK>']
    words_all.extend(list(set(words)))
    word2id = {w: i for i, w in enumerate(words_all)}
    id2word = {v: k for k, v in word2id.items()}
    print("Found {0} unique words".format(len(words_all)))
    return words_all, word2id, id2word


def char_mapping(sentences):
    chars = "".join(["".join(s) for s in sentences])
    chars_all = list(set(chars))
    char2id = {c: i for i, c in enumerate(chars_all)}
    id2char = {v: k for k, v in char2id.items()}
    print("Found {0} unique chars".format(len(chars_all)))
    return chars_all, char2id, id2char


def tag_mapping(tags_list):
    tags = [tag for tags in tags_list for tag in tags]
    tags_all = [START_TAG, STOP_TAG]
    tags_all.extend(list(set(tags)))
    tag2id = {t: i for i, t in enumerate(tags_all)}
    id2tag = {v: k for k, v in tag2id.items()}
    print("Found {0} unique tags".format(len(tags_all)))
    return tags_all, tag2id, id2tag


words_all, word2id, id2word = word_mapping(train_sentences, lower)
chars_all, char2id, id2char = char_mapping(train_sentences)
tags_all, tag2id, id2tag = tag_mapping(train_tags)
print(tags_all)

Found 17493 unique words
Found 75 unique chars
Found 19 unique tags
['<START>', '<STOP>', 'I-PER', 'O', 'S-PER', 'B-PER', 'B-ORG', 'B-MISC', 'S-ORG', 'E-MISC', 'E-ORG', 'B-LOC', 'E-LOC', 'I-ORG', 'I-LOC', 'S-MISC', 'E-PER', 'S-LOC', 'I-MISC']


## 5. 构建 Dataset和 DataLoader

In [33]:
class ConllDataset(Dataset):
    def __init__(self, sentences, tags, lower, word2id, char2id, tag2id):
        self.word2id = word2id
        self.char2id = char2id
        self.tag2id = tag2id
        self.lower = lower

        self.init_data(sentences, tags)

    def __getitem__(self, index):
        return {
            'words': self.sent_ids[index],
            'chars': self.char_ids[index],
            'tags': self.tag_ids[index]
        }

    def __len__(self):
        return len(self.sent_ids)

    def lower_case(self, x):
        if self.lower:
            return x.lower()
        else:
            return x

    def init_data(self, sentences, tags):
        self.sent_ids = []
        self.char_ids = []
        self.tag_ids = []
        for i, (sent, tag) in enumerate(zip(sentences, tags)):
            self.sent_ids.append([self.word2id[self.lower_case(w) if self.lower_case(
                w) in self.word2id else '<UNK>'] for w in sent])

            self.char_ids.append(
                [[self.char2id[c] for c in w if c in self.char2id] for w in sent])

            self.tag_ids.append([self.tag2id[t] for t in tag])


train_dataset = ConllDataset(
    train_sentences, train_tags, lower, word2id, char2id, tag2id)
valid_dataset = ConllDataset(
    valid_sentences, valid_tags, lower, word2id, char2id, tag2id)
test_dataset = ConllDataset(
    test_sentences, test_tags, lower, word2id, char2id, tag2id)
print("{} / {} / {} sentences in train / valid / test.".format(len(train_dataset),
                                                               len(valid_dataset), len(test_dataset)))

14041 / 3250 / 3453 sentences in train / valid / test.


In [34]:
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=shuffle)
valid_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=shuffle)
test_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=shuffle)

##  6. 加载预训练词向量

In [35]:
all_word_embeds = {}
with open(embedding_path,'r',encoding='utf-8') as f:
    for i,line in enumerate(f.readlines()):
        s = line.strip().split()
        if len(s) == word_embed_size +1:
            all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])
            
#初始化词向量矩阵
word_embeds = np.random.uniform(-np.sqrt(0.03), np.sqrt(0.03), (len(word2id), word_embed_sized))

for w in word2id:
    if w in all_word_embeds:
        word_embeds[word2id[w]] = all_word_embeds[w]
    elif w.lower() in all_word_embeds:
        word_embeds[word2id[w]] = all_word_embeds[w.lower()]

print('Loaded %i pretrained embeddings.' % len(all_word_embeds))

NameError: name 'word_embed_sized' is not defined

## 7. 保存预处理后的数据以便复用

In [None]:
with open(mapping_file,'wb') as f:
    mappings = {
        'word2id':word2id,
        'char2id':char2id,
        'tag2id':tag2id,
        'word_embeds':word_embeds
    }
    pickle.dump(mappings,f)

## 8. 导入CRF

In [36]:
from typing import List, Optional

import torch
import torch.nn as nn


class CRF(nn.Module):
    """Conditional random field.
    This module implements a conditional random field [LMP01]_. The forward computation
    of this class computes the log likelihood of the given sequence of tags and
    emission score tensor. This class also has `~CRF.decode` method which finds
    the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
    Args:
        num_tags: Number of tags.
        batch_first: Whether the first dimension corresponds to the size of a minibatch.
    Attributes:
        start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
            ``(num_tags,)``.
        end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
            ``(num_tags,)``.
        transitions (`~torch.nn.Parameter`): Transition score tensor of size
            ``(num_tags, num_tags)``.
    .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
       "Conditional random fields: Probabilistic models for segmenting and
       labeling sequence data". *Proc. 18th International Conf. on Machine
       Learning*. Morgan Kaufmann. pp. 282–289.
    .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
    """

    def __init__(self, num_tags: int, batch_first: bool = False) -> None:
        if num_tags <= 0:
            raise ValueError(f'invalid number of tags: {num_tags}')
        super().__init__()
        self.num_tags = num_tags
        self.batch_first = batch_first
        self.start_transitions = nn.Parameter(torch.empty(num_tags))
        self.end_transitions = nn.Parameter(torch.empty(num_tags))
        self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))

        self.reset_parameters()

    def reset_parameters(self) -> None:
        """Initialize the transition parameters.
        The parameters will be initialized randomly from a uniform distribution
        between -0.1 and 0.1.
        """
        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
        nn.init.uniform_(self.transitions, -0.1, 0.1)

    def __repr__(self) -> str:
        return f'{self.__class__.__name__}(num_tags={self.num_tags})'

    def forward(
            self,
            emissions: torch.Tensor,
            tags: torch.LongTensor,
            mask: Optional[torch.ByteTensor] = None,
            reduction: str = 'sum',
    ) -> torch.Tensor:
        """Compute the conditional log likelihood of a sequence of tags given emission scores.
        Args:
            emissions (`~torch.Tensor`): Emission score tensor of size
                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
                ``(batch_size, seq_length, num_tags)`` otherwise.
            tags (`~torch.LongTensor`): Sequence of tags tensor of size
                ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
                ``(batch_size, seq_length)`` otherwise.
            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
            reduction: Specifies  the reduction to apply to the output:
                ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
                ``sum``: the output will be summed over batches. ``mean``: the output will be
                averaged over batches. ``token_mean``: the output will be averaged over tokens.
        Returns:
            `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
            reduction is ``none``, ``()`` otherwise.
        """
        self._validate(emissions, tags=tags, mask=mask)
        if reduction not in ('none', 'sum', 'mean', 'token_mean'):
            raise ValueError(f'invalid reduction: {reduction}')
        if mask is None:
            mask = torch.ones_like(tags, dtype=torch.uint8)

        if self.batch_first:
            emissions = emissions.transpose(0, 1)
            tags = tags.transpose(0, 1)
            mask = mask.transpose(0, 1)

        # shape: (batch_size,)
        numerator = self._compute_score(emissions, tags, mask)
        # shape: (batch_size,)
        denominator = self._compute_normalizer(emissions, mask)
        # shape: (batch_size,)
        llh = numerator - denominator

        if reduction == 'none':
            return llh
        if reduction == 'sum':
            return llh.sum()
        if reduction == 'mean':
            return llh.mean()
        assert reduction == 'token_mean'
        return llh.sum() / mask.type_as(emissions).sum()

    def decode(self, emissions: torch.Tensor,
               mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
        """Find the most likely tag sequence using Viterbi algorithm.
        Args:
            emissions (`~torch.Tensor`): Emission score tensor of size
                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
                ``(batch_size, seq_length, num_tags)`` otherwise.
            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
        Returns:
            List of list containing the best tag sequence for each batch.
        """
        self._validate(emissions, mask=mask)
        if mask is None:
            mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)

        if self.batch_first:
            emissions = emissions.transpose(0, 1)
            mask = mask.transpose(0, 1)

        return self._viterbi_decode(emissions, mask)

    def _validate(
            self,
            emissions: torch.Tensor,
            tags: Optional[torch.LongTensor] = None,
            mask: Optional[torch.ByteTensor] = None) -> None:
        if emissions.dim() != 3:
            raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
        if emissions.size(2) != self.num_tags:
            raise ValueError(
                f'expected last dimension of emissions is {self.num_tags}, '
                f'got {emissions.size(2)}')

        if tags is not None:
            if emissions.shape[:2] != tags.shape:
                raise ValueError(
                    'the first two dimensions of emissions and tags must match, '
                    f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}')

        if mask is not None:
            if emissions.shape[:2] != mask.shape:
                raise ValueError(
                    'the first two dimensions of emissions and mask must match, '
                    f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}')
            no_empty_seq = not self.batch_first and mask[0].all()
            no_empty_seq_bf = self.batch_first and mask[:, 0].all()
            if not no_empty_seq and not no_empty_seq_bf:
                raise ValueError('mask of the first timestep must all be on')

    def _compute_score(
            self, emissions: torch.Tensor, tags: torch.LongTensor,
            mask: torch.ByteTensor) -> torch.Tensor:
        # emissions: (seq_length, batch_size, num_tags)
        # tags: (seq_length, batch_size)
        # mask: (seq_length, batch_size)
        assert emissions.dim() == 3 and tags.dim() == 2
        assert emissions.shape[:2] == tags.shape
        assert emissions.size(2) == self.num_tags
        assert mask.shape == tags.shape
        assert mask[0].all()

        seq_length, batch_size = tags.shape
        mask = mask.type_as(emissions)

        # Start transition score and first emission
        # shape: (batch_size,)
        score = self.start_transitions[tags[0]]
        score += emissions[0, torch.arange(batch_size), tags[0]]

        for i in range(1, seq_length):
            # Transition score to next tag, only added if next timestep is valid (mask == 1)
            # shape: (batch_size,)
            score += self.transitions[tags[i - 1], tags[i]] * mask[i]

            # Emission score for next tag, only added if next timestep is valid (mask == 1)
            # shape: (batch_size,)
            score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]

        # End transition score
        # shape: (batch_size,)
        seq_ends = mask.long().sum(dim=0) - 1
        # shape: (batch_size,)
        last_tags = tags[seq_ends, torch.arange(batch_size)]
        # shape: (batch_size,)
        score += self.end_transitions[last_tags]

        return score

    def _compute_normalizer(
            self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor:
        # emissions: (seq_length, batch_size, num_tags)
        # mask: (seq_length, batch_size)
        assert emissions.dim() == 3 and mask.dim() == 2
        assert emissions.shape[:2] == mask.shape
        assert emissions.size(2) == self.num_tags
        assert mask[0].all()

        seq_length = emissions.size(0)

        # Start transition score and first emission; score has size of
        # (batch_size, num_tags) where for each batch, the j-th column stores
        # the score that the first timestep has tag j
        # shape: (batch_size, num_tags)
        score = self.start_transitions + emissions[0]

        for i in range(1, seq_length):
            # Broadcast score for every possible next tag
            # shape: (batch_size, num_tags, 1)
            broadcast_score = score.unsqueeze(2)

            # Broadcast emission score for every possible current tag
            # shape: (batch_size, 1, num_tags)
            broadcast_emissions = emissions[i].unsqueeze(1)

            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
            # for each sample, entry at row i and column j stores the sum of scores of all
            # possible tag sequences so far that end with transitioning from tag i to tag j
            # and emitting
            # shape: (batch_size, num_tags, num_tags)
            next_score = broadcast_score + self.transitions + broadcast_emissions

            # Sum over all possible current tags, but we're in score space, so a sum
            # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
            # all possible tag sequences so far, that end in tag i
            # shape: (batch_size, num_tags)
            next_score = torch.logsumexp(next_score, dim=1)

            # Set score to the next score if this timestep is valid (mask == 1)
            # shape: (batch_size, num_tags)
            score = torch.where(mask[i].unsqueeze(1), next_score, score)

        # End transition score
        # shape: (batch_size, num_tags)
        score += self.end_transitions

        # Sum (log-sum-exp) over all possible tags
        # shape: (batch_size,)
        return torch.logsumexp(score, dim=1)

    def _viterbi_decode(self, emissions: torch.FloatTensor,
                        mask: torch.ByteTensor) -> List[List[int]]:
        # emissions: (seq_length, batch_size, num_tags)
        # mask: (seq_length, batch_size)
        assert emissions.dim() == 3 and mask.dim() == 2
        assert emissions.shape[:2] == mask.shape
        assert emissions.size(2) == self.num_tags
        assert mask[0].all()

        seq_length, batch_size = mask.shape

        # Start transition and first emission
        # shape: (batch_size, num_tags)
        score = self.start_transitions + emissions[0]
        history = []

        # score is a tensor of size (batch_size, num_tags) where for every batch,
        # value at column j stores the score of the best tag sequence so far that ends
        # with tag j
        # history saves where the best tags candidate transitioned from; this is used
        # when we trace back the best tag sequence

        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
        # for every possible next tag
        for i in range(1, seq_length):
            # Broadcast viterbi score for every possible next tag
            # shape: (batch_size, num_tags, 1)
            broadcast_score = score.unsqueeze(2)

            # Broadcast emission score for every possible current tag
            # shape: (batch_size, 1, num_tags)
            broadcast_emission = emissions[i].unsqueeze(1)

            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
            # for each sample, entry at row i and column j stores the score of the best
            # tag sequence so far that ends with transitioning from tag i to tag j and emitting
            # shape: (batch_size, num_tags, num_tags)
            next_score = broadcast_score + self.transitions + broadcast_emission

            # Find the maximum score over all possible current tag
            # shape: (batch_size, num_tags)
            next_score, indices = next_score.max(dim=1)

            # Set score to the next score if this timestep is valid (mask == 1)
            # and save the index that produces the next score
            # shape: (batch_size, num_tags)
            score = torch.where(mask[i].unsqueeze(1), next_score, score)
            history.append(indices)

        # End transition score
        # shape: (batch_size, num_tags)
        score += self.end_transitions

        # Now, compute the best path for each sample

        # shape: (batch_size,)
        seq_ends = mask.long().sum(dim=0) - 1
        best_tags_list = []

        for idx in range(batch_size):
            # Find the tag which maximizes the score at the last timestep; this is our best tag
            # for the last timestep
            _, best_last_tag = score[idx].max(dim=0)
            best_tags = [best_last_tag.item()]

            # We trace back where the best last tag comes from, append that to our best tag
            # sequence, and trace it back again, and so on
            for hist in reversed(history[:seq_ends[idx]]):
                best_last_tag = hist[idx][best_tags[-1]]
                best_tags.append(best_last_tag.item())

            # Reverse the order because we start from the last timestep
            best_tags.reverse()
            best_tags_list.append(best_tags)

        return best_tags_list

## 9. 模型构建
![char embedding](http://ww1.sinaimg.cn/large/005XIOOugy1ghoydgglkrj30ez0c8weq.jpg)
![bilstm-cnn-crf model](http://ww1.sinaimg.cn/large/005XIOOugy1ghoydgg0cjj30fz0js752.jpg)

In [39]:
class CNN(nn.Module):
    def __init__(
            self,
            chars_vocab_size,
            char_embed_size,
            kernel_size,
            num_filters,
            max_word_length,
            dropout_rate,
            batch_first=False):
        super(CNN, self).__init__()
        self.char_embed_size = char_embed_size
        self.num_filters = num_filters

        self.batch_first = batch_first
        self.char_embedding = nn.Embedding(chars_vocab_size, char_embed_size)
        bias = np.sqrt(3.0 / char_embed_size)
        self.char_embedding.weight.data.uniform_(-bias, bias)  # 初始化嵌入层权重

        self.dropout = nn.Dropout(dropout_rate)

        self.conv = nn.Conv1d(
            in_channels=char_embed_size,
            out_channels=num_filters,
            kernel_size=kernel_size)
        self.maxpool = nn.MaxPool1d(kernel_size=max_word_length - kernel_size + 1)

    def forward(self, x):
        # x: [batch_size,max_seq_length,max_word_length]
        # [batch_size,max_seq_length,max_word_length,char_embed_size]
        x = self.char_embedding(x)
        x = self.dropout(x)
        batch_size, max_seq_len,max_word_length,_ = x.shape
        view_shape = (batch_size * max_seq_len, max_word_length, self.char_embed_size)
        x = x.view(view_shape).transpose(1, 2) #[batch_size*max_seq_len,char_embed_size,max_word_length]

        x = self.conv(x)  # [batch_size*max_seq_length,num_filters]
        x = self.maxpool(x)
        output = x.view(batch_size,max_seq_len,self.num_filters) #[batch_size,max_seq_length,num_filters]
        return output

In [None]:
class BiLstmCrf(nn.Module):
    def __init__(
            self,
            input_size,
            hidden_size,
            bidrection,
            dropout_rate,
            num_tags):
        super(BiLstmCrf, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            bidirectional=bidrection,
            num_layers=2,
            dropout=dropout_rate,
            batch_first=True)
        self.hidden2tag = nn.Linear(hidden_size * 2, num_tags)
        self.crf = CRF(num_tags, batch_first=True)

    def forward(self, x):
        # [batch_size,max_seq_len,word_embed_size]
        output, _ = self.lstm(x)  # [batch_size,max_seq_len,hidden_size*2]
        output = self.hidden2tag(output)
        output = self.crf.decode(output)
        return torch.tensor(output) #[batch_size,max_seq_len]

In [None]:
class Model(nn.Module):
    def __init__(self, pretrained_weight, vocab_size, word_embed_size, cnn_module, lstm_crf_module):
        super(Model, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, word_embed_size)
        self.char_embedding = cnn_module
        self.lstm_crf_module = lstm_crf_module

    def forward(self, words, chars):
        #words: [batch_size,max_seq_length]
        # chars:[batch_size,max_seq_length,max_word_length]
        # [batch_size,max_seq_length,num_filters]
        char_embed = self.char_embedding(chars)
        # [batch_size,max_seq_length,word_embed_size]
        word_embed = self.word_embedding(words)

        input_x = torch.cat((word_embed, char_embed), 2)
        output = self.lstm_crf_module(input_x)

        return output


cnn_modulel = CNN(len(char2id),
                  char_embed_size,
                  kernel_size,
                  num_filters,
                  max_word_length,
                  dropout_rate,
                  batch_first=False)
lstm_crf_module = BiLstmCrf(num_filters+word_embed_size,
                            hidden_size,
                            bidrection,
                            dropout_rate,
                            len(tag2id))
weight = np.random.randn(len(word2id), word_embed_size)
model = Model(weight, len(word2id), word_embed_size, cnn_modulel, lstm_crf_module)
words = torch.rand(10, max_seq_length).type(torch.LongTensor)
chars = torch.rand(10, max_seq_length, max_word_length).type(torch.LongTensor)
output = model(words, chars)
print(output.shape)

## 10. 训练 

In [3]:
def train(epochs,
          model,
          train_dataloader,
          valid_dataloader,
          do_validation,
          device,
          lr,
          momentum,
          weight_decay,
          gradient_clip):
    criterion = nn.SGD(model.parameters(),
                       lr=lr,
                       momentum=momentum,
                       weight_decay=weight_decay)
    optimizer = optim.SGD()
    clip_gradient = nn.utils.clip_grad_norm(model.parameters(),gradient_clip)
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for batch_idx, data in enumerate(train_dataloader):
            words = data['words'].to(device)
            chars = data['chars'].to(device)
            tags = data['tags'].to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            train_loss += loss.item()
            loss.backward()
            nn.utils.clip_grad_norm(model.parameters(),gradient_clip)
            optimizer.step()
            if batch_idx % 20 == 0:
                print('Train Epoch:{}[{}/{}({:.0f}%)]\tLoss:{:.6f}'.format(
                    epoch, batch_idx *
                    len(data), len(train_dataloader.dataset),
                    100. * batch_idx / len(train_dataloader), loss.item()))

        train_loss /= len(train_dataloader)

        if do_validation:
            model.eval()
            valid_loss = 0.0
            valid_correct = 0
            with torch.no_grad():
                for batch_idx, (data, target) in enumerate(valid_dataloader):
                    data, target = data.to(device), target.to(device)
                    output = model(data)
                    loss = criterion(output, target)
                    valid_loss += loss.item()
                    pred = output.argmax(dim=1, keepdim=True)
                    valid_correct += pred.eq(target.view_as(pred)).sum().item()

            valid_loss /= len(valid_dataloader)

            print('\nValid set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
                valid_loss, valid_correct, len(valid_dataloader.dataset),
                100. * valid_correct / len(valid_dataloader.dataset)))

## 11. 测试

## 12. 结果可视化