In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import random
import math

In [2]:
# params for model
model_config = dict(
        embedding_dim=32,
        hidden_dim=64,
        vocab_size=10000,
        target_size=7
    )

# params for data processing
data_config = dict(
        seq_len=64,
        batch_size=32,
        padding_idx=0,
        unknown_idx=1
    )

# params for model training
train_config = dict(
        epochs=10,
        lr=1e-3,
        loss_type='softmax'
    )

START_TAG = "<START>"
STOP_TAG = "<STOP>"

print('model params:', model_config)
print('data params:', data_config)
print('train parmas:', train_config)

model params: {'embedding_dim': 32, 'hidden_dim': 128, 'vocab_size': 10000, 'target_size': 7}
data params: {'seq_len': 64, 'batch_size': 32, 'padding_idx': 0, 'unknown_idx': 1}
train parmas: {'epochs': 10, 'lr': 0.001, 'loss_type': 'softmax'}


In [3]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [4]:
# lstm+crf: baseline model
class lstm_crf_ner(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, batch_size):
        super(lstm_crf_ner, self).__init__()
        # params
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.batch_size = batch_size

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True, batch_first=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, self.batch_size, self.hidden_dim // 2),
                torch.randn(2, self.batch_size, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, batch_sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(batch_sentence) #(batch_size, seq_len, embedding_dim)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden) #(batch_size, seq_len, hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out) #(batch_size, seq_len, target_size)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, batch_sentence, batch_tags):
        
        # Avoid breaking if the last batch has a different size
        if batch_sentence.size(0) != self.batch_size:
            self.batch_size = batch_sentence.size(0)
        
        batch_feats = self._get_lstm_features(batch_sentence)
        batch_score = 0
        for i in range(self.batch_size):
            feats, tags = batch_feats[i,:,:], batch_tags[i,:]
            forward_score = self._forward_alg(feats)
            gold_score = self._score_sentence(feats, tags)
            batch_score += forward_score - gold_score
        return batch_score / self.batch_size

    def forward(self, batch_sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        batch_lstm_feats = self._get_lstm_features(batch_sentence)
        
        batch_tag_seq = []
        # Find the best path, given the features.
        for i in range(self.batch_size):
            lstm_feats = batch_lstm_feats[i,:,:]
            score, tag_seq = self._viterbi_decode(lstm_feats)
            batch_tag_seq.append(tag_seq)
        return np.array(batch_tag_seq)

In [5]:
# get tokens from bert vocab
token_idx = {}
with open('vocab.txt','r') as f:
    for line in f:
        token_idx[line.strip()] = len(token_idx)+2

# add special token
token_idx['<PAD>'] = data_config['padding_idx']
token_idx['<UNK>'] = data_config['unknown_idx']

# tags map
tag_idx = {'O':0, 
           'B-ORG':1, 'I-ORG':2, 
           'B-LOC':3, 'I-LOC':4, 
           'B-PER':5, 'I-PER':6,}
tag_idx[START_TAG] = len(tag_idx)
tag_idx[STOP_TAG] = len(tag_idx)

def sentence_padding(x):
    x = [[token_idx.get(token,data_config['unknown_idx']) for token in sent.split()] for sent in x]
    x = [sent[:data_config['seq_len']]+[data_config['padding_idx']]*(max(0,data_config['seq_len']-len(sent))) for sent in x]
    return x

def target_padding(y):
    y = [[tag_idx[target] for target in targets.split()] for targets in y]
    y = [targets[:data_config['seq_len']]+[0]*(max(0,data_config['seq_len']-len(targets))) for targets in y] # 0 stands for 'O'
    return y

# prepare data 
def train_data_iter(path_x='msra/train/sentences.txt',path_y='msra/train/tags.txt',data_config=data_config,shuffle=True):
    
    # read x and y
    x = open(path_x,'r').read().split('\n')
    y = open(path_y,'r').read().split('\n')
    assert len(x) == len(y),print('data error!')
    n = len(x)
    
    # transform sentence to array
    x = sentence_padding(x)
    
    # transform target to array
    y = target_padding(y)
    
    # shuffle x and y
    if shuffle:
        x, y = np.array(x), np.array(y)
        state = np.random.get_state()
        np.random.shuffle(x)
        np.random.set_state(state)
        np.random.shuffle(y)
    
    # get batch data
    for i in range(math.ceil(n/data_config['batch_size'])-1):
        start = i*data_config['batch_size']
        end = (i+1)*data_config['batch_size']
        yield torch.tensor(x[start:end]), torch.tensor(y[start:end])

model_config['vocab_size'] = len(token_idx)+2
token_idx_r = {v:k for k,v in token_idx.items()}
tag_idx_r = {v:k for k,v in tag_idx.items()}
I = train_data_iter()
for x,y in I:
    print(x.size(), y.size())
    print([token_idx_r.get(i,'') for i in x[0].numpy()])
    print(y[0].numpy())
    break

torch.Size([32, 64]) torch.Size([32, 64])
['他', '说', '，', '在', '今', '年', '的', '单', '边', '行', '动', '计', '划', '改', '进', '计', '划', '中', '，', '中', '国', '承', '诺', '到', '２', '０', '０', '５', '年', '将', '工', '业', '品', '平', '均', '关', '税', '降', '至', '１', '０', '·', '８', '％', '，', '增', '添', '了', '环', '保', '和', '能', '源', '服', '务', '的', '开', '放', '计', '划', '，', '并', '恢', '复']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [6]:
# evaluate trained model on some cases
def evaluation_case(model, path_x='msra/val/sentences.txt', path_y='msra/val/tags.txt', n=1):
    Iter_val = train_data_iter(path_x, path_y, data_config, shuffle=False)
    x,y = next(Iter_val)
    tag_pred = model(x)
    for sent, tag in zip(x[:n].numpy(), tag_pred[:n]):
        print('\t',[token_idx_r.get(i,'')+':'+tag_idx_r[j] for i,j in zip(sent, tag)])

# train model and evaluation
def train(model_config, data_config, train_config):
    
    print('training start...')
    print('[params]:')
    print('\tmodel params:', model_config)
    print('\tdata params:', data_config)
    print('\ttrain parmas:', train_config)
    
    # build model
    model = lstm_crf_ner(embedding_dim=model_config['embedding_dim'],
                     hidden_dim=model_config['hidden_dim'],
                     vocab_size=model_config['vocab_size'], 
                     tag_to_ix=tag_idx,
                     batch_size=data_config['batch_size'])
    print('[build model]:')
    print(model)
    
    # opt
    optimizer = optim.Adam(model.parameters(), lr=train_config['lr'])
    
    # train
    for i in range(train_config['epochs']):
        
        # record loss every epoch
        loss_value = []
        
        # get data flow
        Iter = train_data_iter(data_config=data_config)
        
        for x,y in Iter:
            
            # model init
            model.zero_grad()
            #model._init_lstm_state()
            
            
            loss = model.neg_log_likelihood(x, y)
            
            # record
            loss_value.append(loss.item())
            
            # weight update
            loss.backward()
            optimizer.step()
            
        print('[epoch %d]\tloss=%s' % (i, np.mean(loss_value)))
        #print('all loss:', loss_value)
        print('[evaluation]:')
        evaluation_case(model, n=2)
            
#train(model_config, data_config, train_config)