In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import random
import math

In [28]:
# params for model
model_config = dict(
        embedding_dim=32,
        hidden_dim=128,
        vocab_size=10000,
        target_size=7
    )

# params for data processing
data_config = dict(
        seq_len=64,
        batch_size=32,
        padding_idx=0,
        unknown_idx=1
    )

# params for model training
train_config = dict(
        epochs=10,
        lr=1e-3,
        loss_type='softmax'
    )

print('model params:', model_config)
print('data params:', data_config)
print('train parmas:', train_config)

model params: {'embedding_dim': 32, 'hidden_dim': 128, 'vocab_size': 10000, 'target_size': 7}
data params: {'seq_len': 64, 'batch_size': 32, 'padding_idx': 0, 'unknown_idx': 1}
train parmas: {'epochs': 10, 'lr': 0.001, 'loss_type': 'softmax'}


In [29]:
# lstm: baseline model
class lstm_ner(nn.Module):
    
    def __init__(self,embedding_dim=32, hidden_dim=32, vocab_size=10000, target_size=7, batch_size=32):
        super(lstm_ner, self).__init__()
        
        # params
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.target_size = target_size
        
        # layers
        self.emb_layer = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.out_layer = nn.Linear(hidden_dim, target_size)
    
    def _init_state(self):
        return torch.randn(1, self.batch_size, self.hidden_dim)
    
    def forward(self, x):
        emb = self.emb_layer(x) #(batch_size, seq_len, embedding_dim)
        lstm_out, _ = self.lstm(emb, self._init_state()) #(batch_size, seq_len, hidden_dim)
        tag_score = self.out_layer(lstm_out).view(self.batch_size, self.target_size, -1) #(batch_size, target_size, seq_len)
        return tag_score

model = lstm_ner(embedding_dim=model_config['embedding_dim'],
                 hidden_dim=model_config['hidden_dim'],
                 vocab_size=model_config['vocab_size'], 
                 target_size=model_config['target_size'],
                 batch_size=data_config['batch_size'])
print(model)
# x_test = torch.tensor([10,35,999,457])
# print(x_test)
# y = model(x_test)
# print(y)

lstm_ner(
  (emb_layer): Embedding(10000, 32)
  (lstm): GRU(32, 128, batch_first=True)
  (out_layer): Linear(in_features=128, out_features=7, bias=True)
)


In [30]:
# get tokens from bert vocab
token_idx = {}
with open('vocab.txt','r') as f:
    for line in f:
        token_idx[line.strip()] = len(token_idx)+2

# add special token
token_idx['<PAD>'] = data_config['padding_idx']
token_idx['<UNK>'] = data_config['unknown_idx']

# tags map
tag_idx = {'O':0, 
           'B-ORG':1, 'I-ORG':2, 
           'B-LOC':3, 'I-LOC':4, 
           'B-PER':5, 'I-PER':6}

def sentence_padding(x):
    x = [[token_idx.get(token,data_config['unknown_idx']) for token in sent.split()] for sent in x]
    x = [sent[:data_config['seq_len']]+[data_config['padding_idx']]*(max(0,data_config['seq_len']-len(sent))) for sent in x]
    return x

def target_padding(y):
    y = [[tag_idx[target] for target in targets.split()] for targets in y]
    y = [targets[:data_config['seq_len']]+[0]*(max(0,data_config['seq_len']-len(targets))) for targets in y] # 0 stands for 'O'
    return y

# prepare data 
def train_data_iter(path_x='msra/train/sentences.txt',path_y='msra/train/tags.txt',data_config=data_config,shuffle=True):
    
    # read x and y
    x = open(path_x,'r').read().split('\n')
    y = open(path_y,'r').read().split('\n')
    assert len(x) == len(y),print('data error!')
    n = len(x)
    
    # transform sentence to array
    x = sentence_padding(x)
    
    # transform target to array
    y = target_padding(y)
    
    # shuffle x and y
    if shuffle:
        x, y = np.array(x), np.array(y)
        state = np.random.get_state()
        np.random.shuffle(x)
        np.random.set_state(state)
        np.random.shuffle(y)
    
    # get batch data
    for i in range(math.ceil(n/data_config['batch_size'])-1):
        start = i*data_config['batch_size']
        end = (i+1)*data_config['batch_size']
        yield torch.tensor(x[start:end]), torch.tensor(y[start:end])

model_config['vocab_size'] = len(token_idx)+2
token_idx_r = {v:k for k,v in token_idx.items()}
tag_idx_r = {v:k for k,v in tag_idx.items()}
I = train_data_iter()
for x,y in I:
    print(x.size(), y.size())
    print([token_idx_r.get(i,'') for i in x[0].numpy()])
    print(y[0].numpy())
    break

torch.Size([32, 64]) torch.Size([32, 64])
['该', '市', '把', '强', '化', '基', '层', '基', '础', '作', '为', '加', '强', '干', '部', '作', '风', '建', '设', '的', '重', '点', '，', '全', '市', '６', '０', '７', '个', '行', '政', '村', '，', '村', '村', '建', '立', '健', '全', '了', '村', '民', '代', '表', '议', '事', '会', '和', '民', '主', '理', '财', '小', '组', '，', '凡', '村', '里', '的', '大', '事', '要', '事', '、']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [31]:
# loss: softmax loss/crf loss
def cal_softmax_loss(tag_score, target):
    return nn.CrossEntropyLoss()(tag_score, target)

def cal_crf_loss(tag_score, target):
    pass

# evaluate trained model on some cases
def evaluation_case(model, path_x='msra/val/sentences.txt', path_y='msra/val/tags.txt', n=1):
    Iter_val = train_data_iter(path_x, path_y, data_config, shuffle=False)
    x,y = next(Iter_val)
    tag_prob = model(x).detach().numpy()
    
    #print(tag_prob)
    tag_pred = np.argmax(tag_prob, axis=1)
    for sent, tag in zip(x[:n].numpy(), tag_pred[:n]):
        print('\t',[token_idx_r.get(i,'')+':'+tag_idx_r[j] for i,j in zip(sent, tag)])

# train model and evaluation
def train(model_config, data_config, train_config):
    
    print('training start...')
    print('[params]:')
    print('\tmodel params:', model_config)
    print('\tdata params:', data_config)
    print('\ttrain parmas:', train_config)
    
    # build model
    model = lstm_ner(embedding_dim=model_config['embedding_dim'],
                     hidden_dim=model_config['hidden_dim'],
                     vocab_size=model_config['vocab_size'], 
                     target_size=model_config['target_size'],
                     batch_size=data_config['batch_size'])
    print('[build model]:')
    print(model)
    
    # opt
    optimizer = optim.Adam(model.parameters(), lr=train_config['lr'])
    
    # train
    for i in range(train_config['epochs']):
        
        # record loss every epoch
        loss_value = []
        
        # get data flow
        Iter = train_data_iter(data_config=data_config)
        
        for x,y in Iter:
            
            # model init
            model.zero_grad()
            #model._init_lstm_state()
            
            y_ = model(x)
            if train_config['loss_type'] == 'softmax':
                loss = cal_softmax_loss(y_, y)
            
            # record
            loss_value.append(loss.item())
            
            # weight update
            loss.backward()
            optimizer.step()
            
        print('[epoch %d]\tloss=%s' % (i, np.mean(loss_value)))
        #print('all loss:', loss_value)
        print('[evaluation]:')
        evaluation_case(model, n=2)
            
train(model_config, data_config, train_config)

training start...
[params]:
	model params: {'embedding_dim': 32, 'hidden_dim': 128, 'vocab_size': 21132, 'target_size': 7}
	data params: {'seq_len': 64, 'batch_size': 32, 'padding_idx': 0, 'unknown_idx': 1}
	train parmas: {'epochs': 10, 'lr': 0.001, 'loss_type': 'softmax'}
[build model]:
lstm_ner(
  (emb_layer): Embedding(21132, 32)
  (lstm): GRU(32, 128, batch_first=True)
  (out_layer): Linear(in_features=128, out_features=7, bias=True)
)
[epoch 0]	loss=0.4421678973793438
[evaluation]:
	 ['近:O', '日:O', '在:O', '江:O', '苏:O', '如:O', '皋:O', '市:O', '城:O', '西:O', '乡:O', '，:O', '１:O', '０:O', '０:O', '０:O', '多:O', '个:O', '品:O', '种:O', '的:O', '花:O', '木:O', '盆:O', '景:O', '，:O', '千:O', '姿:O', '百:O', '态:O', '，:O', '新:O', '颖:O', '别:O', '致:O', '，:O', '吸:O', '引:O', '了:O', '成:O', '千:O', '上:O', '万:O', '的:O', '游:O', '客:O', '、:O', '顾:O', '客:O', '驻:O', '足:O', '观:O', '赏:O', '、:O', '选:O', '购:O', '。:O', '<PAD>:O', '<PAD>:O', '<PAD>:O', '<PAD>:O', '<PAD>:O', '<PAD>:O', '<PAD>:O']
	 ['大:O', '家:O', '认:O', '为:O'

[epoch 8]	loss=0.2687076972683937
[evaluation]:
	 ['近:O', '日:O', '在:O', '江:O', '苏:I-LOC', '如:I-LOC', '皋:I-LOC', '市:O', '城:O', '西:O', '乡:O', '，:O', '１:O', '０:O', '０:O', '０:O', '多:O', '个:O', '品:O', '种:O', '的:O', '花:O', '木:O', '盆:O', '景:O', '，:O', '千:O', '姿:O', '百:O', '态:O', '，:O', '新:O', '颖:O', '别:O', '致:O', '，:O', '吸:O', '引:O', '了:O', '成:O', '千:O', '上:O', '万:O', '的:O', '游:O', '客:O', '、:O', '顾:O', '客:O', '驻:O', '足:O', '观:O', '赏:O', '、:O', '选:O', '购:O', '。:O', '<PAD>:O', '<PAD>:O', '<PAD>:O', '<PAD>:O', '<PAD>:O', '<PAD>:O', '<PAD>:O']
	 ['大:O', '家:O', '认:O', '为:O', '，:O', '该:O', '片:O', '较:O', '全:O', '面:O', '、:O', '系:O', '统:O', '、:O', '形:O', '象:O', '地:O', '反:O', '映:O', '了:O', '新:O', '四:O', '军:O', '的:O', '战:O', '斗:O', '历:O', '史:O', '，:O', '揭:O', '示:O', '了:O', '中:O', '国:O', '共:O', '产:O', '党:O', '在:O', '抗:O', '日:O', '战:O', '争:O', '中:O', '的:O', '历:O', '史:O', '地:O', '位:O', '和:O', '作:O', '用:O', '，:O', '是:O', '一:O', '部:O', '进:O', '行:O', '爱:O', '国:O', '主:O', '义:O', '和:O', '革:O', '命:O']
[epoch 9]	