In [1]:
!pip install transformers
!pip install tensorflow==2.1.0

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification, AdamW

import numpy as np
import random
import math
from tqdm import tqdm_notebook



In [2]:
!/opt/bin/nvidia-smi

Sun Mar  8 11:52:26 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
# load data
# from google.colab import drive
# drive.mount('/content/drive')

try:
  from google_drive_downloader import GoogleDriveDownloader as gdd

  gdd.download_file_from_google_drive(
          file_id='1RFZmH6cLFbivA0VeDc6s56bRTgc2NhL1',
          dest_path='NER_Data/vocab.txt',
      )

  gdd.download_file_from_google_drive(
          file_id='1JqV332A6ZWZEHv64vzCFmSNVpC1Mox8J',
          dest_path='NER_Data/msra.zip',
          unzip=True
      )
  print('running in colab!')
  MAIN_PATH = '/content/NER_Data/'
except:
  print('running in local environment!')
  MAIN_PATH = './'

running in colab!


In [5]:
# params for model
model_config = dict(
        target_size=7
    )

# params for data processing
data_config = dict(
        seq_len=128,
        batch_size=32,
    )

# params for model training
train_config = dict(
        epochs=3,
        lr=5e-5,
        adam_eps=1e-8
    )

tag2idx = {'O':0, 
           'B-ORG':1, 'I-ORG':2, 
           'B-LOC':3, 'I-LOC':4, 
           'B-PER':5, 'I-PER':6}

train_pathx = MAIN_PATH+'msra/train/sentences.txt'
train_pathy = MAIN_PATH+'msra/train/tags.txt'
val_pathx = MAIN_PATH+'msra/val/sentences.txt'
val_pathy = MAIN_PATH+'msra/val/tags.txt'
test_pathx = MAIN_PATH+'msra/test/sentences.txt'
test_pathy = MAIN_PATH+'msra/test/tags.txt'

print('model params:', model_config)
print('data params:', data_config)
print('train parmas:', train_config)
print('tag2idx:', tag2idx)

model params: {'target_size': 7}
data params: {'seq_len': 128, 'batch_size': 16}
train parmas: {'epochs': 3, 'lr': 5e-05, 'adam_eps': 1e-08}
tag2idx: {'O': 0, 'B-ORG': 1, 'I-ORG': 2, 'B-LOC': 3, 'I-LOC': 4, 'B-PER': 5, 'I-PER': 6}


In [0]:
# bert fine-tuning for ner task

# get bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
#model = BertForTokenClassification.from_pretrained('bert-base-chinese')

In [7]:
# prepare train data
train_token = open(train_pathx, 'r').read().split('\n')[:-1]
train_target = open(train_pathy, 'r').read().split('\n')[:-1]

print(tokenizer.tokenize(train_token[10]))
print(train_target[10].split(' '))
token_0 = tokenizer.encode_plus(train_token[0], max_length=10, pad_to_max_length=True)
print(len(token_0['input_ids']))

for key in token_0:
    print(key, ':')
    print(token_0[key])
print("Tokens (str)      : {}".format([tokenizer.convert_ids_to_tokens(s) for s in token_0['input_ids']]))
#train_token = tokenizer.batch_encode_plus(train_token, pad_to_max_length=True)

['当', '有', '了', '一', '定', '的', '实', '力', '后', '，', '他', '就', '成', '立', '了', '武', '义', '县', '重', '点', '实', '用', '菌', '公', '司', '，', '不', '仅', '负', '责', '为', '菇', '农', '提', '供', '技', '术', '指', '导', '和', '菌', '种', '，', '而', '且', '负', '责', '原', '料', '代', '购', '，', '产', '品', '回', '收', '，', '经', '自', '己', '加', '工', '，', '或', '出', '口', '、', '或', '内', '销', '，', '从', '而', '使', '高', '温', '香', '菇', '栽', '培', '技', '术', '迅', '速', '扩', '散', '到', '浙', '西', '南', '山', '区', '的', '１', '０', '多', '个', '县', '市', '，', '１', '０', '０', '多', '个', '乡', '镇', '，', '栽', '培', '规', '模', '由', '１', '９', '９', '１', '年', '的', '２', '３', '万', '袋', '增', '加', '到', '１', '９', '９', '５', '年', '的', '３', '０', '０', '０', '万', '袋', '，', '仅', '此', '一', '项', '就', '使', '当', '地', '农', '民', '增', '加', '收', '入', '１', '亿', '多', '元', '。']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [0]:
# 构建数据集
class MSRA(Dataset):
    
    def __init__(self, seq_len, train_token_path, train_target_path, device=device):
        
        # load raw data
        train_token = open(train_token_path, 'r').read().split('\n')[:-1]
        train_target = open(train_target_path, 'r').read().split('\n')[:-1]
        
        # tokenize
        self.train_token = list(map(lambda x:['[CLS]']+x.split(' ')[:seq_len-2]+['[SEP]'], train_token))
        self.train_target = list(map(lambda x:['O']+x.split(' ')[:seq_len-2]+['O'], train_target))
        
        # check
        for token, target in zip(self.train_token, self.train_target):
            if len(token) != len(target):
                print(idx, token, target)
                print('-'*100)
        
        # transform to id list
        self.train_token = list(map(lambda x:tokenizer.convert_tokens_to_ids(x), self.train_token))
        self.train_target = list(map(lambda x:[tag2idx[i] for i in x], self.train_target))
        
        # pad and mask
        pad_lens = [seq_len-len(x) for x in self.train_token]
        self.train_token = [token+[0]*pad_len for token, pad_len in zip(self.train_token, pad_lens)]
        self.mask = [[1]*(seq_len-pad_len)+[0]*pad_len for pad_len in pad_lens]
        self.train_target = [target+[0]*pad_len for target, pad_len in zip(self.train_target, pad_lens)]
        
        # to tensor
        self.train_token = torch.LongTensor(self.train_token).to(device)
        self.mask = torch.LongTensor(self.mask).to(device)
        self.train_target = torch.LongTensor(self.train_target).to(device)
        
    def __getitem__(self, idx):
        return self.train_token[idx], self.mask[idx], self.train_target[idx]
    
    def __len__(self):
        return len(self.train_token)
            

In [9]:
train_data = MSRA(data_config['seq_len'], train_pathx, train_pathy)
print(train_data[10])
train_data_loader = DataLoader(train_data, batch_size=data_config['batch_size'])

val_data = MSRA(data_config['seq_len'], val_pathx, val_pathy)
val_data_loader = DataLoader(val_data, batch_size=data_config['batch_size'])

test_data = MSRA(data_config['seq_len'], test_pathx, test_pathy)
test_data_loader = DataLoader(test_data, batch_size=data_config['batch_size'])

(tensor([ 101, 2496, 3300,  749,  671, 2137, 4638, 2141, 1213, 1400, 8024,  800,
        2218, 2768, 4989,  749, 3636,  721, 1344, 7028, 4157, 2141, 4500, 5826,
        1062, 1385, 8024,  679,  788, 6566, 6569,  711, 5823, 1093, 2990,  897,
        2825, 3318, 2900, 2193, 1469, 5826, 4905, 8024, 5445,  684, 6566, 6569,
        1333, 3160,  807, 6579, 8024,  772, 1501, 1726, 3119, 8024, 5307, 5632,
        2346, 1217, 2339, 8024, 2772, 1139, 1366,  510, 2772, 1079, 7218, 8024,
         794, 5445,  886, 7770, 3946, 7676, 5823, 3420, 1824, 2825, 3318, 6813,
        6862, 2810, 3141, 1168, 3851, 6205, 1298, 2255, 1277, 4638, 8029, 8028,
        1914,  702, 1344, 2356, 8024, 8029, 8028, 8028, 1914,  702,  740, 7252,
        8024, 3420, 1824, 6226, 3563, 4507, 8029, 8037, 8037, 8029, 2399, 4638,
        8030, 8031,  674, 6150, 1872, 1217, 1168,  102], device='cuda:0'), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [10]:
# evaluate trained model on some cases
tag2idx_r = {v:k for k,v in tag2idx.items()}

def evaluation_case(model, n=1):
    for x, mask, y in val_data_loader:
      break
    model.eval()
    with torch.no_grad():
      tag_prob = model(input_ids=x)[0] # score
      tag_pred = np.argmax(tag_prob.cpu().numpy(), -1)
      for sent_id, tag in zip(x[:n].cpu().numpy(), tag_pred[:n]):
        sent_token = tokenizer.convert_ids_to_tokens(sent_id)
        print('\t',[i+':'+tag2idx_r[j] for i,j in zip(sent_token, tag)])

# train model and evaluation
def train(model_config, train_config, device=device):
    
    print('training start...')
    print('[params]:')
    print('\tmodel params:', model_config)
    print('\ttrain parmas:', train_config)

    # build model
    bertmodel = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=model_config['target_size']).to(device)
    print('[build model]:')
    print(bertmodel.parameters())
    
    # opt
    param_optimizer = list(bertmodel.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=train_config['lr'], eps=train_config['adam_eps'])
    
    # train
    for epoch in range(train_config['epochs']):
        
        print('[evaluation]:')
        evaluation_case(bertmodel, n=2)

        # record loss every epoch
        loss_value = []

        # model init
        bertmodel.train()
        
        for token, mask, target in tqdm_notebook(train_data_loader):

            bertmodel.zero_grad()
            
            loss = bertmodel(input_ids=token, attention_mask=mask, labels=target)[0]
            
            # record
            loss_value.append(loss.item())
            
            # weight update
            loss.backward()
            optimizer.step()
            
        print('[epoch %d]\tloss=%s' % (epoch, np.mean(loss_value)))

    print('[evaluation]:')
    evaluation_case(bertmodel, n=2)
        
    return bertmodel

model = train(model_config, train_config)

training start...
[params]:
	model params: {'target_size': 7}
	train parmas: {'epochs': 3, 'lr': 5e-05, 'adam_eps': 1e-08}
[build model]:
<generator object Module.parameters at 0x7ff205cc5308>
[evaluation]:
	 ['[CLS]:I-LOC', '近:I-LOC', '日:B-ORG', '在:B-ORG', '江:B-ORG', '苏:I-LOC', '如:B-PER', '皋:I-ORG', '市:B-LOC', '城:I-LOC', '西:I-LOC', '乡:O', '，:B-ORG', '１:I-LOC', '０:I-LOC', '０:I-LOC', '０:I-LOC', '多:I-LOC', '个:B-ORG', '品:I-LOC', '种:I-LOC', '的:I-LOC', '花:I-LOC', '木:I-LOC', '盆:I-LOC', '景:I-LOC', '，:I-LOC', '千:I-LOC', '姿:I-LOC', '百:I-LOC', '态:I-LOC', '，:B-ORG', '新:I-LOC', '颖:I-LOC', '别:I-LOC', '致:I-LOC', '，:I-LOC', '吸:B-ORG', '引:I-LOC', '了:B-LOC', '成:I-LOC', '千:I-LOC', '上:I-LOC', '万:I-LOC', '的:B-LOC', '游:I-LOC', '客:I-LOC', '、:B-ORG', '顾:I-LOC', '客:I-LOC', '驻:I-LOC', '足:I-LOC', '观:I-LOC', '赏:I-LOC', '、:B-ORG', '选:I-LOC', '购:I-LOC', '。:I-LOC', '[SEP]:I-LOC', '[PAD]:I-LOC', '[PAD]:I-LOC', '[PAD]:I-LOC', '[PAD]:I-LOC', '[PAD]:I-LOC', '[PAD]:I-LOC', '[PAD]:I-LOC', '[PAD]:I-LOC', '[PAD]:I-LOC', '[

HBox(children=(IntProgress(value=0, max=2625), HTML(value='')))


[epoch 0]	loss=0.3757697166772116
[evaluation]:
	 ['[CLS]:O', '近:O', '日:O', '在:O', '江:B-LOC', '苏:I-LOC', '如:O', '皋:I-LOC', '市:O', '城:O', '西:I-LOC', '乡:O', '，:O', '１:O', '０:O', '０:O', '０:O', '多:O', '个:O', '品:O', '种:O', '的:O', '花:O', '木:O', '盆:O', '景:O', '，:O', '千:O', '姿:O', '百:O', '态:O', '，:O', '新:O', '颖:O', '别:O', '致:O', '，:O', '吸:O', '引:O', '了:O', '成:O', '千:O', '上:O', '万:O', '的:O', '游:O', '客:O', '、:O', '顾:O', '客:O', '驻:O', '足:O', '观:O', '赏:O', '、:O', '选:O', '购:O', '。:O', '[SEP]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD

HBox(children=(IntProgress(value=0, max=2625), HTML(value='')))


[epoch 1]	loss=0.1426406116953918
[evaluation]:
	 ['[CLS]:O', '近:O', '日:O', '在:O', '江:B-LOC', '苏:I-LOC', '如:B-LOC', '皋:I-LOC', '市:I-LOC', '城:I-LOC', '西:I-LOC', '乡:O', '，:O', '１:O', '０:O', '０:O', '０:O', '多:O', '个:O', '品:O', '种:O', '的:O', '花:O', '木:O', '盆:O', '景:O', '，:O', '千:O', '姿:O', '百:O', '态:O', '，:O', '新:O', '颖:O', '别:O', '致:O', '，:O', '吸:O', '引:O', '了:O', '成:O', '千:O', '上:O', '万:O', '的:O', '游:O', '客:O', '、:O', '顾:O', '客:O', '驻:O', '足:O', '观:O', '赏:O', '、:O', '选:O', '购:O', '。:O', '[SEP]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PA

HBox(children=(IntProgress(value=0, max=2625), HTML(value='')))


[epoch 2]	loss=0.11094573866824309
[evaluation]:
	 ['[CLS]:O', '近:O', '日:O', '在:O', '江:B-LOC', '苏:I-LOC', '如:B-LOC', '皋:I-LOC', '市:I-LOC', '城:I-LOC', '西:I-LOC', '乡:I-LOC', '，:O', '１:O', '０:O', '０:O', '０:O', '多:O', '个:O', '品:O', '种:O', '的:O', '花:O', '木:O', '盆:O', '景:O', '，:O', '千:O', '姿:O', '百:O', '态:O', '，:O', '新:O', '颖:O', '别:O', '致:O', '，:O', '吸:O', '引:O', '了:O', '成:O', '千:O', '上:O', '万:O', '的:O', '游:O', '客:O', '、:O', '顾:O', '客:O', '驻:O', '足:O', '观:O', '赏:O', '、:O', '选:O', '购:O', '。:O', '[SEP]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O', '[PAD]:O',

In [11]:
#test

def test(model, out_pathx, out_pathy, path_x=MAIN_PATH+'msra/test/sentences.txt', path_y=MAIN_PATH+'msra/test/tags.txt'):
    Iter_val = train_data_iter(path_x, path_y, data_config, shuffle=False)
    ox = open(out_pathx, 'w')
    oy = open(out_pathy, 'w')
    
    model.eval()
    for x,y in tqdm_notebook(Iter_val):
        with torch.no_grad():
            tag_pred = model(x)
            for sent, tag in zip(x.numpy(), tag_pred):
                sent_decode = ' '.join([token_idx_r[i] for i in sent])
                tag_decode = ' '.join([tag_idx_r[i] for i in tag])
                ox.write(sent_decode + '\n')
                oy.write(tag_decode + '\n')
    ox.close()
    oy.close()
    
test(model, out_pathx='result/lstm_crf_sentences.txt', out_pathy='result/lstm_crf_tags.txt')

NameError: ignored

In [0]:
# evaluation
from utils.evaluation import f1_score_from_path

test_x = 'msra/test/sentences.txt'
test_y = 'msra/test/tags.txt'
pred_y = 'result/lstm_crf_tags.txt'
pred_x = 'result/lstm_crf_sentences.txt' # Because of padding, the length of prediction may be shorter than true label

micro_score = f1_score_from_path(test_x, test_y, pred_y, pred_x, f1_type='mirco')
macro_score = f1_score_from_path(test_x, test_y, pred_y, pred_x, f1_type='marco')
print('micro : %s \t macro : %s' % (micro_score, macro_score))