In [1]:
!pip install transformers
!pip install tensorflow==2.1.0

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification, AdamW

import numpy as np
import random
import math
from tqdm import tqdm_notebook



KeyboardInterrupt: ignored

In [0]:
!/opt/bin/nvidia-smi

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [0]:
# load data
# from google.colab import drive
# drive.mount('/content/drive')

try:
  from google_drive_downloader import GoogleDriveDownloader as gdd

  gdd.download_file_from_google_drive(
          file_id='1RFZmH6cLFbivA0VeDc6s56bRTgc2NhL1',
          dest_path='NER_Data/vocab.txt',
      )

  gdd.download_file_from_google_drive(
          file_id='1JqV332A6ZWZEHv64vzCFmSNVpC1Mox8J',
          dest_path='NER_Data/msra.zip',
          unzip=True
      )
  print('running in colab!')
  MAIN_PATH = '/content/NER_Data/'
except:
  print('running in local environment!')
  MAIN_PATH = './'

In [0]:
# params for model
model_config = dict(
        target_size=7
    )

# params for data processing
data_config = dict(
        seq_len=128,
        batch_size=16,
    )

# params for model training
train_config = dict(
        epochs=3,
        lr=5e-5,
        adam_eps=1e-8
    )

tag2idx = {'O':0, 
           'B-ORG':1, 'I-ORG':2, 
           'B-LOC':3, 'I-LOC':4, 
           'B-PER':5, 'I-PER':6}

train_pathx = MAIN_PATH+'msra/train/sentences.txt'
train_pathy = MAIN_PATH+'msra/train/tags.txt'
val_pathx = MAIN_PATH+'msra/val/sentences.txt'
val_pathy = MAIN_PATH+'msra/val/tags.txt'
test_pathx = MAIN_PATH+'msra/test/sentences.txt'
test_pathy = MAIN_PATH+'msra/test/tags.txt'

print('model params:', model_config)
print('data params:', data_config)
print('train parmas:', train_config)
print('tag2idx:', tag2idx)

In [0]:
# bert fine-tuning for ner task

# get bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
#model = BertForTokenClassification.from_pretrained('bert-base-chinese')

In [0]:
# prepare train data
train_token = open(train_pathx, 'r').read().split('\n')[:-1]
train_target = open(train_pathy, 'r').read().split('\n')[:-1]

print(tokenizer.tokenize(train_token[10]))
print(train_target[10].split(' '))
token_0 = tokenizer.encode_plus(train_token[0], max_length=10, pad_to_max_length=True)
print(len(token_0['input_ids']))

for key in token_0:
    print(key, ':')
    print(token_0[key])
print("Tokens (str)      : {}".format([tokenizer.convert_ids_to_tokens(s) for s in token_0['input_ids']]))
#train_token = tokenizer.batch_encode_plus(train_token, pad_to_max_length=True)

In [0]:
# 构建数据集
class MSRA(Dataset):
    
    def __init__(self, seq_len, train_token_path, train_target_path, device=device):
        
        # load raw data
        train_token = open(train_token_path, 'r').read().split('\n')[:-1]
        train_target = open(train_target_path, 'r').read().split('\n')[:-1]
        
        # tokenize
        self.train_token = list(map(lambda x:['[CLS]']+x.split(' ')[:seq_len-2]+['[SEP]'], train_token))
        self.train_target = list(map(lambda x:['O']+x.split(' ')[:seq_len-2]+['O'], train_target))
        
        # check
        for token, target in zip(self.train_token, self.train_target):
            if len(token) != len(target):
                print(idx, token, target)
                print('-'*100)
        
        # transform to id list
        self.train_token = list(map(lambda x:tokenizer.convert_tokens_to_ids(x), self.train_token))
        self.train_target = list(map(lambda x:[tag2idx[i] for i in x], self.train_target))
        
        # pad and mask
        pad_lens = [seq_len-len(x) for x in self.train_token]
        self.train_token = [token+[0]*pad_len for token, pad_len in zip(self.train_token, pad_lens)]
        self.mask = [[1]*(seq_len-pad_len)+[0]*pad_len for pad_len in pad_lens]
        self.train_target = [target+[0]*pad_len for target, pad_len in zip(self.train_target, pad_lens)]
        
        # to tensor
        self.train_token = torch.LongTensor(self.train_token).to(device)
        self.mask = torch.LongTensor(self.mask).to(device)
        self.train_target = torch.LongTensor(self.train_target).to(device)
        
    def __getitem__(self, idx):
        return self.train_token[idx], self.mask[idx], self.train_target[idx]
    
    def __len__(self):
        return len(self.train_token)
            

In [0]:
train_data = MSRA(data_config['seq_len'], train_pathx, train_pathy)
print(train_data[10])
train_data_loader = DataLoader(train_data, batch_size=data_config['batch_size'])

val_data = MSRA(data_config['seq_len'], val_pathx, val_pathy)
val_data_loader = DataLoader(val_data, batch_size=data_config['batch_size'])

test_data = MSRA(data_config['seq_len'], test_pathx, test_pathy)
test_data_loader = DataLoader(test_data, batch_size=data_config['batch_size'])

In [0]:
# evaluate trained model on some cases
tag2idx_r = {v:k for k,v in tag2idx.items()}

def evaluation_case(model, n=1):
    for x, mask, y in val_data_loader:
      break
    model.eval()
    with torch.no_grad():
      tag_prob = model(input_ids=x)[0] # score
      tag_pred = np.argmax(tag_prob.cpu().numpy(), -1)
      for sent_id, tag in zip(x[:n].cpu().numpy(), tag_pred[:n]):
        sent_token = tokenizer.convert_ids_to_tokens(sent_id)
        print('\t',[i+':'+tag2idx_r[j] for i,j in zip(sent_token, tag)])

# train model and evaluation
def train(model_config, train_config, device=device):
    
    print('training start...')
    print('[params]:')
    print('\tmodel params:', model_config)
    print('\ttrain parmas:', train_config)

    # build model
    bertmodel = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=model_config['target_size']).to(device)
    print('[build model]:')
    print(bertmodel.parameters())
    
    # opt
    param_optimizer = list(bertmodel.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=train_config['lr'], eps=train_config['adam_eps'])
    
    # train
    for epoch in range(train_config['epochs']):
        
        print('[evaluation]:')
        evaluation_case(bertmodel, n=2)

        # record loss every epoch
        loss_value = []

        # model init
        bertmodel.train()
        
        for token, mask, target in tqdm_notebook(train_data_loader):

            bertmodel.zero_grad()
            
            loss = bertmodel(input_ids=token, attention_mask=mask, labels=target)[0]
            
            # record
            loss_value.append(loss.item())
            
            # weight update
            loss.backward()
            optimizer.step()
            
        print('[epoch %d]\tloss=%s' % (epoch, np.mean(loss_value)))

    print('[evaluation]:')
    evaluation_case(bertmodel, n=2)
        
    return bertmodel

model = train(model_config, train_config)

In [0]:
#test

def test(model, out_pathx, out_pathy, path_x=MAIN_PATH+'msra/test/sentences.txt', path_y=MAIN_PATH+'msra/test/tags.txt'):
    Iter_val = train_data_iter(path_x, path_y, data_config, shuffle=False)
    ox = open(out_pathx, 'w')
    oy = open(out_pathy, 'w')
    
    model.eval()
    for x,y in tqdm_notebook(Iter_val):
        with torch.no_grad():
            tag_pred = model(x)
            for sent, tag in zip(x.numpy(), tag_pred):
                sent_decode = ' '.join([token_idx_r[i] for i in sent])
                tag_decode = ' '.join([tag_idx_r[i] for i in tag])
                ox.write(sent_decode + '\n')
                oy.write(tag_decode + '\n')
    ox.close()
    oy.close()
    
test(model, out_pathx='result/lstm_crf_sentences.txt', out_pathy='result/lstm_crf_tags.txt')

In [0]:
# evaluation
from utils.evaluation import f1_score_from_path

test_x = 'msra/test/sentences.txt'
test_y = 'msra/test/tags.txt'
pred_y = 'result/lstm_crf_tags.txt'
pred_x = 'result/lstm_crf_sentences.txt' # Because of padding, the length of prediction may be shorter than true label

micro_score = f1_score_from_path(test_x, test_y, pred_y, pred_x, f1_type='mirco')
macro_score = f1_score_from_path(test_x, test_y, pred_y, pred_x, f1_type='marco')
print('micro : %s \t macro : %s' % (micro_score, macro_score))