In [91]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification, AdamW

import numpy as np
import random
import math
from tqdm import tqdm_notebook

In [11]:
!/opt/bin/nvidia-smi

/bin/sh: /opt/bin/nvidia-smi: No such file or directory


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [13]:
# load data
# from google.colab import drive
# drive.mount('/content/drive')

try:
  from google_drive_downloader import GoogleDriveDownloader as gdd

  gdd.download_file_from_google_drive(
          file_id='1RFZmH6cLFbivA0VeDc6s56bRTgc2NhL1',
          dest_path='NER_Data/vocab.txt',
      )

  gdd.download_file_from_google_drive(
          file_id='1JqV332A6ZWZEHv64vzCFmSNVpC1Mox8J',
          dest_path='NER_Data/msra.zip',
          unzip=True
      )
  print('running in colab!')
  MAIN_PATH = '/content/NER_Data/'
except:
  print('running in local environment!')
  MAIN_PATH = './'

running in local environment!


In [85]:
# params for model
model_config = dict(
        target_size=7
    )

# params for data processing
data_config = dict(
        seq_len=200,
        batch_size=64,
    )

# params for model training
train_config = dict(
        epochs=10,
        lr=1e-3,
    )

tag2idx = {'O':0, 
           'B-ORG':1, 'I-ORG':2, 
           'B-LOC':3, 'I-LOC':4, 
           'B-PER':5, 'I-PER':6}

train_pathx = MAIN_PATH+'msra/train/sentences.txt'
train_pathy = MAIN_PATH+'msra/train/tags.txt'
val_pathx = MAIN_PATH+'msra/val/sentences.txt'
val_pathy = MAIN_PATH+'msra/val/tags.txt'
test_pathx = MAIN_PATH+'msra/test/sentences.txt'
test_pathy = MAIN_PATH+'msra/test/tags.txt'

print('model params:', model_config)
print('data params:', data_config)
print('train parmas:', train_config)
print('tag2idx:', tag2idx)

model params: {'target_size': 7}
data params: {'seq_len': 200, 'batch_size': 64}
train parmas: {'epochs': 10, 'lr': 0.001}
tag2idx: {'O': 0, 'B-ORG': 1, 'I-ORG': 2, 'B-LOC': 3, 'I-LOC': 4, 'B-PER': 5, 'I-PER': 6}


In [8]:
# bert fine-tuning for ner task

# get bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
#model = BertForTokenClassification.from_pretrained('bert-base-chinese')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=568.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411577189.0, style=ProgressStyle(descri…




In [86]:
# prepare train data
train_token = open(train_pathx, 'r').read().split('\n')[:-1]
train_target = open(train_pathy, 'r').read().split('\n')[:-1]

print(tokenizer.tokenize(train_token[10]))
print(train_target[10].split(' '))
token_0 = tokenizer.encode_plus(train_token[0], max_length=10, pad_to_max_length=True)
print(len(token_0['input_ids']))

for key in token_0:
    print(key, ':')
    print(token_0[key])
print("Tokens (str)      : {}".format([tokenizer.convert_ids_to_tokens(s) for s in token_0['input_ids']]))
#train_token = tokenizer.batch_encode_plus(train_token, pad_to_max_length=True)

['当', '有', '了', '一', '定', '的', '实', '力', '后', '，', '他', '就', '成', '立', '了', '武', '义', '县', '重', '点', '实', '用', '菌', '公', '司', '，', '不', '仅', '负', '责', '为', '菇', '农', '提', '供', '技', '术', '指', '导', '和', '菌', '种', '，', '而', '且', '负', '责', '原', '料', '代', '购', '，', '产', '品', '回', '收', '，', '经', '自', '己', '加', '工', '，', '或', '出', '口', '、', '或', '内', '销', '，', '从', '而', '使', '高', '温', '香', '菇', '栽', '培', '技', '术', '迅', '速', '扩', '散', '到', '浙', '西', '南', '山', '区', '的', '１', '０', '多', '个', '县', '市', '，', '１', '０', '０', '多', '个', '乡', '镇', '，', '栽', '培', '规', '模', '由', '１', '９', '９', '１', '年', '的', '２', '３', '万', '袋', '增', '加', '到', '１', '９', '９', '５', '年', '的', '３', '０', '０', '０', '万', '袋', '，', '仅', '此', '一', '项', '就', '使', '当', '地', '农', '民', '增', '加', '收', '入', '１', '亿', '多', '元', '。']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [93]:
# 构建数据集
class MSRA(Dataset):
    
    def __init__(self, seq_len, train_token_path, train_target_path, device=device):
        
        # load raw data
        train_token = open(train_token_path, 'r').read().split('\n')[:-1]
        train_target = open(train_target_path, 'r').read().split('\n')[:-1]
        
        # tokenize
        self.train_token = list(map(lambda x:['[CLS]']+x.split(' ')[:seq_len-2]+['[SEP]'], train_token))
        self.train_target = list(map(lambda x:['O']+x.split(' ')[:seq_len-2]+['O'], train_target))
        
        # check
        for token, target in zip(self.train_token, self.train_target):
            if len(token) != len(target):
                print(idx, token, target)
                print('-'*100)
        
        # transform to id list
        self.train_token = list(map(lambda x:tokenizer.convert_tokens_to_ids(x), self.train_token))
        self.train_target = list(map(lambda x:[tag2idx[i] for i in x], self.train_target))
        
        # pad and mask
        pad_lens = [seq_len-len(x) for x in self.train_token]
        self.train_token = [token+[0]*pad_len for token, pad_len in zip(self.train_token, pad_lens)]
        self.mask = [[1]*(seq_len-pad_len)+[0]*pad_len for pad_len in pad_lens]
        self.train_target = [target+[0]*pad_len for target, pad_len in zip(self.train_target, pad_lens)]
        
        # to tensor
        self.train_token = torch.tensor(self.train_token).to(device)
        self.mask = torch.tensor(self.mask).to(device)
        self.train_target = torch.tensor(self.train_target).to(device)
        
    def __getitem__(self, idx):
        return self.train_token[idx], self.mask[idx], self.train_target[idx]
    
    def __len__(self):
        return len(self.train_token)
            

In [94]:
train_data = MSRA(data_config['seq_len'], train_pathx, train_pathy)
print(train_data[10])
train_data_loader = DataLoader(train_data, batch_size=data_config['batch_size'])

val_data = MSRA(data_config['seq_len'], val_pathx, val_pathy)
val_data_loader = DataLoader(val_data, batch_size=data_config['batch_size'])

test_data = MSRA(data_config['seq_len'], test_pathx, test_pathy)
test_data_loader = DataLoader(test_data, batch_size=data_config['batch_size'])

(tensor([ 101, 2496, 3300,  749,  671, 2137, 4638, 2141, 1213, 1400, 8024,  800,
        2218, 2768, 4989,  749, 3636,  721, 1344, 7028, 4157, 2141, 4500, 5826,
        1062, 1385, 8024,  679,  788, 6566, 6569,  711, 5823, 1093, 2990,  897,
        2825, 3318, 2900, 2193, 1469, 5826, 4905, 8024, 5445,  684, 6566, 6569,
        1333, 3160,  807, 6579, 8024,  772, 1501, 1726, 3119, 8024, 5307, 5632,
        2346, 1217, 2339, 8024, 2772, 1139, 1366,  510, 2772, 1079, 7218, 8024,
         794, 5445,  886, 7770, 3946, 7676, 5823, 3420, 1824, 2825, 3318, 6813,
        6862, 2810, 3141, 1168, 3851, 6205, 1298, 2255, 1277, 4638, 8029, 8028,
        1914,  702, 1344, 2356, 8024, 8029, 8028, 8028, 1914,  702,  740, 7252,
        8024, 3420, 1824, 6226, 3563, 4507, 8029, 8037, 8037, 8029, 2399, 4638,
        8030, 8031,  674, 6150, 1872, 1217, 1168, 8029, 8037, 8037, 8033, 2399,
        4638, 8031, 8028, 8028, 8028,  674, 6150, 8024,  788, 3634,  671, 7555,
        2218,  886, 2496, 1765, 1093, 3

In [None]:
# evaluate trained model on some cases
def evaluation_case(model, path_x=MAIN_PATH+'msra/val/sentences.txt', path_y=MAIN_PATH+'msra/val/tags.txt', n=1):
    Iter_val = train_data_iter(path_x, path_y, data_config, shuffle=False)
    x,y = next(Iter_val)
    tag_pred = model(x)
    for sent, tag in zip(x[:n].numpy(), tag_pred[:n]):
        print('\t',[token_idx_r.get(i,'')+':'+tag_idx_r[j] for i,j in zip(sent, tag)])

# train model and evaluation
def train(model_config, train_config, device=device):
    
    print('training start...')
    print('[params]:')
    print('\tmodel params:', model_config)
    print('\ttrain parmas:', train_config)

    # build model
    bertmodel = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=model_config['target_size']).to(device)
    print('[build model]:')
    print(bertmodel.parameters())
    
    # opt
    optimizer = AdamW(bertmodel.parameters(), lr=train_config['lr'])
    
    # train
    for epoch in range(train_config['epochs']):
        
        # record loss every epoch
        loss_value = []
        
        for token, mask, target in train_data_loader:

            # model init
            optimizer.zero_grad()
            bertmodel.train()
            
            loss = bertmodel(input_ids=token, attention_mask=mask, labels=target)
            
            # record
            loss_value.append(loss.item())
            
            # weight update
            loss.backward()
            optimizer.step()
            
        print('[epoch %d]\tloss=%s' % (epoch, np.mean(loss_value)))
        #print('all loss:', loss_value)
        print('[evaluation]:')
        #evaluation_case(model, n=2)
        
    return model

model = train(model_config, train_config)

training start...
[params]:
	model params: {'target_size': 7}
	train parmas: {'epochs': 10, 'lr': 0.001}
[build model]:
<generator object Module.parameters at 0x1adcf1afc0>


In [23]:
#test

def test(model, out_pathx, out_pathy, path_x=MAIN_PATH+'msra/test/sentences.txt', path_y=MAIN_PATH+'msra/test/tags.txt'):
    Iter_val = train_data_iter(path_x, path_y, data_config, shuffle=False)
    ox = open(out_pathx, 'w')
    oy = open(out_pathy, 'w')
    
    model.eval()
    for x,y in tqdm_notebook(Iter_val):
        with torch.no_grad():
            tag_pred = model(x)
            for sent, tag in zip(x.numpy(), tag_pred):
                sent_decode = ' '.join([token_idx_r[i] for i in sent])
                tag_decode = ' '.join([tag_idx_r[i] for i in tag])
                ox.write(sent_decode + '\n')
                oy.write(tag_decode + '\n')
    ox.close()
    oy.close()
    
test(model, out_pathx='result/lstm_crf_sentences.txt', out_pathy='result/lstm_crf_tags.txt')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [25]:
# evaluation
from utils.evaluation import f1_score_from_path

test_x = 'msra/test/sentences.txt'
test_y = 'msra/test/tags.txt'
pred_y = 'result/lstm_crf_tags.txt'
pred_x = 'result/lstm_crf_sentences.txt' # Because of padding, the length of prediction may be shorter than true label

micro_score = f1_score_from_path(test_x, test_y, pred_y, pred_x, f1_type='mirco')
macro_score = f1_score_from_path(test_x, test_y, pred_y, pred_x, f1_type='marco')
print('micro : %s \t macro : %s' % (micro_score, macro_score))

micro : 0.8097027292120667 	 macro : 0.8097027292120667
