In [1]:
import time
import sys
import argparse
import random
import copy
import torch
import gc
# import cPickle as pickle
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [2]:
import json
from tqdm import tqdm
import os
from random import choice
from itertools import groupby

In [3]:
import jieba

In [4]:
# from ner import *
from ner_utils.metric import get_ner_fmeasure
from ner_utils.data import Data
from KB import KB

In [5]:
def load_data(data_directory):
    with open(data_directory) as f:
        train_data,dev_data,test_data = json.load(f)
#         train_data = byteify(train_data)
#         dev_data = byteify(dev_data)
#         test_data = byteify(test_data)
    print('traindata size:',len(train_data))
    print('devdata size:',len(dev_data))
    print('testdata size',len(test_data))
    return train_data,dev_data,test_data

In [6]:
def generate_ner_data(data):
    char = []
    seg = []
    label = []
    ids = []
    kb_ids = []
    for i in tqdm(range(len(data))):
        tmp_text = list(data[i]['text'])
        tmp_label = ['O']*len(tmp_text)
        tmp_mention = data[i]['mention_data']
        kb_id = []
        for j in range(len(tmp_mention)):
            m,o,_id = tmp_mention[j]
            kb_id.append(_id)
            if len(m) > 1:
                tmp_label[o] = 'B'
                tmp_label[o+len(m)-1] = 'E'
                for j in range(o+1,o+len(m)-1,1):
                    tmp_label[j] = 'M'
            else:
                tmp_label[o] = 'S'
        seg.append(list(jieba.cut(data[i]['text'])))
        char.append(tmp_text)
        label.append(tmp_label)
        ids.append(data[i]['text_id'])
        kb_ids.append(kb_id)
    return char,seg,label,ids,kb_ids

In [7]:
def data_initialization(data, kb, train_file, dev_file, test_file):
    data.build_alphabet(train_file[0],train_file[1])
    data.build_alphabet(dev_file[0],dev_file[1])
    data.build_alphabet(test_file[0],test_file[1])
    data.build_kb(kb)
    data.build_all_alphabet(train_file[0],train_file[1])
    data.build_all_alphabet(train_file[0],train_file[1])
    data.build_all_alphabet(train_file[0],train_file[1])
    data.fix_alphabet()
    return data

def train(data, save_model_dir, seg=True):
    print ("Training model...")
    data.show_data_summary()
    save_data_name = save_model_dir +".dset"
    save_data_setting(data, save_data_name)
#     ????????????
#     model = SeqModel(data)
    print "finished built model."
    loss_function = nn.NLLLoss()
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.SGD(parameters, lr=data.HP_lr, momentum=data.HP_momentum)
    best_dev = -1
    data.HP_iteration = 100
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" %(idx,data.HP_iteration))
        optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        batch_loss = 0
        total_loss = 0
        right_token = 0
        whole_token = 0
        random.shuffle(data.train_Ids)
        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = 1 ## current only support batch size = 1 to compulate and accumulate to data.HP_batch_size update weights
        batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num//batch_size+1
        for batch_id in range(total_batch):
            start = batch_id*batch_size
            end = (batch_id+1)*batch_size 
            if end >train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            if not instance:
                continue
            gaz_list,  batch_word, batch_biword, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu)
            # print "gaz_list:",gaz_list
            # exit(0)
            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(gaz_list, batch_word, batch_biword, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask)
            right, whole = predict_check(tag_seq, batch_label, mask)
            right_token += right
            whole_token += whole
            sample_loss += loss.data[0]
            total_loss += loss.data[0]
            batch_loss += loss

            if end%500 == 0:
                temp_time = time.time()
                temp_cost = temp_time - temp_start
                temp_start = temp_time
                print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))
                sys.stdout.flush()
                sample_loss = 0
            if end%data.HP_batch_size == 0:
                batch_loss.backward()
                optimizer.step()
                model.zero_grad()
                batch_loss = 0
        temp_time = time.time()
        temp_cost = temp_time - temp_start
        print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))       
        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss))
        # exit(0)
        # continue
        speed, acc, p, r, f, _ = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if seg:
            current_score = f
            print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(dev_cost, speed, acc))

        if current_score > best_dev:
            if seg:
                print "Exceed previous best f score:", best_dev
            else:
                print "Exceed previous best acc score:", best_dev
            model_name = save_model_dir +'.'+ str(idx) + ".model"
            torch.save(model.state_dict(), model_name)
            best_dev = current_score 
        # ## decode test
        speed, acc, p, r, f, _ = evaluate(data, model, "test")
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if seg:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(test_cost, speed, acc, p, r, f))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc))
        gc.collect() 


In [8]:
seed_num = 100
random.seed(seed_num)
torch.manual_seed(seed_num)
np.random.seed(seed_num)

In [9]:
parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CRF')
parser.add_argument('--embedding',  help='Embedding for words', default='None')
parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train')
parser.add_argument('--savemodel', default="data/model/saved_model.lstmcrf.")
parser.add_argument('--savedset', help='Dir of saved data setting', default="data/save.dset")
parser.add_argument('--data',default="data/all_data.json")
# parser.add_argument('--train', default="data/conll03/train.bmes") 
# parser.add_argument('--dev', default="data/conll03/dev.bmes" )  
# parser.add_argument('--test', default="data/conll03/test.bmes") 
parser.add_argument('--seg', default="True") 
parser.add_argument('--extendalphabet', default="True") 
# parser.add_argument('--raw') 
parser.add_argument('--loadmodel')
parser.add_argument('--output') 
# ?????????????????????????????????
args = parser.parse_args([])

In [10]:
data_dir = args.data
model_dir = args.loadmodel
dset_dir = args.savedset
kb_file = None
output_file = args.output
if args.seg.lower() == "true":
    seg = True 
else:
    seg = False
status = args.status.lower()
save_model_dir = args.savemodel
gpu = torch.cuda.is_available()


In [11]:
char_emb = None
bichar_emb = None
# kb_emb = None

In [12]:
print ("CuDNN:", torch.backends.cudnn.enabled)
# gpu = False
print ("GPU available:", gpu)
print ("Status:", status)
print ("Seg: ", seg)
# print ("Train file:", train_file)
# print ("Dev file:", dev_file)
# print ("Test file:", test_file)
print ("Data dir:", data_dir)
# print ("Raw file:", raw_file)
print ("Char emb:", char_emb)
print ("Bichar emb:", bichar_emb)
print ("KB file:",kb_file)
if status == 'train':
    print ("Model saved to:", save_model_dir)
sys.stdout.flush()

CuDNN: True
GPU available: True
Status: train
Seg:  True
Data dir: data/all_data.json
Char emb: None
Bichar emb: None
KB file: None
Model saved to: data/model/saved_model.lstmcrf.


In [13]:
# train_file = args.train
# dev_file = args.dev
# test_file = args.test
# raw_file = args.raw
train_data,dev_data,test_data = load_data('./data/all_data.json')
kb_data = KB('./ccks2019_el/kb_data')
# ??????????????????
kb = kb_data.kb[:100]

2330it [00:00, 23283.54it/s]

traindata size: 84262
devdata size: 851
testdata size 852
start loading kb_data...
construct id2kb dict...


399252it [00:18, 21966.73it/s]


construct kb2id dict...
KB DATA INFORMATION
TOKEN SIZE:303375
ID SIZE:399233
TYPE SIZE:51
PREDICATE SIZE:41841


In [14]:
train_char,train_seg,train_label,train_ids,train_kb_ids  = generate_ner_data(train_data)
dev_char,dev_seg,dev_label,dev_ids,dev_kb_ids  = generate_ner_data(dev_data)
test_char,test_seg,test_label,test_ids,test_kb_ids  = generate_ner_data(test_data)

  0%|          | 0/84262 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.077 seconds.
Prefix dict has been built succesfully.
100%|██████████| 84262/84262 [00:18<00:00, 4574.64it/s]
100%|██████████| 851/851 [00:00<00:00, 4904.58it/s]
100%|██████████| 852/852 [00:00<00:00, 4665.20it/s]


In [15]:
if status == 'train':
        data = Data()
        data.HP_gpu = gpu
        data.HP_use_char = False
        data.HP_batch_size = 1
        data.use_bigram = False
        data.gaz_dropout = 0.5
        data.norm_gaz_emb = False
        data.HP_fix_gaz_emb = False
        data_initialization(data, kb, [train_char,train_label], [dev_char,dev_label], [test_char,test_label])
        data.generate_instance_with_kb(train_char,train_label,'train')
        data.generate_instance_with_kb(dev_char,dev_label,'dev')
        data.generate_instance_with_kb(test_char,test_label,'test')
        data.build_word_pretrain_emb(char_emb)
        data.build_biword_pretrain_emb(bichar_emb)
        data.build_kb_pretrain_emb(kb_file)

KB total size: 100
gaz alphabet size: 2
kb alphabet size: 70
gaz alphabet size: 2
kb alphabet size: 70
gaz alphabet size: 2
kb alphabet size: 70
build word pretrain emb...
Embedding:
     pretrain word:0, prefect match:0, case_match:0, oov:6253, oov%:0.9998401023345059
build biword pretrain emb...
Embedding:
     pretrain word:0, prefect match:0, case_match:0, oov:302399, oov%:0.9999966931216931
build gaz pretrain emb...
Embedding:
     pretrain word:0, prefect match:0, case_match:0, oov:69, oov%:0.9857142857142858


In [16]:
import torch as t
print(t.__version__)

0.3.1.post2
