<h1 style="text-align:center">基于 BiLSTM-Attention 的实体消歧系统示例</h1>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# 导入模块
import sys

sys.path.append("./src")

import os
import re
import json
import numpy as np
import utils
from data_process import *
from model import Model

In [3]:
# vocab 包含 vocab["w2i"]: word2idx、vocab["i2w"]：
matrix = np.load('./data/pretrain_data/matrix.npy')
with open('./data/pretrain_data/vocab.json', 'r', encoding='utf8') as f:
    jsonstr = ''.join(f.readlines())
    vocab = json.loads(jsonstr)

# 生成 mention 的候选实体集合
if os.path.exists('./data/generated/cand.json'):
    with open('./data/generated/cand.json', 'r', encoding='utf8') as f:
        jsonstr = ''.join(f.readlines())
        cand_dic = json.loads(jsonstr)
    with open('./data/generated/entity.json', 'r', encoding='utf8') as f:
        jsonstr = ''.join(f.readlines())
        ent_dic = json.loads(jsonstr)
else:
    cand_dic, ent_dic = GenerateCand('kb.json')

# 实例化编码类
data_encoder = DataEncoder(vocab["w2i"], utils.type2label, user_word_dict="./data/generated/mention.txt")

# 实例化模型
model = Model(matrix, utils.param)
model.load_state_dict(torch.load('./weights/ckpt_best_2.pth')['net'])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/data/zhangruochi/miniconda3/envs/tryit/lib/python3.8/site-packages/jieba/__init__.py", line 152, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmpva168foy' -> '/tmp/jieba.cache'
Loading model cost 0.895 seconds.
Prefix dict has been built succesfully.


<All keys matched successfully>

In [4]:
predict_line = {
    "text_id": "3",
    "text": "《绿皮书》托尼·利普和唐博士，配上这首歌，网友：这种情愫有点嗲",
    "mention_data": [
        {
            "mention": "《绿皮书》",
            "offset": "0"
        },
        {
            "mention": "托尼·利普",
            "offset": "5"
        },
        {
            "mention": "唐博士",
            "offset": "10"
        },
        {
            "mention": "歌",
            "offset": "18"
        },
        {
            "mention": "情愫",
            "offset": "25"
        }
    ]
}

In [5]:

jsonstr = json.dumps(predict_line, ensure_ascii=False)
with open("./data/basic_data/predict.json", 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(jsonstr)

# 生成预测的文本数据
GeneratePairwaiseSample('predict.json', cand_dic, ent_dic, is_train=False)

# 数据编码
data_encoder.data_encode("./data/generated/predict_data.txt", is_train=False)

# 构建数据集加载接口
predict_set = DataSet("./data/generated/predict.csv", is_train=False)

# dataloader
test_loader = DATA.DataLoader(predict_set,  batch_size=8, collate_fn=utils.collate_fn_test)

Encode ./data/generated/predict_data.txt: 28it [00:00, 1159.48it/s]


In [6]:
# 预测结果
result = {}
for i, test_data in enumerate(test_loader):
    id_list, query, offset, cand_desc, seq_len = test_data
    # forward
    pre_label, pre_type = model.predict(query, offset, cand_desc, seq_len)
    # 记录预测结果
    result = utils.record(result, id_list, torch.softmax(pre_label, dim=-1), pre_type)

In [7]:
# 处理预测结果，生成打印信息
data = [predict_line]

for i, line in enumerate(data):
    res_line = result[line['text_id']]
    mention_data = line["mention_data"]
    for mid, item in enumerate(line["mention_data"]):
        item['pre_id'] = res_line[str(mid)]['pre_id']
        pre_type_id = res_line[str(mid)]['pre_type'].argmax().item()
        item['pre_type'] = utils.lable2type[pre_type_id]
        if item['pre_id'] != 'NIL':
            item["pre_desc"] = ent_dic[item['pre_id']]['ent_desc']
            item['pre_type'] = ent_dic[item['pre_id']]['type']
        mention_data[mid] = item
    data[i]['mention_data'] = mention_data
# 打印结果
for i in data:
    print("文本:\t", i['text'], '\n')
    for j in i['mention_data']:
        print("实体:\t", j['mention'])
        print("类型:\t", j['pre_type'])
        if j['pre_id'] != 'NIL':
            print('描述:\t', j['pre_desc'])
        print('\n')

文本:	 《绿皮书》托尼·利普和唐博士，配上这首歌，网友：这种情愫有点嗲 

实体:	 《绿皮书》
类型:	 Other


实体:	 托尼·利普
类型:	 Other


实体:	 唐博士
类型:	 Other


实体:	 歌
类型:	 Other


实体:	 情愫
类型:	 Work
描述:	 摘要:《情愫》是由七月有尾巴创作的情诗，发表于中国诗歌网。;作品名称:情愫;作品出处:中国诗歌网;作者:七月有尾巴;义项描述:七月有尾巴创作的诗;文学体裁:七绝;创作年代:2017.11.24;


