<h1 style="text-align:center">基于 BiLSTM-Attention 的实体消歧系统示例</h1>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# 导入模块
import sys
import torch

sys.path.append("./src")
common_path = '../data/entity_link'
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

import os
import re
import json
import numpy as np
import utils
from data_process import *
from model import Model

In [3]:
# vocab 包含 vocab["w2i"]: word2idx、vocab["i2w"]：
matrix = np.load(os.path.join(common_path, 'pretrain_data/matrix.npy'))
with open(os.path.join(common_path, 'pretrain_data/vocab.json'), 'r', encoding='utf8') as f:
    jsonstr = ''.join(f.readlines())
    vocab = json.loads(jsonstr)

# 生成 mention 的候选实体集合
if os.path.exists(os.path.join(common_path, 'generated/cand.json')):
    with open(os.path.join(common_path, 'generated/cand.json'), 'r', encoding='utf8') as f:
        jsonstr = ''.join(f.readlines())
        cand_dic = json.loads(jsonstr)
    with open(os.path.join(common_path, 'generated/entity.json'), 'r', encoding='utf8') as f:
        jsonstr = ''.join(f.readlines())
        ent_dic = json.loads(jsonstr)
else:
    cand_dic, ent_dic = GenerateCand('kb.json')

# 实例化编码类
data_encoder = DataEncoder(vocab["w2i"],
                           utils.type2label,
                           user_word_dict=os.path.join(common_path,"generated/mention.txt"))
 
# 实例化模型
model = Model(matrix, utils.param)
model.load_state_dict(
    torch.load('../ckpt/entity_link/weights/ckpt_best_2.pth',
               map_location=torch.device(device=device))['net'])

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/19/dgxwsbgd76728hx577833nx00000gn/T/jieba.cache
Loading model cost 0.623 seconds.
Prefix dict has been built successfully.


<All keys matched successfully>

In [4]:
predict_line = {
    "text_id": "3",
    "text": "《绿皮书》托尼·利普和唐博士，配上这首歌，网友：这种情愫有点嗲",
    "mention_data": [
        {
            "mention": "《绿皮书》",
            "offset": "0"
        },
        {
            "mention": "托尼·利普",
            "offset": "5"
        },
        {
            "mention": "唐博士",
            "offset": "10"
        },
        {
            "mention": "歌",
            "offset": "18"
        },
        {
            "mention": "情愫",
            "offset": "25"
        }
    ]
}

In [5]:

jsonstr = json.dumps(predict_line, ensure_ascii=False)
with open(os.path.join(common_path, "basic_data/predict.json"), 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(jsonstr)

# 生成预测的文本数据
GeneratePairwaiseSample('predict.json', cand_dic, ent_dic, is_train=False)

# 数据编码
data_encoder.data_encode(os.path.join(common_path, "generated/predict_data.txt"), is_train=False)

# 构建数据集加载接口
predict_set = DataSet(os.path.join(common_path, "generated/predict.csv"), is_train=False)

# dataloader
test_loader = DATA.DataLoader(predict_set,  batch_size=8, collate_fn=utils.collate_fn_test)

Encode ../data/entity_link/generated/predict_data.txt: 28it [00:00, 1064.97it/s]


In [6]:
# 预测结果
result = {}
for i, test_data in enumerate(test_loader):
    id_list, query, offset, cand_desc, seq_len = test_data
    # forward
    pre_label, pre_type = model.predict(query, offset, cand_desc, seq_len)
    # 记录预测结果
    result = utils.record(result, id_list, torch.softmax(pre_label, dim=-1), pre_type)

In [7]:
# 处理预测结果，生成打印信息
data = [predict_line]

for i, line in enumerate(data):
    res_line = result[line['text_id']]
    mention_data = line["mention_data"]
    for mid, item in enumerate(line["mention_data"]):
        item['pre_id'] = res_line[str(mid)]['pre_id']
        pre_type_id = res_line[str(mid)]['pre_type'].argmax().item()
        item['pre_type'] = utils.lable2type[pre_type_id]
        if item['pre_id'] != 'NIL':
            item["pre_desc"] = ent_dic[item['pre_id']]['ent_desc']
            item['pre_type'] = ent_dic[item['pre_id']]['type']
        mention_data[mid] = item
    data[i]['mention_data'] = mention_data
# 打印结果
for i in data:
    print("文本:\t", i['text'], '\n')
    for j in i['mention_data']:
        print("实体:\t", j['mention'])
        print("类型:\t", j['pre_type'])
        if j['pre_id'] != 'NIL':
            print('描述:\t', j['pre_desc'])
        print('\n')

文本:	 《绿皮书》托尼·利普和唐博士，配上这首歌，网友：这种情愫有点嗲 

实体:	 《绿皮书》
类型:	 Other


实体:	 托尼·利普
类型:	 Other
描述:	 出生地:美国宾夕法尼亚州比弗福尔斯;外文名:Tony Lip;摘要:托尼·利普（Tony Lip，原名Frank Anthony Vallelonga，1930年7月30日-2013年1月4日），是电影《绿皮书》中白人司机的原型。;逝世日期:2013年1月4日;别名:Frank Anthony Vallelonga;义项描述:托尼·利普;中文名:托尼·利普;国籍:美国;出生日期:1930年7月30日;


实体:	 唐博士
类型:	 Other


实体:	 歌
类型:	 Work
描述:	 摘要:《歌》是郭晓春演唱的歌曲。;谱曲:郭晓春;歌曲原唱:郭晓春;填词:郭晓春;发行公司:杭州回声文化艺术策划有限公司;中文名称:歌;义项描述:《歌》是郭晓春演唱的歌曲;


实体:	 情愫
类型:	 Work
描述:	 摘要:《情愫》是歌手方季惟所演唱的一首歌曲，所属专辑：心痛不再有。;谱曲:蔡宣辉;歌曲原唱:方季惟;填词:蔡宣辉;编曲:丹尼;中文名称:情愫;义项描述:方季惟演唱歌曲;所属专辑:《心痛不再有》;标签:娱乐;


