In [1]:
import json
from tqdm import tqdm
import os
from random import choice
from itertools import groupby
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
class KB(object):
    def __init__(self,kb_directory):
        print("start loading kb_data...")
        self.kb_directory = kb_directory
        self.id2kb,self.types,self.predicate = self.get_id2kb()
        self.kb2id = self.get_kb2id()
        self.kb = list(self.kb2id.keys())
        self.id = list(self.id2kb.keys())
        print("KB DATA INFORMATION")
        print("TOKEN SIZE:{}".format(self.get_token_size()))
        print("ID SIZE:{}".format(len(self)))
        print("TYPE SIZE:{}".format(len(self.types)))
        print("PREDICATE SIZE:{}".format(len(self.predicate)))
    def get_id2kb(self):
        print("construct id2kb dict...")
        id2kb = {}
        kbtype = set()
        predicate = set()
        multi_type = []
        with open(self.kb_directory) as f:
            for l in tqdm(f):
                tmp = json.loads(l)
                subject_id = tmp['subject_id']
                subject_alias = list(set([tmp['subject']] + tmp.get('alias', [])))
                subject_alias = [alias.lower() for alias in subject_alias]
                subject_type = [i.lower() for i in tmp['type']]
                kbtype.update(subject_type)
                try:
                    assert(len(tmp['type'])==1)
                except AssertionError:
                    multi_type.append(tmp['type'])
                subject_data = {}
                for i in tmp['data']:
                    predicate.add(i['predicate'].lower())
                    subject_data[i['predicate'].lower()] = i['object'].lower()
                if subject_data:
                    id2kb[subject_id] = {'alias': subject_alias, 'data': subject_data,'type':subject_type}
#         print(multi_type)
        return id2kb,kbtype,predicate
    def get_kb2id(self):
        print("construct kb2id dict...")
        kb2id = {}
        for i,j in self.id2kb.items():
            for k in j['alias']:
                if k not in kb2id:
                    kb2id[k] = []
                kb2id[k].append(i)
        return kb2id
    def __len__(self):
        return len(self.id2kb)
    def get_token_size(self):
        return len(self.kb)
#     def save(self):
        


In [3]:
kb_data = KB('./ccks2019_el/kb_data')

4355it [00:00, 21006.03it/s]

start loading kb_data...
construct id2kb dict...


399252it [00:18, 21052.67it/s]


construct kb2id dict...
KB DATA INFORMATION
TOKEN SIZE:303375
ID SIZE:399233
TYPE SIZE:51
PREDICATE SIZE:41841


In [4]:
print(len(kb_data))

399233


type2label = {'PER':['athlete','entertainmentperson','fictionalhuman',\
                     'person','human','familyname','organization'],
              'LOC':['nation','place','realestate','medicaldepartmenttype','country'],
              'ORG':['collegeoruniversity','scientificorganization','organism',\
                     'communicationmedium','internationalorganization','building','organization'],
              'MISC':['medicalcondition','animal','formula',\
                      'tvshow','event','academicdiscipline',\
                     'currency','tool','astronomicalobject',\
                     'brand','tvplay','vocabulary','culturalheritage',\
                      'dynasty','zodiacsign','thing','food','language',\
                     'material','chemicalelement','theorem','game',\
                     'creativework','historicalperiod','awardeventseries','plant',\
                     'product','fictionalthing','familyname','movie','symbol','curriculum']}

label2type = {}
for label,types in type2label.items():
    for j in types:
        label2type[j] = label
print(label2type)

In [7]:
print(kb_data.types)

{'internationalorganization', 'theorem', 'entertainmentperson', 'communicationmedium', 'symbol', 'curriculum', 'realestate', 'product', 'fictionalhuman', 'food', 'awardeventseries', 'language', 'place', 'animal', 'familyname', 'chemicalelement', 'academicdiscipline', 'fictionalthing', 'tool', 'medicaldepartmenttype', 'organism', 'thing', 'plant', 'country', 'historicalperiod', 'formula', 'nation', 'scientificorganization', 'culturalheritage', 'building', 'athlete', 'organization', 'tvshow', 'brand', 'movie', 'educationmajor', 'zodiacsign', 'collegeoruniversity', 'tvplay', 'material', 'creativework', 'astronomicalobject', 'dynasty', 'historicalperson', 'event', 'currency', 'medicalcondition', 'human', 'game', 'person', 'vocabulary'}


In [8]:
print(kb_data.predicate)

{'界面语言', '背景美術', '版本大小', '制作组', '学位学历', '武器特色', '母带师', '连载情况', '优酷上映', '车型尺寸', '植物类别', '形容的事物', '胞兄', '化名', '地铁日均客流', '著名太后', '本之义', '大致内容', '工作院系', '休闲设施', '经典语录', '著名高校', '功效主治', 'appstore', '预购时间', '著称于世', '特定动画人物', '出品机构', '面额', '作者朝代', '粤语读音', '关于媒商', '公交地铁', '组织代号', '矫正方法', '武术顾问', '采用网络', '其中之一', '最近更新内容', '英雄单位', '影片性质', '受众人群', '现节目主持', '录音后期', '代省长', '行政区划级别', '兴起国家', '燃烧三要素', '学校语文书', '中央界', '团体中角色', '装备特性', '青岛校区', '部活', '品牌主张', '达到', '特约编辑', '成立人', '中文直译', '现任总队长', '城市口号', '器乐录音师', '美国出版社', '拥有球', '最难忘的人', '学校吉祥物', '航速', '取景地点', '等级分类', '标识', '觉醒等级', '限制范围', '上映', '游戏容量', '功能特点', '线路类型', '动画后期', '日语版', '含义vi.', '烧制温度', '根椐地', '运营环境', '工作性质', '碎纸能力', '从警经历', '平均运动', '剧情年代', '注册资产', '区花区树', '五个首脑', 'imdb平分', '三十五回', '适用对向', '游戏言语', '经典语句', '召唤时间', '常用声线', '教义', '纲英文名', '魂兽形态', '最爱的点心', '男性阿斯泰坦', '主要运用', '策划/剧情', '统一政权', '曲目顺序', '正式登场', '珍品', '市民广场', '发卡银行', '译制', '颁奖礼司仪', '文学风格', '人物善恶', '感光器件', '导演人', '承制', '公司经营', '剧幕', '必经之地', '曾用假名', '支持帐号', '雀形目鸟', '全美累计票房', '北周', '提升途径

In [9]:
multi_ids = []
for kbs,ids in kb.kb2id.items():
    if len(ids)>1:
        multi_ids.append(kbs)


In [10]:
print(len(multi_ids))
print(multi_ids[:10])

for kbs in multi_ids[:1]:
#     tmp = []
    for ids in kb_data.kb2id[kbs]:
        print(kb_data.id2kb[ids])
#         tmp.append(kb.id2kb[ids])
#     print(tmp)
    

89694
['胜利', '张三的歌', '七里香', '王平', '王平将军', '树大招风', '王超', '无尽武道', '滕王阁序', '莲宗寺']
{'alias': ['胜利'], 'data': {'摘要': '英雄联盟胜利系列皮肤是拳头公司制作的具有纪念意义限定系列皮肤之一。拳头公司制作的具有纪念意义限定系列皮肤还包括英雄联盟冠军系列皮肤、msi季中冠军赛征服者系列以及英雄联盟全球总决赛冠军系列皮肤。每到赛季结束时，拳头公司都会制作胜利系列皮肤作为赛季奖励来认可那些在排位赛中勇猛拼搏达到黄金段位的玩家。', '制作方': 'riot games', '外文名': 'victorious', '来源': '英雄联盟', '中文名': '胜利', '属性': '虚拟', '义项描述': '游戏《英雄联盟》胜利系列限定皮肤'}, 'type': ['thing']}
{'alias': ['胜利'], 'data': {'摘要': '胜利，汉语词汇。拼音：shèng lì胜利，指达到预期的目的。与“失败”相对。有“成功”的意思，古代打仗成功称胜利，比赛夺冠胜利称“成功”。其他寓意也很广泛(如：一件事坚持到了最后也称胜利)。胜利在英语中都为victory [victory ]', '外文名': 'win', '反义词': '失败', '拼音': 'shèng lì', '中文名': '胜利', '释义': '获得成功或达到目的', '义项描述': '汉语词语', '标签': '文化'}, 'type': ['vocabulary']}
{'alias': ['胜利'], 'data': {'摘要': '《胜利》是由[英] 约瑟夫·康拉德所著一部讽喻小说，新华出版社出版发行。', '作者': '[英] 约瑟夫·康拉德', 'isbn': '9787516620762', '书名': '胜利', '出版社': '新华出版社', '义项描述': '[英] 约瑟夫·康拉德所著小说'}, 'type': ['thing']}
{'alias': ['胜利'], 'data': {'摘要': '《胜利》是动漫原声的音乐作品,收录在《火影忍者疾风传ost》专辑中。', '发行时间': '2007-03', '音乐时长': '1分47秒', '所属专辑': '火影忍者疾风传

In [22]:
from topsim import TopSim

In [23]:
ts = TopSim(kb.kb)

In [36]:
print(kb_data.kb[258037])

地铁南京


In [37]:
print(ts.search('南京',k=50))

[(1.0, [216188]), (0.6, [159073]), (0.5, [159076]), (0.4, [262414, 81948, 140024, 164598]), (0.3333333333333333, [36210, 168069, 258037, 286950, 296174, 27034, 81947, 81949, 95281, 113955, 123409, 123556, 125630, 130826, 133979, 140029, 157474, 164600, 171633, 172232, 185119, 186896, 214670, 221729, 224545, 227151, 228484, 236183, 239889, 244184, 249849, 258038, 266560, 266667, 275771, 279219, 282965, 286949, 298008]), (0.2857142857142857, [670, 38997, 241398, 29364])]


In [12]:
data = []
with open('./ccks2019_el/train.json') as f:
    for l in tqdm(f):
        _ = json.loads(l)
        tmp = {
            'text_id':_['text_id'],
            'text': _['text'].lower(),
            'mention_data': [(x['mention'].lower(), int(x['offset']), x['kb_id'])
                for x in _['mention_data'] if x['kb_id'] != 'NIL'
            ]
        }
        if tmp['mention_data']:
            data.append(tmp)

90000it [00:01, 48306.78it/s]


In [13]:
print(len(data))

85965


mode = 0
random_order = np.arange(len(data))
np.random.shuffle(random_order)train_data,dev_data,test_data
test_data = [data[j] for i, j in enumerate(random_order) if i % 101 == mode]
dev_data = [data[j] for i, j in enumerate(random_order) if i % 100 == mode and i % 101 != mode]
train_data = [data[j] for i, j in enumerate(random_order) if i % 100 != mode and i % 101 != mode]

print(len(train_data))
print(len(test_data))
print(len(dev_data))
result = len(train_data)+len(test_data)+len(dev_data)
print(result)
assert(result==len(data))

json.dump([train_data,dev_data,test_data],open('./data/all_data.json','w'))

In [None]:
print(train_data[0])

In [18]:
import jieba

In [21]:
print(' '.join(jieba.cut('南京南站')))

南京 南站


In [72]:
cut_result = jieba.cut(train_data[0]['text'])
# print(' '.join(cut_result))
result = list(cut_result)
print(result)
# offset = [len((''.join(result[:i]))) for i in range(len(result))]
offset = [0]
for i in range(len(result)-1):
    offset.append(offset[-1]+len(result[i]))
for i in range(1,len(result)):
    result.append(''.join(result[i-3+1:i+1]))
    offset.append(offset[i-1])
#     offset.append(len((''.join(result[:i-3+1]))))
print(result)
print(offset)

['电影', '《', '淘金', '岁月', '》', '下载', ' ', '急求']
['电影', '《', '淘金', '岁月', '》', '下载', ' ', '急求', '', '电影《淘金', '《淘金岁月', '淘金岁月》', '岁月》下载', '》下载 ', '下载 急求']
[0, 2, 3, 5, 7, 8, 10, 11, 0, 2, 3, 5, 7, 8, 10]


In [76]:
ts_result = []
for i in result:
#     print(i)
    tmp = ts.search(i)
    if tmp and tmp[0][0] > 0.5:
        ts_result.append(tmp)
    else:
        ts_result.append([])
print(ts_result)

[[(1.0, [31107])], [], [], [(1.0, [7525])], [], [(1.0, [249645])], [], [], [], [], [(0.5714285714285714, [50472])], [(0.5714285714285714, [50472])], [], [], []]


In [196]:
kb_data = KB('./ccks2019_el/kb_data')

2295it [00:00, 22871.01it/s]

start loading kb_data...
construct id2kb dict...


399252it [00:21, 18435.18it/s]


construct kb2id dict...
KB DATA INFORMATION
TOKEN SIZE:303375
ID SIZE:399233
TYPE SIZE:51
PREDICATE SIZE:41841


In [217]:
class ngram_search(object):
    def __init__(self,data,kb,ngram = 2,similarity = 0.5):
        self.n = ngram
        self.similarity = similarity
        self.data = data
        self.kb = kb
        self.cut_data,self.offset = self.cut_words()
        self.ts = TopSim(self.kb)
        self.candidates = self.get_candidates(self.similarity)
        self.cand_name,self.cand_off = self.get_candidates_name()
    def cut_words(self):
        print('starting build ngram list')
        print('ngram',self.n)
        result = []
        offset = []
        for d in tqdm(self.data):
#             print(d)
#             print(' '.join(jieba.cut(d)))
            tmp = list(jieba.cut(d))
            n = len(tmp)
            tmp_off = [0]
#             tmp_off = [len(''.join(tmp[:i])) for i in range(len(tmp))]
            for i in range(len(tmp)-1):
                tmp_off.append(tmp_off[-1]+len(tmp[i]))
            for j in range(2,self.n+1):
                for i in range(j-1,n):
                    tmp.append(''.join(tmp[i-j+1:i+1]))
                    tmp_off.append(tmp_off[i-j+1])
#                     tmp_off.append(''.join(tmp[:i-n+1]))
            result.append(tmp)
            offset.append(tmp_off)
        return result,offset
    def get_candidates(self,similarity = 0.5):
        self.similarity = similarity
        print('starting build candidates list')
        print('similarity:',self.similarity)
        candidates = []
        for dt in tqdm(self.cut_data):
            ts_result = []
            for i in dt:
                tmp = ts.search(i)
                if tmp and tmp[0][0] > self.similarity:
                    ts_result.append(tmp)
                else:
                    ts_result.append([])
            candidates.append(ts_result)
        return candidates
    def get_candidates_name(self):
        print('starting get candidates name and offset')
        cand_name = []
        cand_offset = []
        for i in tqdm(range(len(self.candidates))):
            cand = []
            off = []
            for j in range(len(self.candidates[i])):
                if self.candidates[i][j]:
#                     print(self.candidates[i][j])
#                     print(self.candidates[i][j][0][1][0])
#                     print(self.kb[self.candidates[i][j][0][1][0]])
                    cand.append(self.kb[self.candidates[i][j][0][1][0]])
                    off.append(self.offset[i][j])
            cand_name.append(cand)
            cand_offset.append(off)
        return cand_name,cand_offset



In [218]:
dev_x = [i['text'] for i in dev_data]
print(type(dev_x[0]))

<class 'str'>


In [244]:
ns_test = ngram_search([dev_x[0]],kb_data.kb)
print(ns_test.candidates)
print(ns_test.cand_name)




  0%|          | 0/1 [00:00<?, ?it/s][A[A[A


100%|██████████| 1/1 [00:00<00:00, 925.08it/s][A[A[A

starting build ngram list
ngram 2





  0%|          | 0/1 [00:00<?, ?it/s][A[A[A


100%|██████████| 1/1 [00:00<00:00, 19.83it/s][A[A[A


  0%|          | 0/1 [00:00<?, ?it/s][A[A[A


100%|██████████| 1/1 [00:00<00:00, 1892.74it/s][A[A[A

starting build candidates list
starting get candidates name and offset
[[[], [(1.0, [44529])], [(1.0, [4657])], [], [], [(1.0, [10911])], [], [(1.0, [48458])], [], [], [(1.0, [35639])], [(1.0, [108397])], [], [], [], [], [(1.0, [44529])], [(1.0, [4657])], [], [(1.0, [4605])], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [(1.0, [4605])]]]
[['铁', '梨花', '结局', '人', '的', '失落', '铁', '梨花', '铁梨花', '铁梨花']]


In [243]:
print(dev[0])
print(en[0])
print(ns_test.cand_name)
print(ns_test.cand_off)

《铁梨花》,结局让人有着说不出的失落。 - 铁梨花
['铁梨花', '结局', '失落', '铁梨花']
[['铁', '梨花', '结局', '人', '的', '失落', '铁', '梨花', '铁梨花', '铁梨花']]
[[1, 2, 6, 9, 15, 16, 22, 23, 1, 22]]


In [240]:
a = kb_data.kb2id['铁']
print(a)
for i in a:
    print(kb_data.id2kb[i])

['44637', '76121', '157702', '209828', '213677', '216377', '300998', '307908', '349237', '354871', '370145', '373877', '375248']
{'alias': ['铁'], 'data': {'摘要': '铁，中药材名。本品为一种灰黑色的金属。主要由赤铁矿、褐铁矿、磁铁矿等炼出。由于含碳量的不同，可分为生铁(含碳量在1.7％以上)、熟铁(含碳量在0.2％以下)和钢铁(含碳量在0.2～1.7％之间)三种。功能主治为：镇心平肝，消痈解毒。治惊痫，癫狂，痈毒。1.《本经》：主坚肌耐痛。2.《别录》：主疗下部及脱肛。主金疮，烦满，热中，胸膈气塞，食不化。3.《本草拾遗》：主贼风，烧赤投酒中热服之。4.《纲目》：散瘀血，消丹毒。5.《本草汇言》：平肝气，安惊痫。6.《本草备要》：镇心平肝，定惊疗狂，消痈解毒。', '汉语拼音': 'tie', '中文名': '铁', '义项描述': '铁'}, 'type': ['thing']}
{'alias': ['铁'], 'data': {'摘要': '铁，中药材名。本品为一种灰黑色的金属。主要由赤铁矿、褐铁矿、磁铁矿等炼出。由于含碳量的不同，可分为生铁(含碳量在1.7％以上)、熟铁(含碳量在0.2％以下)和钢铁(含碳量在0.2～1.7％之间)三种。功能主治为：镇心平肝，消痈解毒。治惊痫，癫狂，痈毒。1.《本经》：主坚肌耐痛。2.《别录》：主疗下部及脱肛。主金疮，烦满，热中，胸膈气塞，食不化。3.《本草拾遗》：主贼风，烧赤投酒中热服之。4.《纲目》：散瘀血，消丹毒。5.《本草汇言》：平肝气，安惊痫。6.《本草备要》：镇心平肝，定惊疗狂，消痈解毒。', '汉语拼音': 'tie', '中文名': '铁', '义项描述': '铁'}, 'type': ['thing']}
{'alias': ['铁'], 'data': {'摘要': '从这幅图可以看出，sc(生存战争)的矿物分布不仅与层数有关，也与生态环境有关，也就是别想在土层一下找硝石，同时，并不是岩石层就没有泥土由图可得：煤炭的分部为由上到下从地表面向下逐渐减少。硝石(也可以是氮肥，因为既可以做火药又可以做肥料，但是原版英文为硝石)的分布为大量分布于沙子下方，沙石

In [220]:
for i in range(2,6):
    ns_test = ngram_search([dev_x[0]],kb_data.kb,i)
    print(ns_test.cut_data)
    print(ns_test.cand_name)

100%|██████████| 1/1 [00:00<00:00, 1589.96it/s]

starting build ngram list
ngram 2



100%|██████████| 1/1 [00:00<00:00, 23.68it/s]
100%|██████████| 1/1 [00:00<00:00, 503.70it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

starting build candidates list
starting get candidates name and offset
[['《', '铁', '梨花', '》', ',', '结局', '让', '人', '有着', '说不出', '的', '失落', '。', ' ', '-', ' ', '铁', '梨花', '《铁', '铁梨花', '梨花》', '》,', ',结局', '结局让', '让人', '人有着', '有着说不出', '说不出的', '的失落', '失落。', '。 ', ' -', '- ', ' 铁', '铁梨花']]
[['铁', '梨花', '结局', '人', '的', '失落', '铁', '梨花', '铁梨花', '铁梨花']]
starting build ngram list
ngram 3


100%|██████████| 1/1 [00:00<00:00, 1587.55it/s]
100%|██████████| 1/1 [00:00<00:00, 17.48it/s]
100%|██████████| 1/1 [00:00<00:00, 1383.80it/s]

starting build candidates list
starting get candidates name and offset
[['《', '铁', '梨花', '》', ',', '结局', '让', '人', '有着', '说不出', '的', '失落', '。', ' ', '-', ' ', '铁', '梨花', '《铁', '铁梨花', '梨花》', '》,', ',结局', '结局让', '让人', '人有着', '有着说不出', '说不出的', '的失落', '失落。', '。 ', ' -', '- ', ' 铁', '铁梨花', '《铁梨花', '铁梨花》', '梨花》,', '》,结局', ',结局让', '结局让人', '让人有着', '人有着说不出', '有着说不出的', '说不出的失落', '的失落。', '失落。 ', '。 -', ' - ', '- 铁', ' 铁梨花']]
[['铁', '梨花', '结局', '人', '的', '失落', '铁', '梨花', '铁梨花', '铁梨花']]
starting build ngram list
ngram 4



100%|██████████| 1/1 [00:00<00:00, 1447.31it/s]
100%|██████████| 1/1 [00:00<00:00, 13.82it/s]
100%|██████████| 1/1 [00:00<00:00, 4604.07it/s]

starting build candidates list
starting get candidates name and offset



100%|██████████| 1/1 [00:00<00:00, 1711.26it/s]
100%|██████████| 1/1 [00:00<00:00, 11.19it/s]
100%|██████████| 1/1 [00:00<00:00, 441.09it/s]

[['《', '铁', '梨花', '》', ',', '结局', '让', '人', '有着', '说不出', '的', '失落', '。', ' ', '-', ' ', '铁', '梨花', '《铁', '铁梨花', '梨花》', '》,', ',结局', '结局让', '让人', '人有着', '有着说不出', '说不出的', '的失落', '失落。', '。 ', ' -', '- ', ' 铁', '铁梨花', '《铁梨花', '铁梨花》', '梨花》,', '》,结局', ',结局让', '结局让人', '让人有着', '人有着说不出', '有着说不出的', '说不出的失落', '的失落。', '失落。 ', '。 -', ' - ', '- 铁', ' 铁梨花', '《铁梨花》', '铁梨花》,', '梨花》,结局', '》,结局让', ',结局让人', '结局让人有着', '让人有着说不出', '人有着说不出的', '有着说不出的失落', '说不出的失落。', '的失落。 ', '失落。 -', '。 - ', ' - 铁', '- 铁梨花']]
[['铁', '梨花', '结局', '人', '的', '失落', '铁', '梨花', '铁梨花', '铁梨花']]
starting build ngram list
ngram 5
starting build candidates list
starting get candidates name and offset
[['《', '铁', '梨花', '》', ',', '结局', '让', '人', '有着', '说不出', '的', '失落', '。', ' ', '-', ' ', '铁', '梨花', '《铁', '铁梨花', '梨花》', '》,', ',结局', '结局让', '让人', '人有着', '有着说不出', '说不出的', '的失落', '失落。', '。 ', ' -', '- ', ' 铁', '铁梨花', '《铁梨花', '铁梨花》', '梨花》,', '》,结局', ',结局让', '结局让人', '让人有着', '人有着说不出', '有着说不出的', '说不出的失落', '的失落。', '失落。 ', '。 -', ' - ', '- 铁', ' 铁梨花',




In [195]:
ns = ngram_search(dev_x,kb_data.kb)
candidates = ns.candidates
print(candidates[0])
offset = ns.offset
print(offset[0])
print(ns.cut_data[0])
cand = []
off = []
for i in range(len(candidates[0])):
    if candidates[0][i]:
#         tmp = 
        cand.append(kb_data.kb[candidates[0][i][0][1][0]])
        off.append(offset[0][i])
print(cand)
print(off)

100%|██████████| 851/851 [00:00<00:00, 4589.01it/s]

starting build ngram list
ngram 2



  0%|          | 3/851 [00:00<00:30, 28.22it/s]

starting build candidates list


100%|██████████| 851/851 [00:24<00:00, 37.84it/s]
  5%|▍         | 42/851 [00:00<00:02, 351.25it/s]

starting get candidates name and offset
[(1.0, [44529])]
44529
铁
[(1.0, [4657])]
4657
梨花
[(1.0, [10911])]
10911
结局
[(1.0, [48458])]
48458
人
[(1.0, [35639])]
35639
的
[(1.0, [108397])]
108397
失落
[(1.0, [44529])]
44529
铁
[(1.0, [4657])]
4657
梨花
[(1.0, [4605])]
4605
铁梨花
[(1.0, [4605])]
4605
铁梨花
[(1.0, [53643])]
53643
十一
[(1.0, [25920])]
25920
罗汉
[(1.0, [11952])]
11952
什么
[(1.0, [19002])]
19002
意思
[(1.0, [62943])]
62943
_
[(1.0, [2422])]
2422
英语
[(1.0, [53643])]
53643
十一
[(1.0, [25920])]
25920
罗汉
[(1.0, [35639])]
35639
的
[(1.0, [258084])]
258084
十一罗汉
[(1.0, [258084])]
258084
十一罗汉
[(1.0, [38787])]
38787
奇缘
[(1.0, [277337])]
277337
开机
[(1.0, [214968])]
214968
陈汉典
[(1.0, [672])]
672
笑
[(1.0, [85294])]
85294
称
[(1.0, [22040])]
22040
颜值
[(1.0, [78426])]
78426
比
[(1.0, [290121])]
290121
吴亦凡
[(1.0, [85939])]
85939
冰火奇缘
[(1.0, [11266])]
11266
我
[(1.0, [10627])]
10627
曾
[(1.0, [10058])]
10058
寂寞
[(1.0, [9832])]
9832
生活
[(1.0, [27911])]
27911
辛
[(1.0, [98129])]
98129
波斯
[(1.0, [29504])]
29504
卡
[(1.0

 14%|█▎        | 117/851 [00:00<00:02, 296.00it/s]

吧
[(1.0, [3219])]
3219
我们
[(1.0, [301008])]
301008
出发吧
[(1.0, [301008])]
301008
出发吧
[(1.0, [5513])]
5513
超人
[(1.0, [30596])]
30596
—
[(1.0, [295976])]
295976
在线播放
[(1.0, [30596])]
30596
—
[(1.0, [5513])]
5513
超人
[(1.0, [30596])]
30596
—
[(1.0, [31107])]
31107
电影
[(1.0, [30596])]
30596
—
[(1.0, [6610])]
6610
优
[(1.0, [37411])]
37411
...
[(1.0, [59712])]
59712
行运超人
[(0.5714285714285714, [295976])]
295976
在线播放
[(0.5714285714285714, [295976])]
295976
在线播放
[(1.0, [59712])]
59712
行运超人
[(1.0, [177937])]
177937
优酷网
[(1.0, [1843])]
1843
春风
[(1.0, [80161])]
80161
不如
[(1.0, [9445])]
9445
你
[(1.0, [19310])]
19310
秋水
[(1.0, [16611])]
16611
柳青
[(1.0, [1081])]
1081
医院
[(1.0, [94163])]
94163
里
[(1.0, [84587])]
84587
激情
[(1.0, [25725])]
25725
主任
[(1.0, [37411])]
37411
...
[(1.0, [57595])]
57595
春风十里
[(1.0, [88963])]
88963
不如你
[(1.0, [6803])]
6803
星星
[(1.0, [249255])]
249255
啤酒
[(1.0, [80755])]
80755
炸鸡
[(1.0, [116901])]
116901
成为
[(1.0, [221989])]
221989
热潮
[(1.0, [43510])]
43510
来自
[(1.0, [6803])]
680

 23%|██▎       | 194/851 [00:00<00:02, 288.74it/s]

45116
电视剧
[(0.5714285714285714, [41597])]
41597
直插金三角
[(1.0, [43029])]
43029
在线观看
[(1.0, [177937])]
177937
优酷网
[(1.0, [14908])]
14908
医神
[(1.0, [31107])]
31107
电影
[(1.0, [187655])]
187655
完整版
[(1.0, [88037])]
88037
奇谈
[(1.0, [140109])]
140109
系列
[(1.0, [202152])]
202152
专辑
[(1.0, [65759])]
65759
免费
[(1.0, [62943])]
62943
_
[(1.0, [7426])]
7426
都市妖奇谈
[(1.0, [30460])]
30460
原理
[(1.0, [278829])]
278829
工艺
[(1.0, [15168])]
15168
简介
[(1.0, [62943])]
62943
_
[(1.0, [20046])]
20046
书评
[(1.0, [63992])]
63992
2015
[(1.0, [52615])]
52615
台湾
[(1.0, [49563])]
49563
恐怖片
[(1.0, [120133])]
120133
尸忆
[(1.0, [103692])]
103692
冥婚
[(1.0, [121909])]
121909
国语
[(1.0, [33121])]
33121
大神
[(1.0, [44857])]
44857
来
[(1.0, [23055])]
23055
左京
[(1.0, [89834])]
89834
桑原
[(1.0, [1874])]
1874
姐姐
[(1.0, [35639])]
35639
的
[(1.0, [15857])]
15857
关系
[(0.6666666666666666, [39510])]
39510
幽幽
[(1.0, [37411])]
37411
...
[(1.0, [66077])]
66077
当选
[(1.0, [242703])]
242703
连云港市
[(1.0, [30258])]
30258
政协主席
[(1.0, [9716])]
9716
有

 32%|███▏      | 269/851 [00:00<00:02, 288.64it/s]


在线播放
[(1.0, [25825])]
25825
秦时明月
[(0.5714285714285714, [29352])]
29352
君临天下
[(0.5714285714285714, [29352])]
29352
君临天下
[(1.0, [72568])]
72568
苗翠花
[(1.0, [61162])]
61162
全集
[(1.0, [189722])]
189722
在线
[(1.0, [105483])]
105483
观看
[(1.0, [72568])]
72568
苗翠花
[(1.0, [45116])]
45116
电视剧
[(1.0, [37411])]
37411
...
[(0.6666666666666666, [90362])]
90362
苗翠花-国语版
[(1.0, [43029])]
43029
在线观看
[(0.6666666666666666, [90362])]
90362
苗翠花-国语版
[(1.0, [3219])]
3219
我们
[(1.0, [82255])]
82255
重新
[(1.0, [31700])]
31700
面对
[(1.0, [24138])]
24138
人生
[(1.0, [35639])]
35639
的
[(1.0, [8685])]
8685
故事
[(1.0, [228517])]
228517
沉睡
[(1.0, [35639])]
35639
的
[(1.0, [75671])]
75671
人鱼
[(1.0, [62888])]
62888
找
[(1.0, [36809])]
36809
小说
[(1.0, [58283])]
58283
找到
[(1.0, [62587])]
62587
雷文
[(1.0, [136257])]
136257
知否
[(1.0, [136257])]
136257
知否
[(1.0, [90048])]
90048
应
[(1.0, [58517])]
58517
是
[(1.0, [132])]
132
红
[(1.0, [37411])]
37411
...
[(1.0, [136256])]
136256
知否知否
[(1.0, [287903])]
287903
图片
[(1.0, [152932])]
152932


 41%|████      | 346/851 [00:01<00:01, 293.58it/s]

的
[(1.0, [1334])]
1334
再见
[(1.0, [62943])]
62943
_
[(1.0, [134443])]
134443
mad
[(1.0, [10071])]
10071
·
[(1.0, [72344])]
72344
amv
[(1.0, [62943])]
62943
_
[(1.0, [85360])]
85360
动画
[(1.0, [62943])]
62943
_
[(1.0, [223147])]
223147
bilibili
[(0.7272727272727273, [223147])]
223147
bilibili
[(1.0, [184770])]
184770
原版
[(1.0, [12774])]
12774
想
[(1.0, [9445])]
9445
你
[(1.0, [45649])]
45649
一起
[(1.0, [184169])]
184169
吹
[(1.0, [7743])]
7743
张学友
[(1.0, [223306])]
223306
原创
[(1.0, [79896])]
79896
嘿
[(1.0, [45649])]
45649
一起
[(1.0, [32936])]
32936
走
[(1.0, [44146])]
44146
吧
[(1.0, [223306])]
223306
原创
[(1.0, [234873])]
234873
小
[(1.0, [73740])]
73740
走吧
[(1.0, [26273])]
26273
听见
[(1.0, [43964])]
43964
凉山
[(1.0, [46653])]
46653
插曲
[(1.0, [31127])]
31127
蓝月亮
[(1.0, [38492])]
38492
听见凉山
[(1.0, [234873])]
234873
小
[(1.0, [167338])]
167338
高清
[(1.0, [31107])]
31107
电影
[(1.0, [189722])]
189722
在线
[(1.0, [105483])]
105483
观看
[(1.0, [105228])]
105228
小次郎
[(1.0, [282574])]
282574
高清电影
[(1.0, [43029])]

 49%|████▊     | 414/851 [00:01<00:01, 276.74it/s]


[(0.5714285714285714, [107996])]
107996
奇门遁甲
[(1.0, [43029])]
43029
在线观看
[(1.0, [148520])]
148520
新
[(1.0, [300384])]
300384
概念
[(1.0, [2422])]
2422
英语
[(1.0, [18165])]
18165
到底
[(1.0, [88175])]
88175
好
[(1.0, [57782])]
57782
在
[(1.0, [91117])]
91117
哪里
[(1.0, [288576])]
288576
适合
[(1.0, [27632])]
27632
小学生
[(1.0, [58660])]
58660
学习
[(1.0, [52876])]
52876
新概念
[(0.5714285714285714, [242910])]
242910
新概念英语
[(1.0, [43962])]
43962
好在
[(1.0, [216188])]
216188
南京
[(1.0, [287])]
287
爱情
[(1.0, [45116])]
45116
电视剧
[(1.0, [113955])]
113955
南京爱情
[(1.0, [103817])]
103817
浙江
[(1.0, [103389])]
103389
嘉兴
[(1.0, [44086])]
44086
海宁
[(1.0, [7711])]
7711
一
[(1.0, [120611])]
120611
餐馆
[(1.0, [5690])]
5690
海鲜
[(0.6666666666666666, [12514])]
12514
2
[(1.0, [174275])]
174275
浙江嘉兴
[(1.0, [177203])]
177203
祖宗
[(0.6666666666666666, [257582])]
257582
若若
[(1.0, [221831])]
221831
被
[(1.0, [22233])]
22233
骂
[(1.0, [612])]
612
他
[(1.0, [147104])]
147104
就
[(1.0, [166426])]
166426
彻底
[(1.0, [20452])]
20452
告别
[(1.0,

 58%|█████▊    | 490/851 [00:01<00:01, 279.71it/s]

[(1.0, [50766])]
50766
psv
[(1.0, [37411])]
37411
...
[(1.0, [64664])]
64664
银妆刀
[(1.0, [58517])]
58517
是
[(1.0, [11952])]
11952
什么
[(1.0, [49488])]
49488
东西
[(1.0, [31104])]
31104
花容月貌
[(1.0, [31107])]
31107
电影
[(1.0, [187655])]
187655
完整版
[(1.0, [189722])]
189722
在线
[(1.0, [105483])]
105483
观看
[(1.0, [167338])]
167338
高清
[(1.0, [57468])]
57468
迅雷
[(1.0, [249645])]
249645
下载
[(0.5714285714285714, [31104])]
31104
花容月貌
[(0.5714285714285714, [31104])]
31104
花容月貌
[(1.0, [43029])]
43029
在线观看
[(0.6, [66356])]
66356
10
[(1.0, [67689])]
67689
种
[(1.0, [21503])]
21503
死法
[(1.0, [1894])]
1894
第一季
[(1.0, [33158])]
33158
1
[(1.0, [208200])]
208200
集
[(1.0, [139394])]
139394
视频
[(1.0, [223306])]
223306
原创
[(1.0, [139394])]
139394
视频
[(0.5555555555555556, [105349])]
105349
1000种死法
[(1.0, [74340])]
74340
八阵图
[(1.0, [292587])]
292587
13
[(1.0, [208200])]
208200
集
[(1.0, [144233])]
144233
神鬼八阵图
[(0.5714285714285714, [74342])]
74342
《八阵图》
[(0.6666666666666666, [147026])]
147026
战战
[(1.0, [156981])]
156

 66%|██████▌   | 561/851 [00:01<00:01, 280.52it/s]


46653
插曲
[(1.0, [41160])]
41160
萨尔
[(0.6666666666666666, [302681])]
302681
曼曼
[(1.0, [10071])]
10071
·
[(1.0, [100763])]
100763
汗
[(1.0, [30596])]
30596
—
[(1.0, [295976])]
295976
在线播放
[(1.0, [30596])]
30596
—
[(1.0, [6610])]
6610
优
[(1.0, [139394])]
139394
视频
[(1.0, [37411])]
37411
...
[(1.0, [290016])]
290016
印度电影
[(0.5714285714285714, [295976])]
295976
在线播放
[(0.5714285714285714, [295976])]
295976
在线播放
[(1.0, [177937])]
177937
优酷网
[(1.0, [270767])]
270767
袁
[(1.0, [21718])]
21718
腾飞
[(1.0, [18725])]
18725
说
[(1.0, [34728])]
34728
宋耀武
[(1.0, [76078])]
76078
小姑娘
[(1.0, [175019])]
175019
亲手
[(1.0, [43518])]
43518
杀
[(0.6666666666666666, [287707])]
287707
了了
[(1.0, [85586])]
85586
个人
[(1.0, [58517])]
58517
是
[(1.0, [41048])]
41048
真
[(1.0, [126003])]
126003
袁腾飞
[(1.0, [55783])]
55783
喜剧
[(1.0, [80881])]
80881
短片
[(1.0, [35639])]
35639
的
[(1.0, [14716])]
14716
竖
[(1.0, [16272])]
16272
娃娃
[(1.0, [14716])]
14716
竖
[(1.0, [45649])]
45649
一起
[(1.0, [9832])]
9832
生活
[(1.0, [37411])]
37411
...

 75%|███████▍  | 636/851 [00:02<00:00, 296.05it/s]

36809
小说
[(1.0, [75934])]
75934
求
[(1.0, [33121])]
33121
大神
[(1.0, [213869])]
213869
绝望游戏
[(1.0, [10071])]
10071
·
[(1.0, [153425])]
153425
粉丝
[(1.0, [30348])]
30348
圈
[(1.0, [10071])]
10071
·
[(1.0, [292345])]
292345
粉丝圈
[(1.0, [15666])]
15666
野兽
[(1.0, [61162])]
61162
全集
[(1.0, [189722])]
189722
在线
[(1.0, [105483])]
105483
观看
[(1.0, [182962])]
182962
电影网
[(1.0, [31351])]
31351
花予野兽
[(1.0, [43029])]
43029
在线观看
[(1.0, [15054])]
15054
双
[(1.0, [2794])]
2794
11
[(1.0, [84536])]
84536
李荣浩
[(1.0, [86010])]
86010
陈奕迅
[(1.0, [9445])]
9445
你
[(1.0, [35639])]
35639
的
[(1.0, [124795])]
124795
背包
[(1.0, [199788])]
199788
湖南
[(0.625, [14692])]
14692
双11狂欢夜
[(1.0, [145509])]
145509
榆林
[(1.0, [34270])]
34270
新闻联播
[(1.0, [139394])]
139394
视频
[(1.0, [148185])]
148185
列表
[(1.0, [25949])]
25949
明珠
[(1.0, [56869])]
56869
网
[(1.0, [145509])]
145509
榆林
[(1.0, [132449])]
132449
广播
[(1.0, [222777])]
222777
电视台
[(1.0, [148520])]
148520
新
[(1.0, [37411])]
37411
...
[(0.5555555555555556, [60580])]
60580
吉林新闻联播

 84%|████████▍ | 715/851 [00:02<00:00, 310.42it/s]


可夫
[(1.0, [15906])]
15906
孔子
[(1.0, [101527])]
101527
76
[(1.0, [45123])]
45123
代
[(1.0, [126343])]
126343
孙女
[(1.0, [292546])]
292546
王叔铭
[(1.0, [249062])]
249062
王光美
[(1.0, [35639])]
35639
的
[(1.0, [144750])]
144750
“
[(1.0, [97288])]
97288
父女
[(1.0, [144752])]
144752
”
[(1.0, [62943])]
62943
_
[(1.0, [40897])]
40897
恐惧
[(1.0, [26174])]
26174
日
[(1.0, [36350])]
36350
王莽
[(1.0, [37411])]
37411
...
[(1.0, [227167])]
227167
公安
[(1.0, [93913])]
93913
传播
[(1.0, [115145])]
115145
彭耀春
[(1.0, [279462])]
279462
摘要
[(1.0, [20046])]
20046
书评
[(1.0, [165814])]
165814
试读
[(1.0, [46956])]
46956
金灿灿
[(1.0, [63548])]
63548
广场
[(1.0, [5495])]
5495
舞
[(1.0, [14045])]
14045
美丽
[(1.0, [35639])]
35639
的
[(1.0, [64190])]
64190
草原
[(1.0, [14045])]
14045
美丽
[(1.0, [35639])]
35639
的
[(1.0, [47072])]
47072
姑娘
[(1.0, [63764])]
63764
附
[(1.0, [50216])]
50216
背面
[(1.0, [94075])]
94075
演
[(1.0, [64941])]
64941
广场舞
[(1.0, [2138])]
2138
李建平
[(1.0, [57782])]
57782
在
[(1.0, [222245])]
222245
深圳
[(1.0, [62943])]
6294

 93%|█████████▎| 790/851 [00:02<00:00, 308.52it/s]

大
[(1.0, [142553])]
142553
2017年
[(1.0, [10071])]
10071
·
[(1.0, [287903])]
287903
图片
[(1.0, [145729])]
145729
价格
[(1.0, [161639])]
161639
品牌
[(1.0, [253145])]
253145
报价
[(1.0, [37031])]
37031
南渡北归
[(1.0, [122660])]
122660
房
[(1.0, [62943])]
62943
_
[(1.0, [186224])]
186224
珠海
[(1.0, [86617])]
86617
房产
[(1.0, [190175])]
190175
信息网
[(1.0, [10071])]
10071
·
[(1.0, [104119])]
104119
梧桐
[(1.0, [14019])]
14019
路
[(1.0, [186224])]
186224
珠海
[(1.0, [293565])]
293565
百姓网
[(0.625, [16126])]
16126
房地产信息网
[(0.6, [165626])]
165626
省委常委
[(1.0, [81050])]
81050
组长
[(1.0, [177158])]
177158
准则
[(1.0, [150114])]
150114
条例
[(1.0, [141779])]
141779
专题
[(1.0, [123067])]
123067
景观
[(1.0, [148077])]
148077
设计
[(1.0, [123067])]
123067
景观
[(1.0, [150936])]
150936
张亚萍
[(1.0, [123277])]
123277
梅洛
[(1.0, [279462])]
279462
摘要
[(1.0, [37411])]
37411
...
[(1.0, [200])]
200
母亲
[(1.0, [35639])]
35639
的
[(1.0, [31107])]
31107
电影
[(1.0, [167338])]
167338
高清
[(1.0, [189722])]
189722
在线
[(1.0, [105483])]
105483
观看
[(1.0, 

100%|██████████| 851/851 [00:02<00:00, 292.21it/s]


[(1.0, [63710])]
63710
学
[(1.0, [140109])]
140109
系列
[(1.0, [216769])]
216769
袁其刚
[(1.0, [279462])]
279462
摘要
[(1.0, [32444])]
32444
帮帮我
[(1.0, [32442])]
32442
爱神
[(1.0, [262813])]
262813
剧照
[(1.0, [1038])]
1038
曝光
[(1.0, [32441])]
32441
帮帮我爱神
[(1.0, [8052])]
8052
王恺
[(1.0, [30596])]
30596
—
[(1.0, [30596])]
30596
—
[(1.0, [30596])]
30596
—
[(1.0, [30596])]
30596
—
[(1.0, [69952])]
69952
武汉大学
[(1.0, [261353])]
261353
经济
[(1.0, [150224])]
150224
管理
[(1.0, [12370])]
12370
学院
[(1.0, [28542])]
28542
——
[(0.5555555555555556, [107817])]
107817
管理科学与工程
[(1.0, [28542])]
28542
——
[(0.5714285714285714, [69952])]
69952
武汉大学
[(0.6666666666666666, [78248])]
78248
武汉大学经济学
[(1.0, [44146])]
44146
吧
[(1.0, [159854])]
159854
肿瘤
[(0.6666666666666666, [37635])]
37635
君君
[(1.0, [94163])]
94163
里
[(1.0, [9716])]
9716
有
[(1.0, [55608])]
55608
一个
[(1.0, [144750])]
144750
“
[(1.0, [141616])]
141616
攻受
[(1.0, [144752])]
144752
”
[(1.0, [12031])]
12031
演员
[(1.0, [37411])]
37411
...
[(1.0, [32468])]
32468
知识
[(1




In [172]:
dev_x[0]

'《铁梨花》,结局让人有着说不出的失落。 - 铁梨花'

In [180]:
print(candidates[0])

[[], [(1.0, [44529])], [(1.0, [4657])], [], [], [(1.0, [10911])], [], [(1.0, [48458])], [], [], [(1.0, [35639])], [(1.0, [108397])], [], [], [], [], [(1.0, [44529])], [(1.0, [4657])], [], [(1.0, [4605])], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [(1.0, [4605])]]


In [None]:
cand_name = ns.cand_name

In [151]:
print(dev_data[0])

{'text_id': '59668', 'text': '《铁梨花》,结局让人有着说不出的失落。 - 铁梨花', 'mention_data': [('铁梨花', 1, '143416'), ('结局', 6, '335878'), ('失落', 16, '286921'), ('铁梨花', 22, '143416')]}


In [167]:
en = []
for i in dev_data:
    tmp = []
    for j in i['mention_data']:
        tmp.append(j[0])
    en.append(tmp)

In [168]:
print(en[0])

['铁梨花', '结局', '失落', '铁梨花']


recall = []
for i in range(len(en)):
    tp = 0
    fp = 0
    for j in en[i]:
        if j in cand_name[i]:
            tp += 1
        else:
            fp += 1
    recall.append(tp/(tp+fp))
print(sum(recall)/len(recall))

In [228]:
recall_all = []
for i in range(2,10):
    print('ngram:',i)
    ns = ngram_search(dev_x,kb_data.kb,i)
    recall = []
    cand_name = ns.cand_name
#     print(len)
    for i in tqdm(range(len(en))):
        tp = 0
        fp = 0
        for j in en[i]:
            if j in cand_name[i]:
                tp += 1
            else:
                fp += 1
        recall.append(tp/(tp+fp))
    print(sum(recall)/len(recall))
    recall_all.append(recall)

 44%|████▍     | 377/851 [00:00<00:00, 3763.14it/s]

ngram: 2
starting build ngram list
ngram 2


100%|██████████| 851/851 [00:00<00:00, 4100.00it/s]
  0%|          | 3/851 [00:00<00:31, 27.22it/s]

starting build candidates list


100%|██████████| 851/851 [00:24<00:00, 34.49it/s]
100%|██████████| 851/851 [00:00<00:00, 64154.30it/s]
  0%|          | 0/851 [00:00<?, ?it/s]

starting get candidates name and offset


100%|██████████| 851/851 [00:00<00:00, 155696.96it/s]
100%|██████████| 851/851 [00:00<00:00, 4560.76it/s]


0.8506313767183328
ngram: 3
starting build ngram list
ngram 3


  0%|          | 3/851 [00:00<00:34, 24.41it/s]

starting build candidates list


100%|██████████| 851/851 [00:35<00:00, 24.14it/s]
100%|██████████| 851/851 [00:00<00:00, 50102.51it/s]


starting get candidates name and offset


100%|██████████| 851/851 [00:00<00:00, 223601.62it/s]
100%|██████████| 851/851 [00:00<00:00, 4360.69it/s]


0.9253390035998725
ngram: 4
starting build ngram list
ngram 4


  0%|          | 2/851 [00:00<00:50, 16.65it/s]

starting build candidates list


100%|██████████| 851/851 [00:45<00:00, 18.89it/s]
100%|██████████| 851/851 [00:00<00:00, 45146.82it/s]


starting get candidates name and offset


100%|██████████| 851/851 [00:00<00:00, 197409.03it/s]
 45%|████▍     | 379/851 [00:00<00:00, 3786.82it/s]

0.936051890399716
ngram: 5
starting build ngram list
ngram 5


100%|██████████| 851/851 [00:00<00:00, 4126.03it/s]
  0%|          | 2/851 [00:00<01:01, 13.78it/s]

starting build candidates list


100%|██████████| 851/851 [00:54<00:00, 15.72it/s]
100%|██████████| 851/851 [00:00<00:00, 38864.90it/s]
  0%|          | 0/851 [00:00<?, ?it/s]

starting get candidates name and offset


100%|██████████| 851/851 [00:00<00:00, 273387.92it/s]
 41%|████      | 345/851 [00:00<00:00, 3446.35it/s]

0.9376186745751959
ngram: 6
starting build ngram list
ngram 6


100%|██████████| 851/851 [00:00<00:00, 3851.85it/s]
  0%|          | 2/851 [00:00<01:08, 12.42it/s]

starting build candidates list


100%|██████████| 851/851 [01:05<00:00, 12.48it/s]
100%|██████████| 851/851 [00:00<00:00, 40822.93it/s]


starting get candidates name and offset


100%|██████████| 851/851 [00:00<00:00, 112000.78it/s]
 92%|█████████▏| 780/851 [00:00<00:00, 3771.91it/s]

0.9387937627068058
ngram: 7
starting build ngram list
ngram 7


100%|██████████| 851/851 [00:00<00:00, 3874.72it/s]
  0%|          | 1/851 [00:00<01:27,  9.69it/s]

starting build candidates list


100%|██████████| 851/851 [01:14<00:00, 11.30it/s]
100%|██████████| 851/851 [00:00<00:00, 37079.95it/s]


starting get candidates name and offset


100%|██████████| 851/851 [00:00<00:00, 296113.55it/s]
 41%|████▏     | 352/851 [00:00<00:00, 3513.13it/s]

0.9387937627068058
ngram: 8
starting build ngram list
ngram 8


100%|██████████| 851/851 [00:00<00:00, 3823.04it/s]
  0%|          | 1/851 [00:00<01:37,  8.71it/s]

starting build candidates list


100%|██████████| 851/851 [01:25<00:00,  9.48it/s]
100%|██████████| 851/851 [00:00<00:00, 35079.98it/s]


starting get candidates name and offset


100%|██████████| 851/851 [00:00<00:00, 99369.51it/s]
 43%|████▎     | 365/851 [00:00<00:00, 3649.66it/s]

0.9387937627068058
ngram: 9
starting build ngram list
ngram 9


100%|██████████| 851/851 [00:00<00:00, 3768.14it/s]
  0%|          | 1/851 [00:00<01:46,  7.99it/s]

starting build candidates list


100%|██████████| 851/851 [01:35<00:00,  7.75it/s]
100%|██████████| 851/851 [00:00<00:00, 29826.38it/s]


starting get candidates name and offset


100%|██████████| 851/851 [00:00<00:00, 175631.19it/s]

0.9387937627068058





In [235]:
recall_all = []
ratio_all = []
for i in range(0,5):
#     print('ngram:',i)
    print('similarity',i/10)
    ns = ngram_search(dev_x,kb_data.kb,4,i/10)
    recall = []
    cand_name = ns.cand_name
    ratio = []
#     print(len)
    for i in tqdm(range(len(en))):
        tp = 0
        fp = 0
        for j in en[i]:
            if j in cand_name[i]:
                tp += 1
            else:
                fp += 1
        recall.append(tp/(tp+fp))
        ratio.append(len(cand_name[i])/len(en[i]))
    print(sum(recall)/len(recall))
    print(sum(ratio)/len(ratio))
    recall_all.append(recall)
    ratio_all.append(ratio)




  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


 46%|████▌     | 388/851 [00:00<00:00, 3868.92it/s][A[A[A

similarity 0.0
starting build ngram list
ngram 4





 89%|████████▊ | 755/851 [00:00<00:00, 3802.02it/s][A[A[A


100%|██████████| 851/851 [00:00<00:00, 3616.15it/s][A[A[A


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


  0%|          | 2/851 [00:00<00:54, 15.46it/s][A[A[A

starting build candidates list





  0%|          | 4/851 [00:00<00:56, 14.99it/s][A[A[A


  1%|          | 6/851 [00:00<00:55, 15.31it/s][A[A[A


  1%|          | 9/851 [00:00<00:47, 17.91it/s][A[A[A


  2%|▏         | 13/851 [00:00<00:41, 20.29it/s][A[A[A


  2%|▏         | 15/851 [00:00<00:54, 15.38it/s][A[A[A


  2%|▏         | 17/851 [00:00<00:52, 15.78it/s][A[A[A


  2%|▏         | 19/851 [00:01<01:05, 12.71it/s][A[A[A

 93%|█████████▎| 795/851 [01:02<00:03, 14.74it/s][A[A


  2%|▏         | 21/851 [00:01<01:09, 11.97it/s][A[A[A


  3%|▎         | 23/851 [00:01<01:03, 12.97it/s][A[A[A


  3%|▎         | 27/851 [00:01<00:57, 14.30it/s][A[A[A


  4%|▎         | 30/851 [00:01<00:52, 15.68it/s][A[A[A


  4%|▍         | 32/851 [00:01<00:49, 16.61it/s][A[A[A


  4%|▍         | 34/851 [00:02<00:50, 16.15it/s][A[A[A


  4%|▍         | 37/851 [00:02<00:47, 17.12it/s][A[A[A


  5%|▍         | 41/851 [00:02<00:44, 18.32it/s][A[A[A


  5%|▌         | 44/851 [00:02<00:43, 18.51

 46%|████▌     | 392/851 [00:22<00:32, 14.13it/s][A[A[A


 46%|████▋     | 395/851 [00:23<00:28, 15.76it/s][A[A[A


 47%|████▋     | 398/851 [00:23<00:25, 17.95it/s][A[A[A


 47%|████▋     | 401/851 [00:23<00:27, 16.31it/s][A[A[A


 47%|████▋     | 403/851 [00:23<00:26, 16.95it/s][A[A[A


 48%|████▊     | 405/851 [00:23<00:25, 17.63it/s][A[A[A


 48%|████▊     | 408/851 [00:24<00:42, 10.49it/s][A[A[A


 48%|████▊     | 412/851 [00:24<00:32, 13.32it/s][A[A[A


 49%|████▉     | 415/851 [00:24<00:31, 14.06it/s][A[A[A


 49%|████▉     | 417/851 [00:24<00:28, 15.36it/s][A[A[A


 49%|████▉     | 421/851 [00:24<00:22, 18.73it/s][A[A[A


 50%|████▉     | 424/851 [00:24<00:21, 19.92it/s][A[A[A


 50%|█████     | 427/851 [00:24<00:22, 18.86it/s][A[A[A


 51%|█████     | 430/851 [00:25<00:24, 17.08it/s][A[A[A


 51%|█████     | 432/851 [00:25<00:27, 15.48it/s][A[A[A


 51%|█████     | 434/851 [00:25<00:25, 16.12it/s][A[A[A


 51%|█████▏    | 437/851

 91%|█████████ | 776/851 [00:45<00:05, 13.83it/s][A[A[A


 91%|█████████▏| 778/851 [00:45<00:04, 14.79it/s][A[A[A


 92%|█████████▏| 780/851 [00:46<00:10,  6.62it/s][A[A[A


 92%|█████████▏| 783/851 [00:46<00:08,  8.15it/s][A[A[A


 92%|█████████▏| 787/851 [00:46<00:06, 10.49it/s][A[A[A


 93%|█████████▎| 790/851 [00:46<00:04, 12.30it/s][A[A[A


 93%|█████████▎| 792/851 [00:46<00:04, 13.52it/s][A[A[A


 93%|█████████▎| 794/851 [00:46<00:04, 13.77it/s][A[A[A


 94%|█████████▎| 796/851 [00:46<00:03, 15.19it/s][A[A[A


 94%|█████████▍| 799/851 [00:47<00:02, 17.43it/s][A[A[A


 94%|█████████▍| 804/851 [00:47<00:02, 21.10it/s][A[A[A


 95%|█████████▍| 807/851 [00:47<00:02, 21.64it/s][A[A[A


 95%|█████████▌| 810/851 [00:47<00:02, 20.08it/s][A[A[A


 96%|█████████▌| 813/851 [00:47<00:01, 20.18it/s][A[A[A


 96%|█████████▌| 816/851 [00:47<00:01, 21.51it/s][A[A[A


 96%|█████████▌| 819/851 [00:47<00:01, 22.14it/s][A[A[A


 97%|█████████▋| 822/851

starting get candidates name and offset


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


100%|██████████| 851/851 [00:00<00:00, 114409.66it/s][A[A[A


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


 46%|████▌     | 390/851 [00:00<00:00, 3892.98it/s][A[A[A




similarity 0.1
starting build ngram list
ngram 4


 93%|█████████▎| 788/851 [00:00<00:00, 3915.42it/s][A[A[A


100%|██████████| 851/851 [00:00<00:00, 3818.17it/s][A[A[A


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


  0%|          | 2/851 [00:00<00:53, 15.86it/s][A[A[A

starting build candidates list





  0%|          | 4/851 [00:00<00:53, 15.76it/s][A[A[A


  1%|          | 6/851 [00:00<00:52, 16.16it/s][A[A[A


  1%|          | 10/851 [00:00<00:42, 19.60it/s][A[A[A


  2%|▏         | 13/851 [00:00<00:39, 21.33it/s][A[A[A


  2%|▏         | 16/851 [00:00<00:46, 18.05it/s][A[A[A


  2%|▏         | 18/851 [00:01<01:02, 13.39it/s][A[A[A


  2%|▏         | 20/851 [00:01<01:02, 13.37it/s][A[A[A


  3%|▎         | 22/851 [00:01<01:03, 13.05it/s][A[A[A


  3%|▎         | 25/851 [00:01<00:52, 15.70it/s][A[A[A


  3%|▎         | 27/851 [00:01<00:56, 14.48it/s][A[A[A


  4%|▎         | 30/851 [00:01<00:51, 15.85it/s][A[A[A


  4%|▍         | 33/851 [00:01<00:49, 16.61it/s][A[A[A


  4%|▍         | 36/851 [00:02<00:45, 18.03it/s][A[A[A


  5%|▍         | 40/851 [00:02<00:37, 21.46it/s][A[A[A


  5%|▌         | 43/851 [00:02<00:40, 20.03it/s][A[A[A


  5%|▌         | 46/851 [00:02<00:45, 17.83it/s][A[A[A


  6%|▌         | 49/851 [00:02<00:50, 1

 47%|████▋     | 401/851 [00:22<00:25, 17.37it/s][A[A[A


 47%|████▋     | 404/851 [00:22<00:24, 18.56it/s][A[A[A


 48%|████▊     | 407/851 [00:22<00:22, 19.59it/s][A[A[A


 48%|████▊     | 410/851 [00:23<00:38, 11.56it/s][A[A[A


 49%|████▊     | 413/851 [00:23<00:35, 12.24it/s][A[A[A


 49%|████▉     | 417/851 [00:23<00:29, 14.48it/s][A[A[A


 50%|████▉     | 422/851 [00:23<00:23, 18.16it/s][A[A[A


 50%|████▉     | 425/851 [00:24<00:26, 15.96it/s][A[A[A


 50%|█████     | 428/851 [00:24<00:26, 15.91it/s][A[A[A


 51%|█████     | 431/851 [00:24<00:23, 17.63it/s][A[A[A


 51%|█████     | 434/851 [00:24<00:24, 16.82it/s][A[A[A


 51%|█████▏    | 437/851 [00:24<00:22, 18.26it/s][A[A[A


 52%|█████▏    | 440/851 [00:24<00:20, 20.31it/s][A[A[A


 52%|█████▏    | 444/851 [00:25<00:17, 23.45it/s][A[A[A


 53%|█████▎    | 448/851 [00:25<00:15, 25.43it/s][A[A[A


 53%|█████▎    | 451/851 [00:25<00:15, 25.38it/s][A[A[A


 53%|█████▎    | 454/851

 94%|█████████▎| 796/851 [00:45<00:03, 15.68it/s][A[A[A


 94%|█████████▍| 799/851 [00:45<00:02, 18.07it/s][A[A[A


 94%|█████████▍| 803/851 [00:45<00:02, 21.43it/s][A[A[A


 95%|█████████▍| 806/851 [00:45<00:02, 22.05it/s][A[A[A


 95%|█████████▌| 809/851 [00:46<00:01, 21.04it/s][A[A[A


 95%|█████████▌| 812/851 [00:46<00:01, 22.30it/s][A[A[A


 96%|█████████▌| 815/851 [00:46<00:01, 22.34it/s][A[A[A


 96%|█████████▌| 819/851 [00:46<00:01, 23.66it/s][A[A[A


 97%|█████████▋| 822/851 [00:46<00:01, 21.43it/s][A[A[A


 97%|█████████▋| 825/851 [00:46<00:01, 19.22it/s][A[A[A


 97%|█████████▋| 828/851 [00:46<00:01, 19.05it/s][A[A[A


 98%|█████████▊| 830/851 [00:47<00:01, 18.98it/s][A[A[A


 98%|█████████▊| 833/851 [00:47<00:00, 19.43it/s][A[A[A


 98%|█████████▊| 837/851 [00:47<00:00, 20.77it/s][A[A[A


 99%|█████████▊| 840/851 [00:47<00:00, 21.79it/s][A[A[A


 99%|█████████▉| 843/851 [00:47<00:00, 22.26it/s][A[A[A


 99%|█████████▉| 846/851

starting get candidates name and offset





  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


100%|██████████| 851/851 [00:00<00:00, 116920.62it/s][A[A[A


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


 42%|████▏     | 355/851 [00:00<00:00, 3549.37it/s][A[A[A

similarity 0.2
starting build ngram list
ngram 4





 86%|████████▋ | 736/851 [00:00<00:00, 3622.64it/s][A[A[A


100%|██████████| 851/851 [00:00<00:00, 3636.16it/s][A[A[A


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


  0%|          | 2/851 [00:00<00:50, 16.81it/s][A[A[A

starting build candidates list





  0%|          | 4/851 [00:00<00:51, 16.60it/s][A[A[A


  1%|          | 6/851 [00:00<00:49, 16.95it/s][A[A[A


  1%|          | 10/851 [00:00<00:41, 20.35it/s][A[A[A


  2%|▏         | 13/851 [00:00<00:37, 22.14it/s][A[A[A


  2%|▏         | 16/851 [00:00<00:43, 19.26it/s][A[A[A


  2%|▏         | 18/851 [00:00<00:57, 14.39it/s][A[A[A


  2%|▏         | 20/851 [00:01<00:58, 14.23it/s][A[A[A


  3%|▎         | 22/851 [00:01<01:01, 13.56it/s][A[A[A


  3%|▎         | 25/851 [00:01<00:51, 16.15it/s][A[A[A


  3%|▎         | 27/851 [00:01<00:56, 14.51it/s][A[A[A


  4%|▎         | 30/851 [00:01<00:52, 15.78it/s][A[A[A


  4%|▍         | 33/851 [00:01<00:50, 16.35it/s][A[A[A


  4%|▍         | 36/851 [00:02<00:45, 17.79it/s][A[A[A


  5%|▍         | 40/851 [00:02<00:38, 21.03it/s][A[A[A


  5%|▌         | 43/851 [00:02<00:39, 20.22it/s][A[A[A


  5%|▌         | 46/851 [00:02<00:45, 17.86it/s][A[A[A


  6%|▌         | 49/851 [00:02<00:49, 1

 46%|████▌     | 388/851 [00:22<00:31, 14.70it/s][A[A[A


 46%|████▌     | 390/851 [00:22<00:31, 14.48it/s][A[A[A


 46%|████▌     | 392/851 [00:22<00:31, 14.39it/s][A[A[A


 46%|████▋     | 395/851 [00:23<00:28, 16.05it/s][A[A[A


 47%|████▋     | 398/851 [00:23<00:24, 18.57it/s][A[A[A


 47%|████▋     | 401/851 [00:23<00:26, 17.20it/s][A[A[A


 47%|████▋     | 404/851 [00:23<00:24, 18.48it/s][A[A[A


 48%|████▊     | 407/851 [00:23<00:22, 19.56it/s][A[A[A


 48%|████▊     | 410/851 [00:24<00:38, 11.57it/s][A[A[A


 49%|████▊     | 413/851 [00:24<00:35, 12.20it/s][A[A[A


 49%|████▉     | 417/851 [00:24<00:30, 14.22it/s][A[A[A


 49%|████▉     | 421/851 [00:24<00:24, 17.47it/s][A[A[A


 50%|████▉     | 424/851 [00:24<00:22, 18.71it/s][A[A[A


 50%|█████     | 427/851 [00:24<00:23, 17.84it/s][A[A[A


 51%|█████     | 430/851 [00:25<00:25, 16.42it/s][A[A[A


 51%|█████     | 432/851 [00:25<00:27, 15.48it/s][A[A[A


 51%|█████     | 434/851

 92%|█████████▏| 783/851 [00:45<00:08,  8.31it/s][A[A[A


 92%|█████████▏| 787/851 [00:45<00:05, 10.68it/s][A[A[A


 93%|█████████▎| 789/851 [00:45<00:05, 12.26it/s][A[A[A


 93%|█████████▎| 791/851 [00:46<00:04, 12.92it/s][A[A[A


 93%|█████████▎| 793/851 [00:46<00:04, 13.55it/s][A[A[A


 93%|█████████▎| 795/851 [00:46<00:03, 14.07it/s][A[A[A


 94%|█████████▍| 799/851 [00:46<00:03, 17.12it/s][A[A[A


 94%|█████████▍| 804/851 [00:46<00:02, 20.78it/s][A[A[A


 95%|█████████▍| 807/851 [00:46<00:02, 21.15it/s][A[A[A


 95%|█████████▌| 810/851 [00:46<00:01, 21.32it/s][A[A[A


 96%|█████████▌| 813/851 [00:47<00:01, 21.45it/s][A[A[A


 96%|█████████▌| 816/851 [00:47<00:01, 22.30it/s][A[A[A


 96%|█████████▌| 819/851 [00:47<00:01, 23.50it/s][A[A[A


 97%|█████████▋| 822/851 [00:47<00:01, 22.00it/s][A[A[A


 97%|█████████▋| 825/851 [00:47<00:01, 20.11it/s][A[A[A


 97%|█████████▋| 828/851 [00:47<00:01, 20.58it/s][A[A[A


 98%|█████████▊| 831/851

starting get candidates name and offset


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


100%|██████████| 851/851 [00:00<00:00, 155310.80it/s][A[A[A


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


 51%|█████     | 430/851 [00:00<00:00, 4289.64it/s][A[A[A




similarity 0.3
starting build ngram list
ngram 4


100%|█████████▉| 847/851 [00:00<00:00, 4252.77it/s][A[A[A


100%|██████████| 851/851 [00:00<00:00, 4017.34it/s][A[A[A


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


  0%|          | 2/851 [00:00<00:51, 16.48it/s][A[A[A

starting build candidates list





  0%|          | 4/851 [00:00<00:52, 16.19it/s][A[A[A


  1%|          | 6/851 [00:00<00:50, 16.75it/s][A[A[A


  1%|          | 10/851 [00:00<00:41, 20.22it/s][A[A[A


  2%|▏         | 13/851 [00:00<00:38, 21.76it/s][A[A[A


  2%|▏         | 16/851 [00:00<00:44, 18.79it/s][A[A[A


  2%|▏         | 18/851 [00:01<00:58, 14.34it/s][A[A[A


  2%|▏         | 20/851 [00:01<00:58, 14.13it/s][A[A[A


  3%|▎         | 22/851 [00:01<01:01, 13.42it/s][A[A[A


  3%|▎         | 26/851 [00:01<00:50, 16.29it/s][A[A[A


  3%|▎         | 28/851 [00:01<00:55, 14.75it/s][A[A[A


  4%|▎         | 30/851 [00:01<00:52, 15.51it/s][A[A[A


  4%|▍         | 33/851 [00:01<00:50, 16.24it/s][A[A[A


  4%|▍         | 36/851 [00:02<00:45, 17.88it/s][A[A[A


  5%|▍         | 40/851 [00:02<00:38, 21.10it/s][A[A[A


  5%|▌         | 43/851 [00:02<00:39, 20.51it/s][A[A[A


  5%|▌         | 46/851 [00:02<00:43, 18.31it/s][A[A[A


  6%|▌         | 49/851 [00:02<00:49, 1

 47%|████▋     | 401/851 [00:22<00:25, 17.65it/s][A[A[A


 47%|████▋     | 403/851 [00:22<00:24, 17.97it/s][A[A[A


 48%|████▊     | 405/851 [00:22<00:25, 17.58it/s][A[A[A


 48%|████▊     | 408/851 [00:22<00:41, 10.74it/s][A[A[A


 48%|████▊     | 412/851 [00:23<00:32, 13.56it/s][A[A[A


 49%|████▉     | 415/851 [00:23<00:30, 14.34it/s][A[A[A


 49%|████▉     | 417/851 [00:23<00:28, 15.49it/s][A[A[A


 49%|████▉     | 421/851 [00:23<00:22, 18.88it/s][A[A[A


 50%|████▉     | 424/851 [00:23<00:21, 19.55it/s][A[A[A


 50%|█████     | 427/851 [00:23<00:22, 18.76it/s][A[A[A


 51%|█████     | 430/851 [00:23<00:24, 17.30it/s][A[A[A


 51%|█████     | 432/851 [00:24<00:26, 16.10it/s][A[A[A


 51%|█████     | 434/851 [00:24<00:24, 16.89it/s][A[A[A


 51%|█████▏    | 437/851 [00:24<00:22, 18.08it/s][A[A[A


 52%|█████▏    | 440/851 [00:24<00:20, 20.50it/s][A[A[A


 52%|█████▏    | 444/851 [00:24<00:17, 23.89it/s][A[A[A


 53%|█████▎    | 448/851

 94%|█████████▍| 804/851 [00:44<00:02, 20.47it/s][A[A[A


 95%|█████████▍| 807/851 [00:44<00:02, 21.38it/s][A[A[A


 95%|█████████▌| 810/851 [00:45<00:01, 21.11it/s][A[A[A


 96%|█████████▌| 813/851 [00:45<00:01, 21.56it/s][A[A[A


 96%|█████████▌| 816/851 [00:45<00:01, 22.42it/s][A[A[A


 96%|█████████▌| 819/851 [00:45<00:01, 23.48it/s][A[A[A


 97%|█████████▋| 822/851 [00:45<00:01, 21.95it/s][A[A[A


 97%|█████████▋| 825/851 [00:45<00:01, 20.12it/s][A[A[A


 97%|█████████▋| 828/851 [00:45<00:01, 20.57it/s][A[A[A


 98%|█████████▊| 831/851 [00:46<00:00, 22.12it/s][A[A[A


 98%|█████████▊| 834/851 [00:46<00:00, 21.56it/s][A[A[A


 98%|█████████▊| 837/851 [00:46<00:00, 21.14it/s][A[A[A


 99%|█████████▊| 840/851 [00:46<00:00, 22.18it/s][A[A[A


 99%|█████████▉| 843/851 [00:46<00:00, 22.82it/s][A[A[A


 99%|█████████▉| 846/851 [00:46<00:00, 18.30it/s][A[A[A


100%|█████████▉| 849/851 [00:46<00:00, 18.33it/s][A[A[A


100%|██████████| 851/851

starting get candidates name and offset


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


100%|██████████| 851/851 [00:00<00:00, 140897.36it/s][A[A[A


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


 42%|████▏     | 359/851 [00:00<00:00, 3587.95it/s][A[A[A

similarity 0.4
starting build ngram list
ngram 4





 88%|████████▊ | 749/851 [00:00<00:00, 3675.83it/s][A[A[A


100%|██████████| 851/851 [00:00<00:00, 3615.24it/s][A[A[A


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


  0%|          | 2/851 [00:00<00:49, 17.04it/s][A[A[A

starting build candidates list





  0%|          | 4/851 [00:00<00:50, 16.68it/s][A[A[A


  1%|          | 6/851 [00:00<00:49, 17.02it/s][A[A[A


  1%|          | 10/851 [00:00<00:40, 20.53it/s][A[A[A


  2%|▏         | 13/851 [00:00<00:37, 22.32it/s][A[A[A


  2%|▏         | 16/851 [00:00<00:43, 19.24it/s][A[A[A


  2%|▏         | 18/851 [00:00<00:58, 14.32it/s][A[A[A


  2%|▏         | 20/851 [00:01<00:58, 14.19it/s][A[A[A


  3%|▎         | 22/851 [00:01<01:02, 13.29it/s][A[A[A


  3%|▎         | 26/851 [00:01<00:51, 16.06it/s][A[A[A


  3%|▎         | 28/851 [00:01<00:56, 14.52it/s][A[A[A


  4%|▎         | 30/851 [00:01<00:54, 15.07it/s][A[A[A


  4%|▍         | 33/851 [00:01<00:51, 15.85it/s][A[A[A


  4%|▍         | 36/851 [00:02<00:46, 17.46it/s][A[A[A


  5%|▍         | 40/851 [00:02<00:38, 20.82it/s][A[A[A


  5%|▌         | 43/851 [00:02<00:40, 19.80it/s][A[A[A


  5%|▌         | 46/851 [00:02<00:44, 17.95it/s][A[A[A


  6%|▌         | 49/851 [00:02<00:49, 1

 47%|████▋     | 401/851 [00:22<00:25, 17.74it/s][A[A[A


 47%|████▋     | 404/851 [00:22<00:23, 19.04it/s][A[A[A


 48%|████▊     | 407/851 [00:22<00:21, 20.23it/s][A[A[A


 48%|████▊     | 410/851 [00:22<00:37, 11.87it/s][A[A[A


 49%|████▊     | 413/851 [00:23<00:35, 12.50it/s][A[A[A


 49%|████▉     | 417/851 [00:23<00:29, 14.90it/s][A[A[A


 50%|████▉     | 422/851 [00:23<00:22, 18.67it/s][A[A[A


 50%|████▉     | 425/851 [00:23<00:25, 16.79it/s][A[A[A


 50%|█████     | 428/851 [00:23<00:25, 16.47it/s][A[A[A


 51%|█████     | 431/851 [00:23<00:23, 18.12it/s][A[A[A


 51%|█████     | 434/851 [00:24<00:24, 17.06it/s][A[A[A


 51%|█████▏    | 437/851 [00:24<00:22, 18.54it/s][A[A[A


 52%|█████▏    | 440/851 [00:24<00:19, 20.81it/s][A[A[A


 52%|█████▏    | 444/851 [00:24<00:17, 23.93it/s][A[A[A


 53%|█████▎    | 448/851 [00:24<00:15, 25.75it/s][A[A[A


 53%|█████▎    | 451/851 [00:24<00:15, 25.78it/s][A[A[A


 53%|█████▎    | 454/851

 94%|█████████▍| 804/851 [00:44<00:02, 20.67it/s][A[A[A


 95%|█████████▍| 807/851 [00:44<00:02, 21.03it/s][A[A[A


 95%|█████████▌| 810/851 [00:45<00:01, 21.09it/s][A[A[A


 96%|█████████▌| 813/851 [00:45<00:01, 21.61it/s][A[A[A


 96%|█████████▌| 816/851 [00:45<00:01, 22.75it/s][A[A[A


 96%|█████████▌| 819/851 [00:45<00:01, 23.86it/s][A[A[A


 97%|█████████▋| 822/851 [00:45<00:01, 21.68it/s][A[A[A


 97%|█████████▋| 825/851 [00:45<00:01, 20.22it/s][A[A[A


 97%|█████████▋| 828/851 [00:45<00:01, 20.94it/s][A[A[A


 98%|█████████▊| 831/851 [00:45<00:00, 22.65it/s][A[A[A


 98%|█████████▊| 834/851 [00:46<00:00, 22.27it/s][A[A[A


 98%|█████████▊| 837/851 [00:46<00:00, 21.99it/s][A[A[A


 99%|█████████▊| 840/851 [00:46<00:00, 22.64it/s][A[A[A


 99%|█████████▉| 843/851 [00:46<00:00, 23.08it/s][A[A[A


 99%|█████████▉| 846/851 [00:46<00:00, 18.89it/s][A[A[A


100%|█████████▉| 849/851 [00:46<00:00, 18.97it/s][A[A[A


100%|██████████| 851/851

starting get candidates name and offset


  0%|          | 0/851 [00:00<?, ?it/s][A[A[A


100%|██████████| 851/851 [00:00<00:00, 196053.65it/s][A[A[A

In [237]:
for i in range(len(ratio_all)):
    print(sum(ratio_all[i])/len(ratio_all[i]))
    print(sum(recall_all[i])/len(recall_all[i]))

18.855327532501448
0.9521725141290355
18.590112939025982
0.9521725141290355
15.693984201592894
0.951447876447876
10.786663682315858
0.9491797698319432
6.31460793091228
0.9422406878928613


In [223]:
cut_data = ns.cut_data

for i in range(len(recall)):
    if recall[i] < 1:
#         print(cand_name[i])
#         print(dev_data[i])
        for j in en[i]:
            if j not in cand_name[i]:
                if j in cut_data[i]:
                    print(cand_name[i])
                    print(cut_data[i])
                    print(dev_data[i])
        
        

['绿洲', '影评', 'mtime', '时光', '网', 'mtime', 'mtime', '时光网', 'mtime']
['绿洲', ' ', '影评', '╟', ' ', 'mtime', '时光', '网', '绿洲 ', ' 影评', '影评╟', '╟ ', ' mtime', 'mtime时光', '时光网', '绿洲 影评', ' 影评╟', '影评╟ ', '╟ mtime', ' mtime时光', 'mtime时光网', '绿洲 影评╟', ' 影评╟ ', '影评╟ mtime', '╟ mtime时光', ' mtime时光网', '绿洲 影评╟ ', ' 影评╟ mtime', '影评╟ mtime时光', '╟ mtime时光网']
{'text_id': '9107', 'text': '绿洲 影评╟ mtime时光网', 'mention_data': [('绿洲', 0, '159221'), ('影评', 3, '22265'), ('mtime时光网', 7, '153474')]}
['守护者', '世纪', '精彩', '片段', '俄罗斯', '动作', '科幻片', '世纪战元', '世纪战元', '世纪战元', '守护者：世纪战元']
['《', '守护者', ':', '世纪', '战元', '》', '精彩', '片段', ' ', '俄罗斯', '动作', '科幻片', '《守护者', '守护者:', ':世纪', '世纪战元', '战元》', '》精彩', '精彩片段', '片段 ', ' 俄罗斯', '俄罗斯动作', '动作科幻片', '《守护者:', '守护者:世纪', ':世纪战元', '世纪战元》', '战元》精彩', '》精彩片段', '精彩片段 ', '片段 俄罗斯', ' 俄罗斯动作', '俄罗斯动作科幻片', '《守护者:世纪', '守护者:世纪战元', ':世纪战元》', '世纪战元》精彩', '战元》精彩片段', '》精彩片段 ', '精彩片段 俄罗斯', '片段 俄罗斯动作', ' 俄罗斯动作科幻片', '《守护者:世纪战元', '守护者:世纪战元》', ':世纪战元》精彩', '世纪战元》精彩片段', '战元》精彩片段 ', '》精彩片段 俄罗斯', '精彩片段 俄罗斯动作

In [224]:
ts = TopSim(kb_data.kb)

In [226]:
print(ts.search('神薙'))

[(1.0, [61124])]


In [158]:
kb.kb2id['的']

['36797', '60209', '70607', '343599']

In [159]:
for i in ['36797', '60209', '70607', '343599']:
    print(kb.id2kb[i])
    cand_name = ns.cand_name

{'alias': ['的', '马廷义'], 'data': {'摘要': '马廷义，男，1956年1月出生，汉族，河南明泰铝业股份有限公司董事长兼总经理，巩义市第三届人大常务委员会委员。1997年，他组建了河南明泰铝业有限公司。10年来，他依靠自有资金，实行滚动发展，使当年名不见经传的小厂成为拥有注册资金3.15亿元、总资产达到13亿元、占地面积24.6万平方米、职工2000余人的巩义30强企业，累计向国家上交税收1亿多元，安置下岗职工与社会劳动力1800余人。', '国籍': '中国', '出生日期': '1956年1月', '民族': '汉族', '别名': '的', '中文名': '马廷义', '义项描述': '马廷义', '标签': '行业人物'}, 'type': ['human']}
{'alias': ['喷喉爽', '的'], 'data': {'地点': '云南', '民族': '彝族', '中文名': '的', '属性': '药物', '义项描述': '喷喉爽', '标签': '食品'}, 'type': ['human']}
{'alias': ['的'], 'data': {'摘要': '的是一个汉语汉字，普通话读音是de,dí,dì,dī(粤语读音为dik1)，其居现代汉语使用量之首，有着漫长的演变史。用在定语的后面，也可以用在主谓短语中间，现代汉语56008个常用词 “的”字用最多的一个字。', '外文名': 'of', '词性': '名词、助词、代词', '笔顺编号': '32511354', '五笔86,98': 'r(一级简码)、rqyy(全码)', '普通话拼音': 'de,dí,dì,dī', '总笔画数': '8', '粤语拼音': 'dik1', '部首': '白', '仓颉': 'hapi', '中文名': '的', '郑码': 'nkrs', '二笔': 'd w t v(全码)、d(一级简码)', '造字法': '形声。本作“旳”，从日，勺声', 'unicode': 'cjk 统一汉字 u+7684', '结构': '左右结构', '四角号码': '27620', '义项描述': '汉语汉字', '标签': '语言'}, 'type': ['vocabulary']}
{'alias': ['的'], 'data

In [None]:
def build_vocabs(datasets):
    token_vocabs = set()
    for 

In [None]:
def build_char2id(token_vocabs):
    char2id = {}
    for 

In [None]:
import numpy as np
import os

In [None]:
UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"

In [None]:
# special error message
class MyIOError(Exception):
    def __init__(self, filename):
        # custom error message
        message = """
ERROR: Unable to locate file {}.

FIX: Have you tried running python build_data.py first?
This will build vocab file from your train, test and dev sets and
trimm your word vectors.
""".format(filename)
        super(MyIOError, self).__init__(message)

In [None]:
class Dataset(object):
    '''
    读文件，生成matrix
    '''
    def __init__(self,filename):
        
    def __len__(self,)
        
    