In [1]:
import json
from tqdm import tqdm
import os
from random import choice
from itertools import groupby
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
class KB(object):
    def __init__(self,kb_directory):
        print("start loading kb_data...")
        self.kb_directory = kb_directory
        self.id2kb,self.types,self.predicate = self.get_id2kb()
        self.kb2id = self.get_kb2id()
        self.kb = list(self.kb2id.keys())
        self.id = list(self.id2kb.keys())
        print("KB DATA INFORMATION")
        print("TOKEN SIZE:{}".format(self.get_token_size()))
        print("ID SIZE:{}".format(len(self)))
        print("TYPE SIZE:{}".format(len(self.types)))
        print("PREDICATE SIZE:{}".format(len(self.predicate)))
    def get_id2kb(self):
        print("construct id2kb dict...")
        id2kb = {}
        kbtype = set()
        predicate = set()
        multi_type = []
        with open(self.kb_directory) as f:
            for l in tqdm(f):
                tmp = json.loads(l)
                subject_id = tmp['subject_id']
                subject_alias = list(set([tmp['subject']] + tmp.get('alias', [])))
                subject_alias = [alias.lower() for alias in subject_alias]
                subject_type = [i.lower() for i in tmp['type']]
                kbtype.update(subject_type)
                try:
                    assert(len(tmp['type'])==1)
                except AssertionError:
                    multi_type.append(tmp['type'])
                subject_data = {}
                for i in tmp['data']:
                    predicate.add(i['predicate'].lower())
                    subject_data[i['predicate'].lower()] = i['object'].lower()
                if subject_data:
                    id2kb[subject_id] = {'alias': subject_alias, 'data': subject_data,'type':subject_type}
#         print(multi_type)
        return id2kb,kbtype,predicate
    def get_kb2id(self):
        print("construct kb2id dict...")
        kb2id = {}
        for i,j in self.id2kb.items():
            for k in j['alias']:
                if k not in kb2id:
                    kb2id[k] = []
                kb2id[k].append(i)
        return kb2id
    def __len__(self):
        return len(self.id2kb)
    def get_token_size(self):
        return len(self.kb)
#     def save(self):
        


In [3]:
kb_data = KB('./ccks2019_el/kb_data')

1532it [00:00, 15317.49it/s]

start loading kb_data...
construct id2kb dict...


399252it [00:20, 19298.34it/s]


construct kb2id dict...
KB DATA INFORMATION
TOKEN SIZE:303375
ID SIZE:399233
TYPE SIZE:51
PREDICATE SIZE:41841


In [None]:
class ngram_search(object):
    def __init__(self,data,kb,ngram = 2,similarity = 0.5):
        self.n = ngram
        self.similarity = similarity
        self.data = data
        self.kb = kb
        self.cut_data,self.offset = self.cut_words()
        self.ts = TopSim(self.kb)
        self.candidates = self.get_candidates(self.similarity)
        self.cand_name,self.cand_off = self.get_candidates_name()
        self.cand_with_off = self.get_cand_with_off(self.similarity)
    def cut_words(self):
        print('starting build ngram list')
        print('ngram',self.n)
        result = []
        offset = []
        for d in tqdm(self.data):
#             print(d)
#             print(' '.join(jieba.cut(d)))
            tmp = list(jieba.cut(d))
            n = len(tmp)
            tmp_off = [0]
#             tmp_off = [len(''.join(tmp[:i])) for i in range(len(tmp))]
            for i in range(len(tmp)-1):
                tmp_off.append(tmp_off[-1]+len(tmp[i]))
            for j in range(2,self.n+1):
                for i in range(j-1,n):
                    tmp.append(''.join(tmp[i-j+1:i+1]))
                    tmp_off.append(tmp_off[i-j+1])
#                     tmp_off.append(''.join(tmp[:i-n+1]))
            result.append(tmp)
            offset.append(tmp_off)
        return result,offset
                                             
    def get_candidates(self,similarity = 0.5):
        self.similarity = similarity
        print('starting build candidates list')
        print('similarity:',self.similarity)
        candidates = []
        for dt in tqdm(self.cut_data):
            ts_result = []
            for i in dt:
                tmp = ts.search(i)
                if tmp and tmp[0][0] > self.similarity:
                    ts_result.append(tmp)
                else:
                    ts_result.append([])
            candidates.append(ts_result)
        return candidates
    
    def get_candidates_name(self):
        print('starting get candidates name and offset')
        cand_name = []
        cand_offset = []
        for i in tqdm(range(len(self.candidates))):
            cand = []
            off = []
            for j in range(len(self.candidates[i])):
                if self.candidates[i][j]:
#                     print(self.candidates[i][j])
#                     print(self.candidates[i][j][0][1][0])
#                     print(self.kb[self.candidates[i][j][0][1][0]])
                    cand.append(self.kb[self.candidates[i][j][0][1][0]])
                    off.append(self.offset[i][j])
            cand_name.append(cand)
            cand_offset.append(off)
        return cand_name,cand_offset
    
    def get_cand_with_off(self):
        print('starting get candidates with offset')
        cand_with_off = []
        for i in tqdm(range(len(self.cand_name))):
            c_o = {}
            for j in range(len(self.cand_name[i]))
                offset = self.cand_offset[i][j]
                cand = self.cand_name = 
                if offset not in c_o:
                    c_o[str(offset)] = []
