In [6]:
'''
__author__: Ellen Wu, Jiaming Shen, Dongming Lei
__description__: Map entity surface to entity id and filter entities with too small occurrences.
    Input: 1) entitylist (a list of raw entity mentions)
    Output: 1) a map between entity surface name to eid and 2) a list of eid
__latest_updates__: 08/23/2017
'''
from textblob import Word
from collections import defaultdict
import inflection
import editdistance
import sys

EDIT_DISTANCE_THRESHOLD = 1

def resolution(surfaceName, normalized_ename2eid):
    '''
    input: a surface name of entity
    output: the "normalized" entity name
    process: 1) lowercase
             2) lemmatization
    '''
#     tmp = [Word(ele.lower()).lemmatize() for ele in surfaceName.split()]
    
    # tmp = [ele.lower() for ele in surfaceName.split()]
#     return " ".join(tmp)
    tmp = surfaceName.lower()
    tmp = tmp.replace('-', '')
    tmp = tmp.replace('_', ' ')
    
    tmp = inflection.singularize(tmp)
    
    if tmp in normalized_ename2eid or len(tmp) <= 5:
        return tmp
    
    # Step 2: Use edit distance to resolve 
    keys = normalized_ename2eid.keys()
    existing_keys = [
        ele for ele in normalized_ename2eid.keys() if editdistance.eval(tmp, ele) <= EDIT_DISTANCE_THRESHOLD
    ]

    if len(existing_keys) != 0:
        print(surfaceName, tmp, existing_keys)
        return existing_keys[0]
    else:
        return tmp
    

def main(corpusName, min_sup = -1):
    data = corpusName
    min_sup = int(min_sup)
    inputFileName = '../../data/'+data+'/intermediate/entitylist.txt'
    outputFileName = '../../data/'+data+'/intermediate/entity2id.txt'
    uniqueEntityNameFileOut = '../../data/'+data+'/intermediate/eidlist.txt'

    eid = 0
    ename2eid = {}
    normalized_ename2eid = {}
    normalized_ename2freq = defaultdict(int)
    with open(inputFileName,"r") as fin:
        for line in fin:
            segs = line.strip().split("\t")
            ename = segs[0]
            freq = int(segs[1])

            normalized_ename = resolution(ename, normalized_ename2eid)
            if normalized_ename in normalized_ename2eid: # already exist
                ename2eid[ename] = normalized_ename2eid[normalized_ename]
                normalized_ename2freq[normalized_ename] += freq
            else: # a new entity
                normalized_ename2eid[normalized_ename] = eid
                normalized_ename2freq[normalized_ename] += freq
                ename2eid[ename] = eid
                eid += 1

    print("Number of entities between (potential) filtering = %s" % eid)
    filtered_eid = set()
    if min_sup != -1:
        print("Filtering entities with too small occurrences")
        for ele in normalized_ename2freq.items():
            if ele[1] < min_sup:
                ## add the eid into the filtered set
                filtered_eid.add(normalized_ename2eid[ele[0]])
        print("Number of filtered entities = %s" % len(filtered_eid))

    with open(outputFileName,"w") as fout:
        for ele in sorted(ename2eid.items(), key = lambda x:x[0]):
            if ele[1] not in filtered_eid:
                fout.write(ele[0]+"\t"+str(ele[1])+"\n")

    with open(uniqueEntityNameFileOut,"w") as fout:
        for ele in sorted(normalized_ename2eid.items(), key = lambda x:x[1] ):
            if ele[1] not in filtered_eid:
                fout.write(ele[0]+"\t"+str(ele[1])+"\n")

if __name__ == '__main__':
#     corpusName = sys.argv[1]
#     min_sup = sys.argv[2]
    corpusName = 'dblp'
    min_sup = 15
    main(corpusName, min_sup)




10fold crossvalidation 10fold crossvalidation ['10fold cross validation']
2011 wiley periodicals 2011 wiley periodical ['2010 wiley periodical']
2012 wiley periodicals 2012 wiley periodical ['2010 wiley periodical']
adhoc network adhoc network ['ad hoc network']
adhoc networks adhoc network ['ad hoc network']
adhoc retrieval adhoc retrieval ['ad hoc retrieval']
aluminum aluminum ['aluminium']
approximation algorithm approximation algorithm ['2approximation algorithm']
approximation algorithms approximation algorithm ['2approximation algorithm']
artificial datasets artificial dataset ['artificial data set']
association rules mining association rules mining ['association rule mining']
benchmark dataset benchmark dataset ['benchmark data set']
benchmark datasets benchmark dataset ['benchmark data set']
biterror rate biterror rate ['bit error rate']
boolean formulae boolean formulae ['boolean formula']
boxing boxing ['boeing']
brain tumour brain tumour ['brain tumor']
braincomputer interfa

In [22]:
# generate the canonical form
from collections import defaultdict

data = 'dblpv2'

entityCountFileName = '../../data/'+data+'/intermediate/entitylist.txt'
entityCount = {}
with open(entityCountFileName) as fin:
    for line in fin:
        segs = line.strip().split("\t")
        entityCount[segs[0]] = int(segs[1])

inputFileName = '../../data/'+data+'/intermediate/entity2id.txt'
id2entities = defaultdict(set)
with open(inputFileName) as fin:
    for line in fin:
        segs = line.strip().split("\t")
        id2entities[segs[1]].add(segs[0])
        
        
        
    

In [20]:
import numpy as np

canonicalFormOutputFile = '../../data/'+data+'/intermediate/canonicalMapping.txt'
with open(canonicalFormOutputFile, 'w') as fout:
    for key, vals in id2entities.items():
        vals = list(vals)
        counts = [entityCount[ele] for ele in vals]
        canonical_form = vals[np.argmax(counts)]
        fout.write(canonical_form + '\t' + ','.join(vals) + '\n')
    

In [30]:
embeddingFile = '../../data/'+data+'/intermediate/entity_word2vec.emb'

print(len(id2entities))

embedSet = set()
with open(embeddingFile) as fin:
    
    for line in fin:
        segs = line.strip().split(" ")
        embedSet.add(segs[0])

print(set(id2entities.keys()) - embedSet)

17065
{'14102', '9502', '7180'}


1
