In [155]:
# Required packages:
import math
import numpy as np
import os
import random
import requests
import json
import bz2
import re
import nltk
from nltk.corpus import wordnet,stopwords
from nltk import word_tokenize, pos_tag
import gensim.downloader as api
from gensim.models import Word2Vec
from itertools import chain
from collections import Counter
from collections import defaultdict
from sklearn.ensemble import RandomForestRegressor
from elasticsearch import Elasticsearch
from elasticsearch import helpers

#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')


### Read query training dataset from Github

Download query from github and split into train set and test set

In [26]:
url = 'https://raw.githubusercontent.com/smart-task/smart-dataset/master/datasets/DBpedia/'
file = 'smarttask_dbpedia_train.json'
url = url + file
queries=requests.get(url).json()


random.seed(a=1234567)
TRAIN_SIZE = int(len(queries) * 0.8)

TRAIN_QUERY = queries[:TRAIN_SIZE]
TEST_QUERY = queries[TRAIN_SIZE:]

print(len(TRAIN_QUERY))



14056


In [5]:
#query stats
#query = Counter([q['category'] for q in TRAIN_QUERY])
#print(query)

#resource_query = [q['type'] for q in TRAIN_QUERY if q['category']=='resource']
#query_type = Counter(sum(resource_query,[]))

#print(resource_query)
#c0=0
#c1=0
 
#for q in resource_query:
#    if len(resource_query[q])>1: c1 += 1
#    else: c0 += 1



### Download file from DBPedia NLP dataset

Check whether dataset files (bz2) are available in data folder and download bz2 file if they are not

In [6]:
dburl= 'http://downloads.dbpedia.org/2016-10/core-i18n/en/'
dbfiles = ['long_abstracts_en.ttl.bz2','instance_types_transitive_en.ttl.bz2']
type_file = 'dbpedia_types.tsv'
dirname = 'data'

for dbfile in dbfiles:
    if not os.path.exists(dirname):
        os.mkdir(dirname) 
    
    # Download file
    if not os.path.exists(dirname + '/' + dbfile):
        url = dburl + dbfile
        r = requests.get(url)
        with open(dirname + '/'+ dbfile, 'wb') as f:
            f.write(r.content)
   
    if not os.path.exists(dirname + '/' + dbfile):

        r = requests.get(url)
        with open(dirname + '/'+ dbfile, 'wb') as f:
            f.write(r.content)

    #extract
    if not os.path.exists(dirname + '/' + dbfile[:-4]):
        filepath= dirname + '/'+ dbfile
        newfilepath = filepath[:-4] 
        with open(filepath, 'rb') as source, open(newfilepath, 'wb') as dest:
            dest.write(bz2.decompress(source.read()))
    else:
        continue
    
url = 'https://raw.githubusercontent.com/smart-task/smart-dataset/master/evaluation/dbpedia/dbpedia_types.tsv'
if not os.path.exists(dirname + '/' + type_file):
    r = requests.get(url)
    with open(dirname + '/'+ type_file, 'wb') as f:
        f.write(r.content)



### Index configuration 

For each of the fields, store the term vectors. These should be stored in the index. 

In [2]:
es = Elasticsearch()
INDEX_NAME_ENTITY = 'nlp_entity'

INDEX_SETTINGS_ENTITY = {
    'mappings': {
            'properties': {
                'description': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'types': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
            }
        }
    }

INDEX_NAME_TYPE = 'nlp_type'

INDEX_SETTINGS_TYPE = {
    'mappings': {
            'properties': {
                'description': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'question': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
            }
        }
    }


In [3]:
ABSTRACTS = '<http://dbpedia.org/ontology/abstract>'
COMMENTS = '<http://www.w3.org/2000/01/rdf-schema#comment>'
TYPES = '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>'
abstract_files = ['long_abstracts_en.ttl']
type_files = ['instance_types_transitive_en.ttl']


In [30]:
def resolve_abstract(text,delimiter):
    text = text.split(delimiter)
    text = text[1].replace('"', '')
    text = text.replace('@en .','')
    return text

def resolve_entity(text):
    name = text.split('>')[0].replace('_', ' ')
    name = name.split('/')[-1]
    return name

def resolve_types(text,delimiter):
    text = text.split(delimiter)
    types = text[1]    
    types = types[types.rfind('/')+1:types.rfind('>')]
    return ['dbo:'+types]

def populate_type(filename,entity_doc):
    #doc structure = {doc: {types: , abstract:}}
    with open(filename,'r',encoding ='utf-8') as bzfin:
        for i, line in enumerate(bzfin):
            #skip header
            
            if i == 1000000: break
            if i == 0: continue        
            #only consider dbpedia
            if 'dbpedia.org/ontology' in line.lower(): 
                line = line.rstrip()
                entity = resolve_entity(line)
                if entity is not None:
                    if TYPES in line:   
                        types=resolve_types(line,TYPES) 
                        #add value or create new id in dictionary
                        if entity in entity_doc:
                            entity_doc[entity]['types']+= types
                        else:
                            entity_doc[entity]={'types':types}
                        
            
            
    return entity_doc

def populate_abstract(filename, entity_doc):
    #doc structure = {doc: {types: , abstract:}}
    with open(filename, 'r', encoding ='utf-8') as bzfin:
        for i, line in enumerate(bzfin):
            
            if i == 1000000: break
            #skip header
            if i == 0: continue      
            #only consider dbpedia, in english
            if 'dbpedia.org/ontology' in line.lower() and 'en' in line.lower():            
                line = line.rstrip()
                entity = resolve_entity(line) 
                if entity is not None:
                    if ABSTRACTS in line:     
                        desc = resolve_abstract(line,ABSTRACTS)
                        if entity in entity_doc:
                            if entity_doc[entity]['types'] is not None:
                                entity_doc[entity]['description']= entity_doc[entity].get('description','') + desc
                            else:                             
                                continue
                        else:
                            continue
            
    return entity_doc

def prepare_doc(type_files,abstract_files):
    entity_doc={}
    type_doc = {}
    dirname ='data'
    for file in type_files:
        file = dirname + '/'+ file
        entity_doc = populate_type(file,entity_doc)
    print('type files are read')
    for file in abstract_files:
        file = dirname + '/'+ file
        entity_doc = populate_abstract( file,entity_doc)
    print('abstract files are read')
    print('done populating entity doc')
    #build type_doc from entity_doc
    for entity in entity_doc:
        types = entity_doc[entity]['types']
        if types is not None:
            for typ in types:
                if typ in type_doc:
                    type_doc[typ]['description']= type_doc[typ].get('description','') + entity_doc[entity].get('description','')
                else:
                    type_doc[typ] = {'description': entity_doc[entity].get('description','')}
    
    print('done populating type doc')
    
    #export to json
    jsn = json.dumps(entity_doc)
    f = open("entity_doc.json","w")
    f.write(jsn)
    f.close()
    
    jsn = json.dumps(type_doc)
    f = open("type_doc.json","w")
    f.write(jsn)
    f.close()
    
    
    return entity_doc, type_doc
        

def create_index_bulk(INDEX_NAME_ENTITY,INDEX_NAME_TYPE,type_files,abstract_files):   
    entity_docs, type_docs = prepare_doc(type_files,abstract_files)
    action_entity = [
    {
    '_id' : ids,
    "_source" : entity_docs[ids]
    }
    for ids in entity_docs
    ]
    bulk = helpers.bulk(es,action_entity, index = INDEX_NAME_ENTITY, chunk_size=1000, request_timeout=200)
        
    print('entity doc are indexed')
    action_type = [
    {
    '_id' : ids,
    "_source" : type_docs[ids]
    }
    for ids in type_docs
    ]
    bulk = helpers.bulk(es,action_type, index = INDEX_NAME_TYPE, chunk_size=1000, request_timeout=200)
    print('type doc are indexed')

def populate_question(queries, INDEX_NAME_TYPE):
    type_dict = defaultdict(str)
    for query in queries:
        if query['question'] == None:
            continue
        for t in query['type']:
            type_dict[t] += query['question']
    for t, q in type_dict.items():
        if es.exists(index=INDEX_NAME_TYPE, id=t):
            es.update(index=INDEX_NAME_TYPE, id=t, body={"doc":{"question": type_dict[t]}})
        else:
            es.index(index=INDEX_NAME_TYPE, id=t, body={"doc":{"question": type_dict[t]}})
    

## Populate index

Declare two new indexes, one for entity centric model and the other for type centric model

In [20]:
if es.indices.exists(INDEX_NAME_ENTITY):
    es.indices.delete(index=INDEX_NAME_ENTITY)
es.indices.create(index=INDEX_NAME_ENTITY, body=INDEX_SETTINGS_ENTITY)    

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'nlp_entity'}

In [21]:
if es.indices.exists(INDEX_NAME_TYPE):
    es.indices.delete(index=INDEX_NAME_TYPE)
es.indices.create(index=INDEX_NAME_TYPE, body=INDEX_SETTINGS_TYPE)    

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'nlp_type'}

In [24]:
create_index_bulk(INDEX_NAME_ENTITY,INDEX_NAME_TYPE,type_files,abstract_files)

type files are read
abstract files are read
done populating entity doc
done populating type doc
entity doc are indexed
type doc are indexed


In [31]:
populate_question(TRAIN_QUERY, INDEX_NAME_TYPE)

In [9]:
#check indexes
res = es.get(index='nlp_type', id='dbo:Place')['_source']['description']
print(len(res))
res = es.get(index='nlp_entity', id='Alabama')['_source']['types']
print(res)
count = es.cat.count('nlp_entity', params={"format": "json"})
print(count)
count = es.cat.count('nlp_type', params={"format": "json"})
print(count)

24848932
['dbo:Region', 'dbo:PopulatedPlace', 'dbo:Place', 'dbo:Location']
[{'epoch': '1604857588', 'timestamp': '17:46:28', 'count': '120104'}]
[{'epoch': '1604857588', 'timestamp': '17:46:28', 'count': '97'}]


## Predic types for queries


First score documents given the querie. Count the number of times a type occurs across the documents, dividing it by the sum of all type counts to get the type weight. For every type in a document, multiply the type weight with the document score and add to the types score.

In [10]:
def analyze_query(es, query):
    tokens = es.indices.analyze(index=INDEX_NAME_TYPE, body={'text': query})['tokens']
    query_terms = []
    for t in sorted(tokens, key=lambda x: x['position']):
        query_terms.append(t['token'])
    return query_terms

def entity_centric_scorer(index_name, query, field='description', k=100):
    es_query = {
        "query":{
            "query_string":{
                "query": query,
                "default_field": field
            }
        }
    }

    matches = es.search(index=index_name, body=es_query, _source=True, size=k)['hits']['hits']

    type_count = defaultdict(int)
    for match in matches:
        for doc_type in match['_source']['types']:
            type_count[doc_type] += 1

    type_weight = {}
    for t, c in type_count.items():
        type_weight[t] = 1/c

    type_score = defaultdict(int)
    for match in matches:
        doc_id = match['_id']
        doc_score = match['_score']
        for doc_type in match['_source']['types']:
            type_score[doc_type] += doc_score * type_weight[doc_type]

    scores = sorted([(t, s) for t, s in type_score.items()],key=lambda tup: tup[1], reverse=True)
    return scores


def type_centric_scorer(index_name, q_terms, field='description', k=100):
    type_scores = defaultdict(int)
    for term in q_terms:
        es_query = {
            "query":{
                "query_string":{
                    "query": term,
                    "default_field": field
                }
            }
        }

        matches = es.search(index=index_name, body=es_query, _source=False, size=k)['hits']['hits']
        for match in matches:
            type_scores[match['_id']] += match['_score']

    scores = sorted([(t, s) for t, s in type_scores.items()],key=lambda tup: tup[1], reverse=True)
    return scores



In [11]:
query_scores = {}
pattern = re.compile(r'[\W_]+')
for query in TRAIN_QUERY:
    if query['category'] != 'resource':
        continue
    if query['question'] == None:
        continue
    q = pattern.sub(' ', query['question'])
    ec_pred = entity_centric_scorer(INDEX_NAME_ENTITY, q)
    q_terms = analyze_query(es, q)
    tc_pred = type_centric_scorer(INDEX_NAME_TYPE, q_terms)
    query_scores[query['id']] = {'entity_centric': ec_pred, 'type_centric': tc_pred, 'actual': query['type']}

KeyboardInterrupt: 

In [14]:
for i, (key, val) in enumerate(query_scores.items()):
    print("ID: {}".format(key))
    print("Entity Centric: {}".format(val['entity_centric'][:10]))
    print("Type Centric: {}".format(val['type_centric'][:10]))
    print("Actual: {}\n".format(val['actual']))
    if i == 10:
        break

ID: dbpedia_14427
Entity Centric: [('dbo:Group', 15.700501499999998), ('dbo:Cartoon', 14.047559), ('dbo:Organisation', 13.0069865), ('dbo:RouteOfTransportation', 12.868805), ('dbo:Infrastructure', 12.868805), ('dbo:MusicalWork', 10.383891444444444), ('dbo:Agent', 10.352069909090911), ('dbo:Work', 10.320312022222225), ('dbo:WrittenWork', 10.244019), ('dbo:Software', 10.227817)]
Type Centric: [('dbo:MusicalWork', 6.932363639999999), ('dbo:Work', 6.85984862), ('dbo:WrittenWork', 6.844822239999999), ('dbo:Group', 6.74023693), ('dbo:Agent', 6.6516779), ('dbo:ArchitecturalStructure', 6.5886717), ('dbo:Wikidata:Q11424', 6.58418887), ('dbo:Person', 6.559901190000001), ('dbo:Artist', 6.54073076), ('dbo:Location', 6.474589539999999)]
Actual: ['dbo:Opera', 'dbo:MusicalWork', 'dbo:Work']

ID: dbpedia_3681
Entity Centric: [('dbo:Food', 13.540861), ('dbo:Artist', 12.90148), ('dbo:Company', 11.979504888888886), ('dbo:Agent', 11.927975352631583), ('dbo:Person', 11.912545999999999), ('dbo:Organisation'

In [15]:
def reciprocal_rank(predicted, ground_truth):
    for i, t in enumerate(predicted):
        if t in ground_truth:
            return 1/(i+1)
    return 0

def recall(predicted, ground_truth):
    if len(ground_truth) == 0: return 0
    correct = 0
    for t in ground_truth:
        if t in predicted:
            correct += 1
    return correct/len(ground_truth)


def avg_precision(predicted, ground_truth):
    precision = []
    correct = 0
    for i, t in enumerate(predicted):
        if t in ground_truth:
            correct += 1
            precision.append(correct/(i+1))

    if len(precision) == 0: return 0
    return sum(precision)/len(precision)

def mean(predictions, ground_truths, func):
    stats = []
    for i in range(len(predictions)):
        stats.append(func(predictions[i], ground_truths[i]))
    return sum(stats)/len(stats) if len(stats) > 0 else 0


In [16]:
ec_preds = []
tc_preds = []
ground_truths = []
for _, val in query_scores.items():
    ec_preds.append([x[0] for x in val['entity_centric']])
    tc_preds.append([x[0] for x in val['type_centric']])
    ground_truths.append(val['actual'])

m_recall_ec = mean(ec_preds, ground_truths ,recall)
mrr_ec = mean(ec_preds, ground_truths ,reciprocal_rank)
m_prec_ec = mean(ec_preds, ground_truths ,reciprocal_rank)

m_recall_tc = mean(tc_preds, ground_truths ,recall)
mrr_tc = mean(tc_preds, ground_truths ,reciprocal_rank)
m_prec_tc = mean(tc_preds, ground_truths ,reciprocal_rank)

print("Entity Centric")
print("Recall: {}".format(m_recall_ec))
print("MRR: {}".format(mrr_ec))
print("Mean Precision: {}\n".format(m_prec_ec))

print("Type Centric")
print("Recall: {}".format(m_recall_tc))
print("MRR: {}".format(mrr_tc))
print("Mean Precision: {}\n".format(m_prec_tc))


Entity Centric
Recall: 0.6713697248220691
MRR: 0.22355451968797388
Mean Precision: 0.22355451968797388

Type Centric
Recall: 0.722420660867124
MRR: 0.4080946288914935
Mean Precision: 0.4080946288914935



## Type label features

Functions used to populate type label features

In [178]:
def typelabel_terms (filename):
    '''function to obtain label of type t'''
    t_label ={}
    with open(filename, 'r') as tsv_file:
        next(tsv_file)  # Skip header row
        for line in tsv_file:
            fields = line.rstrip().split('\t')
            type_name= fields[0]
            if t_label.get(type_name,'') == '':
                t_label[type_name] = [t.lower() for t in re.findall('.[^A-Z]*',type_name)[1:]]

    return t_label

def idf_type_feature (t_terms,q_terms,t_features):
    idf=[]
    
    if t_terms is not None:
        num_doc = len(q_terms)
        t_features['length'] = sum(len(i) for i in t_terms)
        for t in t_terms:
            df = 0
            for qt in q_terms:
                if t in qt: df += 1   
            
            if df == 0: idf.append(0)
            else: idf.append(math.log(num_doc)/df)

        if len(idf)>0:
            t_features['sum_idf'] = sum(idf)
            t_features['avg_idf'] = sum(idf)/len(t_terms)
        else:
            t_features['sum_idf'] = 0
            t_features['avg_idf'] = 0
        
    else:
        t_features['sum_idf'] = 0
        t_features['avg_idf'] = 0
        t_features['length'] = 0
        
    return t_features

def jterms (t_terms,q_terms,t_features):
    if t_terms is not None:
        t_terms = set(t_terms)
        q_terms = set([item for sublist in q_terms for item in sublist])
        t_features['jsim'] = float(len(t_terms.intersection(q_terms))) / (len(t_terms) + len(q_terms) - len(t_terms.intersection(q_terms)))
    else:
        t_features['jsim']= 0

    return t_features

def similarity(t_terms, q_term, model, t_features):
    similarity = []
    
    #q = list(chain.from_iterable(term))
    for q in q_term:
        if q is not None:
            for t in t_terms:
                try:
                    similarity.append(model.wv.similarity(t,q))
                except:
                    similarity.append(0)
 
    t_features['avg_sim'] = sum(similarity)/len(similarity)
        
    return t_features

def gettypelabelterm(dbotype):
    type_file = 'dbpedia_types.tsv'
    filename = 'data/' + type_file
    label_terms = typelabel_terms (filename)
    t_terms =label_terms[dbotype]
    return t_terms

def getqueryterm(queries):
    q_terms = []
    stop_words = set(stopwords.words('english'))
    for query in queries:
        if query['question']:
            question = query['question'].lower()
            question = re.sub(r'[-()\"#/@;:<>{}`+=~|.!?,]', '', question).split(' ')
            question = [q for q in question if not q in stop_words]
            question =[q for q in question if len(q)>1]           
            q_terms.append(question)
    return q_terms

def typelabel_features(t_term,q_terms):
    '''input: dbotype and list of queries'''
    typefeature = {'sum_idf','avg_idf','length','jsim','avg_sim'}
    t_features={}
    feature_vect = []
    
    model = api.load('word2vec-google-news-300')
    #('city', 'bird')
    
    for q_term in q_terms:
        t_features = idf_type_feature (t_term, q_terms, t_features)
        t_features = jterms(t_term,q_term, t_features)
        t_features = similarity(t_term, q_term, model, t_features)
        #for feature in typefeature:
        feature_vect.append(list(t_features.values()))

    return feature_vect

In [182]:
typelabel = gettypelabelterm('dbo:BasketballLeague')
queries = getqueryterm(TRAIN_QUERY)
typelabel_features(typelabel,queries)



[[16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.08842033124528825],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.08356784950010479],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.06844631603194608],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.046855914406478405],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.03124373822938651],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.023289650771766902],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.06003423519432545],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.07086049634963273],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.05412056380882859],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.11704674921929836],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.19486985448747873],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.10588627471588552],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0, 0.08061446986782055],
 [16, 0.4951013305656918, 0.2475506652828459, 0.0