In [1]:
# Required packages:
import elasticsearch
import math
import numpy as np
import os
import pytest
import random
import requests
import tarfile
import json
import bz2file
import bz2
import re

from collections import Counter
from collections import defaultdict
from sklearn.ensemble import RandomForestRegressor
from bz2 import BZ2File as bzopen
from elasticsearch import Elasticsearch


### Reading Query from Github

Download query from github and split into train set and test set

In [2]:
url = 'https://raw.githubusercontent.com/smart-task/smart-dataset/master/datasets/DBpedia/'
file = 'smarttask_dbpedia_train.json'
url = url + file
queries=requests.get(url).json()


random.seed(a=1234567)
TRAIN_SIZE = int(len(queries) * 0.8)

TRAIN_QUERY = queries[:TRAIN_SIZE]
TEST_QUERY = queries[TRAIN_SIZE:]

print(len(TRAIN_QUERY))


14056


### Download file from DBPedia NLP dataset

Check whether dataset files (bz2) are available in data folder and download bz2 file if they are not

In [3]:
dburl= 'http://downloads.dbpedia.org/2016-10/core-i18n/en/'
#dbfiles = ['long_abstracts_en.ttl.bz2','short_abstracts_en.ttl.bz2','instance_types_sdtyped_dbo_en.ttl.bz2','instance_types_en.ttl.bz2']
#cwd = os.getcwd()  
dbfiles = ['long_abstracts_en.ttl.bz2','instance_types_transitive_en.ttl.bz2']
dirname = 'data'

for dbfile in dbfiles:
    if not os.path.exists(dirname):
        os.mkdir(dirname) 
    
    # Download file
    if not os.path.exists(dirname + '/' + dbfile):
        url = dburl + dbfile
        r = requests.get(url)
        with open(dirname + '/'+ dbfile, 'wb') as f:
            f.write(r.content)
    
    #extract
    if not os.path.exists(dirname + '/' + dbfile[:-4]):
        filepath= dirname + '/'+ dbfile
        newfilepath = filepath[:-4] 
        with open(filepath, 'rb') as source, open(newfilepath, 'wb') as dest:
            dest.write(bz2.decompress(source.read()))
    else:
        continue



### Index configuration 

For each of the fields, store the term vectors. These should be stored in the index. 

In [4]:
es = Elasticsearch()
INDEX_NAME_ENTITY = 'nlp_entity'

INDEX_SETTINGS_ENTITY = {
    'mappings': {
            'properties': {
                'description': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'types': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
            }
        }
    }

INDEX_NAME_TYPE = 'nlp_type'

INDEX_SETTINGS_TYPE = {
    'mappings': {
            'properties': {
                'description': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
            }
        }
    }


In [5]:
ABSTRACTS = '<http://dbpedia.org/ontology/abstract>'
COMMENTS = '<http://www.w3.org/2000/01/rdf-schema#comment>'
TYPES = '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>'
#abstract_files = ['long_abstracts_en.ttl.bz2','short_abstracts_en.ttl.bz2']
#type_files = ['instance_types_sdtyped_dbo_en.ttl.bz2','instance_types_en.ttl.bz2']
abstract_files = ['long_abstracts_en.ttl']
type_files = ['instance_types_transitive_en.ttl']


In [6]:
def resolve_abstract(text,delimiter):
    text = text.split(delimiter)
    text = text[1].replace('"', '')
    text = text.replace('@en .','')
    return text

def resolve_entity(text):
    name = text.split('>')[0].replace('_', ' ')
    name = name.split('/')[-1]
    return name

def resolve_types(text,delimiter):
    text = text.split(delimiter)
    types = text[1]    
    types = types[types.rfind('/')+1:types.rfind('>')]
    if 'wikidata' in types.lower(): return None
    return 'dbo:'+types

def populate_type(idx, filename):
    doc = {}
    with open(filename,'r',encoding ='utf-8') as bzfin:
        for i, line in enumerate(bzfin):
            #skip header
            if i == 0: continue        
            #only consider dbpedia
            if 'dbpedia.org' in line.lower(): 
                line = line.rstrip()
                entity = resolve_entity(line)
                if entity is not None:
                    if TYPES in line:                                
                        types=resolve_types(line,TYPES) 
                        #update index if it already exist if not then create new one 
                        try:
                            if es.exists(index=idx, id=entity):
                                result = es.update(index=idx, id=entity, body={
                                    'script':{'source':'ctx._source.types.add(params.new_type)', 'params':
                                                  {'new_type' : types}}})['result']
                            else:
                                es.index(index=idx, id=entity, body={'types':[types],'description':''})
                        except:
                            pass

            if i == 100: break

def populate_abstract(idx, filename):
    doc = {}
    with open(filename, 'r', encoding ='utf-8') as bzfin:
        for i, line in enumerate(bzfin):
            #skip header
            if i == 0: continue      
            #only consider dbpedia
            if 'dbpedia.org' in line.lower():            
                line = line.rstrip()
                entity = resolve_entity(line) 
                if entity is not None:
                    if ABSTRACTS in line:     
                        desc = resolve_abstract(line,ABSTRACTS)
                    elif COMMENTS in line:
                        desc = resolve_abstract(line,COMMENTS)
                    else:
                        continue
                    
                    #only update index if document type already exist 
                    try:
                        if es.exists(index=idx, id=entity):                 
                            result = es.update(index=idx, id=entity, body={
                                    'script':{'source':'ctx._source.description += params.new_desc', 
                                              'params': {'new_desc' :desc + ' '}}})['result']
                            if result != 'updated': print('failed to update description for :', entity)
                    except:
                        pass

            if i == 1000: break

def populate_abstract_bytype(idx1,idx2, filename):
    doc = {}
    with open(filename, 'r', encoding ='utf-8') as bzfin:
        for i, line in enumerate(bzfin):
            if i > 1000: break
            if i % 1000 == 0: print(i)
            #skip header
            if i == 0: continue                                   
            #only consider dbpedia
            if 'dbpedia.org' in line.lower():            
                line = line.rstrip()
                entity = resolve_entity(line) 
                if entity is not None:
                    #read types from existing entity index
                    try:
                        types = es.get(index=idx1, id=entity)['_source']['types']
                        if types is not None:
                            desc = resolve_abstract(line,ABSTRACTS)
                            for typ in types:
                               
                                #update index or create index for each types found
                                if es.exists(index=idx2, id=typ): 
                                    result = es.update(index=idx2, id=typ, body={
                                        'script':{'source':'ctx._source.description += params.new_desc', 
                                            'params': {'new_desc' :desc + ' '}}})['result']
                                    if result != 'updated': print('failed to update description for :', entity)
                                else:
                                    es.index(index=idx2, id=typ, body={'description':desc})
                    except:
                        pass

            

In [8]:
#Create Entity based index
#if es.indices.exists(INDEX_NAME_ENTITY):
#    es.indices.delete(index=INDEX_NAME_ENTITY)
#es.indices.create(index=INDEX_NAME_ENTITY, body=INDEX_SETTINGS_ENTITY)    

#Create Type based index
if es.indices.exists(INDEX_NAME_TYPE):
    es.indices.delete(index=INDEX_NAME_TYPE)
es.indices.create(index=INDEX_NAME_TYPE, body=INDEX_SETTINGS_TYPE)    

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'nlp_type'}

## Populate index

There are 2 steps in populating index based on fields. This is due to fact that there are 3 different dataset files for 2 different fields, types and description.

In [None]:
#1. Create entity based index , with types populated
dirname = 'data'
for file in type_files:
    file = dirname + '/'+ file
    populate_type(INDEX_NAME_ENTITY,file)

In [11]:
#2. Create entity based index, with abstract populated
dirname = 'data'
for file in abstract_files:
    file = dirname + '/'+ file
    populate_abstract(INDEX_NAME_ENTITY, file)

In [9]:
#3. Create type based index, with abstract populated
dirname = 'data'
for file in abstract_files:
    file = dirname + '/'+ file
    populate_abstract_bytype(INDEX_NAME_ENTITY,INDEX_NAME_TYPE, file)


0
1000


In [7]:
query = 'Who is the governor of alabama'
#hits = es.search(index=INDEX_NAME_ENTITY, body={'query': {'match': {'description': query}}},
#                               _source=True, size=1)
#print(hits)
#tv_1 = es.termvectors(index=INDEX_NAME_ENTITY, id='Crowe Clark Whitehill', fields='types')
#print(tv_1)
#res = es.get(index=INDEX_NAME_ENTITY, id=query)['_source']['types']
#print(res)
#res = es.get(index=INDEX_NAME_TYPE, id='dbo:Person')['_source']
#print(res)
hits = es.search(index=INDEX_NAME_TYPE, body={'query': {'match': {'description': query}}}, _source=False, size=10)
print(hits['hits']['hits'])

[{'_index': 'nlp_type', '_type': '_doc', '_id': 'dbo:AdministrativeArea', '_score': 4.1482306}, {'_index': 'nlp_type', '_type': '_doc', '_id': 'dbo:Region', '_score': 4.1482306}, {'_index': 'nlp_type', '_type': '_doc', '_id': 'dbo:PopulatedPlace', '_score': 4.134841}, {'_index': 'nlp_type', '_type': '_doc', '_id': 'dbo:Place', '_score': 4.129408}, {'_index': 'nlp_type', '_type': '_doc', '_id': 'dbo:Location', '_score': 4.022934}, {'_index': 'nlp_type', '_type': '_doc', '_id': 'dbo:owl#Thing', '_score': 3.8660688}, {'_index': 'nlp_type', '_type': '_doc', '_id': 'dbo:BodyOfWater', '_score': 2.5585153}, {'_index': 'nlp_type', '_type': '_doc', '_id': 'dbo:RiverBodyOfWater', '_score': 2.3980913}, {'_index': 'nlp_type', '_type': '_doc', '_id': 'dbo:Stream', '_score': 2.3980913}, {'_index': 'nlp_type', '_type': '_doc', '_id': 'dbo:NaturalPlace', '_score': 2.370525}]


## Predic types for queries


First score documents given the querie. Count the number of times a type occurs across the documents, dividing it by the sum of all type counts to get the type weight. For every type in a document, multiply the type weight with the document score and add to the types score.

In [8]:
def analyze_query(es, query):
    tokens = es.indices.analyze(index=INDEX_NAME_TYPE, body={'text': query})['tokens']
    query_terms = []
    for t in sorted(tokens, key=lambda x: x['position']):
        query_terms.append(t['token'])
    return query_terms

def entity_centric_scorer(index_name, query, field='description', k=100):
    es_query = {
        "query":{
            "query_string":{
                "query": query,
                "default_field": field
            }
        }
    }

    matches = es.search(index=index_name, body=es_query, _source=True, size=k)['hits']['hits']

    type_count = defaultdict(int)
    for match in matches:
        for doc_type in match['_source']['types']:
            type_count[doc_type] += 1

    type_weight = {}
    for t, c in type_count.items():
        type_weight[t] = 1/c

    type_score = defaultdict(int)
    for match in matches:
        doc_id = match['_id']
        doc_score = match['_score']
        for doc_type in match['_source']['types']:
            type_score[doc_type] += doc_score * type_weight[doc_type]

    scores = sorted([(t, s) for t, s in type_score.items()],key=lambda tup: tup[1], reverse=True)
    return scores


def type_centric_scorer(index_name, q_terms, field='description', k=100):
    type_scores = defaultdict(int)
    for term in q_terms:
        es_query = {
            "query":{
                "query_string":{
                    "query": term,
                    "default_field": field
                }
            }
        }

        matches = es.search(index=index_name, body=es_query, _source=False, size=k)['hits']['hits']
        for match in matches:
            type_scores[match['_id']] += match['_score']

    scores = sorted([(t, s) for t, s in type_scores.items()],key=lambda tup: tup[1], reverse=True)
    return scores



In [9]:
query_scores = {}
pattern = re.compile(r'[\W_]+')
for query in TRAIN_QUERY:
    if query['category'] != 'resource':
        continue
    if query['question'] == None:
        continue
    q = pattern.sub(' ', query['question'])
    ec_pred = entity_centric_scorer(INDEX_NAME_ENTITY, q)
    q_terms = analyze_query(es, q)
    tc_pred = type_centric_scorer(INDEX_NAME_TYPE, q_terms)
    query_scores[query['id']] = {'entity_centric': ec_pred, 'type_centric': tc_pred, 'actual': query['type']}

In [10]:
for i, (key, val) in enumerate(query_scores.items()):
    print("ID: {}".format(key))
    print("Entity Centric: {}".format(val['entity_centric'][:10]))
    print("Type Centric: {}".format(val['type_centric'][:10]))
    print("Actual: {}\n".format(val['actual']))
    if i == 10:
        break

ID: dbpedia_14427
Entity Centric: [('dbo:MusicalWork', 9.542570920000001), ('dbo:City', 8.740549), ('dbo:Language', 8.3763765), ('dbo:Work', 7.916609217647058), ('dbo:CreativeWork', 7.916609217647058), ('dbo:MusicAlbum', 7.8533206), ('dbo:ArchitecturalStructure', 7.66714895), ('dbo:Group', 7.656701066666666), ('dbo:AnatomicalStructure', 7.499036), ('dbo:Software', 7.494758)]
Type Centric: [('dbo:Person', 2.08592492), ('dbo:DUL.owl#NaturalPerson', 2.0530576700000003), ('dbo:owl#Thing', 1.9950488100000001), ('dbo:Agent', 1.9898861600000002), ('dbo:DUL.owl#Agent', 1.9898861600000002), ('dbo:Place', 1.66913342), ('dbo:CelestialBody', 1.65529437), ('dbo:Location', 1.61243721), ('dbo:Book', 1.55039071), ('dbo:Satellite', 1.5280187399999998)]
Actual: ['dbo:Opera', 'dbo:MusicalWork', 'dbo:Work']

ID: dbpedia_3681
Entity Centric: [('dbo:Organisation', 9.667582681034476), ('dbo:Organization', 9.52073445081967), ('dbo:DUL.owl#SocialPerson', 9.52073445081967), ('dbo:Agent', 9.059211415555557), ('d

In [11]:
def reciprocal_rank(predicted, ground_truth):
    for i, t in enumerate(predicted):
        if t in ground_truth:
            return 1/(i+1)
    return 0

def recall(predicted, ground_truth):
    if len(ground_truth) == 0: return 0
    correct = 0
    for t in ground_truth:
        if t in predicted:
            correct += 1
    return correct/len(ground_truth)


def avg_precision(predicted, ground_truth):
    precision = []
    correct = 0
    for i, t in enumerate(predicted):
        if t in ground_truth:
            correct += 1
            precision.append(correct/(i+1))

    if len(precision) == 0: return 0
    return sum(precision)/len(precision)

def mean(predictions, ground_truths, func):
    stats = []
    for i in range(len(predictions)):
        stats.append(func(predictions[i], ground_truths[i]))
    return sum(stats)/len(stats) if len(stats) > 0 else 0


In [16]:
ec_preds = []
tc_preds = []
ground_truths = []
for _, val in query_scores.items():
    ec_preds.append([x[0] for x in val['entity_centric']])
    tc_preds.append([x[0] for x in val['type_centric']])
    ground_truths.append(val['actual'])

m_recall_ec = mean(ec_preds, ground_truths ,recall)
mrr_ec = mean(ec_preds, ground_truths ,reciprocal_rank)
m_prec_ec = mean(ec_preds, ground_truths ,reciprocal_rank)

m_recall_tc = mean(tc_preds, ground_truths ,recall)
mrr_tc = mean(tc_preds, ground_truths ,reciprocal_rank)
m_prec_tc = mean(tc_preds, ground_truths ,reciprocal_rank)

print("Entity Centric")
print("Recall: {}".format(m_recall_ec))
print("MRR: {}".format(mrr_ec))
print("Mean Precision: {}\n".format(m_prec_ec))

print("Type Centric")
print("Recall: {}".format(m_recall_tc))
print("MRR: {}".format(mrr_tc))
print("Mean Precision: {}\n".format(m_prec_tc))


Entity Centric
Recall: 0.7133303131635906
MRR: 0.18212586849916343
Mean Precision: 0.18212586849916343

Type Centric
Recall: 0.703204389728525
MRR: 0.4134076986356575
Mean Precision: 0.4134076986356575 

