In [1]:
# Required packages:
import elasticsearch
import math
import numpy as np
import os
import pytest
import random
import requests
import tarfile
import json
import bz2file
import bz2

from collections import Counter
from collections import defaultdict
from sklearn.ensemble import RandomForestRegressor
from bz2 import BZ2File as bzopen
from elasticsearch import Elasticsearch


### Reading Query from Github

Download query from github and split into train set and test set

In [43]:
url = 'https://raw.githubusercontent.com/smart-task/smart-dataset/master/datasets/DBpedia/'
file = 'smarttask_dbpedia_train.json'
url = url + file
queries=requests.get(url).json()


random.seed(a=1234567)
TRAIN_SIZE = int(len(queries) * 0.8)

TRAIN_QUERY = queries[:TRAIN_SIZE]
TEST_QUERY = queries[TRAIN_SIZE:]

print(len(TRAIN_QUERY))


14056


### Download file from DBPedia NLP dataset

Check whether dataset files (bz2) are available in data folder and download bz2 file if they are not

In [44]:
dburl= 'http://downloads.dbpedia.org/2016-10/core-i18n/en/'
#dbfiles = ['long_abstracts_en.ttl.bz2','short_abstracts_en.ttl.bz2','instance_types_sdtyped_dbo_en.ttl.bz2','instance_types_en.ttl.bz2']
#cwd = os.getcwd()  
dbfiles = ['long_abstracts_en.ttl.bz2','instance_types_transitive_en.ttl.bz2']
dirname = 'data'

for dbfile in dbfiles:
    if not os.path.exists(dirname):
        os.mkdir(dirname) 
        
    if not os.path.exists(dirname + '/' + dbfile):
        # Download file
        url = dburl + dbfile
        r = requests.get(url)
        with open(dirname + '/'+ dbfile, 'wb') as f:
            f.write(r.content)
    else:
        continue



### Index configuration 

For each of the fields, store the term vectors. These should be stored in the index. 

In [12]:
es = Elasticsearch()
INDEX_NAME = 'nlp_entity'

INDEX_SETTINGS = {
    'mappings': {
            'properties': {
                #'names': {
                #    'type': 'text',
                #    'term_vector': 'yes',
                #    'analyzer': 'english'
                #},
                'description': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'types': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                #'catch_all': {
                #    'type': 'text',
                #    'term_vector': 'yes',
                #    'analyzer': 'english'
                #},
            }
        }
    }


In [100]:
ABSTRACTS = '<http://dbpedia.org/ontology/abstract>'
COMMENTS = '<http://www.w3.org/2000/01/rdf-schema#comment>'
TYPES = '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>'
#abstract_files = ['long_abstracts_en.ttl.bz2','short_abstracts_en.ttl.bz2']
#type_files = ['instance_types_sdtyped_dbo_en.ttl.bz2','instance_types_en.ttl.bz2']
abstract_files = ['long_abstracts_en.ttl.bz2']
type_files = ['instance_types_transitive_en.ttl.bz2']


In [195]:
def resolve_abstract(text,delimiter):
    text = text.split(delimiter)
    text = text[1].replace('"', ' ')
    text = text.replace(' @en .','')
    return text

def resolve_entity(text):
    name = text.split('>')[0].replace('_', ' ')
    name = name.split('/')[-1]
    return name

def resolve_types(text,delimiter):
    text = text.split(delimiter)
    types = text[1]    
    types = types[types.rfind('/')+1:types.rfind('>')]
    return 'dbo: '+types

def populate_type(idx, filename):
    doc = {}
    #cwd = os.getcwd()
    #myfile= os.path.join(cwd, dbfile)
    myfile = 'data' + '/' +  filename
    with bzopen(myfile, "r") as bzfin:
        for i, line in enumerate(bzfin):
            
            if i == 0: continue                #skip header
            line = line.rstrip().decode('utf-8')
            entity = resolve_entity(line)
            types = []
            if entity is not None:
                if TYPES in line:                         #update index if it already exist if not then create new one    
                    types.append(resolve_types(line,TYPES))
                    if es.exists(index=idx, id=entity):
                        result = es.update(index=idx, id=entity, body={
                            'script':{'source':'ctx._source.types.add(params.new_type)', 'params':
                                          {'new_type' : types}}})['result']
                        #result = es.update(index=idx, id=entity, body={
                        #    'script':{'source':'if(ctx._source.types.contains(params.new_type)){ctx._source.types=ctx._source.types} else {ctx._source.types.add(params.new_type)}', 
                        #    'params': {'new_type' : types}}})['result']

                    else:
                        es.index(index=idx, id=entity, body={'types':[],'description':''})
                        result = es.update(index=idx, id=entity, body={
                            'script':{'source':'ctx._source.types.add(params.new_type)', 'params':
                                          {'new_type' : types}}})['result']
                        if result != 'updated': print('failed to update type for :', entity)

            if i == 100: break

def populate_abstract(idx, filename):
    doc = {}
    #cwd = os.getcwd()
    #myfile= os.path.join(cwd, filename)
    myfile = 'data' + '/'  + filename
    with bzopen(myfile, "r") as bzfin:
        for i, line in enumerate(bzfin):
            
            if i == 0: continue                      #skip header
            line = line.rstrip().decode('utf-8')
            entity = resolve_entity(line)
            if entity is not None:
                if ABSTRACTS in line:     
                    desc = resolve_abstract(line,ABSTRACTS)
                elif COMMENTS in line:
                    desc = resolve_abstract(line,COMMENTS)
                else:
                    continue

                if es.exists(index=idx, id=entity):    #only update index if already exist 
                    result = es.update(index=idx, id=entity, body={
                            'script':{'source':'ctx._source.description += params.new_desc', 
                                      'params': {'new_desc' :desc + ' '}}})['result']

                    #result =es.update(index=idx, id=entity, body={
                    #        'script':{'source':'if(ctx._source.description.contains(params.new_desc)){ctx._source.description=ctx._source.description} else {ctx._source.description += params.new_desc}', 
                    #        'params': {'new_desc' : ' ' + desc}}})['result']
                    if result != 'updated': print('failed to update description for :', entity)
                    
                #else:
                #    es.index(index=idx, id=entity, body={'types':[],'description':desc})

            if i == 100: break
            

In [196]:
if es.indices.exists(INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)
es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)    


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'nlp_entity'}

## Populate index

There are 2 steps in populating index based on fields. This is due to fact that there are 3 different dataset files for 2 different fields, types and description.

In [197]:
for file in type_files:
    populate_type(INDEX_NAME,file)



In [198]:
for file in abstract_files:
    populate_abstract(INDEX_NAME, file)

In [199]:
query = 'Alabama'
hits = es.search(index=INDEX_NAME, body={'query': {'match': {'description': query}}},
                               _source=True, size=10)
print(hits)
tv_1 = es.termvectors(index=INDEX_NAME, id='Alabama', fields='types')
print(tv_1)

{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 4.172229, 'hits': [{'_index': 'nlp_entity', '_type': '_doc', '_id': 'Alabama', '_score': 4.172229, '_source': {'types': [['dbo: AdministrativeArea'], ['dbo: Q3455524'], ['dbo: Region'], ['dbo: PopulatedPlace'], ['dbo: Place'], ['dbo: Place'], ['dbo: Location'], ['dbo: owl#Thing']], 'description': "  Alabama (/ˌæləˈbæmə/) is a state in the southeastern region of the United States. It is bordered by Tennessee to the north, Georgia to the east, Florida and the Gulf of Mexico to the south, and Mississippi to the west. Alabama is the 30th-most extensive and the 24th-most populous of the 50 United States. At 1,300 miles (2,100 km), Alabama has one of the nation's longest navigable inland waterways. From the American Civil War until World War II, Alabama, like many states in the South, suffered economic hardship, in part because of