In [1]:
# Read all QALD-8 Queries
# For each question
    # Extract all URIs
    # Find sameAs links to DE, FR, ES dbpedia
    # If links exist in same language for all URIs then;
        # generate sparql for all languages
        # write the sparql like: "query_<language-abbr>": { "sparql": <sparql> }
# Store the new QALD File

In [1]:
# Load QALD8 questions

# import urllib library
from urllib.request import urlopen
  
# import json
import json
# store the URL in url as 
# parameter for urlopen
url = "https://raw.githubusercontent.com/ag-sc/QALD/master/7/data/qald-7-train-multilingual.json"
  
# store the response of URL
response = urlopen(url)
  
# storing the JSON response 
# from url in data
data_json = json.loads(response.read())
  
# print the json response
print(data_json['dataset']['id'])

qald-7-train-multilingual


In [2]:
import time
from SPARQLWrapper import SPARQLWrapper, JSON
from string import Template

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
# Choosing to fetch only one link
sparql_template = 'SELECT ?l WHERE { <$uri> owl:sameAs ?l . FILTER(regex(str(?l), "http://$lang.dbpedia.org/" )) }'
sparql_template2 = 'SELECT ?l WHERE { <$uri> owl:sameAs ?l . FILTER(regex(str(?l), "http://$lang.dbpedia.org/" )) } LIMIT 1'
def fetch_links(uri, lang):
    links_set = set()
    # form the sparql
    query = Template(sparql_template).substitute(uri=uri, lang=lang)
    # query
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    #print('Result size:',len(results["results"]["bindings"]))

    for result in results["results"]["bindings"]:
        links_set.add(result['l']['value'])
    
    time.sleep(0.1)
    return links_set

def fetch_single_link(uri, lang):
    single_link = None
    # form the sparql
    query = Template(sparql_template2).substitute(uri=uri, lang=lang)
    # query
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    #print(results)
    #print('Result size:',len(results["results"]["bindings"]))

    for result in results["results"]["bindings"]:
        single_link = result['l']['value']
    
    time.sleep(0.1)
    return single_link

# function to check and return links for URIs in a given language
def check_links(uri_list, lang):
    missing_links = False
    # For each uri look for language specific links
    link_map = {}
    for uri in uri_list:
        links_set = fetch_links(uri, lang)
        if len(links_set) == 0:
            missing_links = True
        link_map[uri] = links_set
    return (missing_links, link_map)

In [3]:
# Test Block
print(check_links({'http://dbpedia.org/resource/Colombo_Lighthouse','http://dbpedia.org/resource/Donald_Trump'},'fr'))

(True, {'http://dbpedia.org/resource/Colombo_Lighthouse': set(), 'http://dbpedia.org/resource/Donald_Trump': {'http://fr.dbpedia.org/resource/Donald_Trump'}})


In [4]:
endpoint_dict = { 'de': 'http://de.dbpedia.org/sparql', 'es': 'https://es.dbpedia.org/sparql', 'fr': 'http://fr.dbpedia.org/sparql'}

In [5]:
# Convert SPARQL
def convert_sparql(sparql_str, prefix_mentions, uris, lang):
    link_map = {}
    # fetch the links for tuples
    for entry in prefix_mentions.keys():
        id = entry
        uri = prefix_mentions[entry]
        single_link = fetch_single_link(uri, lang)
        #print(single_link)
        if single_link:
            link_map[id] = single_link
    # fetch the links for uris
    for uri in uris:
        single_link = fetch_single_link(uri, lang)
        if single_link:
            link_map[uri] = single_link
    #print('link map', link_map)
    if len(link_map) == 0:
        return None
    # create sparql
    rep = dict((re.escape(k), v) for k, v in link_map.items()) 
    pattern = re.compile("|".join(rep.keys()))
    sparql_str = pattern.sub(lambda m: rep[re.escape(m.group(0))], sparql_str)
    # return the sparql
    return sparql_str
def get_all_sparql(sparql_str, prefix_mentions, uris):
    sparql_map = {}
    # for each language fetch the sparql queries
    for lang in endpoint_dict.keys():
        sparql_map[lang] = convert_sparql(sparql_str, prefix_mentions, uris, lang)
    # return the queries
    return sparql_map

In [6]:
# Testing replacement


In [8]:
import re
prefix_map = {}
count = 0
total_len = len(data_json['questions'])
for question in data_json['questions']:
    #sparql_uris = set()
    sparql_str = question['query']['sparql']
    #print('SPARQL:', sparql_str)
    # extract all prefixes to make a map
    prefix_tuples=re.findall('PREFIX\s+([a-zA-Z0-9]+):\s+<(.*?)>',sparql_str, re.IGNORECASE)
    for entry in prefix_tuples:
        prefix_map[entry[0]] = entry[1]
    #print(prefix_map)
    # extract all prefix mentions
    prefix_mentions = re.findall('{?[\s\t\n\r]+([\w\d]+):([^<\s]+)',sparql_str, re.IGNORECASE)
    #print(prefix_mentions)
    # join the local name with prefix and store it in uri set
    abbr_map = {}
    for entry in prefix_mentions:
        formed_uri = prefix_map[entry[0]]+entry[1]
        abbr_map[entry[0]+':'+entry[1]] = formed_uri
        #sparql_uris.add(formed_uri)
    #print(uris)
    # extract all direct uri mentions
    extra_uris = re.findall('<(.*?)>',sparql_str, re.IGNORECASE)
    #sparql_uris.update(extra_uris)
    # Generate and save language specific SPARQL queries
    sparql_map = get_all_sparql(sparql_str, abbr_map, extra_uris)
    count+= 1
    question['query']['translated'] = sparql_map
    print('Progress:',count,'/',total_len,'\r')
    # print('Translated SPARQL',sparql_map)

Progress: 1 / 215 
Progress: 2 / 215 
Progress: 3 / 215 
Progress: 4 / 215 
Progress: 5 / 215 
Progress: 6 / 215 
Progress: 7 / 215 
Progress: 8 / 215 
Progress: 9 / 215 
Progress: 10 / 215 
Progress: 11 / 215 
Progress: 12 / 215 
Progress: 13 / 215 
Progress: 14 / 215 
Progress: 15 / 215 
Progress: 16 / 215 
Progress: 17 / 215 
Progress: 18 / 215 
Progress: 19 / 215 
Progress: 20 / 215 
Progress: 21 / 215 
Progress: 22 / 215 
Progress: 23 / 215 
Progress: 24 / 215 
Progress: 25 / 215 
Progress: 26 / 215 
Progress: 27 / 215 
Progress: 28 / 215 
Progress: 29 / 215 
Progress: 30 / 215 
Progress: 31 / 215 
Progress: 32 / 215 
Progress: 33 / 215 
Progress: 34 / 215 
Progress: 35 / 215 
Progress: 36 / 215 
Progress: 37 / 215 
Progress: 38 / 215 
Progress: 39 / 215 
Progress: 40 / 215 
Progress: 41 / 215 
Progress: 42 / 215 
Progress: 43 / 215 
Progress: 44 / 215 
Progress: 45 / 215 
Progress: 46 / 215 
Progress: 47 / 215 
Progress: 48 / 215 
Progress: 49 / 215 
Progress: 50 / 215 
Progress:

In [10]:
import json
with open('porque-test/qald-7-train-multilingual_translated-sparql.json', 'w') as out:
    out.write(json.dumps(data_json, indent=4, sort_keys=True))
print('Modified QALD JSON written to file.')

Modified QALD JSON written to file.


In [None]:
# Compare the answers fetched by sparql queries of each question