In [1]:
# Load QALD8 questions
# Extract entities (with German sameAs) out of SPARQL in QALD8
# Extract German and English subgraph using entities in QALD8
# The steps below apply only to the German Subgraph
# Extract all the triples with string literals into a file (string-triples.nt)
# Writes two files:
    # 1. Triples with replaced string literal (<UNK>) (sub-str-triples.tsv)
    # 2. String literals corresponding to triples in file1 (str-literals.txt)
# Extract all non-literal triples into a file (non-literal-triples.tsv)
# Preprocess the triples and keep a map between preprocessed token and original uri
# Perform the triple translation
# Perform the text translation using KG-NMT

In [1]:
# Load QALD8 questions

# import urllib library
from urllib.request import urlopen
  
# import json
import json
# store the URL in url as 
# parameter for urlopen
url = "https://raw.githubusercontent.com/ag-sc/QALD/master/7/data/qald-7-train-multilingual.json"
  
# store the response of URL
response = urlopen(url)
  
# storing the JSON response 
# from url in data
data_json = json.loads(response.read())
  
# print the json response
print(data_json['dataset']['id'])

qald-7-train-multilingual


In [2]:
# Extract entities (with German sameAs) out of SPARQL in QALD8

import re
ent_list = set({})
spc_sep_ents = ''
sep_ents_arr = []
limit = 1
count = 0
for question in data_json['questions']:
    sparql = question['query']['sparql']
    temp_res1 = re.findall('res:[^\s\.]+',sparql)
    temp_res1 = [w.replace('res:', 'http://dbpedia.org/resource/') for w in temp_res1]
    temp_res2 = re.findall('http://dbpedia.org/resource/[^>]+', sparql)
    ent_list.update(temp_res1)
    ent_list.update(temp_res2)
    # print(temp_res1, temp_res2)

# print(ent_list)
    
spc_sep_ents = ''
for ent in ent_list:
    count+=1
    spc_sep_ents+='<'+ent+'> '
    if count%limit == 0 :
        sep_ents_arr.append(spc_sep_ents)
        spc_sep_ents = ''
sep_ents_arr.append(spc_sep_ents)

print('Number of entities found: ', len(ent_list))
print('Number of spaced strings: ', len(sep_ents_arr))

# print(sep_ents_arr)

Number of entities found:  213
Number of spaced strings:  214


In [3]:


import time
from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

en_ents = set({})
de_ents = set({})

for sep_ents_str in sep_ents_arr :
    sameas_query = """SELECT ?ent ?ent_de WHERE {
    ?ent owl:sameAs ?ent_de . 
    FILTER(regex(str(?ent_de ), "http://de.dbpedia.org/resource/" )) """
    sameas_query+='VALUES ?ent { '+ sep_ents_str +'}} LIMIT 2'

    # print('query: ', sameas_query)
    
    sparql.setQuery(sameas_query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    print('Result size:',len(results["results"]["bindings"]))

    for result in results["results"]["bindings"]:
        en_ents.add(result['ent']['value'])
        de_ents.add(result['ent_de']['value'])
        
    time.sleep(0.1)
    
print('Total number of English entries: ',len(en_ents))
print('Total number of German entries: ',len(de_ents))

Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 2
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 0
Result size: 0
Result size: 1
Result size: 1
Result size: 1
Result size: 0
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 0
Result size: 2
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 2
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 0
Result size: 1
Result size: 1
Result size: 1
Result size: 1
Result size: 2
Result size: 1
Result size: 1
Result size: 1
Result size: 2
Result size: 1
Result size: 1
Result siz

In [40]:
# Extract German and English subgraph using entities in QALD8

# English Sub-Graph
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sep_ents_arr = []
limit = 10
count = 0
spc_sep_ents = ''

for ent in en_ents:
    count+=1
    spc_sep_ents+='<'+ent+'> '
    if count%limit == 0 :
        sep_ents_arr.append(spc_sep_ents)
        spc_sep_ents = ''
if len(spc_sep_ents) > 0:
    sep_ents_arr.append(spc_sep_ents)

en_subgraph = Graph()

for sep_ents_str in sep_ents_arr :
    describe_query = 'DESCRIBE ?ent WHERE { VALUES ?ent { '+ sep_ents_str +' } }'
    sparql.setQuery(describe_query)
    sparql.setReturnFormat('rdf')
    results = sparql.query().convert()
    print('query response size: ',len(results))
    en_subgraph+=results
    time.sleep(1)
print('Size of English Sub-Graph: ', len(en_subgraph))

query response size:  140657
query response size:  77206
query response size:  307032
query response size:  199786
query response size:  48406
query response size:  266263
query response size:  34515
query response size:  356715
query response size:  114366
query response size:  195341
query response size:  56020
query response size:  68024
query response size:  111657
query response size:  155502
query response size:  257083
query response size:  217448
query response size:  512006
query response size:  516853
query response size:  787610
Size of English Sub-Graph:  4421499


In [42]:


# German Sub-Graph
sparql = SPARQLWrapper("http://de.dbpedia.org/sparql")
sep_ents_arr = []
limit = 10
count = 0
spc_sep_ents = ''

for ent in de_ents:
    count+=1
    spc_sep_ents+='<'+ent+'> '
    if count%limit == 0 :
        sep_ents_arr.append(spc_sep_ents)
        spc_sep_ents = ''
if len(spc_sep_ents) > 0:
    sep_ents_arr.append(spc_sep_ents)

de_subgraph = Graph()
for sep_ents_str in sep_ents_arr :
    describe_query = 'DESCRIBE ?ent WHERE { VALUES ?ent { '+ sep_ents_str +' } }'
    sparql.setQuery(describe_query)
    sparql.setReturnFormat('rdf')
    results = sparql.query().convert()
    print('query response size: ', len(results))
    de_subgraph+= results
    time.sleep(1)
print('Size of German Sub-Graph: ', len(de_subgraph))

query response size:  30419
query response size:  4602
query response size:  78518
query response size:  2014
query response size:  780
query response size:  898
query response size:  11396
query response size:  10187
query response size:  959
query response size:  8600
query response size:  29905
query response size:  1494
query response size:  8726
query response size:  2234
query response size:  5901
query response size:  4361
query response size:  891
query response size:  15245
query response size:  22581
query response size:  1069
query response size:  2286
Size of German Sub-Graph:  243001


In [43]:
# Write Sub-Graphs to a file
en_subgraph.serialize(destination="lf-translation/en-sub-graph.nt",format='nt')
de_subgraph.serialize(destination="lf-translation/de-sub-graph.nt",format='nt') 

### Code below can be used directly if the sub-graphs already exist.

In [3]:
# Load the sub-graph
from rdflib import Graph

en_subgraph = Graph()
en_subgraph.parse(source="lf-translation/en-sub-graph.nt", format='nt')

de_subgraph = Graph()
de_subgraph.parse(source="lf-translation/de-sub-graph.nt", format='nt')

print('Size of English Sub-Graph: ', len(en_subgraph))
print('Size of German Sub-Graph: ', len(de_subgraph))

Size of English Sub-Graph:  4421499
Size of German Sub-Graph:  243001


In [None]:
# The steps below apply only to the German Subgraph

In [77]:
# URI Preprocessing function

# from rdflib import Graph, URIRef
# nsg = Graph()
# nsg.bind("dbo", 'http://dbpedia.org/ontology/')
# nsg.bind("dbr_en", 'http://dbpedia.org/resource/')
# nsg.bind("dbp", 'http://dbpedia.org/property/')
# nsg.bind("dbr_de", 'http://de.dbpedia.org/resource/')
# nsg.bind("dbp_de", 'http://de.dbpedia.org/property/')
# # all_ns = [n for n in g.namespace_manager.namespaces()]
# # print(all_ns)
# uri_map = {}
# def process_uri(uri_ref):
#     # Check map if URI exists; if yes, then return corresponding token. Else:
#     uri_str = str(uri_ref)
#     if uri_str in uri_map:
#         return uri_map[str(uri_ref)]
#     # replace dbo, dbp, dbr namespace with prefixes & for new URIs, generate new prefix
#     frag = nsg.compute_qname(uri_ref)
#     retval = frag[0] + '_' + frag[2]
#     retval = retval.lower()
#     # Map the end result to its URI
#     uri_map[str(uri_ref)] = retval
#     return retval

In [14]:
import re
uri_map = {}
# uri_map = json.load(open("lf-translation/uri-map"))
def clean(uri_ref):
    # Check map if URI exists; if yes, then return corresponding token. Else:
    uri_str = str(uri_ref)
    if uri_str in uri_map:
        return uri_map[str(uri_ref)]
    string = uri_str.replace("http://dbpedia.org/ontology/", "dbo_")
    string = string.replace("http://dbpedia.org/property/", "dbp_")
    string = string.replace("http://dbpedia.org/resource/", "dbr_en_")
    string = string.replace("http://de.dbpedia.org/property/", "dbp_de_")
    string = string.replace("http://de.dbpedia.org/resource/", "dbr_de_")
    string = re.sub(r'\W+', '', string)
    string = string.lower()
    # Map the end result to its URI
    uri_map[str(uri_ref)] = string
    return string

In [15]:
# Extract all the triples with string literals into a file (string-triples.tsv)
# TODO: 
string_sparql = 'SELECT ?s ?p ?o WHERE {?s ?p ?o . FILTER ( datatype(?o) = <http://www.w3.org/1999/02/22-rdf-syntax-ns#langString> && lang(?o) = "de") } '
qres = de_subgraph.query(string_sparql)
# Writes two files:
    # 1. Triples with replaced string literal (<UNK>) (sub-str-triples.tsv)
    # 2. String literals corresponding to triples in file1 (str-literals.txt)
with open('lf-translation/de-str-triples.tsv', 'w') as str_out, \
    open('lf-translation/de-str-rep-triples-preprocessed.txt', 'w') as str_rep_out, \
    open('lf-translation/de-str-only.txt', 'w') as only_str_out:
    for row in qres:
        # string triples out
        str_out.write(str(row[0]) + '\t' + str(row[1]) + '\t"' + str(row[2]) + '"\n')
        # replaced/preprocessed triples
        str_rep_out.write(clean(row[0]) + ' ' + clean(row[1]) + ' <UNK>\n')
        # only literals
        only_str_out.write(str(row[2]) + '\n')
print('Pre-processed literal triples ready.')

Pre-processed literal triples ready.


In [16]:
# Extract all non-literal triples into a file (non-literal-triples.tsv)
nolit_sparql = 'SELECT ?s ?p ?o WHERE {?s ?p ?o . FILTER(!isLiteral(?o) ) }'
qres = de_subgraph.query(nolit_sparql)
with open('lf-translation/de-non-literal-triples.tsv', 'w') as nolit_out, \
     open('lf-translation/de-non-literal-triples-preprocessed.txt', 'w') as nolit_pp_out   :
     for row in qres:
        # triples out
        nolit_out.write(str(row[0]) + '\t' + str(row[1]) + '\t' + str(row[2]) + '\n')
        # preprocessed triples out
        nolit_pp_out.write(clean(row[0]) + ' ' + clean(row[1]) + ' ' + clean(row[2]) + '\n')
print('Pre-processed non-literal triples ready.')

Pre-processed non-literal triples ready.


In [17]:
# Preprocess the triples and keep a map between preprocessed token and original uri
# ^^ This step is done along with the extraction above.
# Save the Unique URI Map
import json
with open('lf-translation/uri-map', 'w') as out:
    out.write(json.dumps(uri_map, indent=4, sort_keys=True))
print('URI Map written to file.')

URI Map written to file.


In [45]:
# Perform the triple translation (Must be done in THOTH's python environment)
# python -m nmt.nmt  --vocab_prefix=../$1/vocab --model_dir=../$1_model  --inference_input_file=./to_ask.txt  --inference_output_file=./output.txt --out_dir=../$1_model --src=en --tgt=sparql > /dev/null 2>&1


In [57]:
# Perform the text translation using KG-NMT
