### Notebook to perform general utility actions

#### Update the answers in a QALD file from remote endpoint

In [None]:
# imports
import sys
sys.path.append('../code/')
from utils.gen_util import update_qald_dataset

In [None]:
# Set variables
input_file_path = "../datasets/qald9plus/wikidata/qald_9_plus_test_wikidata.json"
output_file_path = "../datasets/qald9plus/wikidata/qald_9_plus_test_wikidata_updated.json"
#input_file_path = "../datasets/qald10/qald_10.json"
#output_file_path = "../datasets/qald10/qald_10_latest.json"
kg = "Wikidata"
languages = ["en", "de", "ru", "fr", "lt", "ba", "be", "uk", "zh", "ja", "es"]
#languages = ["en", "de", "ru", "zh", "ja"]

In [None]:
update_qald_dataset(input_file_path, output_file_path, languages, kg)

#### Convert LcQUAD2 dataset to QALD based format

In [None]:
# imports
import sys
sys.path.append('../code/')
from utils.gen_util import convert_lcquad2_to_qald

In [None]:
# Set variables
input_file_path = "../datasets/lcquad2/test.json"
output_file_path = "../datasets/qald_lcquad2/test_qald.json"

In [None]:
convert_lcquad2_to_qald(input_file_path, output_file_path)

#### Convert Mintaka dataset to QALD based format

In [None]:
# imports
import sys
sys.path.append('../code/')
from utils.gen_util import convert_mintaka_to_qald

In [None]:
# Set variables
input_file_path = "../datasets/mintaka/mintaka_test.json"
output_file_path = "../datasets/qald_mintaka/mintaka_test_qald.json"
languages = ["en", "de", "es", "fr"]

In [None]:
convert_mintaka_to_qald(input_file_path, output_file_path, languages)

#### Format QALD based Mintaka results for Mintaka evaluation

In [None]:
# imports
import sys
sys.path.append('../code/')
from utils.gen_util import extract_mintaka_qald_results

In [None]:
# Set variables
languages = ["en", "de", "es", "fr"]
input_file_path_tmpl = "../pred_files_mintaka2qald/qald9plus-finetune/%s.json"
output_file_path_tmpl = "../pred_files_mintaka2qald/qald9plus-finetune/%s_formatted_result.json"

In [None]:
for lang in languages:
    input_file_path = input_file_path_tmpl % lang
    output_file_path = output_file_path_tmpl % lang
    extract_mintaka_qald_results(input_file_path, output_file_path)

In [None]:
import re
import sys
from SPARQLWrapper import JSON
from SPARQLWrapper import SPARQLWrapper
from SPARQLWrapper.SPARQLExceptions import SPARQLWrapperException

In [None]:
QUERY_PREFIX = """
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX psn: <http://www.wikidata.org/prop/statement/value-normalized/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wds: <http://www.wikidata.org/entity/statement/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
PREFIX wdv: <http://www.wikidata.org/value/>
"""

def ask_wikidata(sparql_str):
    # endpoint_url = "https://query.wikidata.org/sparql"
    endpoint_url = "https://skynet.coypu.org/wikidata/"
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(QUERY_PREFIX + '\n' + sparql_str)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

In [None]:
sparql_str = "SELECT DISTINCT  ?uri WHERE  {  wd:Q761383 wdt:P50  ?uri .  }  "
print(ask_wikidata(sparql_str))

#### Update train file for QALD10 with Chinese, Japanese and Spanish translations

In [None]:
import json

qald9plus_train_path = "../datasets/qald9plus/wikidata/qald_9_plus_train_wikidata.json"
qald9plus_test_path = "../datasets/qald9plus/wikidata/qald_9_plus_test_wikidata.json"

qald10_train_path = "../datasets/qald10/qald_9_plus_train_wikidata.json"

output_file = "../datasets/qald10/qald10_train.json"

# Load both qald 9 plus test and train
with open(qald9plus_train_path) as train_file, open(qald9plus_test_path) as test_file:
    qald9plus_train = json.load(train_file)
    qald9plus_test = json.load(test_file)
    qald_objects = [qald9plus_train, qald9plus_test]
# Map English question against Chinese, Japanese and Spanish translations
q_map = {}
lang_arr = ['zh', 'ja', 'es']
for qald_json in qald_objects:
    questions = qald_json['questions']
    for ques in questions:
        translations = []
        for q_pair in ques['question']:
            lang = q_pair['language']
            if lang  == 'en':
                cur_key = q_pair['string']
            elif lang in lang_arr:
                translations.append(q_pair)
        q_map[cur_key] = translations
# Load qald 10 train
with open(qald10_train_path) as train_file:
    qald10_train = json.load(train_file)
# for each question find and update translations
questions = qald10_train['questions']
for ques in questions:
    translations = []
    for q_pair in ques['question']:
        lang = q_pair['language']
        if lang  == 'en':
            translations = q_map[q_pair['string']]
            break
    ques['question'].extend(translations)
# export csv
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(qald10_train, f, ensure_ascii=False, indent=4)