In [None]:
# Read the mintaka dataset

In [14]:
import json
from SPARQLWrapper import SPARQLWrapper, JSON
from string import Template
import re
from pathlib import Path
from tqdm import tqdm

In [15]:
with open('../eval/dataset/mintaka/mintaka_train.json') as mintaka_train_file:
    mintaka_train_json = json.load(mintaka_train_file)
pbar = tqdm(total=len(mintaka_train_json), position=0, leave=True)

 40%|████      | 5630/14000 [1:11:22<1:46:07,  1.31it/s]


In [16]:
# mintaka_train_json[42]

In [17]:
# accepted_langs = ['de', 'pt', 'es', 'fr']
accepted_langs = ['it']

In [18]:
def safeget(dct, *keys):
    for key in keys:
        try:
            dct = dct[key]
        except KeyError:
            return None
    return dct

In [19]:
sparql_wd = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql_wd.setReturnFormat(JSON)

# Temporary agent modifier
agent_header = {'User-Agent': 'wiki_parser_online/0.17.1 (https://deeppavlov.ai;'
                              ' info@deeppavlov.ai) deeppavlov/0.17.1'}
sparql_wd.agent = str(agent_header)

langmatches_clause = " langMatches( lang(?enlbl), \"en\") "
for lang in accepted_langs:
    langmatches_clause += " || langMatches( lang(?lbl), \"" + lang + "\") "

WD_QUERY_STR = Template('''
    SELECT ?lbl (lang(?lbl) as ?lang) WHERE {
        OPTIONAL {
            wd:$link rdfs:label ?lbl .
            FILTER ( %s )
        }
    }
''' % langmatches_clause)


def find_entity_labels(entity_id):
    ret_dict = {}
    f_sparql = WD_QUERY_STR.substitute(link=entity_id)
    # print('SPARQL: %s' % f_sparql)
    sparql_wd.setQuery(f_sparql)
    ret = sparql_wd.queryAndConvert()
    # print('SPARQL results: %s' % ret)
    for r in ret["results"]["bindings"]:
        cur_lang = safeget(r, 'lang', 'value')
        cur_lbl = safeget(r, 'lbl', 'value')
        # print('Current label: %s\nCurrent language: %s\n\n' % (cur_lbl, cur_lang))
        if cur_lang and cur_lbl and (cur_lang in accepted_langs):
            ret_dict[cur_lang] = cur_lbl
    return ret_dict

In [20]:
def fetch_placeholder_str(query, ent_links):
    query_plc = ''
    last_ind = 0
    for link in ent_links:
        if 'link' not in link:
            continue
        plchldr = link['placeholder']
        # forming the placeholder query
        if plchldr:
            query_plc += query[last_ind:link['start']] + plchldr
            last_ind = link['end']
    query_plc += query[last_ind:]
    return query_plc

In [21]:
# For each entry in mintaka dataset
    # For each entity in the entry
        # Extract other language labels
        # For each entry check if label exists in the query of that language
    # For each language that has matching number of entities with original English text
        # Write the entry to reference and prediction files

In [22]:
def extract_ent_mentions(entity_info_array, plc):
    ent_mentions = []
    arr_ind = 1
    entity_info_array.sort(key=lambda d: d['span'][0])
    for info in entity_info_array:
        enttype = info.get('entityType')
        if enttype == 'entity':
            mention = {'start': info['span'][0], 'end': info['span'][1],
                       'surfaceform': info['mention'], 'link': info['name'],
                       'placeholder': '[%s%d]' % (plc, arr_ind)}
            ent_mentions.append(mention)
            arr_ind += 1
    # ent_mentions.sort(key=lambda d: d['start'])
    return ent_mentions

In [23]:
# mintaka_train_json = mintaka_train_json[0:3]
# print(len(mintaka_train_json))
# print(mintaka_train_json[2])

In [24]:
# open the output files
output_dir = 'data/'
Path(output_dir).mkdir(parents=True, exist_ok=True)
op_files = {}
for key in accepted_langs:
    plc_en_file = '%s%s_en_placeholder.txt' % (output_dir, key)
    plc_lang_file = '%s%s_placeholder.txt' % (output_dir, key)
    norm_en_file = '%s%s_en.txt' % (output_dir, key)
    norm_lang_file = '%s%s.txt' % (output_dir, key)
    en_file_obj = open(plc_en_file, 'w')
    lang_file_obj = open(plc_lang_file, 'w')
    norm_en_file_obj = open(norm_en_file, 'w')
    norm_lang_file_obj = open(norm_lang_file, 'w')
    op_files[key] = (en_file_obj, lang_file_obj, norm_en_file_obj, norm_lang_file_obj)

In [25]:
for entry in mintaka_train_json:
    eng_text = entry['question']
    ent_mentions = extract_ent_mentions(entry['questionEntity'], '00')
    # print('Entity mentions: %s' % ent_mentions)
    eng_plc_text = fetch_placeholder_str(eng_text, ent_mentions)
    ent_count = len(ent_mentions)
    label_count = {lang: 0 for lang in accepted_langs}
    lang_ent_mentions = {key: [] for key in accepted_langs}
    for mention in ent_mentions:
        entity_id = mention['link']
        label_dict = find_entity_labels(entity_id)
        # print('label dict: %s' % label_dict)
        for key in label_dict:
            lang_query = entry['translations'][key]
            cur_label = label_dict[key]
            if cur_label in lang_query:
                # Add lang mention
                for match in re.finditer(re.escape(cur_label), lang_query):
                    lang_mention = {'link': entity_id,
                                    'start': match.start(),
                                    'end': match.end(),
                                    'placeholder': mention['placeholder']}
                    lang_ent_mentions[key].append(lang_mention)
                label_count[key] += 1
    # print('Extracted language based entity mentions: %s' % lang_ent_mentions)
    # check label counts
    for key in label_count:
        if label_count[key] == ent_count:
            lang_query = entry['translations'][key]
            # sorting for placeholder logic
            lang_ent_mentions[key].sort(key=lambda d: d['start'])
            lang_ent_mentions[key]
            # Generate placeholder text
            lang_plc_text = fetch_placeholder_str(lang_query, lang_ent_mentions[key])
            # write to file
            # placeholder files
            plc_en_file = op_files[key][0]
            plc_lang_file = op_files[key][1]
            # normal files
            norm_en_file = op_files[key][2]
            norm_lang_file = op_files[key][3]
            # print(eng_plc_text, '\n', lang_plc_text, '\n\n\n')
            plc_en_file.write(eng_plc_text + '\n')
            plc_lang_file.write(lang_plc_text + '\n')
            norm_en_file.write(eng_text + '\n')
            norm_lang_file.write(lang_query + '\n')
    pbar.update(1)
print('Processing complete!')

100%|██████████| 14000/14000 [1:02:40<00:00,  5.00it/s]

Processing complete!


In [26]:
##### 

In [27]:
# Close the output files
for value in op_files.values():
    for file_obj in value:
        file_obj.close()
print('Files closed.')

Files closed.


In [None]:
# Convert to json
# This is kept separate from the previous logic to run independently without the need to run previous cells

In [28]:
import json
from pathlib import Path

In [30]:
#accepted_langs = ['de', 'pt', 'es', 'fr']
accepted_langs = ['it']
input_dir = 'data/'
output_dir = 'data/json/'
Path(output_dir).mkdir(parents=True, exist_ok=True)
for key in accepted_langs:
    plc_en_file = '%s%s_en_placeholder.txt' % (input_dir, key)
    plc_lang_file = '%s%s_placeholder.txt' % (input_dir, key)
    plc_out_file = '%s%s_en_placeholder.json' % (output_dir, key)
    with open(plc_en_file, 'r') as en_file_obj,  open(plc_lang_file, 'r') as lang_file_obj, open(plc_out_file, 'w') as out_file_obj:
        # Read the files and write json
        en_texts = en_file_obj.read().splitlines()
        lang_texts = lang_file_obj.read().splitlines()
        print('[Placeholder] English file(%s) lines: %d\n[Placeholder] Lang file(%s) lines: %d\n' % (plc_en_file, len(en_texts), plc_lang_file, len(lang_texts)))
        output_json = []
        for en_line, lang_line in zip(en_texts, lang_texts):
            output_json.append({'output': en_line, 'input': lang_line})
        json.dump(output_json, out_file_obj)

    norm_en_file = '%s%s_en.txt' % (input_dir, key)
    norm_lang_file = '%s%s.txt' % (input_dir, key)
    norm_out_file = '%s%s_en.json' % (output_dir, key)
    with open(norm_en_file, 'r') as en_file_obj,  open(norm_lang_file, 'r') as lang_file_obj, open(norm_out_file, 'w') as out_file_obj:
        # Read the files and write json
        en_texts = en_file_obj.read().splitlines()
        lang_texts = lang_file_obj.read().splitlines()
        print('[Normal] English file(%s) lines: %d\n[Normal] Lang file(%s) lines: %d\n\n' % (norm_en_file, len(en_texts), norm_lang_file, len(lang_texts)))
        output_json = []
        for en_line, lang_line in zip(en_texts, lang_texts):
            output_json.append({'output': en_line, 'input': lang_line})
        json.dump(output_json, out_file_obj)
print('json files created!')

[Placeholder] English file(data/it_en_placeholder.txt) lines: 6382
[Placeholder] Lang file(data/it_placeholder.txt) lines: 6383

[Normal] English file(data/it_en.txt) lines: 6382
[Normal] Lang file(data/it.txt) lines: 6383


json files created!


In [None]:
# Testing load dataset for huggingface

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("json", data_files="data/json/de_en*.json")

In [None]:
dataset

In [None]:
for data in dataset['train']:
    print(data)
    break