In [None]:
# Read the mintaka dataset

In [1]:
import json
from SPARQLWrapper import SPARQLWrapper, JSON
from string import Template
import re
from pathlib import Path
from tqdm import tqdm

In [2]:
with open('../eval/dataset/mintaka/mintaka_train.json') as mintaka_train_file:
    mintaka_train_json = json.load(mintaka_train_file)
pbar = tqdm(total=len(mintaka_train_json))

  0%|          | 0/14000 [00:00<?, ?it/s]

In [3]:
# mintaka_train_json[42]

In [4]:
accepted_langs = ['de', 'pt', 'es', 'fr']

In [5]:
def safeget(dct, *keys):
    for key in keys:
        try:
            dct = dct[key]
        except KeyError:
            return None
    return dct

In [6]:
sparql_wd = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql_wd.setReturnFormat(JSON)

# Temporary agent modifier
agent_header = {'User-Agent': 'wiki_parser_online/0.17.1 (https://deeppavlov.ai;'
                              ' info@deeppavlov.ai) deeppavlov/0.17.1'}
sparql_wd.agent = str(agent_header)

langmatches_clause = " langMatches( lang(?enlbl), \"en\") "
for lang in accepted_langs:
    langmatches_clause += " || langMatches( lang(?lbl), \"" + lang + "\") "

WD_QUERY_STR = Template('''
    SELECT ?lbl (lang(?lbl) as ?lang) WHERE {
        OPTIONAL {
            wd:$link rdfs:label ?lbl .
            FILTER ( %s )
        }
    }
''' % langmatches_clause)


def find_entity_labels(entity_id):
    ret_dict = {}
    f_sparql = WD_QUERY_STR.substitute(link=entity_id)
    # print('SPARQL: %s' % f_sparql)
    sparql_wd.setQuery(f_sparql)
    ret = sparql_wd.queryAndConvert()
    # print('SPARQL results: %s' % ret)
    for r in ret["results"]["bindings"]:
        cur_lang = safeget(r, 'lang', 'value')
        cur_lbl = safeget(r, 'lbl', 'value')
        # print('Current label: %s\nCurrent language: %s\n\n' % (cur_lbl, cur_lang))
        if cur_lang and cur_lbl and (cur_lang in accepted_langs):
            ret_dict[cur_lang] = cur_lbl
    return ret_dict

In [7]:
def fetch_placeholder_str(query, ent_links):
    query_plc = ''
    last_ind = 0
    for link in ent_links:
        if 'link' not in link:
            continue
        plchldr = link['placeholder']
        # forming the placeholder query
        if plchldr:
            query_plc += query[last_ind:link['start']] + plchldr
            last_ind = link['end']
    query_plc += query[last_ind:]
    return query_plc

In [8]:
# For each entry in mintaka dataset
    # For each entity in the entry
        # Extract other language labels
        # For each entry check if label exists in the query of that language
    # For each language that has matching number of entities with original English text
        # Write the entry to reference and prediction files

In [9]:
def extract_ent_mentions(entity_info_array, plc):
    ent_mentions = []
    arr_ind = 1
    entity_info_array.sort(key=lambda d: d['span'][0])
    for info in entity_info_array:
        enttype = info.get('entityType')
        if enttype == 'entity':
            mention = {'start': info['span'][0], 'end': info['span'][1],
                       'surfaceform': info['mention'], 'link': info['name'],
                       'placeholder': '[%s%d]' % (plc, arr_ind)}
            ent_mentions.append(mention)
            arr_ind += 1
    # ent_mentions.sort(key=lambda d: d['start'])
    return ent_mentions

In [10]:
# mintaka_train_json = mintaka_train_json[0:3]
# print(len(mintaka_train_json))
# print(mintaka_train_json[2])

In [11]:
# open the output files
output_dir = 'data/'
Path(output_dir).mkdir(parents=True, exist_ok=True)
op_files = {}
for key in accepted_langs:
    en_file = '%s%s_en_placeholder.txt' % (output_dir, key)
    lang_file = '%s%s_placeholder.txt' % (output_dir, key)
    en_file_obj = open(en_file, 'w')
    lang_file_obj = open(lang_file, 'w')
    op_files[key] = (en_file_obj, lang_file_obj)

In [12]:
for entry in mintaka_train_json:
    eng_text = entry['question']
    ent_mentions = extract_ent_mentions(entry['questionEntity'], '00')
    # print('Entity mentions: %s' % ent_mentions)
    eng_plc_text = fetch_placeholder_str(eng_text, ent_mentions)
    ent_count = len(ent_mentions)
    label_count = {lang: 0 for lang in accepted_langs}
    lang_ent_mentions = {key: [] for key in accepted_langs}
    for mention in ent_mentions:
        entity_id = mention['link']
        label_dict = find_entity_labels(entity_id)
        # print('label dict: %s' % label_dict)
        for key in label_dict:
            lang_query = entry['translations'][key]
            cur_label = label_dict[key]
            if cur_label in lang_query:
                # Add lang mention
                for match in re.finditer(re.escape(cur_label), lang_query):
                    lang_mention = {'link': entity_id,
                                    'start': match.start(),
                                    'end': match.end(),
                                    'placeholder': mention['placeholder']}
                    lang_ent_mentions[key].append(lang_mention)
                label_count[key] += 1
    # print('Extracted language based entity mentions: %s' % lang_ent_mentions)
    # check label counts
    for key in label_count:
        if label_count[key] == ent_count:
            lang_query = entry['translations'][key]
            # sorting for placeholder logic
            lang_ent_mentions[key].sort(key=lambda d: d['start'])
            lang_ent_mentions[key]
            # Generate placeholder text
            lang_plc_text = fetch_placeholder_str(lang_query, lang_ent_mentions[key])
            # write to file
            en_file = op_files[key][0]
            lang_file = op_files[key][1]
            # print(eng_plc_text, '\n', lang_plc_text, '\n\n\n')
            en_file.write(eng_plc_text + '\n')
            lang_file.write(lang_plc_text + '\n')
    pbar.update(1)
print('Processing complete!')

100%|██████████| 14000/14000 [1:03:54<00:00,  4.04it/s]

Processing complete!


In [13]:
##### 

In [14]:
# Close the output files
for value in op_files.values():
    for file_obj in value:
        file_obj.close()
print('Files closed.')

Files closed.


In [None]:
# Convert to json
# This is kept separate from the previous logic to run independently without the need to run previous cells

In [9]:
import json
from pathlib import Path

In [20]:
accepted_langs = ['de', 'pt', 'es', 'fr']
input_dir = 'data/'
output_dir = 'data/json/'
Path(output_dir).mkdir(parents=True, exist_ok=True)
for key in accepted_langs:
    en_file = '%s%s_en_placeholder.txt' % (input_dir, key)
    lang_file = '%s%s_placeholder.txt' % (input_dir, key)
    out_file = '%s%s_en_placeholder.json' % (output_dir, key)
    with open(en_file, 'r') as en_file_obj,  open(lang_file, 'r') as lang_file_obj, open(out_file, 'w') as out_file_obj:
        # Read the files and write json
        en_texts = en_file_obj.read().splitlines()
        lang_texts = lang_file_obj.read().splitlines()
        print('English file(%s) lines: %d\nLang file(%s) lines: %d\n\n' % (en_file, len(en_texts), lang_file, len(lang_texts)))
        output_json = []
        for en_line, lang_line in zip(en_texts, lang_texts):
            output_json.append({'gold': en_line, 'pred': lang_line})
        json.dump(output_json, out_file_obj)
print('json files created!')

English file(data/de_en_placeholder.txt) lines: 6222
Lang file(data/de_placeholder.txt) lines: 6222


English file(data/pt_en_placeholder.txt) lines: 6373
Lang file(data/pt_placeholder.txt) lines: 6373


English file(data/es_en_placeholder.txt) lines: 6466
Lang file(data/es_placeholder.txt) lines: 6466


English file(data/fr_en_placeholder.txt) lines: 7171
Lang file(data/fr_placeholder.txt) lines: 7171


json files created!


In [15]:
# Testing load dataset for huggingface

In [21]:
from datasets import load_dataset

In [22]:
dataset = load_dataset("json", data_dir="data/json/")

Downloading and preparing dataset json/default to /upb/users/n/nikit/profiles/unix/cs/.cache/huggingface/datasets/json/default-1e503d135908a822/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1676.38it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 386.79it/s]
                                                        

Dataset json downloaded and prepared to /upb/users/n/nikit/profiles/unix/cs/.cache/huggingface/datasets/json/default-1e503d135908a822/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 644.88it/s]


In [23]:
dataset['train']

Dataset({
    features: ['gold', 'pred'],
    num_rows: 26232
})

In [24]:
for data in dataset['train']:
    print(data)
    break

{'gold': 'What is the seventh tallest mountain in [001]?', 'pred': 'Wie heißt der siebthöchste Berg [001]s?'}
