# Download and prepare the evaluation datasets

In [None]:
import sys
sys.path.append('../src')

import os
os.environ["LANGUAGE"] = 'ar'

from sqlalchemy.sql.expression import func
from wikidataDB import WikidataEntity, WikidataID, Session
from SPARQLWrapper import SPARQLWrapper, JSON

import json
import os
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import pickle
from datasets import load_dataset
import re
from requests.exceptions import HTTPError
import time
import numpy as np
from deep_translator import GoogleTranslator

def is_in_wikipedia(qid):
    item = WikidataID.get_id(qid)
    if item is None:
        return False
    return item.in_wikipedia

#### Prepare the KGConv dataset

In [None]:
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for _, data_chunks in data.items():
        for data_chunk in data_chunks:
            triple = data_chunk['triple']
            question_in_wikipedia = is_in_wikipedia(triple[0])
            answer_in_wikipedia = is_in_wikipedia(triple[2])
            question = data_chunk['question variants'][0]
            processed_rows.append({
                'Question QID': triple[0],
                'Property PID': triple[1],
                'Answer QID': triple[2],
                'Question in Wikipedia': question_in_wikipedia,
                'Answer in Wikipedia': answer_in_wikipedia,
                'Question': question['out-of-context'],
                'Answer': data_chunk['answer']
            })
    return processed_rows

all_data = []

main_dir = "../data/Evaluation Data/KGConv/complete_version"

with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for folder in os.listdir(main_dir):
        current_dir = os.path.join(main_dir, folder)
        for file in tqdm(os.listdir(current_dir)):
            file_path = os.path.join(current_dir, file)
            futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/KGConv/processed_dataframe.pkl", "wb"))

#### Prepare the Mintaka dataset

In [None]:
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        try:
            question_qids = [d['name'] for d in data_chunk['questionEntity'] if d['entityType'] == 'entity']
            question_in_wikipedia = [is_in_wikipedia(id) for id in question_qids]

            answer_type = data_chunk['answer']['answerType']

            answer_qids = []
            answer_in_wikipedia = []
            if (answer_type == 'entity') and (data_chunk['answer']['answer'] is not None):
                answer_qids = [d['name'] for d in data_chunk['answer']['answer']]
                answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

            processed_rows.append({
                'Question QIDs': question_qids,
                'Answer QIDs': answer_qids,
                'Question in Wikipedia': question_in_wikipedia,
                'Answer in Wikipedia': answer_in_wikipedia,
                'Question': data_chunk['question'],
                'Answer': data_chunk['answer']['mention'],
                'Answer Type': answer_type,
                'Language': 'en'
            })

            for lang in data_chunk['translations'].keys():
                processed_rows.append({
                    'Question QIDs': question_qids,
                    'Answer QIDs': answer_qids,
                    'Question in Wikipedia': question_in_wikipedia,
                    'Answer in Wikipedia': answer_in_wikipedia,
                    'Question': data_chunk['translations'][lang],
                    'Answer': data_chunk['answer']['mention'],
                    'Answer Type': answer_type,
                    'Language': lang
                })

        except Exception as e:
            print(data_chunk)
            raise e
    return processed_rows

all_data = []

main_dir = "../data/Evaluation Data/Mintaka"

with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for file in tqdm(os.listdir(main_dir)):
        if 'json' in file:
            file_path = os.path.join(main_dir, file)
            futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/Mintaka/processed_dataframe_full.pkl", "wb"))

#### Prepare the RuBQ dataset

In [None]:
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        try:
            if data_chunk['question_uris'] is not None:
                question_qids = [d.split('/')[-1] for d in data_chunk['question_uris']]
                question_in_wikipedia = [is_in_wikipedia(id) for id in question_qids]

                answers = [d['wd_names']['en'][0] if len(d['wd_names']['en']) > 0 else d['label'] for d in data_chunk['answers']]
                answer_qids = [d['value'].split('/')[-1] for d in data_chunk['answers']]
                answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

                processed_rows.append({
                    'Question QIDs': question_qids,
                    'Answer QIDs': answer_qids,
                    'Question in Wikipedia': question_in_wikipedia,
                    'Answer in Wikipedia': answer_in_wikipedia,
                    'Question': data_chunk['question_eng'],
                    'Answer': answers,
                })
        except Exception as e:
            print(data_chunk)
            raise e
    return processed_rows

all_data = []

main_dir = "../data/Evaluation Data/RuBQ"

with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for file in tqdm(os.listdir(main_dir)):
        file_path = os.path.join(main_dir, file)
        futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/RuBQ/processed_dataframe.pkl", "wb"))

#### Prepare the LC_QuAD dataset

In [None]:
def run_sparql(query):
    wikidata_endpoint = "https://query.wikidata.org/sparql"
    sparql = SPARQLWrapper(wikidata_endpoint)
    sparql.setReturnFormat(JSON)

    answer_qids = []
    ran_sparql = True
    if not query.lower().strip().startswith('ask'):
        results = []
        retry = 5
        ran_sparql = False
        while retry > 0:  # Retry up to 5 times
            try:
                sparql.setQuery(query)
                results = sparql.query().convert()["results"]["bindings"]
                retry = 0  # Exit loop if successful
                ran_sparql = True
            except HTTPError as e:
                print(e)
                retry -= 1
                time.sleep(1)
            except Exception as e:
                print(e)
                retry -= 1
                query = re.sub(r'LIMIT \d+', 'LIMIT 5', query, flags=re.IGNORECASE) # Include a limit if the query returns too many answers

        for result in results:
            for key in result:
                value = result[key]["value"]
                if 'www.wikidata.org' in result[key]["value"]:
                    answer_qids.append(value.split('/')[-1])

    return answer_qids, ran_sparql

def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        try:
            matches = re.findall(r'wd:Q\d+', data_chunk['sparql_wikidata'])
            question_qids = [match[3:] for match in matches]
            question_in_wikipedia = [is_in_wikipedia(id) for id in question_qids]

            answer_qids, ran_sparql = run_sparql(data_chunk['sparql_wikidata'])
            answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

            processed_rows.append({
                'Question QIDs': question_qids,
                'Answer QIDs': answer_qids,
                'Question in Wikipedia': question_in_wikipedia,
                'Answer in Wikipedia': answer_in_wikipedia,
                'Question': data_chunk['question'],
                'SPARQL': data_chunk['sparql_wikidata'],
                'Ran SPARQL': ran_sparql
            })
        except Exception as e:
            print(e)
            print(data_chunk)
            raise e
    return processed_rows

all_data = []

main_dir = "../data/Evaluation Data/LC_QuAD"

with ThreadPoolExecutor(max_workers=1) as executor:
    futures = []
    for file in tqdm(os.listdir(main_dir)):
        file_path = os.path.join(main_dir, file)
        futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "wb"))

#### Prepare the Wikidata-Disamb dataset

In [None]:
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        correct_in_wikipedia = is_in_wikipedia(data_chunk['correct_id'])
        wrong_in_wikipedia = is_in_wikipedia(data_chunk['wrong_id'])
        processed_rows.append({
            'Sentence': data_chunk['text'],
            'Entity Name': data_chunk['string'],
            'Correct QID': data_chunk['correct_id'],
            'Wrong QID': data_chunk['wrong_id'],
            'Correct in Wikipedia': correct_in_wikipedia,
            'Wrong in Wikipedia': wrong_in_wikipedia,
        })
    return processed_rows

main_dir = "../data/Evaluation Data/Wikidata-Disamb"

all_data = []

for file in tqdm(os.listdir(main_dir)):
    file_path = os.path.join(main_dir, file)
    processed_data = process_file(file_path)
    all_data.extend(processed_data)

clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/Wikidata-Disamb/processed_dataframe.pkl", "wb"))

#### Prepare the REDFM dataset

In [None]:
def process_file(data):
    processed_rows = []
    for data_chunk in tqdm(data):
        qid = data_chunk['uri']
        qid_in_wikipedia = is_in_wikipedia(qid)
        boundaries = [(e['start'], e['end']) for e in data_chunk['entities'] if e['uri'] == data_chunk['uri']]
        processed_rows.append({
            'Sentence': data_chunk['text'],
            'Entity Name': data_chunk['title'],
            'Entity Span': boundaries,
            'Correct QID': qid,
            'Correct in Wikipedia': qid_in_wikipedia,
            'Language': data_chunk['lan']
        })
    return processed_rows

all_data = []

huggingface_ds = load_dataset("Babelscape/REDFM", "all_languages", streaming=True, trust_remote_code=True)

for split in huggingface_ds:
    all_data.extend(process_file(huggingface_ds[split]))

clean_data = pd.DataFrame(all_data)

def remove_spans(sentence, spans, replace_with='Entity'):
    # Sort spans in ascending order to remove from left to right
    spans = sorted(spans, key=lambda x: x[0])
    offset = 0  # To track the shift in index after replacing each span

    for start, end in spans:
        sentence = sentence[:start - offset] + replace_with + sentence[end - offset:]
        offset += (end - start) - len(replace_with)

    return sentence

clean_data['Sentence no entity'] = clean_data.apply(lambda x: remove_spans(x['Sentence'], x['Entity Span']), axis=1)

pickle.dump(clean_data, open("../data/Evaluation Data/REDFM/processed_dataframe.pkl", "wb"))

#### Extract all QIDs found in the datasets

In [None]:
tqdm.pandas()

data = pickle.load(open("../data/Evaluation Data/REDFM/processed_dataframe.pkl", "rb"))
unique_ids = data['Correct QID'].unique()

data = pickle.load(open("../data/Evaluation Data/Mintaka/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, pd.Series(np.concatenate(data['Question QIDs'].to_numpy())).unique()])
unique_ids = np.concatenate([unique_ids, pd.Series(np.concatenate(data['Answer QIDs'].to_numpy())).unique()])

data = pickle.load(open("../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, pd.Series(np.concatenate(data['Question QIDs'].to_numpy())).unique()])
unique_ids = np.concatenate([unique_ids, pd.Series(np.concatenate(data['Answer QIDs'].to_numpy())).unique()])

data = pickle.load(open("../data/Evaluation Data/RuBQ/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, pd.Series(np.concatenate(data['Question QIDs'].to_numpy())).unique()])
unique_ids = np.concatenate([unique_ids, pd.Series(np.concatenate(data['Answer QIDs'].to_numpy())).unique()])

data = pickle.load(open("../data/Evaluation Data/KGConv/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, data['Question QID'].unique()])
unique_ids = np.concatenate([unique_ids, data['Answer QID'].unique()])

data = pickle.load(open("../data/Evaluation Data/Wikidata-Disamb/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, data['Correct QID'].unique()])
unique_ids = np.concatenate([unique_ids, data['Wrong QID'].unique()])

unique_ids = pd.Series(unique_ids).unique()
unique_ids = pd.DataFrame({'QID': unique_ids})

unique_ids['In Wikipedia'] = unique_ids['QID'].progress_apply(lambda x: (WikidataEntity.get_entity(x) is not None) and (WikidataID.get_id(x).in_wikipedia))

#### Create a sample QIDs dataset

In [None]:
sample_ids = unique_ids[unique_ids['In Wikipedia']]

sample_count = sample_ids['from Evaluation'].sum()*2 - (~sample_ids['from Evaluation']).sum()
with tqdm(total=sample_count) as progressbar:
    with Session() as session:
        entities = (
            session.query(WikidataID)
            .filter(WikidataID.in_wikipedia == True)
            .order_by(func.random())  # Adds random ordering
            .yield_per(1000)
        )

        for entity in tqdm(entities):
            if entity.id not in sample_ids['QID'].values:
                sample_ids = pd.concat([sample_ids, pd.DataFrame([{
                        'QID': entity.id,
                        'from Evaluation': False,
                        'In Wikipedia': True,
                    }])], ignore_index=True)
                progressbar.update(1)
            if progressbar.n >= sample_count:
                break

#### Translate questions with Google Translate

In [None]:
data = pickle.load(open(f"../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "rb"))
data['Language'] = 'en'

target = 'de'
translator = GoogleTranslator(source='en', target=target)

batch_size = 20
translated_rows = []

questions = data[data['Language'] == 'en']['Question'].tolist()

# Process in batches
for i in tqdm(range(len(translated_rows), len(questions), batch_size)):
    batch = questions[i:i + batch_size]
    translated_batch = translator.translate_batch(batch)

    for idx, translated_question in enumerate(translated_batch):
        original_row = data.iloc[i + idx].copy()
        original_row['Question'] = translated_question
        original_row['Language'] = target
        translated_rows.append(original_row)

translated_df = pd.DataFrame(translated_rows)
combined_df = pd.concat([data, translated_df], ignore_index=True)
combined_df