In [1]:
import sys
sys.path.append('../src')

import os
os.environ["LANGUAGE"] = 'ar'

from sqlalchemy.sql.expression import func
from wikidataDB import WikidataEntity, WikidataID, Session
from wikidataRetriever import WikidataKeywordSearch, AstraDBConnect
from SPARQLWrapper import SPARQLWrapper, JSON

import json
import os
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import pickle
from datasets import load_dataset
import re
from requests.exceptions import HTTPError
import time
from tqdm import tqdm

def is_in_wikipedia(qid):
    item = WikidataID.get_id(qid)
    if item is None:
        return False
    return item.in_wikipedia

In [None]:
# Function to process a single file
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for _, data_chunks in data.items():
        for data_chunk in data_chunks:
            triple = data_chunk['triple']
            question_in_wikipedia = is_in_wikipedia(triple[0])
            answer_in_wikipedia = is_in_wikipedia(triple[2])
            question = data_chunk['question variants'][0]
            processed_rows.append({
                'Question QID': triple[0],
                'Property PID': triple[1],
                'Answer QID': triple[2],
                'Question in Wikipedia': question_in_wikipedia,
                'Answer in Wikipedia': answer_in_wikipedia,
                'Question': question['out-of-context'],
                'Answer': data_chunk['answer']
            })
    return processed_rows

# Use a list to accumulate the rows
all_data = []

main_dir = "../data/Evaluation Data/KGConv/complete_version"

# Using ThreadPoolExecutor to parallelize file processing
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for folder in os.listdir(main_dir):
        current_dir = os.path.join(main_dir, folder)
        for file in tqdm(os.listdir(current_dir)):
            file_path = os.path.join(current_dir, file)
            futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/KGConv/processed_dataframe.pkl", "wb"))

In [None]:
# Function to process a single file
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        try:
            question_qids = [d['name'] for d in data_chunk['questionEntity'] if d['entityType'] == 'entity']
            question_in_wikipedia = [is_in_wikipedia(id) for id in question_qids]

            answer_type = data_chunk['answer']['answerType']

            answer_qids = []
            answer_in_wikipedia = []
            if (answer_type == 'entity') and (data_chunk['answer']['answer'] is not None):
                answer_qids = [d['name'] for d in data_chunk['answer']['answer']]
                answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

            processed_rows.append({
                'Question QIDs': question_qids,
                'Answer QIDs': answer_qids,
                'Question in Wikipedia': question_in_wikipedia,
                'Answer in Wikipedia': answer_in_wikipedia,
                'Question': data_chunk['question'],
                'Answer': data_chunk['answer']['mention'],
                'Answer Type': answer_type,
                'Language': 'en'
            })

            for lang in data_chunk['translations'].keys():
                processed_rows.append({
                    'Question QIDs': question_qids,
                    'Answer QIDs': answer_qids,
                    'Question in Wikipedia': question_in_wikipedia,
                    'Answer in Wikipedia': answer_in_wikipedia,
                    'Question': data_chunk['translations'][lang],
                    'Answer': data_chunk['answer']['mention'],
                    'Answer Type': answer_type,
                    'Language': lang
                })

        except Exception as e:
            print(data_chunk)
            raise e
    return processed_rows

# Use a list to accumulate the rows
all_data = []

main_dir = "../data/Evaluation Data/Mintaka"

# Using ThreadPoolExecutor to parallelize file processing
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for file in tqdm(os.listdir(main_dir)):
        if 'json' in file:
            file_path = os.path.join(main_dir, file)
            futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/Mintaka/processed_dataframe_full.pkl", "wb"))

In [None]:
# Function to process a single file
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        try:
            if data_chunk['question_uris'] is not None:
                question_qids = [d.split('/')[-1] for d in data_chunk['question_uris']]
                question_in_wikipedia = [is_in_wikipedia(id) for id in question_qids]

                answers = [d['wd_names']['en'][0] if len(d['wd_names']['en']) > 0 else d['label'] for d in data_chunk['answers']]
                answer_qids = [d['value'].split('/')[-1] for d in data_chunk['answers']]
                answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

                processed_rows.append({
                    'Question QIDs': question_qids,
                    'Answer QIDs': answer_qids,
                    'Question in Wikipedia': question_in_wikipedia,
                    'Answer in Wikipedia': answer_in_wikipedia,
                    'Question': data_chunk['question_eng'],
                    'Answer': answers,
                })
        except Exception as e:
            print(data_chunk)
            raise e
    return processed_rows

# Use a list to accumulate the rows
all_data = []

main_dir = "../data/Evaluation Data/RuBQ"

# Using ThreadPoolExecutor to parallelize file processing
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for file in tqdm(os.listdir(main_dir)):
        file_path = os.path.join(main_dir, file)
        futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/RuBQ/processed_dataframe.pkl", "wb"))

In [None]:
wikidata_endpoint = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(wikidata_endpoint)
sparql.setReturnFormat(JSON)

sparql.setQuery("SELECT DISTINCT ?sbj ?sbj_label WHERE { ?sbj wdt:P31 wd:Q7187 . ?sbj rdfs:label ?sbj_label . FILTER(CONTAINS(lcase(?sbj_label), 'vgf')) . FILTER (lang(?sbj_label) = 'en') } LIMIT 25")
results = sparql.query().convert()

In [None]:
from requests.exceptions import HTTPError
import time

# Function to process a single file
def process_file(file_path):
    wikidata_endpoint = "https://query.wikidata.org/sparql"
    sparql = SPARQLWrapper(wikidata_endpoint)
    sparql.setReturnFormat(JSON)

    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        try:
            matches = re.findall(r'wd:Q\d+', data_chunk['sparql_wikidata'])
            question_qids = [match[3:] for match in matches]
            question_in_wikipedia = [is_in_wikipedia(id) for id in question_qids]

            answer_qids = []
            query = data_chunk['sparql_wikidata']
            ran_sparql = True
            if not data_chunk['sparql_wikidata'].lower().strip().startswith('ask'):
                results = []
                retry = 5
                ran_sparql = False
                while retry > 0:  # Retry up to 5 times
                    try:
                        sparql.setQuery(query)
                        results = sparql.query().convert()["results"]["bindings"]
                        retry = 0  # Exit loop if successful
                        ran_sparql = True
                    except HTTPError as e:
                        print(e)
                        retry -= 1
                        time.sleep(1)
                    except Exception as e:
                        print(e)
                        retry -= 1
                        query = re.sub(r'LIMIT \d+', 'LIMIT 5', query, flags=re.IGNORECASE)

                for result in results:
                    for key in result:
                        value = result[key]["value"]
                        if 'www.wikidata.org' in result[key]["value"]:
                            answer_qids.append(value.split('/')[-1])
                answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

            processed_rows.append({
                'Question QIDs': question_qids,
                'Answer QIDs': answer_qids,
                'Question in Wikipedia': question_in_wikipedia,
                'Answer in Wikipedia': answer_in_wikipedia,
                'Question': data_chunk['question'],
                'SPARQL': data_chunk['sparql_wikidata'],
                'Ran SPARQL': ran_sparql
            })
        except Exception as e:
            print(e)
            print(data_chunk)
            raise e
    return processed_rows

# Use a list to accumulate the rows
all_data = []

main_dir = "../data/Evaluation Data/LC_QuAD"

# Using ThreadPoolExecutor to parallelize file processing
with ThreadPoolExecutor(max_workers=1) as executor:
    futures = []
    for file in tqdm(os.listdir(main_dir)):
        file_path = os.path.join(main_dir, file)
        futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "wb"))

In [None]:
clean_data = pickle.load(open("../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "rb"))

for i, row in tqdm(clean_data.iterrows()):
    answer_qids = []
    query = row['SPARQL']
    if (not query.lower().strip().startswith('ask')) and (not row['Ran SPARQL']):
        results = []
        retry = 5
        while retry > 0:  # Retry up to 5 times
            try:
                sparql.setQuery(query)
                results = sparql.query().convert()["results"]["bindings"]
                retry = 0  # Exit loop if successful
                clean_data.at[row.name, 'Ran SPARQL'] = True
            except HTTPError as e:
                print(e)
                retry -= 1
                time.sleep(1)
            except Exception as e:
                print(e)
                retry -= 1
                query = re.sub(r'LIMIT \d+', 'LIMIT 5', query, flags=re.IGNORECASE)

        for result in results:
            for key in result:
                value = result[key]["value"]
                if 'www.wikidata.org' in result[key]["value"]:
                    answer_qids.append(value.split('/')[-1])
        answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

        clean_data.at[row.name, 'Answer QIDs'] = answer_qids
        clean_data.at[row.name, 'Answer in Wikipedia'] = answer_in_wikipedia

        if i%100 == 0:
            pickle.dump(clean_data, open("../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "wb"))

In [None]:
# Function to process a single file
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        correct_in_wikipedia = is_in_wikipedia(data_chunk['correct_id'])
        wrong_in_wikipedia = is_in_wikipedia(data_chunk['wrong_id'])
        processed_rows.append({
            'Sentence': data_chunk['text'],
            'Entity Name': data_chunk['string'],
            'Correct QID': data_chunk['correct_id'],
            'Wrong QID': data_chunk['wrong_id'],
            'Correct in Wikipedia': correct_in_wikipedia,
            'Wrong in Wikipedia': wrong_in_wikipedia,
        })
    return processed_rows

# Define main directory and path to save results
main_dir = "../data/Evaluation Data/Wikidata-Disamb"

# Use a list to accumulate the rows
all_data = []

# Iterate over each file, process it, and save in chunks
for file in tqdm(os.listdir(main_dir)):
    file_path = os.path.join(main_dir, file)
    processed_data = process_file(file_path)
    all_data.extend(processed_data)

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/Wikidata-Disamb/processed_dataframe.pkl", "wb"))

In [None]:
# Function to process a single file
def process_file(data, lang='en'):
    processed_rows = []
    for data_chunk in tqdm(data):
        qid = data_chunk['uri']
        qid_in_wikipedia = is_in_wikipedia(qid)
        boundaries = [(e['start'], e['end']) for e in data_chunk['entities'] if e['uri'] == data_chunk['uri']]
        processed_rows.append({
            'Sentence': data_chunk['text'],
            'Entity Name': data_chunk['title'],
            'Entity Span': boundaries,
            'Correct QID': qid,
            'Correct in Wikipedia': qid_in_wikipedia,
            'Language': data_chunk['lan']
        })
    return processed_rows

# Use a list to accumulate the rows
all_data = []

huggingface_ds = load_dataset("Babelscape/REDFM", "all_languages", streaming=True, trust_remote_code=True)

for split in huggingface_ds:
    all_data.extend(process_file(huggingface_ds[split]))

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/REDFM/processed_dataframe.pkl", "wb"))

In [None]:
data = pickle.load(open("../data/Evaluation Data/REDFM/processed_dataframe.pkl", "rb"))
unique_ids = data['Correct QID'].unique()

data = pickle.load(open("../data/Evaluation Data/Mintaka/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, pd.Series(np.concatenate(data['Question QIDs'].to_numpy())).unique()])
unique_ids = np.concatenate([unique_ids, pd.Series(np.concatenate(data['Answer QIDs'].to_numpy())).unique()])

data = pickle.load(open("../data/Evaluation Data/KGConv/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, data['Question QID'].unique()])
unique_ids = np.concatenate([unique_ids, data['Answer QID'].unique()])

data = pickle.load(open("../data/Evaluation Data/Wikidata-Disamb/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, data['Correct QID'].unique()])
unique_ids = np.concatenate([unique_ids, data['Wrong QID'].unique()])

unique_ids = pd.Series(unique_ids).unique()
unique_ids = pd.DataFrame({'QID': unique_ids})

In [None]:
tqdm.pandas()

unique_ids['In Wikipedia'] = unique_ids['QID'].progress_apply(lambda x: (WikidataEntity.get_entity(x) is not None) and (WikidataID.get_id(x).in_wikipedia))

In [None]:
test_data  = pd.DataFrame(columns=['Query', 'Correct', 'Source'])

data = pickle.load(open("../data/Evaluation Data/REDFM/processed_dataframe.pkl", "rb"))
prep = data[data['Language'] == 'en'][data['Correct in Wikipedia']]
prep['Correct QID'] = prep['Correct QID'].apply(lambda x: [x])
prep = prep.rename({'Sentence': 'Query', 'Correct QID': 'Correct'}, axis=1)
prep['Source'] = 'REDFM'

test_data = pd.concat([test_data, prep[['Query', 'Correct', 'Source']]])

def mintaka_filter(x):
    in_wiki = x['Question in Wikipedia'] + x['Answer in Wikipedia']
    if len(in_wiki) == 0:
        return False
    return all(in_wiki)

data = pickle.load(open("../data/Evaluation Data/Mintaka/processed_dataframe.pkl", "rb"))
prep = data[data.apply(lambda x: all(x['Question in Wikipedia'] + x['Answer in Wikipedia']), axis=1)]
prep['Correct'] = prep.apply(lambda x: x['Question QIDs'] + x['Answer QIDs'], axis=1)
prep = prep.rename({'Question': 'Query'}, axis=1)
prep['Source'] = 'Mintaka'

test_data = pd.concat([test_data, prep[['Query', 'Correct', 'Source']]])

data = pickle.load(open("../data/Evaluation Data/KGConv/processed_dataframe.pkl", "rb"))
prep = data[data.apply(lambda x: x['Question in Wikipedia'] and x['Answer in Wikipedia'], axis=1)]
prep['Correct'] = prep.apply(lambda x: [x['Question QID'], x['Answer QID']], axis=1)
prep = prep.rename({'Question': 'Query'}, axis=1)
prep['Source'] = 'KGConv'

test_data = pd.concat([test_data, prep[['Query', 'Correct', 'Source']]])
test_datacombined_df

In [None]:
def remove_spans(sentence, spans, replace_with='Entity'):
    # Sort spans in ascending order to remove from left to right
    spans = sorted(spans, key=lambda x: x[0])
    offset = 0  # To track the shift in index after replacing each span

    for start, end in spans:
        sentence = sentence[:start - offset] + replace_with + sentence[end - offset:]
        offset += (end - start) - len(replace_with)  # Update offset to account for the replaced span length

    return sentence

data['Sentence no entity'] = data.apply(lambda x: remove_spans(x['Sentence'], x['Entity Span']), axis=1)

In [2]:
import pickle
import pandas as pd
from tqdm import tqdm
from deep_translator import GoogleTranslator

# prep = pickle.load(open(f"../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "rb"))
# prep['Language'] = 'en'

target = 'de'
translator = GoogleTranslator(source='en', target=target)

batch_size = 20
# translated_rows = []

questions = prep[prep['Language'] == 'en']['Question'].tolist()

# Process in batches
for i in tqdm(range(len(translated_rows), len(questions), batch_size)):
    batch = questions[i:i + batch_size]
    translated_batch = translator.translate_batch(batch)

    for idx, translated_question in enumerate(translated_batch):
        original_row = prep.iloc[i + idx].copy()
        original_row['Question'] = translated_question
        original_row['Language'] = target
        translated_rows.append(original_row)

translated_df = pd.DataFrame(translated_rows)
combined_df = pd.concat([prep, translated_df], ignore_index=True)
combined_df

100%|██████████| 1351/1351 [2:42:37<00:00,  7.22s/it] 


Unnamed: 0,Question QIDs,Answer QIDs,Question in Wikipedia,Answer in Wikipedia,Question,SPARQL,Language
0,"[Q127998, Q6256]",[Q219060],"[True, True]",[True],Who is the {country} for {head of state} of {...,select distinct ?sbj where { ?sbj wdt:P35 wd:...,en
1,[Q1045],[],[True],[],What was the population of Somalia in 2009-0-0?,SELECT ?obj WHERE { wd:Q1045 p:P1082 ?s . ?s p...,en
2,"[Q124057, Q3915489]",[Q4790397],"[True, True]",[True],What is {nominated for} of {Dolores del Río} t...,SELECT ?obj WHERE { wd:Q124057 p:P1411 ?s . ?s...,en
3,[Q42168],[],[True],[],What was the population of Clermont-Ferrand on...,SELECT ?obj WHERE { wd:Q42168 p:P1082 ?s . ?s ...,en
4,[Q3272],[Q3292],[True],[True],On Lake Winnipeg what is the lakes on river?,select distinct ?answer where { ?answer wdt:P4...,en
...,...,...,...,...,...,...,...
81034,"[Q18123741, Q9368]","[Q6853, Q154869, Q157661]","[True, True]","[True, True, True]","Nennen Sie eine Infektionskrankheit, die die L...",SELECT DISTINCT ?sbj ?sbj_label WHERE { ?sbj w...,de
81035,[Q14982],[],[True],"[True, True, True]","Der Siedepunkt des Methanols beträgt 117,6",ASK WHERE { wd:Q14982 wdt:P2102 ?obj filter(?o...,de
81036,[Q295393],[],[True],[],Wie lautet der Professorenausweis (1909-1939) ...,select distinct ?answer where { wd:Q295393 wdt...,de
81037,"[Q42807, Q7118067]",[Q43301],"[True, True]",[True],"Welche Art von Menschen leben in Fresno, der P...",SELECT ?answer WHERE { wd:Q42807 wdt:P190 ?ans...,de
