In [1]:
import sys
sys.path.append('../src')

from sqlalchemy.sql.expression import func
from wikidataDB import WikidataEntity, WikidataID, Session
from wikidataRetriever import WikidataKeywordSearch, AstraDBConnect
from SPARQLWrapper import SPARQLWrapper, JSON

import json
import os
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import pickle
from datasets import load_dataset
import re
from requests.exceptions import HTTPError
import time
from tqdm import tqdm

def is_in_wikipedia(qid):
    item = WikidataID.get_id(qid)
    if item is None:
        return False
    return item.in_wikipedia

In [137]:
datastax_token = json.load(open(f"../API_tokens/datastax_wikidata_nvidia.json"))
graph_store = AstraDBConnect(datastax_token, 'qids_nvidia', model='nvidia', batch_size=10)

In [141]:
BATCH_SIZE = 1000
with tqdm(total=9203786) as progressbar:
    with Session() as session:
        for i in range(2000000, 9203786, BATCH_SIZE):
            entity = session.query(WikidataEntity).join(WikidataID, WikidataEntity.id == WikidataID.id).filter(WikidataID.in_wikipedia == True).offset(i).first()
            progressbar.n = i
            progressbar.refresh()
            doc_batch = []
            ids_batch = []
            if graph_store.graph_store.get_by_document_id(f'{entity.id}_1') is None:
                    break

 34%|███▍      | 3142000/9203786 [24:22<47:01, 2148.43it/s]  


In [None]:
# Function to process a single file
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for _, data_chunks in data.items():
        for data_chunk in data_chunks:
            triple = data_chunk['triple']
            question_in_wikipedia = is_in_wikipedia(triple[0])
            answer_in_wikipedia = is_in_wikipedia(triple[2])
            question = data_chunk['question variants'][0]
            processed_rows.append({
                'Question QID': triple[0],
                'Property PID': triple[1],
                'Answer QID': triple[2],
                'Question in Wikipedia': question_in_wikipedia,
                'Answer in Wikipedia': answer_in_wikipedia,
                'Question': question['out-of-context'],
                'Answer': data_chunk['answer']
            })
    return processed_rows

# Use a list to accumulate the rows
all_data = []

main_dir = "../data/Evaluation Data/KGConv/complete_version"

# Using ThreadPoolExecutor to parallelize file processing
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for folder in os.listdir(main_dir):
        current_dir = os.path.join(main_dir, folder)
        for file in tqdm(os.listdir(current_dir)):
            file_path = os.path.join(current_dir, file)
            futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/KGConv/processed_dataframe.pkl", "wb"))

In [None]:
# Function to process a single file
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        try:
            question_qids = [d['name'] for d in data_chunk['questionEntity'] if d['entityType'] == 'entity']
            question_in_wikipedia = [is_in_wikipedia(id) for id in question_qids]

            answer_type = data_chunk['answer']['answerType']

            answer_qids = []
            answer_in_wikipedia = []
            if (answer_type == 'entity') and (data_chunk['answer']['answer'] is not None):
                answer_qids = [d['name'] for d in data_chunk['answer']['answer']]
                answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

            processed_rows.append({
                'Question QIDs': question_qids,
                'Answer QIDs': answer_qids,
                'Question in Wikipedia': question_in_wikipedia,
                'Answer in Wikipedia': answer_in_wikipedia,
                'Question': data_chunk['question'],
                'Answer': data_chunk['answer']['mention'],
                'Answer Type': answer_type
            })
        except Exception as e:
            print(data_chunk)
            raise e
    return processed_rows

# Use a list to accumulate the rows
all_data = []

main_dir = "../data/Evaluation Data/Mintaka"

# Using ThreadPoolExecutor to parallelize file processing
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for file in tqdm(os.listdir(main_dir)):
        file_path = os.path.join(main_dir, file)
        futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/Mintaka/processed_dataframe.pkl", "wb"))

In [None]:
# Function to process a single file
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        try:
            if data_chunk['question_uris'] is not None:
                question_qids = [d.split('/')[-1] for d in data_chunk['question_uris']]
                question_in_wikipedia = [is_in_wikipedia(id) for id in question_qids]

                answers = [d['wd_names']['en'][0] if len(d['wd_names']['en']) > 0 else d['label'] for d in data_chunk['answers']]
                answer_qids = [d['value'].split('/')[-1] for d in data_chunk['answers']]
                answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

                processed_rows.append({
                    'Question QIDs': question_qids,
                    'Answer QIDs': answer_qids,
                    'Question in Wikipedia': question_in_wikipedia,
                    'Answer in Wikipedia': answer_in_wikipedia,
                    'Question': data_chunk['question_eng'],
                    'Answer': answers,
                })
        except Exception as e:
            print(data_chunk)
            raise e
    return processed_rows

# Use a list to accumulate the rows
all_data = []

main_dir = "../data/Evaluation Data/RuBQ"

# Using ThreadPoolExecutor to parallelize file processing
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for file in tqdm(os.listdir(main_dir)):
        file_path = os.path.join(main_dir, file)
        futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/RuBQ/processed_dataframe.pkl", "wb"))

In [3]:
wikidata_endpoint = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(wikidata_endpoint)
sparql.setReturnFormat(JSON)

sparql.setQuery("SELECT DISTINCT ?sbj ?sbj_label WHERE { ?sbj wdt:P31 wd:Q7187 . ?sbj rdfs:label ?sbj_label . FILTER(CONTAINS(lcase(?sbj_label), 'vgf')) . FILTER (lang(?sbj_label) = 'en') } LIMIT 25")
results = sparql.query().convert()

In [10]:
from requests.exceptions import HTTPError
import time

# Function to process a single file
def process_file(file_path):
    wikidata_endpoint = "https://query.wikidata.org/sparql"
    sparql = SPARQLWrapper(wikidata_endpoint)
    sparql.setReturnFormat(JSON)

    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        try:
            matches = re.findall(r'wd:Q\d+', data_chunk['sparql_wikidata'])
            question_qids = [match[3:] for match in matches]
            question_in_wikipedia = [is_in_wikipedia(id) for id in question_qids]

            answer_qids = []
            query = data_chunk['sparql_wikidata']
            ran_sparql = True
            if not data_chunk['sparql_wikidata'].lower().strip().startswith('ask'):
                results = []
                retry = 5
                ran_sparql = False
                while retry > 0:  # Retry up to 5 times
                    try:
                        sparql.setQuery(query)
                        results = sparql.query().convert()["results"]["bindings"]
                        retry = 0  # Exit loop if successful
                        ran_sparql = True
                    except HTTPError as e:
                        print(e)
                        retry -= 1
                        time.sleep(1)
                    except Exception as e:
                        print(e)
                        retry -= 1
                        query = re.sub(r'LIMIT \d+', 'LIMIT 5', query, flags=re.IGNORECASE)

                for result in results:
                    for key in result:
                        value = result[key]["value"]
                        if 'www.wikidata.org' in result[key]["value"]:
                            answer_qids.append(value.split('/')[-1])
                answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

            processed_rows.append({
                'Question QIDs': question_qids,
                'Answer QIDs': answer_qids,
                'Question in Wikipedia': question_in_wikipedia,
                'Answer in Wikipedia': answer_in_wikipedia,
                'Question': data_chunk['question'],
                'SPARQL': data_chunk['sparql_wikidata'],
                'Ran SPARQL': ran_sparql
            })
        except Exception as e:
            print(e)
            print(data_chunk)
            raise e
    return processed_rows

# Use a list to accumulate the rows
all_data = []

main_dir = "../data/Evaluation Data/LC_QuAD"

# Using ThreadPoolExecutor to parallelize file processing
with ThreadPoolExecutor(max_workers=1) as executor:
    futures = []
    for file in tqdm(os.listdir(main_dir)):
        file_path = os.path.join(main_dir, file)
        futures.append(executor.submit(process_file, file_path))

    for future in tqdm(futures):
        all_data.extend(future.result())

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "wb"))

100%|██████████| 2/2 [00:00<00:00, 8338.58it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b"SPARQL-QUERY: queryStr=SELECT DISTINCT ?sbj ?sbj_label WHERE { ?sbj wdt:P31 wd:Q7187 . ?sbj rdfs:label ?sbj_label . FILTER(CONTAINS(lcase(?sbj_label), 'vgf')) . FILTER (lang(?sbj_label) = 'en') } LIMIT 25 \njava.util.concurrent.TimeoutException\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:205)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:687)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletH

 50%|█████     | 1/2 [26:34<26:34, 1594.17s/it]

HTTP Error 429: Too Many RequestsSocket operation on non-socket
{'NNQT_question': 'What is the {video game developer} for {developer} of {Pong}', 'uid': 22882, 'subgraph': 'simple question right', 'template_index': 3943, 'question': 'Which video game company developed Pong?', 'sparql_wikidata': ' select distinct ?obj where { wd:Q216293 wdt:P178 ?obj . ?obj wdt:P31 wd:Q210167 } ', 'sparql_dbpedia18': 'select distinct ?obj where { ?statement <http://www.w3.org/1999/02/22-rdf-syntax-ns#subject> <http://wikidata.dbpedia.org/resource/Q216293> . ?statement <http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate> <http://www.wikidata.org/entity/P178> . ?statement <http://www.w3.org/1999/02/22-rdf-syntax-ns#object> ?obj . ?obj <http://www.wikidata.org/entity/P31> <http://wikidata.dbpedia.org/resource/Q210167> } ', 'template': ' <S P ?O ; ?O instanceOf Type>', 'answer': [], 'template_id': 1, 'paraphrased_question': 'Who is the president of Valencia?'}
HTTP Error 429: Too Many Requests
HTTP Error 

100%|██████████| 2/2 [2:16:21<00:00, 4090.63s/it]

HTTP Error 429: Too Many RequestsSocket operation on non-socket
{'NNQT_question': 'What is {date of birth} of {Antipater} that is {instance of} is {statement with Gregorian date earlier than 1584} ?', 'uid': 9458, 'subgraph': 'statement_property', 'template_index': 5903, 'question': 'What (in the Julian calendar) is the date of birth of Antipater?', 'sparql_wikidata': 'SELECT ?obj WHERE { wd:Q204760 p:P569 ?s . ?s ps:P569 ?obj . ?s pq:P31 wd:Q26961029 }', 'sparql_dbpedia18': 'select distinct ?obj  where {\n?statement <http://www.w3.org/1999/02/22-rdf-syntax-ns#subject> <http://wikidata.dbpedia.org/resource/Q204760> .\n?statement <http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate> <http://www.wikidata.org/entity/P569> .\n?statement <http://www.w3.org/1999/02/22-rdf-syntax-ns#object> ?obj .\n?statement <http://www.wikidata.org/entity/P31> <http://wikidata.dbpedia.org/resource/Q26961029>\n} \n', 'template': '(E pred F) prop ?value', 'answer': [], 'template_id': 'statement_property_2', 




In [8]:
clean_data['Ran SPARQL'] = clean_data['Answer QIDs'].apply(lambda x: len(x) > 0)

In [21]:
clean_data = pickle.load(open("../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "rb"))
clean_data.apply(lambda x: all(x['Question in Wikipedia']), axis=1).sum()

25934

In [24]:
clean_data = pickle.load(open("../data/Evaluation Data/RuBQ/processed_dataframe.pkl", "rb"))
clean_data

Unnamed: 0,Question QIDs,Answer QIDs,Question in Wikipedia,Answer in Wikipedia,Question,Answer
0,[Q8070],"[Q7944, Q60186, Q167903, Q2580904, Q5975740, Q...",[True],"[True, True, True, True, True, True]",What can cause a tsunami?,"[seism, shooting star, Rock avalanche, tsunami..."
1,[Q2222],[Q102513],[True],[True],"Who wrote the novel ""uncle Tom's Cabin""?",[Christopher Crowfield]
2,[Q83186],[Q692],[True],[True],"Who is the author of the play ""Romeo and Juliet""?",[The Bard]
3,[Q218],[Q19660],[True],[True],What is the name of the capital of Romania?,[Paris of the East]
4,[Q5928],"[Q6607, Q483994, Q626035, Q2643890, Q17172850]",[True],"[True, True, True, True, False]",What instrument did Jimi Hendrix play?,"[guitar, kazoo, glockenspiel, vocalist, voice]"
...,...,...,...,...,...,...
2605,[Q362258],[],[True],[],What is the name of Vitas's father?,[]
2606,[Q862],[],[True],[],"What was the name of the poet's father, Joseph...",[]
2607,[Q2143],[],[True],[],which river is the city of Syktyvkar located on?,[]
2608,[Q612],[],[True],[],Which Bay is Dubai located on?,[]


In [4]:
clean_data = pickle.load(open("../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "rb"))

for i, row in tqdm(clean_data.iterrows()):
    answer_qids = []
    query = row['SPARQL']
    if (not query.lower().strip().startswith('ask')) and (not row['Ran SPARQL']):
        results = []
        retry = 5
        while retry > 0:  # Retry up to 5 times
            try:
                sparql.setQuery(query)
                results = sparql.query().convert()["results"]["bindings"]
                retry = 0  # Exit loop if successful
                clean_data.at[row.name, 'Ran SPARQL'] = True
            except HTTPError as e:
                print(e)
                retry -= 1
                time.sleep(1)
            except Exception as e:
                print(e)
                retry -= 1
                query = re.sub(r'LIMIT \d+', 'LIMIT 5', query, flags=re.IGNORECASE)

        for result in results:
            for key in result:
                value = result[key]["value"]
                if 'www.wikidata.org' in result[key]["value"]:
                    answer_qids.append(value.split('/')[-1])
        answer_in_wikipedia = [is_in_wikipedia(id) for id in answer_qids]

        clean_data.at[row.name, 'Answer QIDs'] = answer_qids
        clean_data.at[row.name, 'Answer in Wikipedia'] = answer_in_wikipedia

        if i%100 == 0:
            pickle.dump(clean_data, open("../data/Evaluation Data/LC_QuAD/processed_dataframe.pkl", "wb"))

0it [00:00, ?it/s]

QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b'SPARQL-QUERY: queryStr=SELECT ?obj WHERE { wd:Q25267 p:P144 ?s . ?s ps:P144 ?obj . ?s pq:P2534 ?x filter(contains(?x,\'^{\\circ}\\text{C} = \\text{K} - 273.15\')) }\njava.util.concurrent.ExecutionException: org.openrdf.query.MalformedQueryException: Lexical error at line 1, column 99.  Encountered: "c" (99), after : "\\\'^{\\\\"\n\tat java.util.concurrent.FutureTask.report(FutureTask.java:122)\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:206)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat java

2117it [00:10, 200.02it/s]

QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b'SPARQL-QUERY: queryStr=SELECT ?obj WHERE { wd:Q25267 p:P144 ?s . ?s ps:P144 ?obj . ?s pq:P2534 ?x filter(contains(?x,\'^{\\circ}\\text{C} = \\text{K} - 273.15\')) }\njava.util.concurrent.ExecutionException: org.openrdf.query.MalformedQueryException: Lexical error at line 1, column 99.  Encountered: "c" (99), after : "\\\'^{\\\\"\n\tat java.util.concurrent.FutureTask.report(FutureTask.java:122)\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:206)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat java

3231it [00:19, 156.44it/s]

QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b'SPARQL-QUERY: queryStr=SELECT ?answer WHERE { wd:Q761414 wdt:P1269 ?answer . ?answer wdt:P2534 ?x FILTER(contains(?x,\'\\phi: \\nabla\\phi = \\vec{A}\'))}\njava.util.concurrent.ExecutionException: org.openrdf.query.MalformedQueryException: Lexical error at line 1, column 97.  Encountered: "p" (112), after : "\\\'\\\\"\n\tat java.util.concurrent.FutureTask.report(FutureTask.java:122)\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:206)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.h

3231it [00:31, 156.44it/s]

EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=select ?ent where { ?ent wdt:P31 wd:Q523 . ?ent wdt:P2214 ?obj } ORDER BY DESC(?obj)LIMIT 5 \njava.util.concurrent.TimeoutException\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:205)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:687)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:865)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(Servle

19556it [05:25, 91.04it/s]

EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=select ?ent where { ?ent wdt:P31 wd:Q523 . ?ent wdt:P2214 ?obj } ORDER BY DESC(?obj)LIMIT 5 \njava.util.concurrent.TimeoutException\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:205)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:687)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:865)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(Servle

19556it [05:41, 91.04it/s]

EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b"SPARQL-QUERY: queryStr=SELECT DISTINCT ?sbj ?sbj_label WHERE { ?sbj wdt:P31 wd:Q523 . ?sbj rdfs:label ?sbj_label . FILTER(CONTAINS(lcase(?sbj_label), 'vega')) . FILTER (lang(?sbj_label) = 'en') } LIMIT 25 \njava.util.concurrent.TimeoutException\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:205)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:687)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletH

21589it [10:27, 29.11it/s]

EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b"SPARQL-QUERY: queryStr=SELECT DISTINCT ?sbj ?sbj_label WHERE { ?sbj wdt:P31 wd:Q523 . ?sbj rdfs:label ?sbj_label . FILTER(CONTAINS(lcase(?sbj_label), 'vega')) . FILTER (lang(?sbj_label) = 'en') } LIMIT 5 \njava.util.concurrent.TimeoutException\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:205)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:687)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHo

25742it [10:29, 43.20it/s]

QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b'SPARQL-QUERY: queryStr=SELECT ?answer WHERE { wd:Q951500 wdt:P1366 ?answer . ?answer wdt:P1451 wd:Qui transtulit sustinet}\njava.util.concurrent.ExecutionException: org.openrdf.query.MalformedQueryException: Lexical error at line 1, column 90.  Encountered: " " (32), after : "transtulit"\n\tat java.util.concurrent.FutureTask.report(FutureTask.java:122)\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:206)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet.service(HttpSer

25742it [10:41, 43.20it/s]

EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b"SPARQL-QUERY: queryStr=SELECT DISTINCT ?sbj ?sbj_label WHERE { ?sbj wdt:P31 wd:Q16521 . ?sbj rdfs:label ?sbj_label . FILTER(CONTAINS(lcase(?sbj_label), 'trigonotarbida')) . FILTER (lang(?sbj_label) = 'en') } LIMIT 25 \njava.util.concurrent.TimeoutException\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:205)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:687)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\tat org.eclipse.jetty.servlet.ServletHolder.han

28344it [15:09, 23.56it/s]

HTTP Error 502: Bad Gateway
Invalid control character at: line 219359 column 145 (char 5095527)
Invalid control character at: line 219359 column 145 (char 5095527)
Invalid control character at: line 219359 column 145 (char 5095527)
Invalid control character at: line 219359 column 145 (char 5095527)


30226it [16:20, 30.83it/s]

Invalid control character at: line 219359 column 145 (char 5095527)





In [None]:
# Function to process a single file
def process_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    processed_rows = []
    for data_chunk in data:
        correct_in_wikipedia = is_in_wikipedia(data_chunk['correct_id'])
        wrong_in_wikipedia = is_in_wikipedia(data_chunk['wrong_id'])
        processed_rows.append({
            'Sentence': data_chunk['text'],
            'Entity Name': data_chunk['string'],
            'Correct QID': data_chunk['correct_id'],
            'Wrong QID': data_chunk['wrong_id'],
            'Correct in Wikipedia': correct_in_wikipedia,
            'Wrong in Wikipedia': wrong_in_wikipedia,
        })
    return processed_rows

# Define main directory and path to save results
main_dir = "../data/Evaluation Data/Wikidata-Disamb"

# Use a list to accumulate the rows
all_data = []

# Iterate over each file, process it, and save in chunks
for file in tqdm(os.listdir(main_dir)):
    file_path = os.path.join(main_dir, file)
    processed_data = process_file(file_path)
    all_data.extend(processed_data)

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/Wikidata-Disamb/processed_dataframe.pkl", "wb"))

In [None]:
# Function to process a single file
def process_file(data, lang='en'):
    processed_rows = []
    for data_chunk in tqdm(data):
        qid = data_chunk['uri']
        qid_in_wikipedia = is_in_wikipedia(qid)
        boundaries = [(e['start'], e['end']) for e in data_chunk['entities'] if e['uri'] == data_chunk['uri']]
        processed_rows.append({
            'Sentence': data_chunk['text'],
            'Entity Name': data_chunk['title'],
            'Entity Span': boundaries,
            'Correct QID': qid,
            'Correct in Wikipedia': qid_in_wikipedia,
            'Language': data_chunk['lan']
        })
    return processed_rows

# Use a list to accumulate the rows
all_data = []

huggingface_ds = load_dataset("Babelscape/REDFM", "all_languages", streaming=True, trust_remote_code=True)

for split in huggingface_ds:
    all_data.extend(process_file(huggingface_ds[split]))

# Convert to DataFrame all at once at the end
clean_data = pd.DataFrame(all_data)
pickle.dump(clean_data, open("../data/Evaluation Data/REDFM/processed_dataframe.pkl", "wb"))

In [90]:
data = pickle.load(open("../data/Evaluation Data/REDFM/processed_dataframe.pkl", "rb"))
unique_ids = data['Correct QID'].unique()

data = pickle.load(open("../data/Evaluation Data/Mintaka/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, pd.Series(np.concatenate(data['Question QIDs'].to_numpy())).unique()])
unique_ids = np.concatenate([unique_ids, pd.Series(np.concatenate(data['Answer QIDs'].to_numpy())).unique()])

data = pickle.load(open("../data/Evaluation Data/KGConv/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, data['Question QID'].unique()])
unique_ids = np.concatenate([unique_ids, data['Answer QID'].unique()])

data = pickle.load(open("../data/Evaluation Data/Wikidata-Disamb/processed_dataframe.pkl", "rb"))
unique_ids = np.concatenate([unique_ids, data['Correct QID'].unique()])
unique_ids = np.concatenate([unique_ids, data['Wrong QID'].unique()])

unique_ids = pd.Series(unique_ids).unique()
unique_ids = pd.DataFrame({'QID': unique_ids})

In [None]:
tqdm.pandas()

unique_ids['In Wikipedia'] = unique_ids['QID'].progress_apply(lambda x: (WikidataEntity.get_entity(x) is not None) and (WikidataID.get_id(x).in_wikipedia))

In [None]:
batch_size = 1000  # Adjust based on memory requirements

# Get total count of entities to fetch
add_count = unique_ids['In Wikipedia'].sum()

with tqdm(total=add_count) as progressbar:
    with Session() as session:
        # Loop through batches
        offset = 0
        while offset < add_count:
            # Query a batch of entities
            entities = (session.query(WikidataEntity)
                        .join(WikidataID, WikidataEntity.id == WikidataID.id)
                        .filter(WikidataID.in_wikipedia == True)
                        .order_by(func.random())
                        .offset(offset)
                        .limit(batch_size)
                        .all())

            # Process each entity in the current batch
            batch_data = []
            for entity in entities:
                batch_data.append({
                    'QID': entity.id,
                    'In Wikipedia': True,
                    'from Evaluation': False
                })
                progressbar.update(1)

            # Append batch to the DataFrame
            unique_ids = pd.concat([unique_ids, pd.DataFrame(batch_data)], ignore_index=True)

            # Update offset
            offset += batch_size

In [None]:
test_data  = pd.DataFrame(columns=['Query', 'Correct', 'Source'])

data = pickle.load(open("../data/Evaluation Data/REDFM/processed_dataframe.pkl", "rb"))
prep = data[data['Language'] == 'en'][data['Correct in Wikipedia']]
prep['Correct QID'] = prep['Correct QID'].apply(lambda x: [x])
prep = prep.rename({'Sentence': 'Query', 'Correct QID': 'Correct'}, axis=1)
prep['Source'] = 'REDFM'

test_data = pd.concat([test_data, prep[['Query', 'Correct', 'Source']]])

def mintaka_filter(x):
    in_wiki = x['Question in Wikipedia'] + x['Answer in Wikipedia']
    if len(in_wiki) == 0:
        return False
    return all(in_wiki)

data = pickle.load(open("../data/Evaluation Data/Mintaka/processed_dataframe.pkl", "rb"))
prep = data[data.apply(lambda x: all(x['Question in Wikipedia'] + x['Answer in Wikipedia']), axis=1)]
prep['Correct'] = prep.apply(lambda x: x['Question QIDs'] + x['Answer QIDs'], axis=1)
prep = prep.rename({'Question': 'Query'}, axis=1)
prep['Source'] = 'Mintaka'

test_data = pd.concat([test_data, prep[['Query', 'Correct', 'Source']]])

data = pickle.load(open("../data/Evaluation Data/KGConv/processed_dataframe.pkl", "rb"))
prep = data[data.apply(lambda x: x['Question in Wikipedia'] and x['Answer in Wikipedia'], axis=1)]
prep['Correct'] = prep.apply(lambda x: [x['Question QID'], x['Answer QID']], axis=1)
prep = prep.rename({'Question': 'Query'}, axis=1)
prep['Source'] = 'KGConv'

test_data = pd.concat([test_data, prep[['Query', 'Correct', 'Source']]])
test_data

In [None]:
import pickle

data = pickle.load(open("../data/Evaluation Data/retrieval_results_Wikidata-Disamb-wikidata-en.pkl", "rb"))
data

In [None]:
import json
from langchain_astradb import AstraDBVectorStore
from astrapy.info import CollectionVectorServiceOptions
import asyncio

datastax_token = json.load(open('../API_tokens/datastax_wikidata_nvidia.json'))
ASTRA_DB_DATABASE_ID = datastax_token['ASTRA_DB_DATABASE_ID']
ASTRA_DB_APPLICATION_TOKEN = datastax_token['ASTRA_DB_APPLICATION_TOKEN']
ASTRA_DB_API_ENDPOINT = datastax_token["ASTRA_DB_API_ENDPOINT"]
ASTRA_DB_KEYSPACE = datastax_token["ASTRA_DB_KEYSPACE"]
K = 50
BATCH_SIZE = 50

# Set up CollectionVectorServiceOptions with NVIDIA model
collection_vector_service_options = CollectionVectorServiceOptions(
    provider="nvidia",
    model_name="NV-Embed-QA"
)

# Initialize the graph store
graph_store = AstraDBVectorStore(
    collection_name="wikidata",
    collection_vector_service_options=collection_vector_service_options,
    token=ASTRA_DB_APPLICATION_TOKEN,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    namespace=ASTRA_DB_KEYSPACE,
)

In [None]:
data

In [None]:
# Async function to retrieve similar QIDs
async def get_similar_qids_async(query):
    results = await graph_store.asimilarity_search_with_relevance_scores(query, k=K)
    return [r[0].metadata['QID'] for r in results], [r[1] for r in results]

# Function to retrieve results for each batch asynchronously
async def retrieve_qids_for_batch(queries_batch):
    tasks = [get_similar_qids_async(query) for query in queries_batch]
    return await asyncio.gather(*tasks)

# Main function to process queries in batches with a single progress bar
async def main():
    queries = test_data['Query'].tolist()
    all_results = []

    # Initialize tqdm progress bar with total length
    with tqdm(total=len(queries)) as pbar:
        for i in range(0, len(queries), BATCH_SIZE):
            batch = queries[i:i + BATCH_SIZE]
            batch_results = await retrieve_qids_for_batch(batch)
            all_results.extend(batch_results)
            pbar.update(len(batch))  # Update progress bar by the size of each batch

    # Assign results to DataFrame
    test_data[['Retrieval QIDs', 'Retrieval Score']] = all_results

# Run the main function using await
await main()

In [37]:
import numpy as np
import pickle

def calculate_mrr_score(df, pred_col, true_cols):
    # Remove duplicate QIDs while keeping the order
    prep[pred_col] = prep[pred_col].apply(lambda x: list(dict.fromkeys(x)))
    # Get the rank of each retrieved QID
    ranks = df.apply(lambda x: [i+1 for i in range(len(x[pred_col])) if (x[pred_col][i] in x[true_cols])], axis=1)
    # Return the MRR
    return ranks.apply(lambda x: 1/x[0] if len(x)>0 else 0).mean()

def calculate_ndcg_score(df, pred_col, true_cols):
    # Remove duplicate QIDs while keeping the order
    prep[pred_col] = prep[pred_col].apply(lambda x: list(dict.fromkeys(x)))
    # Get the rank of each retrieved QID
    ranks = df.apply(lambda x: [i+1 for i in range(len(x[pred_col])) if (x[pred_col][i] in x[true_cols])], axis=1)
    # Calculate the DCG, the Ideal DCG and finally return the NDCG
    dcg = ranks.apply(lambda x: sum([1/np.log2(y+1) for y in x]) if len(x)>0 else 0)
    idcg = df.apply(lambda x: sum([1/np.log2(y+1) for y in range(1, min(len(x[true_cols]), len(x[pred_col])) + 1)]), axis=1)
    return (dcg/idcg).mean()

prep = pickle.load(open("../data/Evaluation Data/retrieval_results_RuBQ-wikidata_test_v2-en.pkl", "rb"))
prep = prep[prep.apply(lambda x: all(x['Question in Wikipedia'] + x['Answer in Wikipedia']), axis=1)]
prep['Correct QIDs'] = prep.apply(lambda x: x['Question QIDs'] + x['Answer QIDs'], axis=1)
# prep = prep[prep['Correct in Wikipedia']]
# prep['Correct QIDs'] = prep['Correct QID'].apply(lambda x: [x])
calculate_ndcg_score(prep, 'Retrieval QIDs', 'Correct QIDs')

0.29367046147124726

In [33]:
prep

Unnamed: 0,Question QIDs,Answer QIDs,Question in Wikipedia,Answer in Wikipedia,Question,Answer,Retrieval QIDs,Retrieval Score,Correct QIDs
0,[Q8070],"[Q7944, Q60186, Q167903, Q2580904, Q5975740, Q...",[True],"[True, True, True, True, True, True]",What can cause a tsunami?,"[seism, shooting star, Rock avalanche, tsunami...","[Q7850148, Q25211995, Q7944, Q60061724, Q8065,...","[0.6873915, 0.68229353, 0.6689529, 0.6357784, ...","[Q8070, Q7944, Q60186, Q167903, Q2580904, Q597..."
1,[Q2222],[Q102513],[True],[True],"Who wrote the novel ""uncle Tom's Cabin""?",[Christopher Crowfield],"[Q2222, Q215410, Q326914, Q214371, Q41359, Q72...","[0.8654343, 0.7390483, 0.72999346, 0.72494113,...","[Q2222, Q102513]"
2,[Q83186],[Q692],[True],[True],"Who is the author of the play ""Romeo and Juliet""?",[The Bard],"[Q83186, Q463313, Q26833, Q1081181, Q3028771, ...","[0.81384814, 0.7692406, 0.76033306, 0.76020837...","[Q83186, Q692]"
3,[Q218],[Q19660],[True],[True],What is the name of the capital of Romania?,[Paris of the East],"[Q19660, Q218, Q21197, Q203493, Q100188, Q8332...","[0.9203162, 0.91148484, 0.90722334, 0.8896997,...","[Q218, Q19660]"
4,[Q5928],"[Q6607, Q483994, Q626035, Q2643890, Q17172850]",[True],"[True, True, True, True, False]",What instrument did Jimi Hendrix play?,"[guitar, kazoo, glockenspiel, vocalist, voice]","[Q5928, Q1407424, Q373360, Q1702407, Q769971, ...","[0.8064965, 0.70240027, 0.6845987, 0.6819242, ...","[Q5928, Q6607, Q483994, Q626035, Q2643890, Q17..."
...,...,...,...,...,...,...,...,...,...
2605,[Q362258],[],[True],[],What is the name of Vitas's father?,[],"[Q362258, Q143880, Q351491, Q3563395, Q1421, Q...","[0.7439842, 0.739173, 0.73597926, 0.7301621, 0...",[Q362258]
2606,[Q862],[],[True],[],"What was the name of the poet's father, Joseph...",[],"[Q144439, Q168728, Q28480, Q283121, Q82925, Q6...","[0.7449581, 0.7249292, 0.71994627, 0.7192839, ...",[Q862]
2607,[Q2143],[],[True],[],which river is the city of Syktyvkar located on?,[],"[Q2143, Q784682, Q6610, Q156713, Q40855, Q4831...","[0.8338574, 0.79417616, 0.78490466, 0.7840861,...",[Q2143]
2608,[Q612],[],[True],[],Which Bay is Dubai located on?,[],"[Q612, Q509588, Q15213940, Q4842734, Q57655, Q...","[0.754179, 0.7541323, 0.7497014, 0.7461058, 0....",[Q612]


In [None]:
import pickle
import numpy as np

prep = pickle.load(open("../data/Evaluation Data/retrieval_results_Wikidata-Disamb-wikidata-en.pkl", "rb"))

def calculate_accuracy_score(df):
    highest_score_idx = df['Retrieval Score'].apply(np.argmax)
    top_qid = df.apply(lambda x: x['Retrieval QIDs'][highest_score_idx[x.name]], axis=1)
    return (top_qid == df['Correct QID']).mean()

def calculate_log_odds_ratio_score(df):
    def log_odds_ratio(row):
        correct_qid = row['Correct QID']
        wrong_qid = row['Wrong QID']

        # Find the maximum scores for the correct and wrong QIDs
        correct_scores = [score for qid, score in zip(row['Retrieval QIDs'], row['Retrieval Score']) if qid == correct_qid]
        wrong_scores = [score for qid, score in zip(row['Retrieval QIDs'], row['Retrieval Score']) if qid == wrong_qid]

        max_correct_score = max(correct_scores, default=0)
        max_wrong_score = max(wrong_scores, default=0)

        correct_log_odds = np.log(max_correct_score / (1 - max_correct_score))
        wrong_log_odds = np.log(max_wrong_score / (1 - max_wrong_score))
        return correct_log_odds - wrong_log_odds

    # Apply the log odds ratio calculation to each row
    return df.apply(log_odds_ratio, axis=1).mean()

calculate_log_odds_ratio_score(prep)

In [120]:
sample_ids

Unnamed: 0,QID,In Wikipedia,from Evaluation,Sample 2
0,Q1316866,True,True,False
1,Q10424479,True,True,False
2,Q1624295,True,True,False
3,Q2735436,True,True,False
4,Q26439,True,True,False
...,...,...,...,...
250696,Q925653,True,True,True
250697,Q191105,True,True,True
250698,Q487125,True,True,True
250699,Q2972611,True,True,True


In [134]:
from sqlalchemy.sql import func
from tqdm import tqdm

# Modified query with random ordering
sample_count = sample_ids['from Evaluation'].sum()*2 - (~sample_ids['from Evaluation']).sum()
with tqdm(total=sample_count) as progressbar:
    with Session() as session:
        entities = (
            session.query(WikidataID)
            .filter(WikidataID.in_wikipedia == True)
            .order_by(func.random())  # Adds random ordering
            .yield_per(1000)
        )

        # Example of iterating through the entities
        for entity in tqdm(entities):
            if entity.id not in sample_ids['QID'].values:
                sample_ids = pd.concat([sample_ids, pd.DataFrame([{
                        'QID': entity.id,
                        'from Evaluation': False,
                        'In Wikipedia': True,
                        'Sample 2': True
                    }])], ignore_index=True)
                progressbar.update(1)
            if progressbar.n >= sample_count:
                break

175126it [20:21, 143.35it/s]09 [20:21<00:00, 109.31it/s]
100%|██████████| 170409/170409 [20:21<00:00, 139.49it/s]


In [2]:
import pickle

# prep = pickle.load(open("/home/philippe.saade/GitHub/WikidataTextEmbedding/data/Evaluation Data/KGConv/processed_dataframe.pkl", "rb"))

sample_ids = pickle.load(open("../data/Evaluation Data/Sample IDs (EN).pkl", "rb"))
sample_ids = sample_ids[sample_ids['In Wikipedia']]

sample_qids_set = set(sample_ids['QID'].values)

# Use vectorized operations for 'not_in_sample'
# prep['Question in Wikipedia'] = prep['Question QID'].isin(sample_qids_set)
# prep['Answer in Wikipedia'] = prep['Answer QID'].isin(sample_qids_set)
# prep

In [7]:
sample_ids[sample_ids['Sample 2']]

Unnamed: 0,QID,In Wikipedia,from Evaluation,Sample 2
221069,Q4790397,True,True,True
221070,Q3292,True,True,True
221071,Q13626,True,True,True
221072,Q13910,True,True,True
221073,Q1072618,True,True,True
...,...,...,...,...
422479,Q16891057,True,False,True
422480,Q5552960,True,False,True
422481,Q21152211,True,False,True
422482,Q27973969,True,False,True


In [99]:
for _, row in tqdm(prep.iterrows()):
    for i in range(len(row['Answer QIDs'])):
        if row['Answer in Wikipedia'][i] and row['Answer QIDs'][i] not in sample_qids_set:
            sample_ids = pd.concat([sample_ids, pd.DataFrame([{
                'QID': row['Answer QIDs'][i],
                'from Evaluation': True,
                'In Wikipedia': True,
                'from Evaluation 2': True
            }])], ignore_index=True)

2610it [00:02, 1112.58it/s]


In [86]:
def remove_spans(sentence, spans, replace_with='Entity'):
    # Sort spans in ascending order to remove from left to right
    spans = sorted(spans, key=lambda x: x[0])
    offset = 0  # To track the shift in index after replacing each span

    for start, end in spans:
        sentence = sentence[:start - offset] + replace_with + sentence[end - offset:]
        offset += (end - start) - len(replace_with)  # Update offset to account for the replaced span length

    return sentence

data['Sentence no entity'] = data.apply(lambda x: remove_spans(x['Sentence'], x['Entity Span']), axis=1)