In [19]:
#!pip install spacy
!python -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m97.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [27]:
#import librairies
import spacy
import csv
from SPARQLWrapper import SPARQLWrapper, JSON
from urllib.parse import quote
import json
import re


try:
    nlp = spacy.load("en_core_web_sm")  # Or "en_core_web_trf" or any other model
except OSError:
    print("Please download a spaCy model (e.g., 'python -m spacy download en_core_web_sm')")
    exit()


In [22]:
#function to extract entities from the dataset
def extract_entities(question):
    """
    Extracts entities from a question using spaCy.

    Args:
        question (str): The question string.

    Returns:
        list: A list of extracted entities (strings).
    """
    doc = nlp(question)
    entities = []
    for ent in doc.ents:
        entities.append(ent.text)
    return entities

In [23]:
#function to query the KG given an entity
def query_sparql_endpoint(endpoint, entity):
    """Queries a SPARQL endpoint for triples involving an entity."""

    sparql = SPARQLWrapper(endpoint)
    sparql.setReturnFormat(JSON)

    query = """SELECT?s?p?o 
WHERE {
  { <"""+entity+""">?p?o }
  UNION
  {?s?p <"""+entity+"""> }

} LIMIT 100"""


    query = """SELECT ?s ?p ?o 
WHERE {
  {
    { SELECT ?s ?p ?o WHERE { VALUES ?s { <"""+entity+"""> } ?s ?p ?o FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> && ?p!= <http://www.w3.org/2000/01/rdf-schema#label> && ?p!= <http://www.w3.org/2002/07/owl#sameAs> && ?p!= <http://dbpedia.org/property/wikiPageUsesTemplate> && ?p!= <http://dbpedia.org/ontology/almaMater> && ?p!= <http://dbpedia.org/ontology/wikiPageExternalLink> && ?p!=<http://dbpedia.org/ontology/wikiPageWikiLink> && ?p!= <http://www.w3.org/2000/01/rdf-schema#comment>) } LIMIT 100 }
  } UNION 
  { 
    { SELECT ?s ?p ?o WHERE { VALUES ?o { <"""+entity+"""> } ?s ?p ?o FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> && ?p!= <http://www.w3.org/2000/01/rdf-schema#label> && ?p!= <http://www.w3.org/2002/07/owl#sameAs> && ?p!= <http://dbpedia.org/property/wikiPageUsesTemplate> && ?p!= <http://dbpedia.org/ontology/almaMater> && ?p!= <http://dbpedia.org/ontology/wikiPageExternalLink> && ?p!=<http://dbpedia.org/ontology/wikiPageWikiLink> && ?p!= <http://www.w3.org/2000/01/rdf-schema#comment>) } LIMIT 100 }
  } 
} """

    #print(query)  # Debugging: Print the query

    sparql.setQuery(query)

    triples = []

    try:
        results = sparql.query().convert()
        for result in results["results"]["bindings"]:
            if "s" in result and "p" in result and "o" in result:
                triples.append(f'{result["s"]["value"]}\t{result["p"]["value"]}\t{result["o"]["value"]}')
        return triples

    except Exception as e:
        print(f"Error querying SPARQL endpoint: {e}")
        return []




In [24]:
#get the uri of a given entity
def find_dbpedia_uri(keyword):
    """
    Finds the URI of a DBpedia entity based on a keyword search.

    Args:
        keyword: The keyword to search for.

    Returns:
        The URI of the DBpedia entity, or None if no matching entity is found.
    """

    #endpoint = "https://dbpedia.org/sparql"
    endpoint = "https://dbpedia.data.dice-research.org/sparql"
    #query = """SELECT DISTINCT ?s WHERE { { ?s <http://www.w3.org/2000/01/rdf-schema#label> """+keyword+""" } UNION { ?s <http://www.w3.org/2000/01/rdf-schema#label> """+keyword+"""@en } FILTER (!CONTAINS(STR(?s), "Category:")) }"""
    query = f"""
        SELECT DISTINCT ?s 
        WHERE {{ 
            {{ ?s <http://www.w3.org/2000/01/rdf-schema#label> "{keyword}" }} 
            UNION 
            {{ ?s <http://www.w3.org/2000/01/rdf-schema#label> "{keyword}"@en }} 
        FILTER (!CONTAINS(STR(?s), "Category:")) 
        }}
    """
    sparql = SPARQLWrapper(endpoint)
    
    sparql.setReturnFormat(JSON)

    sparql.setQuery(query)

    try:
        results = sparql.query().convert()
        if results["results"]["bindings"]:  # Check if any results were found
            return results["results"]["bindings"][0]["s"]["value"]  # Return the URI of the first result
        else:
            return None  # No matching entity found
    except Exception as e:
        print(f"Error: {e}")
        return None


In [25]:
#Example Usage
#endpoint = "https://dbpedia.org/sparql"
endpoint = "https://dbpedia.data.dice-research.org/sparql"
#entity1 = "http://dbpedia.org/resource/Albert_Einstein"
#entity2 = "http://dbpedia.org/resource/Cornell_University"
import pandas as pd

df = pd.read_csv("MLaKE/dataset/single_hop/en_qa.csv")

all_triples = {} 

remove_rows = []

for i, keyword in enumerate(df["question"]):
    #for keyword2 in df["e_answer"]:
    #print(keyword1)
    #break
    if keyword not in all_triples:
        entity1 = extract_entities(keyword)
        for i in entity1:
            entities = find_dbpedia_uri(i)
            #print(i, entities)
        
            #entity2 = find_dbpedia_uri(keyword2)
        #print(entity1)
        #break
    
            if entities is None:
                remove_rows.append(i)
                continue
            
            if entities and keyword not in all_triples:
                result = query_sparql_endpoint(endpoint, entities)
                all_triples[keyword] = result

    
with open("./all_triples/"+"all_triples_dict1.json", "w") as f:
    json.dump(all_triples, f)

with open("./all_triples/"+"removed_rows1.json", "w") as f:
    json.dump(remove_rows, f)



Error: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b"An error occurred during query evaluation: Parse Error: mismatched input 'M' expecting '}'. At line: 4 and position: 64"


In [26]:
#save the triples
with open( "./all_triples/triples1.txt",'w', encoding='utf-8') as r:
    
    for keyword, triples in all_triples.items():
        for t in triples:
#         print(triple)
            r.write(t + "\n")

In [28]:
#process the triples
def process_triples_file(input_file, output_file):
    """
    Reads a file containing triples (s\tp\to), extracts the last part of each link,
    and writes the processed triples to a new file.

    Args:
        input_file (str): Path to the input file.
        output_file (str): Path to the output file.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as infile, \
             open(output_file, 'w', encoding='utf-8') as outfile:

            for line in infile:
                line = line.strip()  # Remove leading/trailing whitespace
                if not line:  # Skip empty lines
                    continue

                # Split by tab or space using regular expressions
                parts = re.split(r'\t| |  ', line)
                #parts = re.split(r'\t| ', line)

                if len(parts) != 3:
                    print(f"Warning: Invalid triple format in line: '{line}'")
                    continue  # Skip lines with incorrect number of parts

                s, p, o = parts

                # Extract the last part of each link
                s_last = s.split('/')[-1]
                p_last = p.split('/')[-1]
                o_last = o.split('/')[-1]

                # Write the processed triple to the output file
                outfile.write(f"{s_last}\t{p_last}\t{o_last}\n")

        print(f"Processed triples saved to '{output_file}'.")

        print(f"Processed triples saved to '{output_file}'.")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except ValueError:
        print(f"Error: Invalid triple format in '{input_file}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [18]:
input_file = './all_triples/triples1.txt'  # Replace with your input file path
output_file = './all_triples/processed_triples1.txt' # Replace with your desired output file name

process_triples_file(input_file, output_file)



IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

