In [12]:
import rdflib
from rdflib import Graph, URIRef, Literal
from rdflib import Namespace
import spacy



In [13]:
graph = rdflib.Graph().parse('data/14_graph.nt', format='turtle')

In [31]:
import spacy
import re

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Process the question using SpaCy's NER pipeline
question = "who is the director of Batman-1989?"
doc = nlp(question)
movie = ""

# Extract the entity (in this case, the movie)
for ent in doc.ents:
    movie = ent.text  # Get the text of the entity

print("movie is", movie)
print("doc is", doc)

movie is Batman-1989
doc is who is the director of Batman-1989?


In [32]:
question_pattern = "who is the (.*) of ENTITY"


# Replace the entity with "ENTITY" in the question
question = re.sub(re.escape(movie), "ENTITY", question.rstrip("?"))  # Use re.escape() to escape special characters in entity

# Match the relation using a pattern
relation_match = re.match(question_pattern, question)

# Ensure that the pattern matches and extract the relation
if relation_match:
    relation = relation_match.group(1)
    print("recognized relation: {}\n".format(relation))
else:
    print("No relation recognized.")

recognized relation: director



In [10]:
import spacy
import re

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Process the question
question = '  Who is the director of Batman '  # Example with extra spaces and case variation

# Process the question using SpaCy's NER pipeline
doc = nlp(question)

# Extract the first entity (assuming there is only one entity)
entity = ""
for ent in doc.ents:
    entity = ent.text  # Get the text of the first entity
    break  # Just using the first entity for now

print("Extracted entity:", entity)

# Escape special characters in the entity for use in regex
escaped_entity = re.escape(entity)

# Replace the entity in the question with "ENTITY"
question_with_placeholder = re.sub(escaped_entity, "ENTITY", question.rstrip("?"), flags=re.IGNORECASE)

# Normalize the question by stripping extra spaces
question_with_placeholder = re.sub(r'\s+', ' ', question_with_placeholder).strip()

print("\nModified question with placeholder:")
print(question_with_placeholder)

# Define multiple question patterns with space flexibility and case insensitivity
patterns = {
    r'\s*who\s+is\s+the\s+(.*)\s+of\s+"?ENTITY"?\s*': "who",  # Example: "Who is the director of ENTITY?"
    r'\s*when\s+was\s+"?ENTITY"?\s+(.*)': "when",  # Example: "When was ENTITY released?"
    r'\s*what\s+is\s+the\s+(.*)\s+of\s+"?ENTITY"?\s*': "what"  # Example: "What is the release date of ENTITY?"
}

matched_relation = None
matched_pattern = None

# Check each pattern and try to match
for pattern, relation_type in patterns.items():
    relation_match = re.match(pattern, question_with_placeholder, flags=re.IGNORECASE)
    
    if relation_match:
        # If a wildcard (.*) is present in the pattern, extract the specific relation (e.g., "director")
        matched_relation = relation_match.group(1) if "(.*)" in pattern else relation_type
        matched_pattern = pattern
        break  # Exit the loop once a match is found

# Output the result
if matched_pattern:
    print(f"\nMatched Pattern: {matched_pattern}")
    print(f"Recognized relation: {matched_relation}")
else:
    print("\nNo matching pattern found.")


Extracted entity: Batman

Modified question with placeholder:
Who is the director of ENTITY

Matched Pattern: \s*who\s+is\s+the\s+(.*)\s+of\s+"?ENTITY"?\s*
Recognized relation: director


In [11]:
import rdflib
from rdflib import URIRef
import re

# Assuming your RDF graph is already loaded
# graph = rdflib.Graph().parse('data/14_graph.nt', format='turtle')

# Define the namespace for Wikidata entities
wikidata_namespace = "http://www.wikidata.org/entity/"

# Extract entities from both subjects and objects if they are URIRefs
entities = set(graph.subjects()) | {s for s in graph.objects() if isinstance(s, URIRef)}  # Nodes
predicates = set(graph.predicates())  # Relations

nodes = {}
predicates_simplified = {}

# Process nodes (entities)
for node in entities:
    if isinstance(node, URIRef):  # Check if the node is a URI
        # Try to extract a label for the node
        label = graph.value(node, rdflib.RDFS.label)
        
        if label:
            nodes[node.toPython()] = label.toPython()  # Use the label if found
        else:
            # Simplify the URI by removing the Wikidata namespace
            nodes[node.toPython()] = re.sub(wikidata_namespace, "", node.toPython())

# Process predicates (relations)
for p in predicates:
    if isinstance(p, URIRef):  # Ensure the predicate is a URI
        predicates_simplified[p.toPython()] = re.sub(wikidata_namespace, "", p.toPython())

# Print the results
print("Labeled nodes: {}\n".format(nodes))
print("Simplified predicates: {}\n".format(predicates_simplified))


NameError: name 'graph' is not defined

In [57]:
extracted_entity = entity  # Example entity from SpaCy
extracted_relation = matched_relation  # Example relation from SpaCy

# Use SpaCy for similarity
def find_most_similar(term, candidates):
    term_nlp = nlp(term)
    max_similarity = -1
    most_similar = None
    for candidate in candidates:
        candidate_nlp = nlp(candidate)
        similarity = term_nlp.similarity(candidate_nlp)
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar = candidate
    return most_similar, max_similarity

# Find the most similar node (entity)
most_similar_node, node_similarity = find_most_similar(extracted_entity, nodes.values())

# Find the most similar predicate (relation)
most_similar_predicate, predicate_similarity = find_most_similar(extracted_relation, predicates_simplified.values())

# Output the most similar node and predicate
print(f"Most similar node to '{extracted_entity}': {most_similar_node} (Similarity: {node_similarity})")
print(f"Most similar predicate to '{extracted_relation}': {most_similar_predicate} (Similarity: {predicate_similarity})")

Labeled nodes: {'http://www.wikidata.org/entity/Q12339404': 'Q12339404', 'http://www.wikidata.org/entity/Q18154496': 'Q18154496', 'http://www.wikidata.org/entity/Q2555171': 'Q2555171', 'http://www.wikidata.org/entity/Q66424277': 'Q66424277', 'http://www.wikidata.org/entity/Q571480': 'Q571480', 'http://www.wikidata.org/entity/Q4271818': 'Q4271818', 'http://www.wikidata.org/entity/Q42315939': 'Q42315939', 'http://www.wikidata.org/entity/Q133622': 'Q133622', 'http://www.wikidata.org/entity/Q93437477': 'Q93437477', 'http://www.wikidata.org/entity/Q930746': 'Q930746', 'http://www.wikidata.org/entity/Q6066470': 'Q6066470', 'http://www.wikidata.org/entity/Q464964': 'Q464964', 'http://www.wikidata.org/entity/Q7358853': 'Q7358853', 'http://www.wikidata.org/entity/Q26869077': 'Q26869077', 'http://www.wikidata.org/entity/Q909602': 'Q909602', 'http://www.wikidata.org/entity/Q655296': 'Q655296', 'http://www.wikidata.org/entity/Q1036775': 'Q1036775', 'http://www.wikidata.org/entity/Q9156205': 'Q9156