In [304]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import rdflib
import plotly.io as pio
pio.renderers.default = 'jupyterlab+svg'
import numpy as np
from sklearn.metrics import pairwise_distances
from speakeasypy import Speakeasy, Chatroom
from typing import List
import time
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch
from torch import nn
import re
from thefuzz import fuzz,process
import editdistance
import itertools

import jsonpickle
# NOTE: You might have to download a few things for nltk to work properly
import nltk
from nltk.corpus import wordnet as wn
from nltk import Tree

# NOTE: You might have to download the en_core_web_sm model for this to work
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy import displacy

In [2]:
import os
import sys

if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('../')
os.getcwd()

'/home/claude/development/uzh__advanced_topics_in_ai'

In [3]:
g = rdflib.Graph()
g.parse('data/14_graph.nt', format='turtle')

<Graph identifier=Nf60bffe633ae49fd9f5d5ed050e3f115 (<class 'rdflib.graph.Graph'>)>

In [4]:
# load the embeddings
entity_emb = np.load('data/ddis-graph-embeddings/entity_embeds.npy')
relation_emb = np.load('data/ddis-graph-embeddings/relation_embeds.npy')

In [5]:
# load the dictionaries
with open('data/ddis-graph-embeddings/entity_ids.del', 'r') as ifile:
    ent2id = {str(rdflib.term.URIRef(ent)): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('data/ddis-graph-embeddings/relation_ids.del', 'r') as ifile:
    rel2id = {str(rdflib.term.URIRef(rel)): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

In [6]:
ent2lbl = {str(ent): str(lbl) for ent, lbl in g.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

In [7]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")

In [8]:
def extract_nodes(g):
    nodes = {}
    query ="""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 

    SELECT ?lbl WHERE {{
        <{}> rdfs:label ?lbl .
        FILTER(LANG(?lbl) = "en").
    }}
    LIMIT 1
    """

    graph_entities = set(g.subjects(unique=True)) | {s for s in g.objects(unique=True) if isinstance(s, URIRef)}
    for node in graph_entities:
        entity = node.toPython()
        if isinstance(node, URIRef):            
            qres = g.query(query.format(entity))
            for row in qres:
                answer = row.lbl
            
            nodes[str(answer)] = entity
    return nodes

def extract_predicates(g):
    query ="""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 

    SELECT ?lbl WHERE {{
        <{}> rdfs:label ?lbl .
        FILTER(LANG(?lbl) = "en").
    }}
    LIMIT 1
    """
    predicates = {}

    graph_predicates = set(g.predicates(unique=True))
    for predicate in graph_predicates:
        predicate_ = predicate.toPython()       
        qres = g.query(query.format(predicate_))
        for row in qres:
            answer = row.lbl
        
        predicates[str(answer)] = predicate_

    return predicates

# make variables for the nodes and predicates path
nodes_path = 'data/processed/nodes.json'
predicates_path = 'data/processed/predicates.json'

# check indiviudally if the files exist and if so load them
if os.path.exists(nodes_path):
    with open(nodes_path, 'r') as ifile:
        nodes = jsonpickle.decode(ifile.read())
else:
    nodes = extract_nodes(g)
    with open(nodes_path, 'w') as ofile:
        ofile.write(jsonpickle.encode(nodes))

if os.path.exists(predicates_path):
    with open(predicates_path, 'r') as ifile:
        predicates = jsonpickle.decode(ifile.read())
else:
    predicates = extract_predicates(g)
    with open(predicates_path, 'w') as ofile:
        ofile.write(jsonpickle.encode(predicates))

In [178]:
TEST_QUESTIONS = [
    "What is the genre of Good Neighbors?",
    'Who directed Apocalypse Now?',
    "Who is the director of Star Wars: Episode VI - Return of the Jedi?",
    "Who is the screenwriter of The Masked Gang: Cyprus?",
    'When was The Godfather released?',
    "Who is the producer of Inception?",
    "Who composed the soundtrack for Jurassic Park?",
    "When was Pulp Fiction released?",
    "Who played the lead role in The Matrix?",
    "Who directed Blade Runner 2049?",
    "What is the running time of The Shawshank Redemption?",
    "Who was the cinematographer for Mad Max: Fury Road?",
    "When did Titanic premiere?",
    "Who wrote the screenplay for The Social Network?",
    "What is the box office gross of Avatar?",
    "Who edited the movie Parasite?",
    "What is the budget of Halloween?",
    "Who starred as the main character in Forrest Gump?",
    "When was Interstellar first released?",
    "Who is the production designer of Dune (2021)?",
    "Who is the production designer of Dune?",
]

In [480]:
def print_tree(question):
    doc = nlp(question)
    displacy.render(doc, style='dep', jupyter=True)

# print_tree(TEST_QUESTIONS[2])
# print_tree("Who wrote the screenplay for The Social Network?")

In [384]:
def build_entity_phrase__rec(entity, preps_to_split, entities=list()):
    if entity not in entities:
        entities.append(entity)
    for child in entity.children:
        if child.dep_ == 'prep' and child in preps_to_split:
            continue
        entities = build_entity_phrase__rec(child, preps_to_split, entities)
    return entities

def build_entity_phrase(entity, preps_to_split):
    entities = build_entity_phrase__rec(entity, preps_to_split)

    tmp = []
    # print(list(entity.subtree))
    for token in entity.subtree:
        if token in entities:
            tmp.append(token)
    
    if tmp[0].text == 'the':
        tmp = tmp[1:]

    return ' '.join([token.text for token in tmp]).replace(' :', ':')

def check_for_child_prep_pobj(child, entity_nodes, preps_to_split):
    for subchild in child.children:
        if subchild.dep_ == 'prep':
            preps_to_split.append(subchild)

            for subsubchild in subchild.children:
                if subsubchild.dep_ == 'pobj':
                    entity_nodes.append(subsubchild)
                    entity_nodes, preps_to_split = check_for_child_prep_pobj(subsubchild, entity_nodes, preps_to_split)
    return entity_nodes, preps_to_split

In [385]:
# question = "Who is the director of Star Wars: Episode VI - Return of the Jedi?"   #1  ***
# question = "Who directed Apocalypse Now?"                                         #2  ***
# question = "Who directed the movie Apocalypse Now?"                               #2b ***
question = "When was The Godfather released?"                                     #3  ***
# question = "Who composed the soundtrack for Jurassic Park?"                       #4  ***
# question = "Who played the lead role in The Matrix?"                              #5  ***
# question = "What is the running time of The Lord of the Rings?"                   #6  ***
# question = "Who starred as the main character in Forrest Gump?"                   #7  ***
# question = "When was Interstellar first released?"                                #8  ***
# question = "What is the budget of Halloween?"                                     #9  ***
# question = "Who wrote the screenplay for The Social Network?"                     #10 ***


def parse_question(question):
    doc = nlp(question)
    sent = list(doc.sents)[0]

    root_type = sent.root.pos_
    # print(f"Root Type: {root_type}")

    entity_nodes = []
    preps_to_split = []


    if root_type == 'AUX':
        for child in sent.root.children:
            if child.dep_ == 'nsubj':
                entity_nodes.append(child)
                entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)


    elif root_type == 'VERB':
        for child in sent.root.children:
            if child.dep_ == 'dobj':
                entity_nodes.append(child)
                entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)

            elif child.dep_ == 'prep':
                preps_to_split.append(child)

                for subchild in child.children:
                    if subchild.dep_ == 'pobj':
                        entity_nodes.append(subchild)
                        entity_nodes, preps_to_split = check_for_child_prep_pobj(subchild, entity_nodes, preps_to_split)

            elif child.dep_ == 'nsubjpass':
                entity_nodes.append(child)
                entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)
            
            elif child.dep_ == 'nsubj':
                if child.pos_ != 'PRON':
                    entity_nodes.append(child)
                    entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)


    # print(entity_nodes)
    entities = dict()
    if root_type == 'VERB':
        entities[sent.root.text] = { 'type': 'VERB', 'matches': [] }
    for node in entity_nodes:
        phrase = build_entity_phrase(node, preps_to_split)
        entities[phrase] = { 'type': None, 'matches': [] }
            
    
    return entities

print(parse_question(question))
print_tree(TEST_QUESTIONS[1])

{'released': {'type': 'VERB', 'matches': []}, 'The Godfather': {'type': None, 'matches': []}}


In [479]:
for question in TEST_QUESTIONS:
    print(question)
    print('\t', parse_question(question))

What is the genre of Good Neighbors?
	 {'genre': {'type': None, 'matches': []}, 'Good Neighbors': {'type': None, 'matches': []}}
Who directed Apocalypse Now?
	 {'directed': {'type': 'VERB', 'matches': []}, 'Apocalypse': {'type': None, 'matches': []}}
Who is the director of Star Wars: Episode VI - Return of the Jedi?
	 {'director': {'type': None, 'matches': []}, 'Star Wars: Episode VI - Return of the Jedi': {'type': None, 'matches': []}}
Who is the screenwriter of The Masked Gang: Cyprus?
	 {'screenwriter': {'type': None, 'matches': []}, 'The Masked Gang: Cyprus': {'type': None, 'matches': []}}
When was The Godfather released?
	 {'released': {'type': 'VERB', 'matches': []}, 'The Godfather': {'type': None, 'matches': []}}
Who is the producer of Inception?
	 {'producer': {'type': None, 'matches': []}, 'Inception': {'type': None, 'matches': []}}
Who composed the soundtrack for Jurassic Park?
	 {'composed': {'type': 'VERB', 'matches': []}, 'soundtrack': {'type': None, 'matches': []}, 'Juras

In [478]:
WN_NOUN = 'n'
WN_VERB = 'v'
WN_ADJECTIVE = 'a'
WN_ADJECTIVE_SATELLITE = 's'
WN_ADVERB = 'r'


def convert(input, from_pos, to_pos):    
    """ Transform words given from/to POS tags """
    words,temp_word_list=[],[]
    for index,word in enumerate(input.split(" ")):
        synsets = wn.synsets(word, pos=from_pos)

        # Word not found
        if not synsets:
            if len(words)==0:
                words.append((word,1.0))
            else:
                words =[(w+" "+word, p) for w,p in words]
        else:
            # Get all lemmas of the word (consider 'a'and 's' equivalent)
            lemmas = []
            for s in synsets:
                for l in s.lemmas():
                    if s.name().split('.')[1] == from_pos or from_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and s.name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE):
                        lemmas += [l]

            # Get related forms
            derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas]
            # filter only the desired pos (consider 'a' and 's' equivalent)
            related_noun_lemmas = []

            for drf in derivationally_related_forms:
                if from_pos == "n":
                    related_noun_lemmas += [drf[0]]
                else:
                    for l in drf[1]:
                        if l.synset().name().split('.')[1] == to_pos or to_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and l.synset().name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE):
                            related_noun_lemmas += [l]

            # Extract the words from the lemmas
            temp_word_list=[l.name() for l in related_noun_lemmas]
            temp_word_list = [(w, float(temp_word_list.count(w)) / len(temp_word_list)) for w in set(temp_word_list)]

            # Take all the combinations for synonyms of different words
            # Build the result in the form of a list containing tuples (word, probability)
            if len(words)==0:
                words=temp_word_list
            else:
                words =[(w_b+" "+w_t, p_b*p_t) for w_b,p_b in words for w_t,p_t in temp_word_list]
                words.sort(key=lambda w:-w[1])

    # return all the possibilities sorted by probability
    return words

# sorted(convert('played', WN_VERB, WN_NOUN), key=lambda x: -x[1])

In [477]:
def label_query(item_iri):
    return """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?lbl WHERE {{
        <{}> rdfs:label ?lbl .
        FILTER(LANG(?lbl) = "en").
    }}
    LIMIT 1
    """.format(item_iri)

def who_query(item_iri, predicate_iri):
    return """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?query WHERE {{
        <{}> <{}> ?person .
        ?person rdfs:label ?query .
    }}
    LIMIT 1
    """.format(item_iri, predicate_iri)

def when_query(item_iri, predicate_iri):
    return """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?query WHERE {{
        <{}> <{}> ?query .
    }}
    LIMIT 1
    """.format(item_iri, predicate_iri)

def what_query(item_iri, predicate_iri):
    return """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?query WHERE {{
        <{}> <{}> ?query .
    }}
    LIMIT 1
    """.format(item_iri, predicate_iri)

def what_query__with_label(item_iri, predicate_iri):
    return """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?query WHERE {{
        <{}> <{}> ?item .
        ?item rdfs:label ?query .
        FILTER(LANG(?query) = "en").
    }}
    LIMIT 1
    """.format(item_iri, predicate_iri)

In [475]:
def answer_factual(question):
    if 'who' in question.lower().split(' '):
        question_type = 'who'
    elif 'when' in question.lower().split(' '):
        question_type = 'when'
    elif 'what' in question.lower().split(' '):
        question_type = 'what'
    else:
        question_type = 'unknown'

    # get all possible entities from parsing the question
    # ============================================================
    #
    parsed_dict = parse_question(question)
    # print(parsed_dict)

    # look up any possible match in the predicates/nodes
    # ============================================================
    #
    def lookup_item(label, nodes, predicates):
        matches = []
        if label in nodes.keys():
            matches.append(nodes[label])
        if label in predicates.keys():
            matches.append(predicates[label])
        return matches

    for entity in parsed_dict.keys():
        # in case of verbs, we want to check for synonyms, e.g.
        # "played" -> "actor"
        #
        if parsed_dict[entity]['type'] == 'VERB':
            # check synonyms based on noun form of the verb
            noun_forms = convert(entity, WN_VERB, WN_NOUN)

            if question_type == 'when':
                noun_forms.extend([(f"{noun[0]} date",0) for noun in noun_forms])
                # print(noun_forms)

            candidate_synonyms = list(filter(lambda x: x in nodes.keys(), [x[0] for x in noun_forms]))
            candidate_synonyms.append(entity)

            tmp = []
            for candidate in candidate_synonyms:
                if candidate == 'star':
                    tmp.extend(lookup_item('cast member', nodes, predicates))
                tmp.extend(lookup_item(candidate, nodes, predicates))
                
            parsed_dict[entity]['matches'] = tmp

        else:
            parsed_dict[entity]['matches'].extend(lookup_item(entity, nodes, predicates))

        parsed_dict[entity]['matches'] = list(set(parsed_dict[entity]['matches']))
    print(parsed_dict)

    # build query based on question word
    # ============================================================
    #
    possible_predicates = set()
    possible_items = set()

    for phrase in parsed_dict.keys():
        for match in parsed_dict[phrase]['matches']:
            identifier = match.split('/')[-1]

            if identifier.startswith('P'):
                possible_predicates.add(f"http://www.wikidata.org/prop/direct/{identifier}")
            elif identifier.startswith('Q'):
                possible_items.add(f"http://www.wikidata.org/entity/{identifier}")
    print(f"Identified Items: {possible_items}")
    print(f"Identified Predicates: {possible_predicates}")
    
    # Build possible queries
    # ============================================================
    #
    if question_type == 'who':
        queries = []
        for item in possible_items:
            for predicate in possible_predicates:
                queries.append(who_query(item, predicate))

    elif question_type == 'when':
        queries = []
        for item in possible_items:
            for predicate in possible_predicates:
                queries.append(when_query(item, predicate))

    elif question_type == 'what':
        queries = []
        for item in possible_items:
            for predicate in possible_predicates:
                queries.append(what_query(item, predicate))
                queries.append(what_query__with_label(item, predicate))

    else:
        print('UNKNOWN QUESTION TYPE')


    # Execute queries
    # ============================================================
    #
    query_answered = False
    for query in queries:
        # print(query)
        res = g.query(query)
        
        if len(res) == 0:
            continue

        for row in res:
            result = row.query
            
            if question_type in ['when', 'what']:
                if not isinstance(result, Literal):
                    continue

            # print(type(result))
            print(f"Answer: {result} (from Graph)")
            query_answered = True
            break

    if not query_answered:
        print("Could not find answer in graph")

In [476]:
for question in TEST_QUESTIONS:
    print(question)
    answer_factual(question)
    print('')

What is the genre of Good Neighbors?
{'genre': {'type': None, 'matches': ['http://www.wikidata.org/prop/direct/P136', 'http://www.wikidata.org/entity/Q483394']}, 'Good Neighbors': {'type': None, 'matches': ['http://www.wikidata.org/entity/Q3110682']}}
Identified Items: {'http://www.wikidata.org/entity/Q3110682', 'http://www.wikidata.org/entity/Q483394'}
Identified Predicates: {'http://www.wikidata.org/prop/direct/P136'}
Answer: art film (from Graph)

Who directed Apocalypse Now?
{'directed': {'type': 'VERB', 'matches': ['http://www.wikidata.org/entity/Q1162163', 'http://www.wikidata.org/entity/Q43229', 'http://www.wikidata.org/entity/Q81096', 'http://www.wikidata.org/prop/direct/P57', 'http://www.wikidata.org/entity/Q1251441']}, 'Apocalypse': {'type': None, 'matches': ['http://www.wikidata.org/entity/Q60964254']}}
Identified Items: {'http://www.wikidata.org/entity/Q1162163', 'http://www.wikidata.org/entity/Q60964254', 'http://www.wikidata.org/entity/Q43229', 'http://www.wikidata.org/en