In [1]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import rdflib
import plotly.io as pio
pio.renderers.default = 'jupyterlab+svg'
import numpy as np
from sklearn.metrics import pairwise_distances
from speakeasypy import Speakeasy, Chatroom
from typing import List
import time
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch
from torch import nn
import re
from thefuzz import fuzz,process
import editdistance
import itertools

import jsonpickle
# NOTE: You might have to download a few things for nltk to work properly
import nltk
from nltk.corpus import wordnet as wn
from nltk import Tree

# NOTE: You might have to download the en_core_web_sm model for this to work
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy import displacy

In [2]:
import os
import sys

if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('../')
os.getcwd()

'/home/claude/development/uzh__advanced_topics_in_ai'

In [3]:
g = rdflib.Graph()
g.parse('data/14_graph.nt', format='turtle')

<Graph identifier=Nf60bffe633ae49fd9f5d5ed050e3f115 (<class 'rdflib.graph.Graph'>)>

In [4]:
# load the embeddings
entity_emb = np.load('data/ddis-graph-embeddings/entity_embeds.npy')
relation_emb = np.load('data/ddis-graph-embeddings/relation_embeds.npy')

In [5]:
# load the dictionaries
with open('data/ddis-graph-embeddings/entity_ids.del', 'r') as ifile:
    ent2id = {str(rdflib.term.URIRef(ent)): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('data/ddis-graph-embeddings/relation_ids.del', 'r') as ifile:
    rel2id = {str(rdflib.term.URIRef(rel)): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

In [6]:
ent2lbl = {str(ent): str(lbl) for ent, lbl in g.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

In [7]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")

In [8]:
def extract_nodes(g):
    nodes = {}
    query ="""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 

    SELECT ?lbl WHERE {{
        <{}> rdfs:label ?lbl .
        FILTER(LANG(?lbl) = "en").
    }}
    LIMIT 1
    """

    graph_entities = set(g.subjects(unique=True)) | {s for s in g.objects(unique=True) if isinstance(s, URIRef)}
    for node in graph_entities:
        entity = node.toPython()
        if isinstance(node, URIRef):            
            qres = g.query(query.format(entity))
            for row in qres:
                answer = row.lbl
            
            nodes[str(answer)] = entity
    return nodes

def extract_predicates(g):
    query ="""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 

    SELECT ?lbl WHERE {{
        <{}> rdfs:label ?lbl .
        FILTER(LANG(?lbl) = "en").
    }}
    LIMIT 1
    """
    predicates = {}

    graph_predicates = set(g.predicates(unique=True))
    for predicate in graph_predicates:
        predicate_ = predicate.toPython()       
        qres = g.query(query.format(predicate_))
        for row in qres:
            answer = row.lbl
        
        predicates[str(answer)] = predicate_

    return predicates

# make variables for the nodes and predicates path
nodes_path = 'data/processed/nodes.json'
predicates_path = 'data/processed/predicates.json'

# check indiviudally if the files exist and if so load them
if os.path.exists(nodes_path):
    with open(nodes_path, 'r') as ifile:
        nodes = jsonpickle.decode(ifile.read())
else:
    nodes = extract_nodes(g)
    with open(nodes_path, 'w') as ofile:
        ofile.write(jsonpickle.encode(nodes))

if os.path.exists(predicates_path):
    with open(predicates_path, 'r') as ifile:
        predicates = jsonpickle.decode(ifile.read())
else:
    predicates = extract_predicates(g)
    with open(predicates_path, 'w') as ofile:
        ofile.write(jsonpickle.encode(predicates))

In [178]:
TEST_QUESTIONS = [
    "What is the genre of Good Neighbors?",
    'Who directed Apocalypse Now?',
    "Who is the director of Star Wars: Episode VI - Return of the Jedi?",
    "Who is the screenwriter of The Masked Gang: Cyprus?",
    'When was The Godfather released?',
    "Who is the producer of Inception?",
    "Who composed the soundtrack for Jurassic Park?",
    "When was Pulp Fiction released?",
    "Who played the lead role in The Matrix?",
    "Who directed Blade Runner 2049?",
    "What is the running time of The Shawshank Redemption?",
    "Who was the cinematographer for Mad Max: Fury Road?",
    "When did Titanic premiere?",
    "Who wrote the screenplay for The Social Network?",
    "What is the box office gross of Avatar?",
    "Who edited the movie Parasite?",
    "What is the budget of Halloween?",
    "Who starred as the main character in Forrest Gump?",
    "When was Interstellar first released?",
    "Who is the production designer of Dune (2021)?",
    "Who is the production designer of Dune?",
]

In [179]:
def print_tree(question):
    doc = nlp(question)
    displacy.render(doc, style='dep', jupyter=True)

# print_tree(TEST_QUESTIONS[2])
print_tree("Who wrote the screenplay for The Social Network?")

In [180]:
def build_entity_phrase__rec(entity, preps_to_split, entities=list()):
    if entity not in entities:
        entities.append(entity)
    for child in entity.children:
        if child.dep_ == 'prep' and child in preps_to_split:
            continue
        entities = build_entity_phrase__rec(child, preps_to_split, entities)
    return entities

def build_entity_phrase(entity, preps_to_split):
    entities = build_entity_phrase__rec(entity, preps_to_split)

    tmp = []
    # print(list(entity.subtree))
    for token in entity.subtree:
        if token in entities:
            tmp.append(token)
    
    if tmp[0].text == 'the':
        tmp = tmp[1:]

    return ' '.join([token.text for token in tmp])

def check_for_child_prep_pobj(child, entity_nodes, preps_to_split):
    for subchild in child.children:
        if subchild.dep_ == 'prep':
            preps_to_split.append(subchild)

            for subsubchild in subchild.children:
                if subsubchild.dep_ == 'pobj':
                    entity_nodes.append(subsubchild)
                    entity_nodes, preps_to_split = check_for_child_prep_pobj(subsubchild, entity_nodes, preps_to_split)
    return entity_nodes, preps_to_split

In [183]:
# question = "Who is the director of Star Wars: Episode VI - Return of the Jedi?"   #1  ***
# question = "Who directed Apocalypse Now?"                                         #2  ***
# question = "Who directed the movie Apocalypse Now?"                               #2b ***
# question = "When was The Godfather released?"                                     #3  ***
# question = "Who composed the soundtrack for Jurassic Park?"                       #4  ***
# question = "Who played the lead role in The Matrix?"                              #5  ***
# question = "What is the running time of The Lord of the Rings?"                   #6  ***
question = "Who starred as the main character in Forrest Gump?"                   #7  ***
# question = "When was Interstellar first released?"                                #8  ***
# question = "What is the budget of Halloween?"                                     #9  ***
# question = "Who wrote the screenplay for The Social Network?"                     #10 ***


def parse_question(question):
    doc = nlp(question)
    sent = list(doc.sents)[0]

    root_type = sent.root.pos_
    # print(f"Root Type: {root_type}")

    entity_nodes = []
    preps_to_split = []


    if root_type == 'AUX':
        for child in sent.root.children:
            if child.dep_ == 'nsubj':
                entity_nodes.append(child)
                entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)


    elif root_type == 'VERB':
        for child in sent.root.children:
            if child.dep_ == 'dobj':
                entity_nodes.append(child)
                entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)

            elif child.dep_ == 'prep':
                preps_to_split.append(child)

                for subchild in child.children:
                    if subchild.dep_ == 'pobj':
                        entity_nodes.append(subchild)
                        entity_nodes, preps_to_split = check_for_child_prep_pobj(subchild, entity_nodes, preps_to_split)

            elif child.dep_ == 'nsubjpass':
                entity_nodes.append(child)
                entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)
            
            elif child.dep_ == 'nsubj':
                if child.pos_ != 'PRON':
                    entity_nodes.append(child)
                    entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)


    # print(entity_nodes)
    entities = []
    if root_type == 'VERB':
        entities.append(sent.root.text)
    for node in entity_nodes:
        entities.append(build_entity_phrase(node, preps_to_split))
    
    return entities

print(parse_question(question))
print_tree(TEST_QUESTIONS[1])

['starred', 'main character', 'Forrest Gump']


In [182]:
for question in TEST_QUESTIONS:
    print(question)
    print('\t', parse_question(question))

What is the genre of Good Neighbors?
	 ['genre', 'Good Neighbors']
Who directed Apocalypse Now?
	 ['directed', 'Apocalypse']
Who is the director of Star Wars: Episode VI - Return of the Jedi?
	 ['director', 'Star Wars : Episode VI - Return of the Jedi']
Who is the screenwriter of The Masked Gang: Cyprus?
	 ['screenwriter', 'The Masked Gang : Cyprus']
When was The Godfather released?
	 ['released', 'The Godfather']
Who is the producer of Inception?
	 ['producer', 'Inception']
Who composed the soundtrack for Jurassic Park?
	 ['composed', 'soundtrack', 'Jurassic Park']
When was Pulp Fiction released?
	 ['released', 'Pulp Fiction']
Who played the lead role in The Matrix?
	 ['played', 'lead role', 'The Matrix']
Who directed Blade Runner 2049?
	 ['directed', 'Blade Runner 2049']
What is the running time of The Shawshank Redemption?
	 ['running time', 'The Shawshank Redemption']
Who was the cinematographer for Mad Max: Fury Road?
	 ['cinematographer', 'Mad Max : Fury Road']
When did Titanic p