## 1. Data Load


In [8]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import rdflib
import plotly.io as pio
pio.renderers.default = 'jupyterlab+svg'
import numpy as np
from sklearn.metrics import pairwise_distances
from speakeasypy import Speakeasy, Chatroom
from typing import List
import time
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch
from torch import nn
import re
from thefuzz import fuzz,process
import editdistance
import itertools


import jsonpickle
# NOTE: You might have to download a few things for nltk to work properly
import nltk
from nltk.corpus import wordnet as wn
from nltk import Tree

# NOTE: You might have to download the en_core_web_sm model for this to work
import spacy
from spacy import displacy
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 
import os

In [2]:
g = rdflib.Graph()
g.parse('data/14_graph.nt', format='turtle')

<Graph identifier=Nfdb5d78e2e35459ebd7175b8f8527723 (<class 'rdflib.graph.Graph'>)>

In [3]:
# load the embeddings
entity_emb = np.load('data/ddis-graph-embeddings/entity_embeds.npy')
relation_emb = np.load('data/ddis-graph-embeddings/relation_embeds.npy')

In [4]:
# load the dictionaries
with open('data/ddis-graph-embeddings/entity_ids.del', 'r') as ifile:
    ent2id = {str(rdflib.term.URIRef(ent)): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('data/ddis-graph-embeddings/relation_ids.del', 'r') as ifile:
    rel2id = {str(rdflib.term.URIRef(rel)): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

In [5]:
ent2lbl = {str(ent): str(lbl) for ent, lbl in g.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

In [6]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")

## Evaluation 2

### NER model

In [7]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

ner = pipeline("ner", model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [9]:
def extract_nodes(g):
    nodes = {}
    query ="""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 

    SELECT ?lbl WHERE {{
        <{}> rdfs:label ?lbl .
        FILTER(LANG(?lbl) = "en").
    }}
    LIMIT 1
    """

    graph_entities = set(g.subjects(unique=True)) | {s for s in g.objects(unique=True) if isinstance(s, URIRef)}
    for node in graph_entities:
        entity = node.toPython()
        if isinstance(node, URIRef):            
            qres = g.query(query.format(entity))
            for row in qres:
                answer = row.lbl
            
            nodes[str(answer)] = entity
    return nodes

def extract_predicates(g):
    query ="""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 

    SELECT ?lbl WHERE {{
        <{}> rdfs:label ?lbl .
        FILTER(LANG(?lbl) = "en").
    }}
    LIMIT 1
    """
    predicates = {}

    graph_predicates = set(g.predicates(unique=True))
    for predicate in graph_predicates:
        predicate_ = predicate.toPython()       
        qres = g.query(query.format(predicate_))
        for row in qres:
            answer = row.lbl
        
        predicates[str(answer)] = predicate_

    return predicates


# make variables for the nodes and predicates path
nodes_path = '/Users/melihserin/Desktop/ATAI/dataset/processed/nodes.json'
predicates_path = '/Users/melihserin/Desktop/ATAI/dataset/processed/predicates.json'

# check indiviudally if the files exist and if so load them
if os.path.exists(nodes_path):
    with open(nodes_path, 'r') as ifile:
        nodes = jsonpickle.decode(ifile.read())
else:
    nodes = extract_nodes(g)
    with open(nodes_path, 'w') as ofile:
        ofile.write(jsonpickle.encode(nodes))

if os.path.exists(predicates_path):
    with open(predicates_path, 'r') as ifile:
        predicates = jsonpickle.decode(ifile.read())
else:
    predicates = extract_predicates(g)
    with open(predicates_path, 'w') as ofile:
        ofile.write(jsonpickle.encode(predicates))

In [302]:
class Chatbot_ner(nn.Module):
    def __init__(self):
        super().__init__()
        self.ner = ner
        self.factual_question_patterns = [
            "who is the (.*) of ENTITY",
            "who was the (.*) of ENTITY",
            "who was the (.*) for ENTITY",
            "who was the (.*) in ENTITY",
            "what is the (.*) of ENTITY",
            "who (.*) ENTITY",
            "who wrote the (.*) of ENTITY",
            "who wrote the (.*) for ENTITY",
            "when was ENTITY (.*)",
            # "when did ENTITY (.*)",
            "where was ENTITY (.*)",
            "where is ENTITY (.*)"
        ]
        self.nodes = nodes
        self.predicates = predicates
        self.entity_emb = entity_emb
        self.relation_emb = relation_emb
        self.ent2id = ent2id
        self.rel2id = rel2id
        self.ent2lbl = ent2lbl
        self.lbl2ent = lbl2ent
        self.id2ent = id2ent
        self.WN_NOUN = 'n'
        self.WN_VERB = 'v'
        self.WN_ADJECTIVE = 'a'
        self.WN_ADJECTIVE_SATELLITE = 's'
        self.WN_ADVERB = 'r'
    
    def entity_extraction(self,ner_results,example):
        entity = ""
        entity_list = []
        reset=0
        for entity_num in range(len(ner_results)):
            if (ner_results[entity_num]["word"].find("#") ==-1) & (reset!=0):
                entity = entity + " " + ner_results[entity_num]["word"]
                reset +=1
            else:
                entity = entity + ner_results[entity_num]["word"].replace("#","")
                reset +=1
            
            if (entity_num < len(ner_results)-1):
                if (ner_results[entity_num+1]["start"] - ner_results[entity_num]["end"] > 3):
                    entity_list.append(entity)
                    reset=0
                    entity = ""
                    continue
            else:
                entity_list.append(entity)
                reset=0
                continue
        given_entity_flawed = entity_list
        for n,entity in enumerate(entity_list):
            entity = entity.replace("?","").strip()
            if len(entity.split(" "))>1:
                try:
                    first_word = entity.split(" ")[0]
                    last_word = entity.split(" ")[-1]
                    search_str = first_word + "(.+?)" + last_word
                    entity_list[n] = re.search(search_str,example).group(0)
                except:
                    ent=""
                    for w in entity.split(" "):
                        ent+= w + " "
                    entity_list[n] = ent
            else:
                continue
        return entity_list,given_entity_flawed
    
    def preprocessing_before_ner(self,question):
        try:
            question_new=re.sub(re.search("(.*?)of",question).group(0), re.search("(.*?)of",question).group(0).lower() ,question)
        except:
            words_question = question.split(" ")
            words_question[0] =words_question[0].lower()
            question_new = ""
            for word in words_question:
                question_new += word + " "
        return question_new
    
    def preprocessing_before_patterndetection(self,question):
        if "the movie" in question:
            question=question.replace("the movie ","")
        return question
    
    def preprocessing(self,question):
        if "the movie" in question:
            question=question.replace("the movie ","")
        if "the" in question:
            index_the = [idx for idx,word in enumerate(question.split(" ")) if word=="the"]
            words_between = ""
            for i in range(1,index_the[0]):
                words_between += question.split(" ")[i] + " "
            question=question.replace(words_between.strip(),"...")
        
        if "when" in question.lower():
            tmp_words = ["was","were","is","are","did","do","does","have","has"]
            if question.split(" ")[1] in tmp_words:
                question=question.replace(question.split(" ")[1],"...")
        return question.replace("?","").lower()

    # which pattern is used in the given question?
    def pattern_detection(self,ner_results,example):
        entities_extracted,given_entity_flawed = self.entity_extraction(ner_results,example)
        matched_entity,_= self.match_things(self.nodes, entities_extracted[0])
        pattern_and_entity = [[re.sub("ENTITY",matched_entity, pattern),matched_entity] for pattern in self.factual_question_patterns]
        example_updated = re.sub(given_entity_flawed[0].replace("?","").strip(),matched_entity, example)
        pattern_entity_included = [lists[0] for lists in pattern_and_entity]
        entity_from_pattern_and_entity = list(dict.fromkeys([lists[1] for lists in pattern_and_entity]))


        question_pattern = process.extract(self.preprocessing_before_patterndetection(example),pattern_entity_included,scorer=fuzz.ratio)[0][0]
        question_pattern_ = [re.sub(value,"ENTITY",question_pattern) for value in entity_from_pattern_and_entity if question_pattern.find(value)!=-1][0]

        index = [num for num,value in enumerate(self.factual_question_patterns) if value==question_pattern_][0]

        return question_pattern,index,example_updated

    def relation_extraction(self,ner_results,example):
        question_pattern, index,example_updated = self.pattern_detection(ner_results,example)
        relation = re.match(self.preprocessing(question_pattern), self.preprocessing(example_updated)).group(1)
        if len(relation.split(" "))==1 and (wn.synsets(relation)[0].pos() == self.WN_VERB):
            relations = [synonym for synonym,score in self.convert(relation, self.WN_VERB,self.WN_NOUN)]
            return self.match_relations(relations,relation)
        else:
            return relation # take care of directed, released, etc. cases
    
    def convert(self,input, from_pos, to_pos):    
        """ Transform words given from/to POS tags """
        words,temp_word_list=[],[]
        for index,word in enumerate(input.split(" ")):
            synsets = wn.synsets(word, pos=from_pos)

            # Word not found
            if not synsets:
                if len(words)==0:
                    words.append((word,1.0))
                else:
                    words =[(w+" "+word, p) for w,p in words]
            else:
                # Get all lemmas of the word (consider 'a'and 's' equivalent)
                lemmas = []
                for s in synsets:
                    for l in s.lemmas():
                        if s.name().split('.')[1] == from_pos or from_pos in (self.WN_ADJECTIVE, self.WN_ADJECTIVE_SATELLITE) and s.name().split('.')[1] in (self.WN_ADJECTIVE, self.WN_ADJECTIVE_SATELLITE):
                            lemmas += [l]

                # Get related forms
                derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas]
                # filter only the desired pos (consider 'a' and 's' equivalent)
                related_noun_lemmas = []

                for drf in derivationally_related_forms:
                    if from_pos == "n":
                        related_noun_lemmas += [drf[0]]
                    else:
                        for l in drf[1]:
                            if l.synset().name().split('.')[1] == to_pos or to_pos in (self.WN_ADJECTIVE, self.WN_ADJECTIVE_SATELLITE) and l.synset().name().split('.')[1] in (self.WN_ADJECTIVE, self.WN_ADJECTIVE_SATELLITE):
                                related_noun_lemmas += [l]

                # Extract the words from the lemmas
                temp_word_list=[l.name() for l in related_noun_lemmas]
                temp_word_list = [(w, float(temp_word_list.count(w)) / len(temp_word_list)) for w in set(temp_word_list)]

                # Take all the combinations for synonyms of different words
                # Build the result in the form of a list containing tuples (word, probability)
                if len(words)==0:
                    words=temp_word_list
                else:
                    words =[(w_b+" "+w_t, p_b*p_t) for w_b,p_b in words for w_t,p_t in temp_word_list]
                    words.sort(key=lambda w:-w[1])

        # return all the possibilities sorted by probability
        return words    
    
    def match_things(self,dict, input):
        tmp = 9999
        match_key = ""
        match_value = ""
        for key, value in dict.items():
            if editdistance.eval(key.lower(), input) < tmp:
                tmp = editdistance.eval(key.lower(), input)
                match_key = key
                match_value = value
        
        return match_key,match_value

    def match_relations(self, inputs,relation):
        tmp = 9999
        match_key = ""
        for input in inputs:
            if editdistance.eval(relation.lower(), input) < tmp:
                tmp = editdistance.eval(relation.lower(), input)
                match_key = input
        return match_key
        
    def final_query(self,matched_entity,matched_entity_url,matched_predicate,matched_predicate_url):
        query_option1 ="""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 

        SELECT ?lbl WHERE {{
            <{}> <{}> ?answer.
            ?answer rdfs:label ?lbl .
            FILTER(LANG(?lbl) = "en").
        }}
        LIMIT 1
        """.format(matched_entity_url,matched_predicate_url)

        query_option2 ="""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 

        SELECT ?lbl WHERE {{
            ?answer <{}> <{}>.
            ?answer rdfs:label ?lbl .
            FILTER(LANG(?lbl) = "en").
        }}
        LIMIT 1
        """.format(matched_predicate_url,matched_entity_url)

        qres1 = g.query(query_option1)
        qres2 = g.query(query_option2)

        answer = ""
        try:
            for row in qres1:
                answer = row.lbl
        except answer == "":
            for row in qres2:
                answer = row.lbl 


        if answer == "":
            try:
                answer1, answer2, answer3 = self.final_embed(matched_entity_url,matched_predicate_url)    
                return f"According to the embeddings, the {matched_predicate} of {matched_entity} is {answer1}, {answer2}, {answer3}." 
            except:  
                return "Sorry, I could not find the answer. Can you please rephrase the question?"
        else:
            # answer1, answer2, answer3 = self.final_embed(matched_entity_url,matched_predicate_url)    
            return f"""According to the the graph, the {matched_predicate} of {matched_entity} is {answer}."""
    
    def final_embed(self,matched_entity_url,matched_predicate_url):
        head = self.entity_emb[self.ent2id[matched_entity_url]]
        pred = self.relation_emb[self.rel2id[matched_predicate_url]]
        # add vectors according to TransE scoring function.
        lhs = head + pred
        # compute distance to *any* entity
        dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
        # find most plausible entities
        most_likely = dist.argsort()
        # compute ranks of entities
        ranks = dist.argsort().argsort()

        most_plausible_3_answers = [(str(self.id2ent[idx]), self.ent2lbl[self.id2ent[idx]])
            for rank, idx in enumerate(most_likely[:3])]
        
        answer1, answer2, answer3 = most_plausible_3_answers[0][1],most_plausible_3_answers[1][1],most_plausible_3_answers[2][1]
        return answer1, answer2, answer3

    def forward(self,input):
        ner_results = self.ner(self.preprocessing_before_ner(input))
        entities,_ = self.entity_extraction(ner_results,input)
        entity = entities[0]
        relation = self.relation_extraction(ner_results,input)
        
        matched_entity, matched_entity_url= self.match_things(self.nodes, entity)
        matched_predicate, matched_predicate_url= self.match_things(self.predicates,relation)

        output = self.final_query(matched_entity,matched_entity_url,matched_predicate,matched_predicate_url)
        return output
            

In [303]:
chatbot = Chatbot_ner()
chatbot("Who is the director of Good Neighbors?")

'According to the the graph, the director of Good Neighbors is Jacob Tierney.'

In [58]:
from decouple import config
DEFAULT_HOST_URL = config("UZH_SPEAKEASY_HOST")
listen_freq = 2
chatbot = Chatbot_ner()

class Agent:
    def __init__(self, username, password):
        self.username = username
        # Initialize the Speakeasy Python framework and login.
        self.speakeasy = Speakeasy(host=DEFAULT_HOST_URL, username=username, password=password)
        self.speakeasy.login()  # This framework will help you log out automatically when the program terminates.

    def listen(self):
        while True:
            # only check active chatrooms (i.e., remaining_time > 0) if active=True.
            rooms: List[Chatroom] = self.speakeasy.get_rooms(active=True)
            for room in rooms:
                if not room.initiated:
                    # send a welcome message if room is not initiated
                    room.post_messages(f'Hello! And Gruetzig, Merhaba, Bonjour! How can I help you today?')
                    room.initiated = True
                # Retrieve messages from this chat room.
                # If only_partner=True, it filters out messages sent by the current bot.
                # If only_new=True, it filters out messages that have already been marked as processed.
                for message in room.get_messages(only_partner=True, only_new=True):
                    print(
                        f"\t- Chatroom {room.room_id} "
                        f"- new message #{message.ordinal}: '{message.message}' "
                        f"- {self.get_time()}")

                    # Implement your agent here #
                    if (message.message.lower() == "hi") | (message.message.lower() == "hello"):
                        answer='Hello! And Gruetzig, Merhaba, Bonjour! How can I help you today?'
                    else:
                        try:
                            answer = chatbot(message.message)
                        except:
                            answer = "Sorry :( I could not understand you. Can you rephrase your question?"
                    # Send a message to the corresponding chat room using the post_messages method of the room object.
                    room.post_messages(f"{answer}")
                    # Mark the message as processed, so it will be filtered out when retrieving new messages.
                    room.mark_as_processed(message)

                # Retrieve reactions from this chat room.
                # If only_new=True, it filters out reactions that have already been marked as processed.
                for reaction in room.get_reactions(only_new=True):
                    print(
                        f"\t- Chatroom {room.room_id} "
                        f"- new reaction #{reaction.message_ordinal}: '{reaction.type}' "
                        f"- {self.get_time()}")

                    # Implement your agent here #

                    room.post_messages(f"Received your reaction: '{reaction.type}' ")
                    room.mark_as_processed(reaction)

            time.sleep(listen_freq)

    @staticmethod
    def get_time():
        return time.strftime("%H:%M:%S, %d-%m-%Y", time.localtime())


In [None]:
demo_bot = Agent(config("UZH_BOT_USERNAME"), config("UZH_BOT_PASSWORD"))
demo_bot.listen()

### nltk / spaCy model

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("spacy_wordnet", after='tagger')

In [91]:
TEST_QUESTIONS = [
    "What is the genre of Good Neighbors?",
    'Who directed Apocalypse Now?',
    "Who is the director of Star Wars Episode VI - Return of the Jedi?",
    "Who is the screenwriter of The Masked Gang: Cyprus?",
    'When was The Godfather released?',
    "Who is the producer of Inception?",
    "Who composed the soundtrack for Jurassic Park?",
    "When was Pulp Fiction released?",
    "Who played the lead role in The Matrix?",
    "Who directed Blade Runner 2049?",
    "What is the running time of The Shawshank Redemption?",
    "Who was the cinematographer for Mad Max: Fury Road?",
    "When did Titanic premiere?",
    "Who wrote the screenplay for The Social Network?",
    "What is the box office gross of Avatar?",
    "Who edited the movie Parasite?",
    "What is the budget of Halloween?",
    "Who starred as the main character in Forrest Gump?",
    "When was Interstellar first released?",
    "Who is the production designer of Dune (2021)?",
    "Who is the production designer of Dune?",
]

In [294]:
def print_tree(question):
    doc = nlp(question)
    displacy.render(doc, style='dep', jupyter=True)

In [92]:
def build_entity_phrase__rec(entity, preps_to_split, entities=list()):
    if entity not in entities:
        entities.append(entity)
    for child in entity.children:
        if child.dep_ == 'prep' and child in preps_to_split:
            continue
        entities = build_entity_phrase__rec(child, preps_to_split, entities)
    return entities

def build_entity_phrase(entity, preps_to_split):
    entities = build_entity_phrase__rec(entity, preps_to_split)

    tmp = []
    # print(list(entity.subtree))
    for token in entity.subtree:
        if token in entities:
            tmp.append(token)
    
    if tmp[0].text == 'the':
        tmp = tmp[1:]

    return ' '.join([token.text for token in tmp]).replace(' :', ':')

def check_for_child_prep_pobj(child, entity_nodes, preps_to_split):
    for subchild in child.children:
        if subchild.dep_ == 'prep':
            preps_to_split.append(subchild)

            for subsubchild in subchild.children:
                if subsubchild.dep_ == 'pobj':
                    entity_nodes.append(subsubchild)
                    entity_nodes, preps_to_split = check_for_child_prep_pobj(subsubchild, entity_nodes, preps_to_split)
    return entity_nodes, preps_to_split

In [93]:
def parse_question(question):
    doc = nlp(question)
    sent = list(doc.sents)[0]

    root_type = sent.root.pos_
    # print(f"Root Type: {root_type}")

    entity_nodes = []
    preps_to_split = []


    if root_type == 'AUX':
        for child in sent.root.children:
            if child.dep_ == 'nsubj':
                entity_nodes.append(child)
                entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)


    elif root_type == 'VERB':
        for child in sent.root.children:
            if child.dep_ == 'dobj':
                entity_nodes.append(child)
                entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)

            elif child.dep_ == 'prep':
                preps_to_split.append(child)

                for subchild in child.children:
                    if subchild.dep_ == 'pobj':
                        entity_nodes.append(subchild)
                        entity_nodes, preps_to_split = check_for_child_prep_pobj(subchild, entity_nodes, preps_to_split)

            elif child.dep_ == 'nsubjpass':
                entity_nodes.append(child)
                entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)
            
            elif child.dep_ == 'nsubj':
                if child.pos_ != 'PRON':
                    entity_nodes.append(child)
                    entity_nodes, preps_to_split = check_for_child_prep_pobj(child, entity_nodes, preps_to_split)


    # print(entity_nodes)
    entities = dict()
    if root_type == 'VERB':
        entities[sent.root.text] = { 'type': 'VERB', 'matches': [] }
    for node in entity_nodes:
        phrase = build_entity_phrase(node, preps_to_split)
        entities[phrase] = { 'type': None, 'matches': [] }
            
    
    return entities

In [94]:
WN_NOUN = 'n'
WN_VERB = 'v'
WN_ADJECTIVE = 'a'
WN_ADJECTIVE_SATELLITE = 's'
WN_ADVERB = 'r'


def convert(input, from_pos, to_pos):    
    """ Transform words given from/to POS tags """
    words,temp_word_list=[],[]
    for index,word in enumerate(input.split(" ")):
        synsets = wn.synsets(word, pos=from_pos)

        # Word not found
        if not synsets:
            if len(words)==0:
                words.append((word,1.0))
            else:
                words =[(w+" "+word, p) for w,p in words]
        else:
            # Get all lemmas of the word (consider 'a'and 's' equivalent)
            lemmas = []
            for s in synsets:
                for l in s.lemmas():
                    if s.name().split('.')[1] == from_pos or from_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and s.name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE):
                        lemmas += [l]

            # Get related forms
            derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas]
            # filter only the desired pos (consider 'a' and 's' equivalent)
            related_noun_lemmas = []

            for drf in derivationally_related_forms:
                if from_pos == "n":
                    related_noun_lemmas += [drf[0]]
                else:
                    for l in drf[1]:
                        if l.synset().name().split('.')[1] == to_pos or to_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and l.synset().name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE):
                            related_noun_lemmas += [l]

            # Extract the words from the lemmas
            temp_word_list=[l.name() for l in related_noun_lemmas]
            temp_word_list = [(w, float(temp_word_list.count(w)) / len(temp_word_list)) for w in set(temp_word_list)]

            # Take all the combinations for synonyms of different words
            # Build the result in the form of a list containing tuples (word, probability)
            if len(words)==0:
                words=temp_word_list
            else:
                words =[(w_b+" "+w_t, p_b*p_t) for w_b,p_b in words for w_t,p_t in temp_word_list]
                words.sort(key=lambda w:-w[1])

    # return all the possibilities sorted by probability
    return words

# sorted(convert('played', WN_VERB, WN_NOUN), key=lambda x: -x[1])

In [95]:
def label_query(item_iri):
    return """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?lbl WHERE {{
        <{}> rdfs:label ?lbl .
        FILTER(LANG(?lbl) = "en").
    }}
    LIMIT 1
    """.format(item_iri)

def who_query(item_iri, predicate_iri):
    return """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?query WHERE {{
        <{}> <{}> ?person .
        ?person rdfs:label ?query .
    }}
    LIMIT 1
    """.format(item_iri, predicate_iri)

def when_query(item_iri, predicate_iri):
    return """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?query WHERE {{
        <{}> <{}> ?query .
    }}
    LIMIT 1
    """.format(item_iri, predicate_iri)

def what_query(item_iri, predicate_iri):
    return """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?query WHERE {{
        <{}> <{}> ?query .
    }}
    LIMIT 1
    """.format(item_iri, predicate_iri)

def what_query__with_label(item_iri, predicate_iri):
    return """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?query WHERE {{
        <{}> <{}> ?item .
        ?item rdfs:label ?query .
        FILTER(LANG(?query) = "en").
    }}
    LIMIT 1
    """.format(item_iri, predicate_iri)

In [102]:
def answer_factual(question):
    if 'who' in question.lower().split(' '):
        question_type = 'who'
    elif 'when' in question.lower().split(' '):
        question_type = 'when'
    elif 'what' in question.lower().split(' '):
        question_type = 'what'
    else:
        question_type = 'unknown'

    # get all possible entities from parsing the question
    # ============================================================
    #
    parsed_dict = parse_question(question)
    # print(parsed_dict)

    # look up any possible match in the predicates/nodes
    # ============================================================
    #
    def lookup_item(label, nodes, predicates):
        matches = []
        if label in nodes.keys():
            matches.append(nodes[label])
        if label in predicates.keys():
            matches.append(predicates[label])
        return matches

    for entity in parsed_dict.keys():
        # in case of verbs, we want to check for synonyms, e.g.
        # "played" -> "actor"
        #
        if parsed_dict[entity]['type'] == 'VERB':
            # check synonyms based on noun form of the verb
            noun_forms = convert(entity, WN_VERB, WN_NOUN)

            if question_type == 'when':
                noun_forms.extend([(f"{noun[0]} date",0) for noun in noun_forms])
                # print(noun_forms)

            candidate_synonyms = list(filter(lambda x: x in nodes.keys(), [x[0] for x in noun_forms]))
            candidate_synonyms.append(entity)

            tmp = []
            for candidate in candidate_synonyms:
                if candidate == 'star':
                    tmp.extend(lookup_item('cast member', nodes, predicates))
                tmp.extend(lookup_item(candidate, nodes, predicates))
                
            parsed_dict[entity]['matches'] = tmp

        else:
            parsed_dict[entity]['matches'].extend(lookup_item(entity, nodes, predicates))

        parsed_dict[entity]['matches'] = list(set(parsed_dict[entity]['matches']))
    # print(parsed_dict)

    # build query based on question word
    # ============================================================
    #
    possible_predicates = set()
    possible_items = set()

    for phrase in parsed_dict.keys():
        for match in parsed_dict[phrase]['matches']:
            identifier = match.split('/')[-1]

            if identifier.startswith('P'):
                possible_predicates.add(f"http://www.wikidata.org/prop/direct/{identifier}")
            elif identifier.startswith('Q'):
                possible_items.add(f"http://www.wikidata.org/entity/{identifier}")
    # print(f"Identified Items: {possible_items}")
    # print(f"Identified Predicates: {possible_predicates}")
    
    # Build possible queries
    # ============================================================
    #
    if question_type == 'who':
        queries = []
        for item in possible_items:
            for predicate in possible_predicates:
                queries.append(who_query(item, predicate))

    elif question_type == 'when':
        queries = []
        for item in possible_items:
            for predicate in possible_predicates:
                queries.append(when_query(item, predicate))

    elif question_type == 'what':
        queries = []
        for item in possible_items:
            for predicate in possible_predicates:
                queries.append(what_query(item, predicate))
                queries.append(what_query__with_label(item, predicate))

    # else:
        # print('UNKNOWN QUESTION TYPE')


    # Execute queries
    # ============================================================
    #
    query_answered = False
    for query in queries:
        # print(query)
        res = g.query(query)
        
        if len(res) == 0:
            continue

        for row in res:
            result = row.query
            
            if question_type in ['when', 'what']:
                if not isinstance(result, Literal):
                    continue

            # print(type(result))
            if result is not None:
                query_answered = True
                return f"Answer: {result} (from Graph)"

    if not query_answered:
        return "Could not find answer in graph"

### Hybrid Model

In [304]:
def hybrid_model_chatbot(input):
    output = answer_factual(input)
    if output == "Could not find answer in graph":
        try:
            output = chatbot(input)
        except:
            output="Sorry, I could not find the answer. Can you please rephrase the question?"
    
    return output

In [305]:
dict_outputs = {}
for question in TEST_QUESTIONS:
    dict_outputs[question] = hybrid_model_chatbot(question)

In [306]:
dict_outputs

{'What is the genre of Good Neighbors?': 'Answer: art film (from Graph)',
 'Who directed Apocalypse Now?': 'According to the the graph, the director of Apocalypse Now is Francis Ford Coppola.',
 'Who is the director of Star Wars Episode VI - Return of the Jedi?': 'According to the embeddings, the director of Star Wars Episode VI: Return of the Jedi is George Lucas, Anthony Daniels, Ellis Rubin.',
 'Who is the screenwriter of The Masked Gang: Cyprus?': 'According to the embeddings, the screenwriter of The Masked Gang: Cyprus is Cengiz Küçükayvaz, Murat Aslan, Melih Ekener.',
 'When was The Godfather released?': 'Answer: 1972-03-15 (from Graph)',
 'Who is the producer of Inception?': 'According to the embeddings, the presenter of Inception is Satomi Ishihara, Michael Aspel, Heino Ferch.',
 'Who composed the soundtrack for Jurassic Park?': 'According to the embeddings, the country of Jurassic Park is Jurassic Park, Jurassic World: Dominion, Jurassic Park.',
 'When was Pulp Fiction release

### arge

In [472]:
# def match_things(dict, input):
#     tmp = 9999
#     match_key = ""
#     match_value = ""
#     for key, value in dict.items():
#         if editdistance.eval(key, input) < tmp:
#             tmp = editdistance.eval(key.lower(), input)
#             match_key = key
#             match_value = value
    
#     return match_key,match_value

In [483]:
# def match_things(dict, input):
#     input_list = input.split(" ")
#     tmp = np.inf
#     tmpp= [len(word) for word in input_list]
#     match_key = ""
#     match_value = ""
#     for key, value in dict.items():
#         if editdistance.eval(key, input) < tmp:
#             tmp = editdistance.eval(key, input)
#             key_list = key.split(" ")
#             len_input_list, len_key = len(input_list), len(key_list)
#             is_input_list_longer = (len_input_list>len_key)
#             index_input_list = list(range(len_input_list)) + [len_input_list-1]*(len_key-len_input_list)*(not is_input_list_longer)
#             index_key = list(range(len_key)) + [len_key-1]*(len_input_list-len_key)*(is_input_list_longer)
#             word_wise_comparison = [editdistance.eval(key_list[k], input_list[i]) for i,k in zip(index_input_list,index_key)]
#             if len_input_list == len(word_wise_comparison):
#                 bool_update = [(tmpp[i] > word_wise_comparison[i]) for i in range(len_input_list)]
#             else:
#                 bool_update = [(tmpp[i] > word_wise_comparison[k]) for i,k in zip(index_input_list,index_key)]
#             if sum(bool_update)>0:
#                 if len_input_list == len(word_wise_comparison):
#                     tmpp = word_wise_comparison
#                 else:
#                     tmpp = word_wise_comparison[:len_input_list]
#                     tmpp[len_input_list-1] += sum(word_wise_comparison[len_input_list:])
#                 match_key = key
#                 match_value = value   
#     return match_key,match_value

In [605]:
def match_things(dict, input):
    input_list = input.split(" ")
    tmp = np.inf
    match_key = ""
    match_value = ""
    for key, value in dict.items():
        key_list = key.split(" ")
        is_key_longer = (len(key_list)>len(input_list))
        score = 0
        for k in key_list:
            by_word_dists = []
            for input in input_list:
                by_word_dists.append(editdistance.eval(k, input))
            
            if 0 in by_word_dists:
                continue
            else:
                score += sum(by_word_dists)/(len(by_word_dists) + is_key_longer*len(key_list))

        if score < tmp:
            tmp = score
            match_key = key
            match_value = value   

   
    return match_key,match_value,tmp

In [606]:
matched = []
is_two_fncs = 1
for child in [children for children,score in convert("writer", WN_NOUN, WN_NOUN)[:10]]:
    # editdistance and fuzz similarity fnc
    matched.append((match_things(predicates, child)[0],match_things(predicates, child)[2]))
matched = list(dict.fromkeys(matched))
matched.sort(key=lambda w:w[1])
matched

[('author', 0), ('voice actor', 2.6666666666666665)]

In [494]:
nodes["Good Neighbors"]

'http://www.wikidata.org/entity/Q3110682'

In [600]:
chosen_option = None
distance = np.inf
# Good Neighbors' relations
predicates_of_entity = list( dict.fromkeys([k for s,p,o in g.triples((WD.Q3110682, None, None)) for k,v in predicates.items() if v==str(p)]) )


for option in matched:
    if option[0] in predicates_of_entity:
        chosen_option= option[0]
        break
    if isinstance(option,list):
        if option[0] in predicates_of_entity:
            chosen_option= option[0]
            break
        elif option[1] in predicates_of_entity:
            chosen_option= option[1]
            break
print(chosen_option)

None
