In [33]:
import rdflib
from rdflib import Graph
from transformers import pipeline
from fuzzywuzzy import process
from rdflib.term import URIRef, Literal
import torch
import numpy as np
import editdistance
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import AutoTokenizer, AutoModelForTokenClassification
import re
import random
from rdflib.namespace import Namespace, RDF, RDFS, XSD
import pandas as pd
from statsmodels.stats.inter_rater import fleiss_kappa
import json

## Import the Graph

In [22]:
# Create and load a sample knowledge graph
graph = rdflib.Graph()
graph.parse('14_graph.nt', format='turtle') 

KeyboardInterrupt: 

Exception ignored in: 'zmq.backend.cython.message.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq/backend/cython/checkrc.pxd", line 13, in zmq.backend.cython.checkrc._check_rc
KeyboardInterrupt: 


<Graph identifier=Nfc6ac4a843944b75b2df3a175faf0d56 (<class 'rdflib.graph.Graph'>)>

## Load Spacy

In [23]:
import spacy

# this command downloads the Spacy model
spacy.cli.download("en_core_web_md")

nlp = spacy.load("en_core_web_md")

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Frameworks/Python.framework/Versions/3.12/bin/python3.12 -m pip install --upgrade pip[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Load the Embeddings

In [24]:
import numpy as np
import csv

entity_matrix = np.load('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/entity_embeds.npy')
predicate_matrix = np.load('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/relation_embeds.npy')

with open('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/entity_ids.del') as ifile:
    ent2id = {ent: int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/ddis-graph-embeddings/relation_ids.del') as ifile:
    pred2id = {rel: int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2pred = {v: k for k, v in pred2id.items()}

## Build dictionaries for entities and predicates

In [25]:
namespaces = {
    'WD': rdflib.Namespace("http://www.wikidata.org/entity/"),
    'WDT': rdflib.Namespace("http://www.wikidata.org/prop/direct/"),
    'DDIS': rdflib.Namespace("http://ddis.ch/atai/"),
    'RDFS': rdflib.namespace.RDFS,
    'SCHEMA': rdflib.Namespace("http://schema.org/")
}

nodes = {
    node.toPython(): str(graph.value(node, namespaces['RDFS'].label) 
                         or str(node).split('/')[-1])
    for node in graph.all_nodes() if isinstance(node, rdflib.term.URIRef)
}
ent2uri = {ent: uri for uri, ent in nodes.items()}

predicates = {
    node.toPython(): str(graph.value(node, namespaces['RDFS'].label) 
                         or str(node).split('/')[-1])
    for node in graph.predicates() if isinstance(node, rdflib.term.URIRef)
}
pred2uri = {pred: uri for uri, pred in predicates.items()}

## Build people-occupation dictionary using SPARQL

In [26]:
query = f"""
SELECT DISTINCT ?person_label ?occupation_label WHERE {{
    ?person <{namespaces['WDT'].P106}> ?occupation .
    ?person <{namespaces['RDFS'].label}> ?person_label .
    ?occupation <{namespaces['RDFS'].label}> ?occupation_label .
}}
"""
result = graph.query(query)

# Create dictionary mapping people to occupations
people_occupation = {}
for row in result:
    person = str(row.person_label)
    occupation = str(row.occupation_label)

    if person not in people_occupation:
        people_occupation[person] = []
    if occupation not in people_occupation[person]:
        people_occupation[person].append(occupation)

## Build a dictionary mapping people to movies they contributed to

In [27]:
people_movies = {}

query = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?personLabel ?movieLabel WHERE {
    ?movie wdt:P161|wdt:P57|wdt:P58|wdt:P344|wdt:P3300|wdt:P1809 ?person .
    ?person rdfs:label ?personLabel .
    ?movie rdfs:label ?movieLabel .
    FILTER (lang(?personLabel) = 'en' && lang(?movieLabel) = 'en')
}
"""

try:
    qres = graph.query(query)

    for row in qres:
        person = str(row.personLabel)
        movie = str(row.movieLabel)
        if person not in people_movies:
            people_movies[person] = []
        people_movies[person].append(movie)

except Exception as e:
    print(f"Error building people-to-movies dictionary: {e}")

In [28]:
# Pass dictionaries to the chatbot class
dictionaries = (nodes, ent2uri, predicates, pred2uri, people_occupation, ent2id, id2ent, pred2id, id2pred, entity_matrix, predicate_matrix, people_movies)

# Load the image file

In [29]:
with open("images.json", "r") as f:
    images = json.load(f)

## Handle the crowdsourcing dataset

In [30]:
# Preprocess Crowdsourced Data
def preprocess_crowd_data(file_path):
    """
    Loads and preprocesses the crowdsourced data.
    """
    data = pd.read_csv(file_path, sep='\t')
    data['LifetimeApprovalRate'] = data['LifetimeApprovalRate'].replace('%', '', regex=True).astype(float)
    data['WorkTimeInSeconds'] = pd.to_numeric(data['WorkTimeInSeconds'], errors='coerce')
    return data


# Filter Workers
def filter_workers(data, approval_threshold=50, time_threshold=35):
    """
    Filters workers based on thresholds.
    Returns filtered DataFrame of valid workers.
    """
    valid_workers = data[
        (data['LifetimeApprovalRate'] >= approval_threshold) & 
        (data['WorkTimeInSeconds'] >= time_threshold)
    ]
    valid_workers = valid_workers.sort_values(by='WorkTimeInSeconds', ascending=False)
    valid_workers = valid_workers.groupby('HITId').head(3)
    return valid_workers

def calculate_fleiss_kappa_per_batch(valid_workers):
    """
    Calculates Fleiss' Kappa for each HITTypeId batch.
    Returns a dictionary mapping HITTypeId to Fleiss' Kappa value.
    """
    contingency_tables = valid_workers.groupby(['HITTypeId', 'HITId'])['AnswerLabel'].value_counts().unstack(fill_value=0)
    fleiss_kappa_per_batch = contingency_tables.groupby(level=0).apply(
        lambda x: fleiss_kappa(x.to_numpy()) if not x.empty else -1
    ).to_dict()

    return fleiss_kappa_per_batch


file_path = '/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Chatbot-Project/crowd_data.tsv'
data = preprocess_crowd_data(file_path)
valid_workers = filter_workers(data)
fleiss_kappa_per_batch = calculate_fleiss_kappa_per_batch(valid_workers)

crowdsourcing_files = (data, valid_workers, fleiss_kappa_per_batch)

In [36]:
class Chatbot:
    def __init__(self, graph, dictionaries, images, crowsourcing_files):

        self.graph = graph
        
        # Initialize zero-shot classification pipeline
        self.classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

        # Load the NER model
        tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
        model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
        self.ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

        # Define namespaces
        self.namespaces = {
            'WD': rdflib.Namespace("http://www.wikidata.org/entity/"),
            'WDT': rdflib.Namespace("http://www.wikidata.org/prop/direct/"),
            'DDIS': rdflib.Namespace("http://ddis.ch/atai/"),
            'RDFS': rdflib.namespace.RDFS,
            'SCHEMA': rdflib.Namespace("http://schema.org/")
        }

        self.nodes, self.ent2uri, self.predicates, self.pred2uri, self.people_occupation, self.ent2id, self.id2ent, self.pred2id, self.id2pred, self.entity_matrix, self.predicate_matrix, self.people_movies = dictionaries

        self.images = images

        self.expecting_response = False
        self.last_question_info = {"question": None, "suggested_label": None, "updated_label": None}

        self.crowd_data, self.valid_workers, self.fleiss_kappa_per_batch = crowsourcing_files

        # Initialize Pegasus model for paraphrasing
        model_name = 'tuner007/pegasus_paraphrase'
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        self.paraphrase_model = PegasusForConditionalGeneration.from_pretrained(model_name)

        # --- Factual Question Patterns ---
        self.question_patterns = [
        
            # Pattern 0: who and what
            (r"who is the (?P<relation>.+?) of (?P<entity>.+)", 'who', 1),
            (r"who (?P<relation>.+?) (?P<entity>.+)", 'who', 1),
        
            # Pattern 1: Find movies with (word) in their titles
            (r"(?:find|which) movies.*contain(?:s)?(?: the word)? (?P<word>\w+)", 'find_word_in_title', 0),
            (r"(?:find|which) movies with (?P<word>\w+) in (?:their )?titles?", 'find_word_in_title', 0),
            (r"(?:find|which) movies (?:whose )?(?:title|name) contains? (?P<word>\w+)", 'find_word_in_title', 0),
        
            # Pattern 2: Highest-rated movies (optional 'above' and a number)
            (r"(?:what are|list)(?: the)?(?: highest[-\s]rated)? movies(?: rated)?(?: above| greater than)?(?: (?P<number>\d+(\.\d+)?))?", 'movies_rating_above', 0),
            (r"movies (?:rated )?(?:above )?(?P<number>\d+(\.\d+)?)?", 'movies_rating_above', 0),
        
            # Pattern 3: Lowest-rated movies (optional 'below' and a number)
            (r"(?:what are|list)(?: the)?(?: lowest[-\s]rated)? movies(?: rated)?(?: below| less than)?(?: (?P<number>\d+(\.\d+)?))?", 'movies_rating_below', 0),
            (r"movies (?:rated )?(?:below )?(?P<number>\d+(\.\d+)?)?", 'movies_rating_below', 0),
        
            # Pattern 4: Entities in alphabetical order
            (r"which (?P<entity>.+) comes first alphabetically", 'entity_first_alphabetically', 1),
            (r"list (?P<entity>.+) in alphabetical order", 'entity_first_alphabetically', 1),
        
            # Pattern 5: Entities in reverse alphabetical order
            (r"which (?P<entity>.+) comes last alphabetically", 'entity_last_alphabetically', 1),
            (r"list (?P<entity>.+) in reverse alphabetical order", 'entity_last_alphabetically', 1),

            # Pattern 6: Release date of an entity
            (r"when was (?P<entity>.+?) released\??", 'release_date', 1),
            (r"what is the release date of (?P<entity>.+?)\??", 'release_date', 1),
            (r"when did (?P<entity>.+?) come out\??", 'release_date', 1),
            (r"when did (?P<entity>.+?) (?:premiere|debut)\??", 'release_date', 1),
            (r"(?:pubblication|release) date of (?P<entity>.+?)\??", 'release_date', 1),
        
        ]

    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################              ANSWER            #######################
    #######################              ------            #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################

    def answer_question(self, question):

        try:
            main_labels = ["image request", "recommendation query", "movie query", "general query"]

            if self.expecting_response == False:
                question_type = self.identify_question(question, main_labels, uncertainty=True)
                if self.expecting_response:
                    return f"I’m not entirely sure, but are you asking about {self.last_question_info["suggested_label"][0]} or {self.last_question_info["suggested_label"][1]}?"

            else:
                possible_labels = [self.last_question_info["suggested_label"][0], self.last_question_info["suggested_label"][1], "negative statement"]
                question_type = self.identify_question(question, possible_labels)
                if question_type != "negative statement":
                    question = self.last_question_info["question"]
                    self.reset_last_question_info()
                else:
                    self.reset_last_question_info()
                    return f"I am still not sure about the question, could you rephrase it?"

        except Exception as e:
            print(f"Unexpected error: {e}")
            return "Sorry, something went wrong while processing your question."

        
        # Handle Image Requests
        if question_type == "image request":
            entity_name, _ = self.extract_known_entity(question, self.people_occupation)
            entities = self.NER(question, self.people_occupation)
            if len(entities) == 1 and entities[0]['entity_group'] == 'PER':
                entity_name = entities[0]['word']
            if not entity_name:
                return "I couldn’t find any known person in your question."
            imdb_id = self.get_imdb_id(entity_name)
            if not imdb_id:
                return f"Sorry, I couldn’t find an IMDb ID for {entity_name}."
            image_path = self.get_image_from_json(imdb_id)
            if not image_path:
                return f"Sorry, I couldn’t find any image for {entity_name}."
            return f"Found Image for {entity_name}: {image_path}"
                
        # Handle Movie Queries
        if question_type == "movie query":

            # Try CROWDSOURCING
            
            # Extract predicates and entities for crowdsourcing and Embeddings
            extracted_predicate, entities = self.extract_entities_and_predicates(question, self.ent2uri)

            if not extracted_predicate or not entities:
                return "Sorry, my head is all over the place, could you rephrase your question?"

            if len(entities) > 1:
                self.expecting_response = True
                return f"I’m not entirely sure, but are you asking about {self.last_question_info["suggested_label"][0]} or {self.last_question_info["suggested_label"][1]}? Sorry but i get confused if i need to handle multiple stuff at once hahaha"

            entity = entities[0]['word']
            
            # Find entity and predicate in crowdsourced data
            crowdsourcing_entity = self.find_entity_in_crowd_data(entity, 'entity')
            crowdsourcing_predicate = self.find_entity_in_crowd_data(extracted_predicate, 'predicate')
    
            if crowdsourcing_entity is not None and crowdsourcing_predicate is not None:
                answer = self.crowdsource_search_with_valid_workers(crowdsourcing_entity, crowdsourcing_predicate)
                if answer:
                    return answer

            # Try FACTUAL ANSWER
            params = self.match_question_pattern(question)
            if params:
                # Extract additional data only if required
                if params['requires_matching']:
                    extracted_predicate, entities = self.extract_entities_and_predicates(question, self.ent2uri)
                    if extracted_predicate:
                        params['relation'] = extracted_predicate
                    if entities:
                        matched_entity = entities[0]['word']
                        params['entity'] = matched_entity

                sparql_query = self.generate_sparql_query(params)
                if sparql_query:
                    answer = self.query_graph(sparql_query)
                    if answer:
                        response = self.paraphrase("I am absolutely sure that the answer is") + " " + ', '.join(answer)
                        return response

            # Refine Question Type if Factual Query Fails 
            refined_labels = ["release date or publication date query", "movie query"]
            refined_type = self.identify_question(question, refined_labels)

            if refined_type == "release date or publication date query":
                print("Identified Release Date question")
                params = {
                    "type": "release_date",
                    "entity": self.NER(question, self.ent2uri)[0]['word'],
                }
                sparql_query = self.generate_sparql_query(params)
                answer = self.query_graph(sparql_query)
                return f"The release Date of {matched_entity} is {', '.join(answer)}" if answer else "Sorry, I couldn’t find the release date."
                 
                
            # Fallback to Embeddings for Remaining Movie Queries
            answer = self.compute_embedding_answer(extracted_predicate, entity)
            
            if not answer:
                return "Sorry, but it's been a hard day and I really don't feel like answering this question :("
            
            # Format the answer list nicely
            if len(answer) == 1:
                answer_str = answer[0]
            elif len(answer) == 2:
                answer_str = ' and '.join(answer)
            else:
                answer_str = ', '.join(answer[:-1]) + f" or maybe {answer[-1]}"
            
            return f"I think that the {extracted_predicate} of {entity} might be {answer_str}. Not sure which one tbh"

    
        # Handle Recommendation Queries
        if question_type == "recommendation query":
            try:
                # Extract movies using NER
                entities = self.NER(question, self.ent2uri)
                if not entities:
                    return "I would need to know a bit more about your preferences to give you a good suggestion, for now I can tell you that my favourite movie in the world is Les Miserables!"
                liked_people = [ent['word'] for ent in entities if ent['entity_group'] == 'PER']
                liked_movies = [ent['word'] for ent in entities if ent['entity_group'] != 'PER']

                # Add movies associated with liked people
                if liked_people:
                    for person in liked_people:
                        movies = self.people_movies.get(person, [])[:3]
                        liked_movies.extend(movies)
        
                # Compute recommendations
                recommendations = self.compute_recommendation(liked_movies)
                if not recommendations:
                    return "Sorry, I couldn’t find any recommendations based on your preferences."
        
                return f"Recommended Movies: {', '.join(recommendations)}"

            except Exception as e:
                print(f"Error during recommendation processing: {e}")
                return "Sorry, something went wrong while processing your request."

        # Handle Unknown Questions
        return "I'm really not sure I can help you out with this one. Maybe I could give you some movie recommendations? I've been watching a ton of TV lately."

    def reset_last_question_info(self):
        """Resets the last question context."""
        self.expecting_response = False
        
        for key, value in self.last_question_info.items():
            self.last_question_info[key] = None


    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################       INDENTIFY QUESTION       #######################
    #######################       ------------------       #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################
    
    def identify_question(self, question, candidate_labels, threshold=0.35, uncertainty=False):
        
        try:
            result = self.classifier(question, candidate_labels)
            top_label = result["labels"][0]
            top_score = result["scores"][0]
            second_top_label = result["labels"][1]

            if not uncertainty:
                return top_label

            self.last_question_info = {"question": question, "suggested_label": [top_label, second_top_label]}
    
            # High confidence
            if top_score >= threshold:
                return top_label

            # Low-confidence scenario: expect clarification
            self.expecting_response = True
    
            return top_label
        
        except Exception as e:
            print(f"Error during question identification: {e}")
            return "Sorry, something went wrong while processing your request."



    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################         EDIT DISTANCE          #######################
    #######################         -------------          #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################

    def match_entity_editdistance(self, entity, dictionary, threshold=1):
        """
        Matches the given entity to the closest node in the dictionary based on edit distance.
        Returns None if the closest match exceeds the specified distance threshold.
        """
        best_match = None
        min_distance = threshold + 1

        for key in dictionary.keys():
            distance = editdistance.eval(key.lower(), entity.lower())
            if distance < min_distance:
                min_distance = distance
                best_match = key
                best_match_value = dictionary[key]
                best_match_distance = distance
        
        if min_distance <= threshold:
            return best_match, best_match_value, best_match_distance
        
        return None, None, None

    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################      EXTRACT KNOWN ENTITY      #######################
    #######################      --------------------      #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################
   
    def extract_known_entity(self, sentence, dictionary, max_ngram_size=3, threshold=2, confidence=0.6, use_similarity=False):
    
        try:
            # Step 1: Parse the sentence and filter stopwords
            doc = nlp(sentence)
            meaningful_words = [token.text for token in doc if not token.is_stop and token.is_alpha]
    
            # Step 2: Generate prioritized n-grams
            ngrams = []
            for size in range(max_ngram_size, 0, -1):
                ngrams += [" ".join(meaningful_words[i:i+size]) 
                           for i in range(len(meaningful_words) - size + 1)]
    
            # Step 3: Try edit distance matching first
            for ngram in ngrams:
                match, _, _ = self.match_entity_editdistance(ngram, dictionary, threshold=threshold)
                if match:
                    return match, ngram
    
            # Step 4: Fallback to similarity matching if edit distance failed
            for ngram in ngrams:
                matches = self.find_match(ngram, dictionary, confidence=confidence)
                if matches:
                    return matches[0], ngram
        
        except Exception as e:
            print(f"Error extracting known entity: {e}")
        
        return None

    def find_match(self, phrase, dictionary, n=5, confidence=0.6):
    
        phrase_token = nlp(phrase)
        similarities = []
    
        try:
            # Calculate similarity between phrase and each value in the dictionary
            for key, value in dictionary.items():
                key_token = nlp(key)
                similarity = phrase_token.similarity(key_token)
                if similarity > confidence:
                    similarities.append((key, similarity))
    
            # Sort by similarity in descending order and return top matches
            top_matches = sorted(similarities, key=lambda x: x[1], reverse=True)[:n]
            return [match[0] for match in top_matches]
    
        except Exception as e:
            print(f"Error in finding match: {e}")
            return []   

    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################            MULTIMEDIA          #######################
    #######################            ----------          #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################
    
    def get_imdb_id(self, entity_name):
        """
        Retrieves the IMDb ID of an entity using graph.value().
        Returns None if not found or in case of an error.
        """
        try:
            entity_uri = self.ent2uri.get(entity_name)
            if not entity_uri:
                return None  # No matching URI
            
            imdb_id = self.graph.value(
                subject=rdflib.URIRef(entity_uri), 
                predicate=self.namespaces['WDT'].P345
            )
            return str(imdb_id) if imdb_id else None
        except Exception as e:
            print(f"Error retrieving IMDb ID for {entity_name}: {e}")
            return None

    def get_image_from_json(self, imdb_id):

        try:
            # Step 1: Exact match search (only the IMDb ID in 'cast')
            exact_matches = [img['img'] for img in self.images if img['cast'] == [imdb_id]]
    
            if exact_matches:
                return f"image:{exact_matches[0].replace('.jpg', '')}"
    
            # Step 2: General search (IMDb ID in 'cast')
            general_matches = [img['img'] for img in self.images if imdb_id in img['cast']]
    
            if general_matches:
                return f"image:{general_matches[0].replace('.jpg', '')}"
    
        except Exception as e:
            print(f"Error retrieving image from JSON: {e}")
        
        return None

    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################                NER             #######################
    #######################                ---             #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################

    def NER(self, question, dictionary, max_distance=5):
    
        # Preprocess the question
        cleaned_question = re.sub(r'[:!\\-]', '', question)
        cleaned_question = re.sub(r'\s+', ' ', cleaned_question).strip()
        question = cleaned_question
        
        extracted_entities = []
    
        # Step 1: Use NER to extract entities
        try:
            entities = self.ner_pipeline(question)
        except Exception as e:
            print(f"NER Pipeline Error: {e}")
            return extracted_entities
        
        # Step 2: Process Extracted Entities
        if entities:
            # Match entities to graph nodes
            for entity in entities[:]:
                match_result = self.match_entity_editdistance(entity['word'], dictionary, threshold=max_distance)
                
                # Handle no match case
                if match_result[0] is None:
                    entities.remove(entity)
                else:
                    # Extract and update matched entity
                    match_key, match_value, distance = match_result
                    entity['word'] = match_key
        
        return entities


    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################           EMBEDDINGS           #######################
    #######################           ----------           #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################
    
    def extract_entities_and_predicates(self, question, dictionary):

        # Step 1: Extract Predicates
        extracted_predicate, matched_ngram = self.extract_known_entity(
            sentence=question, 
            dictionary=self.pred2uri, 
            use_similarity=True
        )
    
        if matched_ngram and extracted_predicate:
        
            # Step 2: Lowercase the Matched Predicate N-gram from the Question to improve NER
            question = re.sub(
                r'\b' + re.escape(matched_ngram) + r'\b', 
                matched_ngram.lower(), 
                question, 
                flags=re.IGNORECASE
            ).strip()
        
        # Step 3: Extract Entities Using NER
        entities = self.NER(question, dictionary)
        return extracted_predicate, entities

    def compute_embedding_answer(self, extracted_predicate, extracted_entity):
    
        # Step 4: Convert Entities and Predicates to Embeddings
        entity_embedding = self.extract_embedding(extracted_entity, 'entity')
        predicate_embedding = self.extract_embedding(extracted_predicate, 'predicate')
    
        # Ensure all embeddings were found
        if entity_embedding is None or predicate_embedding is None:
            return []
    
        # Step 5: Compute Answer Using Embeddings
        query_embedding = entity_embedding + predicate_embedding
        answer = self.find_similarities(query_embedding, n=3)
    
        return answer if answer else []


    def find_similarities(self, embedding, n=5):
        """
        Finds the top-n most similar entities based on embedding distance.
        """
        try:
            embedding = np.atleast_2d(embedding)
            distances = pairwise_distances(embedding, self.entity_matrix)
            similar_entities = [
                self.nodes[self.id2ent[idx]]
                for idx in distances.argsort().reshape(-1)[:n]
            ]
            return similar_entities
        
        except Exception as e:
            print(f"Error finding similar entities: {e}")
            return []

    
    def extract_embedding(self, label, embedding_type='entity'):
        """
        Extracts the embedding vector for a given label (entity or predicate).
        """
        try:
            if embedding_type == 'entity':
                uri_dict, id_dict, embed_matrix = self.ent2uri, self.ent2id, self.entity_matrix
            else:
                uri_dict, id_dict, embed_matrix = self.pred2uri, self.pred2id, self.predicate_matrix
    
            # Convert label to URI and lookup ID
            uri = uri_dict[label]
            row_id = id_dict[uri]
    
            # Extract embedding from matrix
            embedding = embed_matrix[row_id]
            return embedding
        
        except KeyError:
            print(f"Label '{label}' not found in {embedding_type} embeddings.")
            return None

    def extract_label(self, embedding, embedding_type='entity'):
        try:
            if embedding_type == 'entity':
                embed_matrix, id_dict, label_dict = self.entity_matrix, self.id2ent, self.nodes
            else:
                embed_matrix, id_dict, label_dict = self.predicate_matrix, self.id2pred, self.predicates
    
            # Find the closest embedding in the matrix
            idx = np.where((embed_matrix == embedding).all(axis=1))[0][0]
            uri = id_dict[idx]
            label = label_dict[uri]
            return label
        
        except IndexError:
            print("Embedding not found in matrix.")
            return None
        
    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################        FACTUAL QUESTIONS       #######################
    #######################        -----------------       #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################

    def match_question_pattern(self, question):
        for pattern, qtype, requires_matching in self.question_patterns:
            match = re.search(pattern, question, re.IGNORECASE)
            if match:
                params = match.groupdict()
                params['type'] = qtype
                params['requires_matching'] = requires_matching
                return params
        return None

    def generate_sparql_query(self, params):
        qtype = params.get('type')

        if qtype == 'who':
            return f"""
            SELECT ?result WHERE {{
                ?entity rdfs:label "{params['entity']}"@en .
                ?entity <{self.pred2uri[params['relation']]}> ?item .
                ?item rdfs:label ?result .
                FILTER (lang(?result) = 'en')
            }}
            """

        if qtype == 'find_word_in_title':
            return f"""
            SELECT ?movieLabel WHERE {{
                ?movie rdfs:label ?movieLabel .
                FILTER(CONTAINS(LCASE(?movieLabel), LCASE("{params['word']}"))) .
                FILTER (lang(?movieLabel) = 'en')
                LIMIT 5
            }}
            """

        if qtype == 'movies_rating_above':
            return f"""
            SELECT ?movieLabel WHERE {{
                ?movie ddis:rating ?rating .
                FILTER(?rating > {params['number']}) .
                ?movie rdfs:label ?movieLabel .
                FILTER (lang(?movieLabel) = 'en')
            }} ORDER BY DESC(?rating) LIMIT 1
            """

        if qtype == 'movies_rating_below':
            return f"""
            SELECT ?movieLabel WHERE {{
                ?movie ddis:rating ?rating .
                FILTER(?rating < {params['number']}) .
                ?movie rdfs:label ?movieLabel .
                FILTER (lang(?movieLabel) = 'en')
            }} ORDER BY DESC(?rating)
            """

        if qtype == 'entity_first_alphabetically':
            return f"""
            SELECT ?entity_label WHERE {{
                ?entity wdt:P31 <{self.ent2uri[params['entity']]}> .
                ?entity rdfs:label ?entity_label .
                FILTER (lang(?entity_label) = 'en')
            }} ORDER BY ASC(?entity_label)
            """

        if qtype == 'entity_last_alphabetically':
            return f"""
            SELECT ?entity_label WHERE {{
                ?entity wdt:P31 <{self.ent2uri[params['entity']]}> .
                ?entity rdfs:label ?entity_label .
                FILTER (lang(?entity_label) = 'en')
            }} ORDER BY DESC(?entity_label)
            """
            
        elif qtype == 'release_date':
            return f"""
            PREFIX ddis: <http://ddis.ch/atai/>  PREFIX wd: <http://www.wikidata.org/entity/>  
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>  PREFIX schema: <http://schema.org/>
            SELECT ?releaseDate WHERE {{
                ?movie rdfs:label "{params['entity']}"@en .
                ?movie wdt:P577 ?releaseDate .
            }} ORDER BY ASC(?releaseDate) LIMIT 1
            """

        return None

    def query_graph(self, sparql_query):
        try:
            qres = self.graph.query(sparql_query)
            return [str(row[0]) for row in qres] if qres else []
        except Exception as e:
            print(f"[ERROR] SPARQL Query Execution Error: {e}")
            return []

    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################        RECOMMENDATIONS         #######################
    #######################        ---------------         #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################


    def compute_recommendation(self, liked_movies, n=5):

        try:
            # Extract embeddings for liked movies
            liked_movies_embeddings = []
            for movie in liked_movies:
                # Extract embedding directly if the movie is known
                embedding = self.extract_embedding(movie, 'entity')
                if embedding is not None:
                    liked_movies_embeddings.append(embedding)
        
            # Return early if no embeddings found
            if not liked_movies_embeddings:
                return []
    
            # Average embedding calculation
            avg_embedding = np.mean(liked_movies_embeddings, axis=0)
    
            # Compute similarity with all movie embeddings
            similarities = cosine_similarity(avg_embedding.reshape(1, -1), self.entity_matrix)
    
            # Sort and filter recommendations
            sorted_indices = similarities.argsort()[0][::-1]
            excluded_movies = set(liked_movies)
    
            recommended_movies = []
            for i in sorted_indices:
                label = self.extract_label(self.entity_matrix[i], 'entity')
                if label and label not in excluded_movies:
                    recommended_movies.append(label)
                    if len(recommended_movies) >= n:
                        break
    
            return recommended_movies[:n]
    
        except Exception as e:
            return []

    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################          CROWDSOURCING         #######################
    #######################          -------------         #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################

    def crowdsource_search_with_valid_workers(self, entity, relation):

        try:
            if not isinstance(entity, str) or not isinstance(relation, str):
                raise ValueError(f"Entity and relation must be strings, got {type(entity)} and {type(relation)}.")
            
            res = self.valid_workers[
                (self.valid_workers['Input1ID'].astype(str) == entity) & 
                (self.valid_workers['Input2ID'].astype(str) == relation)
            ]
    
            if res.empty:
                return None
    
            hit_type_id = res['HITTypeId'].iloc[0]
            kappa = self.fleiss_kappa_per_batch.get(hit_type_id, 0)
            kappa = round(kappa, 3)
    
            final_answer, correct_count, incorrect_count = self.majority_vote(res)
    
            answer_column = 'Input3ID' if final_answer == "CORRECT" else 'FixValue'
            answers = res[answer_column].dropna().unique()
    
            converted_answers = [
                self.get_entity_label(ans) if ans.startswith("wd:") else ans for ans in answers
            ]
            if len(converted_answers) == 1:
                answers_str = converted_answers[0]
                return (f"The answer is {answers_str}. "
                    f"[Crowd, inter-rater agreement: {kappa}, "
                    f"The answer distribution for this task was {correct_count} support votes, "
                    f"{incorrect_count} reject votes]")
            elif len(converted_answers) == 2:
                answers_str = ' and '.join(converted_answers)
            else:
                answers_str = ', '.join(converted_answers[:-1]) + f", and {converted_answers[-1]}"
            
            return (f"The answers are {answers_str}. "
                    f"[Crowd, inter-rater agreement: {kappa}, "
                    f"The answer distribution for this task was {correct_count} support votes, "
                    f"{incorrect_count} reject votes]")

    
        except KeyError as e:
            return None
    
        except Exception as e:
            return None


    def find_entity_in_crowd_data(self, extracted_entity_label, entity='entity'):

        try:
                
            dictionary = self.ent2uri if entity == 'entity' else self.pred2uri
            
            match = self.match_entity_editdistance(extracted_entity_label, dictionary)
            
            if match:
                matched_node, matched_id, _ = match
                if entity == 'entity':
                    stripped_id = "wd:" + matched_id.split('/')[-1]
                    result = self.crowd_data[self.crowd_data['Input1ID'] == stripped_id]
                    if not result.empty:
                        id = result['Input1ID'].iloc[0]
                else:
                    stripped_id = "wdt:" + matched_id.split('/')[-1]
                    result = self.crowd_data[self.crowd_data['Input2ID'] == stripped_id]
                    if not result.empty:
                        id = result['Input2ID'].iloc[0]

                return id
        
            # Entity not found
            return None
    
        except KeyError as e:
            return None
    
        except Exception as e:
            return None

    def majority_vote(self, res):
        
        try:
            correct_count = (res['AnswerLabel'] == 'CORRECT').sum()
            incorrect_count = (res['AnswerLabel'] == 'INCORRECT').sum()
    
            if correct_count == 0 and incorrect_count == 0:
                raise ValueError("No valid votes found in the result set.")
    
            final_answer = "CORRECT" if correct_count >= incorrect_count else "INCORRECT"
            return final_answer, correct_count, incorrect_count
    
        except Exception as e:
            return "INCORRECT", 0, 0


    
    def get_entity_label(self, entity_id):

        try:
            graph = Graph()
            url = f"https://www.wikidata.org/wiki/Special:EntityData/{entity_id[3:]}.ttl"
            graph.parse(url, format="turtle")
    
            entity_uri = URIRef(f"http://www.wikidata.org/entity/{entity_id[3:]}")
            for label in graph.objects(entity_uri, URIRef("http://schema.org/name")):
                return label.value
    
        except Exception as e:
            return "Label not found"

    ##############################################################################
    ##############################################################################
    #######################                                #######################
    #######################           PARAPHRASING         #######################
    #######################           ------------         #######################
    #######################                                #######################
    ##############################################################################
    ##############################################################################
    
    def paraphrase(self, context: str, num_return_sequences: int = 5, num_beams: int = 5):

        try:
            # Tokenize the input
            batch = self.tokenizer(
                [context], truncation=True, padding='longest',
                max_length=60, return_tensors="pt"
            )
    
            # Generate paraphrases
            translated = self.paraphrase_model.generate(
                **batch, max_length=60, num_beams=num_beams, 
                num_return_sequences=num_return_sequences, temperature=1.5
            )
    
            # Decode and remove final period if present
            tgt_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True)
            if tgt_text:
                result = random.choice(tgt_text).strip()
                return result[:-1] if result.endswith('.') else result
    
            return "I couldn't find a paraphrase."
    
        except Exception as e:
            print(f"Paraphrasing Error: {e}")
            return "Something went wrong while paraphrasing. Please try again later."
            

# SpeakEasy Environment

In [None]:
from speakeasypy import Speakeasy, Chatroom
from typing import List
import time

DEFAULT_HOST_URL = 'https://speakeasy.ifi.uzh.ch'
listen_freq = 2


class Agent:
    def __init__(self, username, password):
        self.username = username
        # Initialize the Speakeasy Python framework and login.
        self.speakeasy = Speakeasy(host=DEFAULT_HOST_URL, username=username, password=password)
        self.speakeasy.login()  # This framework will help you log out automatically when the program terminates.

    def listen(self):
        graph = rdflib.Graph()
        graph.parse('/Users/gianmarcoalbano/Desktop/Advanced topics in AI/Speakeasy Project/Datasets/14_graph.nt', format='turtle')
        while True:
            # only check active chatrooms (i.e., remaining_time > 0) if active=True.
            rooms: List[Chatroom] = self.speakeasy.get_rooms(active=True)
            for room in rooms:
                if not room.initiated:
                    # send a welcome message if room is not initiated
                    room.post_messages(f'Hello! This is a welcome message from {room.my_alias}.')
                    room.initiated = True
                # Retrieve messages from this chat room.
                # If only_partner=True, it filters out messages sent by the current bot.
                # If only_new=True, it filters out messages that have already been marked as processed.
                for message in room.get_messages(only_partner=True, only_new=True):
                    print(
                        f"\t- Chatroom {room.room_id} "
                        f"- new message #{message.ordinal}: '{message.message}' "
                        f"- {self.get_time()}")

                    # Implement your agent here #
                    bot = Chatbot(graph, dictionaries, images, crowdsourcing_files)
                    result = bot.answer_question(message.message)

                    if message.message == "Show me a picture of Abu":
                        result = "Sorry but that sexy beast is not in the database"

                    # Send a message to the corresponding chat room using the post_messages method of the room object.
                    room.post_messages(f"'{result}' ")
                    # Mark the message as processed, so it will be filtered out when retrieving new messages.
                    room.mark_as_processed(message)

                # Retrieve reactions from this chat room.
                # If only_new=True, it filters out reactions that have already been marked as processed.
                for reaction in room.get_reactions(only_new=True):
                    print(
                        f"\t- Chatroom {room.room_id} "
                        f"- new reaction #{reaction.message_ordinal}: '{reaction.type}' "
                        f"- {self.get_time()}")

                    # Implement your agent here #

                    room.post_messages(f"Received your reaction: '{reaction.type}' ")
                    room.mark_as_processed(reaction)

            time.sleep(listen_freq)

    @staticmethod
    def get_time():
        return time.strftime("%H:%M:%S, %d-%m-%Y", time.localtime())


if __name__ == '__main__':
    demo_bot = Agent("swift-comet", "X2wqU6D3")
    demo_bot.listen()

Login successful. Session token: 9xnp3Q78utK_I48li4z1Bhpa3qgYeYgM
	- Chatroom fc1b5fa6-95db-460f-ab26-789cb4d03f36 - new message #0: 'who is the director of Inception' - 23:00:00, 09-12-2024


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


	- Chatroom fc1b5fa6-95db-460f-ab26-789cb4d03f36 - new message #3: 'can you recommend some movies since I like The Great Gatsby, Inception and The Godfather' - 23:00:17, 09-12-2024


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
