In [1]:
# imports
import getpass
import json

from langchain_ibm import WatsonxEmbeddings, WatsonxLLM
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.prompts import PromptTemplate
from langchain.tools import tool
from langchain.tools.render import render_text_description_and_args
from langchain.agents.output_parsers import JSONAgentOutputParser
from langchain.agents.format_scratchpad import format_log_to_str
from langchain.agents import AgentExecutor
from langchain.memory import ConversationBufferMemory
from langchain_core.runnables import RunnablePassthrough
from ibm_watsonx_ai.foundation_models.utils.enums import EmbeddingTypes
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
import os
import glob

def get_json_files(folder_path):
    """
    Find all JSON files in the specified folder.
    
    Args:
        folder_path (str): Path to the folder to search
        
    Returns:
        list: List of paths to JSON files
    """
    # Make sure the path exists
    if not os.path.exists(folder_path):
        raise ValueError(f"The folder '{folder_path}' does not exist")
        
    # Method 1: Using glob
    json_files = glob.glob(os.path.join(folder_path, "*.json"))
    return json_files

In [3]:
import pandas as pd
from typing import Dict, List

def get_planet_distances(planet: str, df: pd.DataFrame) -> Dict[str, float]:
    """
    Get distances from a specific planet to all other planets.
    
    Args:
        planet: Name of the source planet
        df: DataFrame containing distance data
        
    Returns:
        Dictionary with planet names as keys and distances as values
    """
    if planet not in df.index:
        raise ValueError(f"Planet {planet} not found in the distance data")
    
    # Get all distances for the planet
    distances = df.loc[planet].to_dict()
    
    # Remove self-distance (should be 0)
    if planet in distances:
        del distances[planet]
        
    return distances

def format_planet_distances(planet: str, distances: Dict[str, float]) -> str:
    """
    Format planet distances into a readable string.
    
    Args:
        planet: Name of the source planet
        distances: Dictionary of distances to other planets
        
    Returns:
        Formatted string with distance information
    """
    prompt = f"Durante il Ciclo cosmico 789, le distanze in anni luce dal pianeta {planet} sono:\n"
    
    # Sort planets by distance
    sorted_distances = sorted(distances.items(), key=lambda x: x[1])
    
    for dest_planet, distance in sorted_distances:
        prompt += f"- {dest_planet}: {distance} anni luce\n"
    
    return prompt

def get_formatted_planet_info(planet: str, filename: str) -> str:
    """
    Read CSV and return formatted distance information for a planet.
    
    Args:
        planet: Name of the planet
        filename: Path to the CSV file
        
    Returns:
        Formatted string with distance information
    """
    try:
        # Read the CSV file
        df = pd.read_csv(filename, sep=',', decimal=',', header=0, index_col=0)
        
        # Get distances
        distances = get_planet_distances(planet, df)
        
        # Format the output
        return format_planet_distances(planet, distances)
        
    except FileNotFoundError:
        return f"Error: File {filename} not found"
    except ValueError as e:
        return f"Error: {str(e)}"
    except Exception as e:
        return f"Error: An unexpected error occurred: {str(e)}"

# You can also create a function to compare distances between planets:
def compare_planets_distances(planet1: str, planet2: str, df: pd.DataFrame) -> str:
    """
    Compare distances from two planets to all other planets.
    """
    dist1 = get_planet_distances(planet1, df)
    dist2 = get_planet_distances(planet2, df)
    
    comparison = f"Confronto delle distanze tra {planet1} e {planet2}:\n\n"
    
    all_planets = sorted(set(dist1.keys()) | set(dist2.keys()))
    
    for planet in all_planets:
        if planet in dist1 and planet in dist2:
            comparison += f"A {planet}:\n"
            comparison += f"- Da {planet1}: {dist1[planet]} anni luce\n"
            comparison += f"- Da {planet2}: {dist2[planet]} anni luce\n"
            comparison += f"- Differenza: {abs(dist1[planet] - dist2[planet])} anni luce\n\n"
    
    return comparison

In [4]:
def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.loads(file.read())
    return data

In [5]:
def read_text_file(file_path):
    # Open and read the file
    with open(file_path, 'r') as file:
        # Read lines into a list, removing trailing whitespace
        lines = [line.strip() for line in file]
    return lines

# Example usage:
file_path = '../../data/Final data cleaned complete to be used/Liste_licenze_ingredienti_techiche/ingredienti.txt'
ingredienti = read_text_file(file_path)

file_path = '../../data/Final data cleaned complete to be used/Liste_licenze_ingredienti_techiche/tecniche.txt'
tecniche = read_text_file(file_path)

dish_path = "../../Hackapizza_Dataset/Misc/dish_mapping.json"
with open(dish_path) as f:
    dish_mapping = json.load(f)

piatti_list = list(dish_mapping.keys())

import pandas as pd
filename = '../../Hackapizza_Dataset/Misc/Distanze.csv'
distances = pd.read_csv(filename, sep=',', decimal=',', header=0, index_col=0)
pianeti = [p for p in distances.columns.to_list() if p != '/']

In [6]:
ENDPOINT_URL = "https://us-south.ml.cloud.ibm.com"
API_KEY = "DZKT0-JkhDNn9_o6a_N3AiYk06HEifU3Kh6xnWXo16v-"
PROJECT_ID = "44241d2e-6cff-49ce-b84c-1ed230f8eb36"

In [7]:
# model_id = "meta-llama/llama-3-1-8b-instruct"
# model_id = "meta-llama/llama-3-3-70b-instruct"
model_id = "mistralai/mistral-large"

In [8]:
llm = WatsonxLLM(
    model_id=model_id,
    url=ENDPOINT_URL,
    apikey=API_KEY,
    project_id=PROJECT_ID,
    params={
        GenParams.DECODING_METHOD: "greedy",
        GenParams.TEMPERATURE: 0,
        GenParams.MIN_NEW_TOKENS: 5,
        GenParams.MAX_NEW_TOKENS: 2048,
        GenParams.STOP_SEQUENCES: ["Human:", "\nDomanda", "Domanda"],
    },
)

In [10]:
doc_template = """Il ristorante {ristorante} dello Chef {chef} è situato nel pianeta {pianeta}. {distanze_pianeta}
Nel menu del ristorante {ristorante} è presente il piatto {piatto} avente le seguenti informazioni:
- Ingredienti:\n{ingredienti}
- Tecniche:\n{tecniche}"""
print(doc_template)

Il ristorante {ristorante} dello Chef {chef} è situato nel pianeta {pianeta}. {distanze_pianeta}
Nel menu del ristorante {ristorante} è presente il piatto {piatto} avente le seguenti informazioni:
- Ingredienti:
{ingredienti}
- Tecniche:
{tecniche}


In [11]:
ingredienti_inverted_template = """L'ingrediente {ingrediente} è presente nei seguenti piatti:
- Piatti:\n{piatti}"""
print(ingredienti_inverted_template)

L'ingrediente {ingrediente} è presente nei seguenti piatti:
- Piatti:
{piatti}


In [12]:
licenze_inverted_template = """La licenza {licenza} è presente nei seguenti ristoranti:
- Ristoranti:\n{ristoranti}"""
print(licenze_inverted_template)

La licenza {licenza} è presente nei seguenti ristoranti:
- Ristoranti:
{ristoranti}


In [13]:
tecniche_inverted_template = """La tecnica {tecnica} è presente nei seguenti piatti:
- Piatti:\n{piatti}"""
print(ingredienti_inverted_template)

L'ingrediente {ingrediente} è presente nei seguenti piatti:
- Piatti:
{piatti}


In [14]:
from typing import List
from langchain_core.documents import Document

def format_list_items(items):
        return "\n".join(f"  - {item}" for item in items)

folder = "../../data/Ingredients_And_Metadata"
json_files = get_json_files(folder)

documents = []
doc_id = 0


for idx, file in enumerate(json_files):
    data = read_json(file)

    pianeta = data['Nome_pianeta']
    ristorante = data['Nome_ristorante']
    chef = data['Nome_chef']
    licenze = data['Licenze']
    ordini = data['Ordini']
    distanze_pianeta = get_formatted_planet_info(pianeta, '../../Hackapizza_Dataset/Misc/Distanze.csv')

    for idx_piatto, (piatto, info) in enumerate(data['piatti'].items()):
        ingredienti_formatted = format_list_items(info["Ingredienti"])
        tecniche_formatted = format_list_items(info["Tecniche"])

        doc = PromptTemplate.from_template(doc_template).format(
            ristorante=ristorante,
            chef=chef,
            pianeta=pianeta,
            distanze_pianeta=distanze_pianeta,
            piatto=piatto,
            ingredienti=ingredienti_formatted,
            tecniche=tecniche_formatted
        )

        docu = Document(page_content=doc, metadata={
            "id": doc_id,
            "ristorante": ristorante,
            "chef": chef,
            "pianeta": pianeta,
            "piatto": piatto,
        })
        doc_id += 1

        documents.append(docu)

In [16]:
len(documents)

277

In [17]:
inverted_index_ingredienti_path = '../../data/Final data cleaned complete to be used/Inverted_Indices/inverted_index_ingredienti.json'
inverted_index_licenze_path = '../../data/Final data cleaned complete to be used/Inverted_Indices/inverted_index_licenze.json'
inverted_index_tecniche_path = '../../data/Final data cleaned complete to be used/Inverted_Indices/inverted_index_tecniche.json'

In [18]:
data_inverted_ingredienti = read_json(inverted_index_ingredienti_path)
data_inverted_licenze = read_json(inverted_index_licenze_path)
data_inverted_tecniche = read_json(inverted_index_tecniche_path)

In [19]:
for ingrediente, piatti in data_inverted_ingredienti.items():
    piatti_formatted = format_list_items(piatti)
    doc = PromptTemplate.from_template(ingredienti_inverted_template).format(
        ingrediente=ingrediente,
        piatti=piatti_formatted
    )
    docu = Document(page_content=doc, metadata={"id": doc_id})
    doc_id += 1
    documents.append(docu)

for licenza, ristoranti in data_inverted_licenze.items():
    ristoranti_formatted = format_list_items(ristoranti)
    doc = PromptTemplate.from_template(licenze_inverted_template).format(
        licenza=licenza,
        ristoranti=ristoranti_formatted
    )
    docu = Document(page_content=doc, metadata={"id": doc_id})
    doc_id += 1
    documents.append(docu)

for tecnica, piatti in data_inverted_tecniche.items():
    piatti_formatted = format_list_items(piatti)
    doc = PromptTemplate.from_template(tecniche_inverted_template).format(
        tecnica=tecnica,
        piatti=piatti_formatted
    )
    docu = Document(page_content=doc, metadata={"id": doc_id})
    doc_id += 1
    documents.append(docu)

In [20]:
import pickle
with open('documents_extra.pkl', 'rb') as file:  # 'wb' means write binary
    document_extra = pickle.load(file)

In [21]:
documents += document_extra

In [22]:
len(documents)

582

In [23]:
embedd_id = 'intfloat/multilingual-e5-large'
# embedd_id = 'sentence-transformers/all-minilm-l6-v2'

embeddings = WatsonxEmbeddings(
    model_id=embedd_id,
    url=ENDPOINT_URL,
    apikey=API_KEY,
    project_id=PROJECT_ID,
)

In [35]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=512, chunk_overlap=100
)
doc_splits = text_splitter.split_documents(documents)

In [36]:
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="agentic-rag-chroma",
    embedding=embeddings,
)

In [26]:
retriever = vectorstore.as_retriever(
    search_type="mmr", search_params={"lambda": 0.6, "mu": 0.2, "top_k": 8}, filter = True)

In [26]:
# retriever = vectorstore.as_retriever(search_params={"top_k": 10}, filter = True)

In [38]:
instruction = """Sei un assistente che risponde alle domande relative all'ambito culinario intergalattico. Il contesto fornito conterrà informazioni preziose su ristoranti, menu, distanze planetarie, regolamenti galattici, certificazioni alimentari e tecniche culinarie specifiche di tutto il multiverso."""

In [39]:
domande = pd.read_csv('../../Hackapizza_Dataset/domande.csv')['domanda'].to_list()

In [41]:
# First, let's format our retrieved documents into a context
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create a template that includes instruction, context, and question
template = """{instruction}

### Linee guida per la risposta:
Rispondi alle domande scrivendo solamente i piatti che rispondono alla domanda, in base al contesto fornito.
### Contesto:
{context}

I piatti tra cui puoi scegliere sono i seguenti:
### Lista di piatti disponibili:
{piatti_list}

Per essere sicuri che tu abbia capito ti lascio un esempio di domanda e risposta:
### Esempio di domanda e risposta:
Domanda: Quali sono i piatti che iniziano con la lettera A?
Risposta:
- Alternate Realities Risotto
- Antipasto Celestiale
- Antipasto Stellare dell'Eterna Armonia
- Armonia Cosmica alla Tavola d'Oro
- Armonia Cosmica della Fenice

MI RACCOMANDO RISPONDI ATTENTAMENTE ALLA DOMANDA, PER ME é molto IMPORTANTE!
### Domanda: {question}
Risposta:"""

prompt = PromptTemplate(
    input_variables=["instruction", "context", "question"],
    template=template
)

responses = []
for question in domande:
    retrieved_docs = retriever.invoke(question)

    # Format with all variables
    final_prompt = prompt.format(
        instruction=instruction,
        context=format_docs(retrieved_docs),
        question=question,
        piatti_list=piatti_list
    )
    response = llm(final_prompt)
    responses.append(response)

In [42]:
responses

['\n- Galassia di Sapori: Il Viaggio Senza Tempo',
 '\n- Sinfonia Cosmica: Versione Pizza',
 "\n- Pizza Cosmica all'Essenza di Drago con Nebbia Arcobaleno e Funghi Orbitali",
 '\n- Sinfonia Cosmica di Sapore',
 '\n- Il Rapsodo Celestiale',
 "\n- Rinascita Cosmica\n- Pioggia Calante dell'Universo\n- Pizza Baby Daniele\n- Pizza Baby Simone e Alessandro\n- Plasma Celestiale al Risotto di Kraken nell'Aura del Sole\n- Quadrifonia Cosmica: Sinfonia di Sapori e Dimensioni\n- Risotto dei Multiversi\n- Sinfonia Celestiale dei Ricordi\n- Sinfonia Cosmica della Rinascita\n- Sinfonia Cosmica: Versione Pizza\n- Sinfonia Cosmica: la Vendetta Fantasma\n- Sinfonia Galattica ai Cristalli di Nebulite\n- Sinfonia Temporale Galattica\n- Sinfonia Temporale al Tocco di Crono\n- Sinfonia Temporale nello Spaghi del Sole\n- Sinfonia dell'Universo\n- Universo Cosmico nel Piatto\n- Universo Incantato: Sinfonia dei Gusti Cosmogonici\n- Valzer delle Stelle",
 '\n- Portale del Cosmo\n- Rivisitazione del Kraken sott

In [43]:
def check_in_dict(phrases, dictionary):
    # Split the input into individual phrases
    phrases = phrases.split('\n')
    phrases = [phrase.strip() for phrase in phrases]

    # Sort dictionary keys by length in descending order
    sorted_keys = sorted(dictionary, key=len, reverse=True)

    found_keys = []
    for phrase in phrases:
        for key in sorted_keys:
            # Check for exact match of the longest possible key
            if key in phrase and all(k not in phrase or k == key for k in found_keys):
                found_keys.append(key)
                break  # Stop checking once the longest match is found

    return found_keys

In [44]:
answers = {}

for i in range(len(responses)):
    dishes = check_in_dict(responses[i], dish_mapping)
    ids = list(map(str, list(set([dish_mapping[dish] for dish in dishes if dish in dish_mapping]))))
    if len(ids) == 0:
        answers[i + 1] = "10"
    else:
        answers[i + 1] = ",".join(ids)

In [45]:
answers

{1: '78',
 2: '225',
 3: '156',
 4: '215',
 5: '94',
 6: '225,227,165,232,201,176,208,242,179,243,272,150,247,184,153,273,155,278',
 7: '267,189',
 8: '130,6,13,15,209,51',
 9: '76,207',
 10: '184,266,115',
 11: '246,199',
 12: '240,185',
 13: '125',
 14: '1,101,77,78,80,81',
 15: '160,34,35,281,103,10,269,147,248,217,125,191',
 16: '160,34,35,281,10,147,248,217,125,191',
 17: '263,170,16,112,276,215,219,157,190',
 18: '128,37,170,45,91',
 19: '53',
 20: '72,71',
 21: '70,168,245,90,220',
 22: '130,69,199,74,75,79,82,127',
 23: '10',
 24: '118',
 25: '216,81,118,94',
 26: '34,205',
 27: '115',
 28: '43,284',
 29: '248,34,227,69,264,172,15,214,88,60',
 30: '77,110,112,180,190',
 31: '260,262,135,12,157,158,166,41,44,45,174,180,186,190,192,198,204,77,86,92,230,104,110,112,117,127',
 32: '128,124,206',
 33: '174,208,252,86,188',
 34: '272',
 35: '217,140,272,279,121,284',
 36: '160,72,276,118,215,158',
 37: '140',
 38: '73,174',
 39: '160,99,5,42,203,110,208,85,86,247',
 40: '197,41,235,1

In [46]:
import csv
csv_filename = "answers_mistral.csv"
with open(csv_filename, mode='w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['row_id', 'result'])
    for key, value in answers.items():
        csv_writer.writerow([int(key), f'{value}'])