<a href="https://colab.research.google.com/github/dbigman/project-dsml-interactive-travel-planner/blob/main/Functions_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import requests
from datetime import datetime, timedelta
import json
import chromadb
from sentence_transformers import SentenceTransformer


# Functions

In [None]:
API_KEY = "f5b3be307a91026889509d3e0cbee098"  # Replace with your API key
BASE_URL = "https://api.openweathermap.org/data/2.5/forecast"

def find_weather_forecast(date, location):
    """
    Retrieves the weather forecast for a given date and location using OpenWeather API.

    Parameters:
    - date (str): Target date in 'YYYY-MM-DD' format.
    - location (str): City name or "city, country" (e.g., "San Juan, PR").

    Returns:
    - dict: Weather forecast details (temperature, description, etc.).
    """
    try:
        # Get weather data
        params = {
            "q": location,
            "appid": API_KEY,
            "units": "metric",  # Use "imperial" for Fahrenheit
        }
        response = requests.get(BASE_URL, params=params)
        data = response.json()

        if response.status_code != 200:
            return {"error": data.get("message", "Failed to fetch weather data")}

        # Convert input date to datetime
        target_date = datetime.strptime(date, "%Y-%m-%d")

        # Find the closest forecast for the given date
        closest_forecast = None
        min_diff = timedelta.max

        for forecast in data["list"]:
            forecast_time = datetime.utcfromtimestamp(forecast["dt"])
            time_diff = abs(forecast_time - target_date)

            if time_diff < min_diff:
                min_diff = time_diff
                closest_forecast = forecast

        if closest_forecast:
            return {
                "date": closest_forecast["dt_txt"],
                "temperature": closest_forecast["main"]["temp"],
                "description": closest_forecast["weather"][0]["description"],
                "humidity": closest_forecast["main"]["humidity"],
                "wind_speed": closest_forecast["wind"]["speed"],
            }
        else:
            return {"error": "No forecast found for the specified date"}

    except Exception as e:
        return {"error": str(e)}

# Example usage:
print(find_weather_forecast("2025-02-15", "San Juan, PR"))

{'date': '2025-02-15 00:00:00', 'temperature': 23.52, 'description': 'clear sky', 'humidity': 79, 'wind_speed': 5.95}


## Landmark Embeddings

In [1]:
import json
import chromadb
from sentence_transformers import SentenceTransformer
import os

# Load embedding model
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Define ChromaDB storage path (create if it doesn't exist)
CHROMA_DB_PATH = "./chromadb_store"  # Local directory
os.makedirs(CHROMA_DB_PATH, exist_ok=True)

# Initialize ChromaDB client with persistent storage
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

# Create or get collection for landmarks
landmark_collection = client.get_or_create_collection(name="landmarks")

# Load landmarks data from corrected JSON file (in the same directory as the notebook)
landmarks_json_path = "landmarks_corrected.json"

with open(landmarks_json_path, "r", encoding="utf-8") as file:
    landmarks = json.load(file)

# Store landmarks in ChromaDB
for landmark in landmarks:
    # Ensure required fields exist
    if "name" not in landmark or "description" not in landmark:
        print(f"Skipping entry due to missing fields: {landmark}")
        continue

    # Generate a unique ID from the landmark name
    landmark_id = landmark["name"].replace(" ", "_").lower()

    # Convert description from list to string (if needed)
    if isinstance(landmark["description"], list):
        landmark["description"] = " ".join(landmark["description"]).replace("\\n", " ").strip()

    # Generate embedding for the landmark description
    embedding = embedding_model.encode(landmark["description"]).tolist()

    # Ensure all metadata values are valid (convert None to "Unknown")
    metadata = {
        "name": landmark["name"] if landmark["name"] is not None else "Unknown",
        "description": landmark["description"] if landmark["description"] is not None else "Unknown",
        "category": landmark.get("category", "Unknown"),
        "municipality": landmark.get("municipality", "Unknown"),
        "coordinates": str(landmark["coordinates"]) if landmark["coordinates"] is not None else "Unknown",
        "source_file": landmark.get("source_file", "Unknown")
    }

    # Add to ChromaDB
    landmark_collection.add(
        ids=[landmark_id],
        embeddings=[embedding],
        metadatas=[metadata]
    )

print("Landmarks stored in ChromaDB successfully!")


  from .autonotebook import tqdm as notebook_tqdm
Add of existing embedding ID: club_n\xc3\xa1utico_de_ponce
Insert of existing embedding ID: club_n\xc3\xa1utico_de_ponce
Add of existing embedding ID: dos_bocas_lake
Insert of existing embedding ID: dos_bocas_lake
Add of existing embedding ID: complejo_recreativo_y_cultural_la_guancha
Insert of existing embedding ID: complejo_recreativo_y_cultural_la_guancha
Add of existing embedding ID: ponce_historic_zone
Insert of existing embedding ID: ponce_historic_zone


Landmarks stored in ChromaDB successfully!


## Municipalities Embeddings

In [2]:
import os
import json
import logging
import chromadb
from sentence_transformers import SentenceTransformer

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

logging.info("Loading embedding model...")
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
logging.info("Embedding model loaded successfully.")

# Define ChromaDB storage path (local directory)
CHROMA_DB_PATH = "./chromadb_store"
os.makedirs(CHROMA_DB_PATH, exist_ok=True)
logging.info(f"ChromaDB storage directory ensured at {CHROMA_DB_PATH}.")

# Initialize ChromaDB client with persistent storage
try:
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    logging.info("ChromaDB PersistentClient initialized successfully.")
except Exception as e:
    logging.error(f"Error initializing ChromaDB PersistentClient: {e}")
    raise

# Create or get collection for municipalities
try:
    municipality_collection = client.get_or_create_collection(name="municipalities")
    logging.info("Municipalities collection retrieved/created successfully.")
except Exception as e:
    logging.error(f"Error getting or creating municipalities collection: {e}")
    raise

# Load municipalities data from JSON file
json_file_path = "./municipalities_corrected.json"  # Update path if needed
try:
    with open(json_file_path, "r", encoding="utf-8") as file:
        municipalities = json.load(file)
    logging.info(f"Loaded {len(municipalities)} municipalities from JSON file.")
except FileNotFoundError:
    logging.error(f"Municipalities JSON file not found at {json_file_path}.")
    raise
except json.JSONDecodeError as e:
    logging.error(f"Error decoding JSON file: {e}")
    raise

# Store municipalities in ChromaDB
for municipality in municipalities:
    # Ensure required fields exist
    if "name" not in municipality or "description" not in municipality:
        logging.warning(f"Skipping entry due to missing fields: {municipality}")
        continue

    # Generate a unique ID from the municipality name
    municipality_id = municipality["name"].replace(" ", "_").lower()

    # Convert description from list to string if needed
    if isinstance(municipality["description"], list):
        municipality["description"] = " ".join(municipality["description"]).replace("\\n", " ").strip()

    # Generate embedding for the municipality description
    try:
        embedding = embedding_model.encode(municipality["description"]).tolist()
    except Exception as e:
        logging.error(f"Error generating embedding for {municipality['name']}: {e}")
        continue

    # Prepare metadata and ensure all values are valid (convert None to "Unknown")
    metadata = {
        "name": municipality.get("name") or "Unknown",
        "description": municipality.get("description") or "Unknown",
        "category": municipality.get("category") or "Municipality",
        "coordinates": str(municipality.get("coordinates")) if municipality.get("coordinates") is not None else "Unknown",
        "source_file": municipality.get("source_file") or "Unknown"
    }

    # Add the record to ChromaDB
    try:
        municipality_collection.add(
            ids=[municipality_id],
            embeddings=[embedding],
            metadatas=[metadata]
        )
        logging.info(f"Municipality '{municipality['name']}' stored in ChromaDB successfully.")
    except Exception as e:
        logging.error(f"Failed to add municipality '{municipality['name']}' to ChromaDB: {e}")

logging.info("All municipalities stored in ChromaDB successfully!")


2025-02-12 01:07:24,108 - INFO - Loading embedding model...
2025-02-12 01:07:24,112 - INFO - Use pytorch device_name: cpu
2025-02-12 01:07:24,112 - INFO - Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
2025-02-12 01:07:26,072 - INFO - Embedding model loaded successfully.
2025-02-12 01:07:26,073 - INFO - ChromaDB storage directory ensured at ./chromadb_store.
2025-02-12 01:07:26,079 - INFO - ChromaDB PersistentClient initialized successfully.
2025-02-12 01:07:26,089 - INFO - Municipalities collection retrieved/created successfully.
2025-02-12 01:07:26,091 - INFO - Loaded 78 municipalities from JSON file.
Batches: 100%|██████████| 1/1 [00:00<00:00, 33.30it/s]
2025-02-12 01:07:26,178 - INFO - Municipality 'Adjuntas, Puerto Rico' stored in ChromaDB successfully.
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.58it/s]
2025-02-12 01:07:26,207 - INFO - Municipality 'Aguada, Puerto Rico' stored in ChromaDB successfully.
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.58it/s]
2025-0

## Appropriate location function

In [3]:
def rank_appropriate_locations(user_prompt, top_k=5):
    """
    Finds and ranks appropriate landmarks based on user input.

    Args:
        user_prompt (str): The user's interest (e.g., "I love the beach and history").
        top_k (int): Number of top results to return.

    Returns:
        List of top_k ranked locations.
    """
    # Ensure ChromaDB is initialized
    global landmark_collection  # Ensure we're using the same collection

    # Convert user prompt into an embedding
    user_embedding = embedding_model.encode(user_prompt).tolist()

    # Retrieve top relevant locations using similarity search
    search_results = landmark_collection.query(
        query_embeddings=[user_embedding],
        n_results=top_k
    )

    # Extract matched locations
    ranked_locations = [
        {
            "name": metadata["name"],
            "description": metadata["description"],
            "category": metadata["category"],
            "municipality": metadata["municipality"],
            "score": score  # Similarity score
        }
        for metadata, score in zip(search_results["metadatas"][0], search_results["distances"][0])
    ]

    return ranked_locations

## Apropriate location test

In [1]:
user_prompt = "I like long walks in nature"
top_recommendations = rank_appropriate_locations(user_prompt)

for idx, place in enumerate(top_recommendations, 1):
    print(f"{idx}. {place['name']} ({place['category']}) - {place['municipality']}")
    print(f"   Score: {place['score']:.4f}")
    print(f"   {place['description']}\n")

NameError: name 'rank_appropriate_locations' is not defined

## Find info on location

In [9]:
def find_info_on_location(user_prompt, location, top_k=3):
    """
    Retrieves relevant information about a given location based on user query.

    Args:
        user_prompt (str): The user's specific request (e.g., "Tell me about the history of Old San Juan").
        location (str): The location for which the user wants information.
        top_k (int): Number of top results to return.

    Returns:
        List of relevant document excerpts.
    """
    # Ensure ChromaDB is initialized
    global landmark_collection

    # Encode user query into an embedding
    query_embedding = embedding_model.encode(f"{user_prompt} about {location}").tolist()

    # Search for relevant information in ChromaDB
    search_results = landmark_collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )

    # Extract matched documents
    relevant_info = [
        {
            "name": metadata["name"],
            "description": metadata["description"],
            "category": metadata["category"],
            "municipality": metadata["municipality"],
            "score": score  # Similarity score
        }
        for metadata, score in zip(search_results["metadatas"][0], search_results["distances"][0])
        if location.lower() in metadata["name"].lower()  # Filter by location name
    ]

    return relevant_info

## Find info on location test

In [12]:
user_query = "What is the history of this place?"
location = "Ponce"

info = find_info_on_location(user_query, location)

for idx, entry in enumerate(info, 1):
    print(f"{idx}. {entry['name']} ({entry['category']}) - {entry['municipality']}")
    print(f"   Score: {entry['score']:.4f}")
    print(f"   {entry['description']}\n")


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.26it/s]

1. Antiguo Hospital Militar Espa\xc3\xb1ol de Ponce (Landmark) - Unknown
   Score: 25.2957
   ["The Antiguo Hospital Militar Español de Ponce (English: Old Spanish Military Hospital in Ponce) is a historic building in Ponce, Puerto Rico, in the city's historic district. The building dates from 1896 or 1897. It was designed by the Spanish Royal Corps of Engineers. The architecture consists of 19th Neoclassical architecture style. The building is of architectural significance since it is the only one-story building of this style remaining in the city of Ponce and one of the best examples on the Island. Completed in 1897, the year before the Spanish–American War of 1898, this building was the last major construction undergone by the Spanish Government in the Americas. From 1905 to the mid-1970s, the structure served as the Asilo de Ciegos de Ponce (Ponce Blind Asylum). As of 2020, the building sits abandoned.", "The structure commonly known as the Asilo de Ciegos (Home of the Blind) was b




# Assembling the assistant

In [13]:
# Imports
import openai

In [36]:
# api_key_path = "/content/drive/MyDrive/IronHack_final_project/API_Key1.txt"

from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()

# Get OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    logging.error("OpenAI API key is missing! Check your .env file.")
    raise ValueError("OpenAI API key not found.")



# # Read the key from the file
# with open(api_key_path, "r") as file:
#     openai.api_key = file.read().strip()  # Strip removes any extra spaces or newlines

# print("API Key Loaded Successfully!")  # Just to confirm it's working

In [40]:
import os
import time
import logging
import openai

# Configure logging if not already done
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Retrieve your OpenAI API key from an environment variable (or set it directly)
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")

# Set the API key for the OpenAI package
openai.api_key = openai_api_key

def query_llm(prompt, max_retries=3):
    """
    Queries the OpenAI API  with the given prompt and returns the response.
    Retries the API call up to max_retries times if there is an error.
    """
    for attempt in range(1, max_retries + 1):
        try:
            logging.info("Calling OpenAI API (attempt %s)...", attempt)
            client = openai.OpenAI(
                api_key=openai_api_key,
                base_url="https://api.openai.com/v1"
            )
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=150
            )
            answer = response.choices[0].message.content.strip()
            logging.info("Received response from OpenAI API.")
            return answer
        except Exception as e:
            logging.error(f"Error on attempt {attempt}: {e}")
            time.sleep(2)  # Wait before retrying
    raise Exception("Failed to query OpenAI API after several attempts.")


answer = query_llm('Where is Ponce?')
print(answer)
# Example usage in your assistant_response function
def assistant_response(user_query):
    """
    Generates an assistant response by first retrieving relevant documents and constructing a prompt.
    Then it calls the LLM (via query_llm) to generate a response.
    """
    # Assume find_relevant_documents and generate_assistant_prompt are defined as in your previous code.
    relevant_docs = find_relevant_documents(user_query)
    location_info = None

    try:
        if (relevant_docs and 
            relevant_docs.get("documents") and 
            relevant_docs["documents"][0] is not None):
            
            metadata_item = relevant_docs["metadatas"][0]
            # If metadata_item is a list, extract its first element
            if isinstance(metadata_item, list):
                if len(metadata_item) > 0:
                    location_info = metadata_item[0].get("name", "No location found")
                else:
                    logging.warning("Received an empty metadata list.")
                    location_info = "No location found"
            elif isinstance(metadata_item, dict):
                location_info = metadata_item.get("name", "No location found")
            else:
                logging.warning("Unexpected metadata type: %s", type(metadata_item))
                location_info = "No location found"

            prompt = generate_assistant_prompt(user_query, location=location_info)
        else:
            prompt = (f"Sorry, I couldn't find any matching places for '{user_query}'. "
                      f"Could you provide more specific information?")
        
        logging.info("Final prompt to query LLM: %s", prompt)
    except Exception as e:
        logging.error("Error processing relevant documents: %s", e)
        prompt = f"Sorry, there was an error processing your request."

    try:
        response = query_llm(prompt)
        return response
    except Exception as e:
        logging.error("Error querying LLM: %s", e)
        return "Sorry, there was an error generating a response."

response = assistant_response('Where is Ponce?') 


2025-02-12 02:04:31,610 - INFO - Calling OpenAI API (attempt 1)...
2025-02-12 02:04:33,633 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-12 02:04:33,634 - INFO - Received response from OpenAI API.


Ponce is a city located on the southern coast of Puerto Rico. It is the island's second-largest city, known for its rich history, cultural heritage, and architectural landmarks. Ponce is often referred to as "La Perla del Sur" (The Pearl of the South) and is a significant center for Puerto Rican culture and commerce.


Batches: 100%|██████████| 1/1 [00:00<00:00, 99.92it/s]
2025-02-12 02:04:33,655 - INFO - ChromaDB returned: {'ids': [['antiguo_casino_de_ponce', 'hotel_ponce_intercontinental', 'letras_de_ponce']], 'embeddings': None, 'documents': [[None, None, None]], 'uris': None, 'data': None, 'metadatas': [[{'category': 'Landmark', 'coordinates': "{'latitude': None, 'longitude': None}", 'description': 'The Antiguo Casino de Ponce (English: Old Ponce Casino), or simply the Casino de Ponce, is a historic structure, built in 1922 and located in Barrio Cuarto, Ponce, Puerto Rico. Originally built as a social club for Ponce\'s elite, it is currently used as the premier reception center of "The Noble City of Puerto Rico." The building, designed by Agustin Camilo Gonzalez in the Second Empire and Neo-Rococo styles, has a French facade and tones. It was listed in the National Register of Historic Places on 28 October 1987. It is located at the corner of Marina and Luna streets. The building has been called 

In [38]:
import logging

# Ensure logging is configured (adjust the configuration as needed)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def generate_assistant_prompt(user_query, location=None):
    """
    Generate a prompt for the LLM based on the user's query and location metadata.
    """
    if location:
        prompt = (f"User asked about '{location}'. The location is part of a large list of "
                  f"landmarks and municipalities in Puerto Rico. Answer the user's query "
                  f"related to this location: '{user_query}'.")
    else:
        prompt = (f"User has the following query: '{user_query}'. Provide suggestions about "
                  f"relevant locations or landmarks in Puerto Rico based on the user’s interests.")
    
    logging.debug(f"Generated prompt: {prompt}")
    return prompt

def find_relevant_documents(user_query):
    """
    Query ChromaDB for documents relevant to the user's query.
    """
    try:
        query_embedding = embedding_model.encode(user_query).tolist()
        results = landmark_collection.query(
            query_embeddings=[query_embedding],
            n_results=3  # Number of results to retrieve
        )
        logging.info("ChromaDB returned: %s", results)
    except Exception as e:
        logging.error("Error querying ChromaDB: %s", e)
        results = {}
    return results

def assistant_response(user_query):
    """
    Generate the assistant's response by first retrieving relevant documents,
    then constructing a prompt for the LLM based on metadata from those documents.
    """
    relevant_docs = find_relevant_documents(user_query)
    location_info = None

    try:
        # Check if relevant documents exist and the first document is not None
        if (relevant_docs and 
            relevant_docs.get("documents") and 
            relevant_docs["documents"][0] is not None):
            
            metadata_item = relevant_docs["metadatas"][0]
            # If metadata_item is a list, extract its first element
            if isinstance(metadata_item, list):
                if len(metadata_item) > 0:
                    location_info = metadata_item[0].get("name", "No location found")
                else:
                    logging.warning("Received an empty metadata list.")
                    location_info = "No location found"
            elif isinstance(metadata_item, dict):
                location_info = metadata_item.get("name", "No location found")
            else:
                logging.warning("Unexpected metadata type: %s", type(metadata_item))
                location_info = "No location found"

            prompt = generate_assistant_prompt(user_query, location=location_info)
        else:
            prompt = (f"Sorry, I couldn't find any matching places for '{user_query}'. "
                      f"Could you provide more specific information?")
        
        logging.info("Final prompt to query LLM: %s", prompt)
    except Exception as e:
        logging.error("Error processing relevant documents: %s", e)
        prompt = f"Sorry, there was an error processing your request."

    # try:
    #     response = query_llm(prompt)
    #     return response
    # except Exception as e:
    #     logging.error("Error querying LLM: %s", e)
    #     ic(prompt)
    #     return "Sorry, there was an error generating a response."
    
    
    # Example test run:
if __name__ == "__main__":
    user_input = "I like the forest and eating pork. Where can I go?"
    try:
        answer = assistant_response(user_input)
        print("Assistant:", answer)
    except Exception as ex:
        print("Assistant encountered an error:", ex)


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.26it/s]
2025-02-12 02:00:17,218 - INFO - ChromaDB returned: {'ids': [['r\\xc3\\xado_abajo_state_forest', 'cayo_luis_pe\\xc3\\xb1a', 'laguna_cartagena_national_wildlife_refuge']], 'embeddings': None, 'documents': [[None, None, None]], 'uris': None, 'data': None, 'metadatas': [[{'category': 'Landmark', 'coordinates': "{'latitude': None, 'longitude': None}", 'description': "['Rio Abajo State Forest is a forest preserve in Puerto Rico owned by the Department of Natural Resources and one of the 20 state forests on the island. It was designated a National Natural Landmark in 1980 and constitutes an area of 3,590 acres. It consists mostly of subtropical wet and moist karst forest. Mogotes and sinkholes fill the landscape. The forest preserve is located in the municipalities of Arecibo and Utuado. In addition to its ecological value, the forest also contains a number of archaeological sites. ', 'Much of the development within the forest area was m

Assistant: None


## Testing the LLM


In [39]:
# Simulate a conversation with the assistant
user_input = "I like the forest and eating pork. Where can I go?"
assistant_output = assistant_response(user_input)

print(f"Assistant: {assistant_output}")

Batches: 100%|██████████| 1/1 [00:00<00:00, 111.02it/s]
2025-02-12 02:00:27,930 - INFO - ChromaDB returned: {'ids': [['r\\xc3\\xado_abajo_state_forest', 'cayo_luis_pe\\xc3\\xb1a', 'laguna_cartagena_national_wildlife_refuge']], 'embeddings': None, 'documents': [[None, None, None]], 'uris': None, 'data': None, 'metadatas': [[{'category': 'Landmark', 'coordinates': "{'latitude': None, 'longitude': None}", 'description': "['Rio Abajo State Forest is a forest preserve in Puerto Rico owned by the Department of Natural Resources and one of the 20 state forests on the island. It was designated a National Natural Landmark in 1980 and constitutes an area of 3,590 acres. It consists mostly of subtropical wet and moist karst forest. Mogotes and sinkholes fill the landscape. The forest preserve is located in the municipalities of Arecibo and Utuado. In addition to its ecological value, the forest also contains a number of archaeological sites. ', 'Much of the development within the forest area was 

Assistant: None
