<a href="https://colab.research.google.com/github/dbigman/project-dsml-interactive-travel-planner/blob/main/Final_project_IH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Hithchiker's Guide to Puerto Rico
In this project, we are going to use all the Data Science and Machine Learning skills we have acquired during the course of the last few weeks to build an interactive travel planner for the beautiful island of Puerto Rico. By the end of this project, we will present a working application that cooperates with a visitor to help them build a travel itinerary suitable to their personal preferences.

In [4]:
# Imports
import os
import requests

import json
from bs4 import BeautifulSoup
import re
import html


  from .autonotebook import tqdm as notebook_tqdm


## From HTML to .JSON|

### Functions

In [5]:
# Function to clean unwanted characters and fix words
def clean_text(text):
    # Remove unwanted characters (e.g., escape sequences like "\xc9")
    text = re.sub(r'\\[xX][0-9A-Fa-f]{2}', '', text)  # Remove escaped Unicode characters
    text = re.sub(r'[\r\n\t]', ' ', text)  # Remove newlines and tabs
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with one
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to add missing characters like 'ñ'
def add_missing_characters(text):
    # Mapping of words that might need 'ñ' or other fixes
    replacements = {
        "Aguada": "Aguada",
        "Aasco": "Añasco",
        "Catao": "Cataño",
        "Nio": "Niño",
        "Peuelas": "Peñuelas"
        # Add other common words as needed
    }

    # Replace words based on the mapping
    for wrong_word, correct_word in replacements.items():
        text = re.sub(rf'\b{wrong_word}\b', correct_word, text, flags=re.IGNORECASE)
    return text

# Function to extract coordinates from the HTML content
def extract_coordinates(html_content):
    # Regex pattern to match lat and lon inside "wgCoordinates"
    coordinates_pattern = r'"wgCoordinates":\s*\{\s*"lat":\s*(-?\d+\.\d+),\s*"lon":\s*(-?\d+\.\d+)\s*\}'
    match = re.search(coordinates_pattern, html_content)

    if match:
        # Clean and extract latitude and longitude as float values
        lat = float(match.group(1).strip().replace("\\n", ""))  # Remove any unwanted newline characters
        lon = float(match.group(2).strip().replace("\\n", ""))  # Remove any unwanted newline characters
        return lat, lon

    return None, None

### Municipalities

In [6]:
# Path to the municipalities folder in Google Drive
# municipalities_folder = "/content/drive/MyDrive/IronHack_final_project/municipalities"
municipalities_folder = 'data\municipalities'

# List to store structured data
municipalities_data = []

# Loop through each .txt file in the folder
for filename in os.listdir(municipalities_folder):
    if filename.endswith(".txt"):  # Ensure we only process .txt files
        file_path = os.path.join(municipalities_folder, filename)

        # Read the HTML content
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse HTML using BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")

        # Extract title (or use filename if no title found), and clean it
        title = clean_text(soup.title.string) if soup.title else clean_text(filename.replace(".txt", ""))
        # Remove " - Wikipedia" from the title
        title = title.replace(" - Wikipedia", "")
        title = add_missing_characters(title)  # Add missing characters to the title

        # Extract first 3 paragraphs for description
        paragraphs = [clean_text(p.get_text(strip=True)) for p in soup.find_all("p")][:3]
        paragraphs = [add_missing_characters(p) for p in paragraphs]  # Add missing characters to description

        # Extract coordinates (latitude and longitude) from HTML content
        latitude, longitude = extract_coordinates(html_content)

        # Structure the data
        municipality = {
            "name": title,
            "category": "Municipality",
            "description": paragraphs,
            "coordinates": {
                "latitude": latitude,
                "longitude": longitude
            },
            "source_file": filename
        }

        # Append to the list
        municipalities_data.append(municipality)

# Save structured data as JSON

output_json = "data/municipalities.json"
with open(output_json, "w", encoding="utf-8") as json_file:
    json.dump(municipalities_data, json_file, indent=4, ensure_ascii=False)

print(f"Municipality data with coordinates saved as {output_json}")

Municipality data with coordinates saved as data/municipalities.json


In [7]:
import pandas as pd

muni_data_extended = []
for entry in municipalities_data:
    muni_data_extended.append({
        "Municipality": entry["name"],
        "Category": entry["category"],
        "Description": " ".join(entry["description"]).strip(),
        "Latitude": entry["coordinates"]["latitude"],
        "Longitude": entry["coordinates"]["longitude"],
        "Source File": entry["source_file"]
    })

municipalities_df = pd.DataFrame(muni_data_extended)




### Landmarks

In [15]:
# Path to the landmarks folder in Google Drive
landmarks_folder = 'data\landmarks'

# List to store structured data
landmarks_data = []

# Loop through each .txt file in the folder
for filename in os.listdir(landmarks_folder):
    if filename.endswith(".txt"):  # Ensure we only process .txt files
        file_path = os.path.join(landmarks_folder, filename)

        # Read the HTML content
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse HTML using BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")

        # Extract title (or use filename if no title found), and clean it
        title = clean_text(soup.title.string) if soup.title else clean_text(filename.replace(".txt", ""))
        # Remove " - Wikipedia" from the title
        title = title.replace(" - Wikipedia", "")
        title = add_missing_characters(title)  # Add missing characters to the title

        # Extract first 3 paragraphs for description
        paragraphs = [clean_text(p.get_text(strip=True)) for p in soup.find_all("p")][:3]
        paragraphs = [add_missing_characters(p) for p in paragraphs]  # Add missing characters to description

        # Extract coordinates (latitude and longitude) from HTML content
        latitude, longitude = extract_coordinates(html_content)

        # Structure the data
        landmark = {
            "name": title,
            "category": "Landmark",
            "description": paragraphs,
            "coordinates": {
                "latitude": latitude,
                "longitude": longitude
            },
            "source_file": filename
        }

        # Append to the list
        landmarks_data.append(landmark)

# Save structured data as JSON
output_json = "data/landmarks.json"
with open(output_json, "w", encoding="utf-8") as json_file:
    json.dump(landmarks_data, json_file, indent=4, ensure_ascii=False)

print(f"Landmark data with coordinates saved as {output_json}")

Landmark data with coordinates saved as data/landmarks.json


In [None]:
import openai
import os
import time
import logging
import pandas as pd
from dotenv import load_dotenv
import sys
from icecream import ic

# Create a custom logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Clear existing handlers (useful in a notebook if logging was already configured)
if logger.hasHandlers():
    logger.handlers.clear()

# Create handlers: one for file and one for console output
file_handler = logging.FileHandler("landmarks_correction.log")
file_handler.setLevel(logging.INFO)

console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)

# Create a formatter and add it to the handlers
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)

# Add handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)

# Load environment variables from .env file
load_dotenv()

# Get OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    logging.error("OpenAI API key is missing! Check your .env file.")
    raise ValueError("OpenAI API key not found.")

def correct_text(text, retries=3):
    """
    Corrects typos using OpenAI API with simple retry logic.
    """
    prompt = (
        f"Correct any typos in the following text while keeping the meaning intact. "
        f"Do not include ANYTHING in addition to the corrected text:\n{text}"
    )
    
    for attempt in range(1, retries + 1):
        try:
            logging.info("Calling OpenAI API (attempt %s)...", attempt)
            client = openai.OpenAI(
                api_key=openai_api_key,
                base_url="https://api.openai.com/v1"
            )
            response = client.chat.completions.create(
                model="gpt-4o-mini-2024-07-18",
                messages=[
                    {"role": "system", "content": "You are a proofreading assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=1,
                max_tokens=500,
                top_p=1
            )
            
            result = response.choices[0].message.content.strip()
            logging.info("OpenAI API call successful on attempt %s.", attempt)
            logging.debug("Result: %s", result)
            return result
        
        except Exception as e:
            logging.error("Error on attempt %s: %s", attempt, e)
            time.sleep(2 ** attempt)  # Exponential backoff
    
    logging.error("All attempts failed for text starting with: %s", text[:50])
    ic(text)
    return text  

def process_description(text):
    """
    Wrapper function to process each description. Inserts a short delay
    to help avoid rate limiting.
    """
    corrected = correct_text(text)
    ic(corrected)
    time.sleep(1)  # Short delay to avoid rate limiting
    return corrected

# Load the landmarks dataset
# landmarks_df = pd.read_csv("landmarks.csv", encoding="utf-8")

# Load the JSON file
landmarks_json_path = "data/landmarks.json"

with open(landmarks_json_path, "r", encoding="utf-8") as file:
    landmarks_data = json.load(file)

# Convert JSON data into a Pandas DataFrame
landmarks_df = pd.DataFrame(landmarks_data)




# Apply the correction function to the 'description' column and store results in a new column
landmarks_df['corrected_description'] = landmarks_df['description'].apply(process_description)

# Optionally, save the updated DataFrame to a new CSV file
landmarks_df.to_csv("landmarks_corrected.csv", index=False, encoding="utf-8")

print("Correction process completed! The corrected data is saved in 'landmarks_corrected.csv'.")


In [18]:
# Save the corrected DataFrame as a JSON file
landmarks_df.to_json("landmarks_corrected.json", orient="records", force_ascii=False, indent=4)


## News

In [None]:
# Navigate to the correct folder
data_dir = "/content/drive/MyDrive/IronHack_final_project/elmundo_chunked_es_page1_40years"
files = os.listdir(data_dir)

print(f"Total files: {len(files)}")
print("Sample files:", files[:5])  # Preview first 5 files
file_sizes = {file: os.path.getsize(os.path.join(data_dir, file)) for file in files}
print(f"Average file size: {sum(file_sizes.values()) / len(file_sizes):.2f} bytes")
print("Smallest files:", sorted(file_sizes.items(), key=lambda x: x[1])[:5])
print("Largest files:", sorted(file_sizes.items(), key=lambda x: x[1], reverse=True)[:5])

Total files: 1668
Sample files: ['19501216_1.txt', '19430227_1.txt', '19520517_1.txt', '19450331_1.txt', '19340224_1.txt']
Average file size: 19467.90 bytes
Smallest files: [('19410104_1.txt', 9), ('19380101_1.txt', 9), ('19280901_1.txt', 6673), ('19351221_1.txt', 7570), ('19360125_1.txt', 7782)]
Largest files: [('19470802_1.txt', 37101), ('19471004_1.txt', 36966), ('19461221_1.txt', 36070), ('19470524_1.txt', 35724), ('19471206_1.txt', 35615)]


## Exploratory Data Analysis

In [None]:
def read_text_file(file_path, num_lines=15):
    with open(file_path, "r", encoding="utf-8") as f:
        return "\n".join([next(f) for _ in range(num_lines)])

sample_file = os.path.join(data_dir, files[0])
print(f"Contents of {files[0]}:\n", read_text_file(sample_file))

In [None]:
import numpy as np

file_lengths = []

for file in files:
    with open(os.path.join(data_dir, file), "r", encoding="utf-8") as f:
        text = f.read()
        file_lengths.append(len(text.split()))  # Count words

print(f"Average words per file: {np.mean(file_lengths):.2f}")
print(f"Min words: {np.min(file_lengths)}, Max words: {np.max(file_lengths)}")

In [None]:
import matplotlib as plt

plt.hist(file_lengths, bins=50, edgecolor="black")
plt.xlabel("Word Count per File")
plt.ylabel("Number of Files")
plt.title("Distribution of Document Lengths")
plt.show()