<a href="https://colab.research.google.com/github/dbigman/project-dsml-interactive-travel-planner/blob/main/Final_project_IH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Hithchiker's Guide to Puerto Rico
In this project, we are going to use all the Data Science and Machine Learning skills we have acquired during the course of the last few weeks to build an interactive travel planner for the beautiful island of Puerto Rico. By the end of this project, we will present a working application that cooperates with a visitor to help them build a travel itinerary suitable to their personal preferences.

In [3]:
# Imports
import os
import requests

import json
from bs4 import BeautifulSoup
import re
import html


## From HTML to .JSON|

### Functions

In [4]:
# Function to clean unwanted characters and fix words
def clean_text(text):
    # Remove unwanted characters (e.g., escape sequences like "\xc9")
    text = re.sub(r'\\[xX][0-9A-Fa-f]{2}', '', text)  # Remove escaped Unicode characters
    text = re.sub(r'[\r\n\t]', ' ', text)  # Remove newlines and tabs
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with one
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to add missing characters like 'ñ'
def add_missing_characters(text):
    # Mapping of words that might need 'ñ' or other fixes
    replacements = {
        "Aguada": "Aguada",
        "Aasco": "Añasco",
        "Catao": "Cataño",
        "Nio": "Niño",
        "Peuelas": "Peñuelas"
        # Add other common words as needed
    }

    # Replace words based on the mapping
    for wrong_word, correct_word in replacements.items():
        text = re.sub(rf'\b{wrong_word}\b', correct_word, text, flags=re.IGNORECASE)
    return text

# Function to extract coordinates from the HTML content
def extract_coordinates(html_content):
    # Regex pattern to match lat and lon inside "wgCoordinates"
    coordinates_pattern = r'"wgCoordinates":\s*\{\s*"lat":\s*(-?\d+\.\d+),\s*"lon":\s*(-?\d+\.\d+)\s*\}'
    match = re.search(coordinates_pattern, html_content)

    if match:
        # Clean and extract latitude and longitude as float values
        lat = float(match.group(1).strip().replace("\\n", ""))  # Remove any unwanted newline characters
        lon = float(match.group(2).strip().replace("\\n", ""))  # Remove any unwanted newline characters
        return lat, lon

    return None, None

### Municipalities

In [None]:
# Path to the municipalities folder in Google Drive
# municipalities_folder = "/content/drive/MyDrive/IronHack_final_project/municipalities"
municipalities_folder = 'data\municipalities'

# List to store structured data
municipalities_data = []

# Loop through each .txt file in the folder
for filename in os.listdir(municipalities_folder):
    if filename.endswith(".txt"):  # Ensure we only process .txt files
        file_path = os.path.join(municipalities_folder, filename)

        # Read the HTML content
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse HTML using BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")

        # Extract title (or use filename if no title found), and clean it
        title = clean_text(soup.title.string) if soup.title else clean_text(filename.replace(".txt", ""))
        # Remove " - Wikipedia" from the title
        title = title.replace(" - Wikipedia", "")
        title = add_missing_characters(title)  # Add missing characters to the title

        # Extract first 3 paragraphs for description
        paragraphs = [clean_text(p.get_text(strip=True)) for p in soup.find_all("p")][:3]
        paragraphs = [add_missing_characters(p) for p in paragraphs]  # Add missing characters to description

        # Extract coordinates (latitude and longitude) from HTML content
        latitude, longitude = extract_coordinates(html_content)

        # Structure the data
        municipality = {
            "name": title,
            "category": "Municipality",
            "description": paragraphs,
            "coordinates": {
                "latitude": latitude,
                "longitude": longitude
            },
            "source_file": filename
        }

        # Append to the list
        municipalities_data.append(municipality)

# Save structured data as JSON
output_json = os.path.join(municipalities_folder, "municipalities.json")
               
with open(output_json, "w", encoding="utf-8") as json_file:
    json.dump(municipalities_data, json_file, indent=4, ensure_ascii=False)

print(f"Municipality data with coordinates saved as {output_json}")

Municipality data with coordinates saved as data\municipalities\municipalities.json


In [None]:
# correcting typos in descriptions

import openai
import os
import time
import logging
import json
from dotenv import load_dotenv

# Configure logging
logging.basicConfig(
    filename="municipality_correction.log",
    level=logging.DEBUG,  # Change as needed
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Load the JSON file
with open("data/municipalities/municipalities.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Load environment variables from .env file
load_dotenv()

# Get DeepSeek API key
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
if not deepseek_api_key:
    logging.error("DeepSeek API key is missing! Check your .env file.")
    raise ValueError("DeepSeek API key not found.")

def correct_text_ds(text, retries=3):
    """
    Corrects typos using DeepSeek API with simple retry logic.
    """
    prompt = f"Correct any typos in the following text while keeping the meaning intact. Do not include ANYTHING in addition to the corrected text:\n{text}"
    
    for attempt in range(1, retries + 1):
        try:
            logging.info("Calling DeepSeek API (attempt %s)...", attempt)
            client = openai.OpenAI(
                api_key=deepseek_api_key,
                base_url="https://api.deepseek.com/v1"  # DeepSeek endpoint
            )
            response = client.chat.completions.create(
                model="deepseek-chat",  
                messages=[
                    {"role": "system", "content": "You are a proofreading assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=2000,
                top_p=0.8
            )
            
            result = response.choices[0].message.content.strip()
            logging.info("DeepSeek API call successful on attempt %s.", attempt)
            logging.debug("Result: %s", result)
            return result
        
        except Exception as e:
            logging.error("Error on attempt %s: %s", attempt, e)
            time.sleep(2 ** attempt)  # Exponential backoff
    
    logging.error("All attempts failed for text starting with: %s", text[:50])
    return text  # Or return "" if you prefer an empty result

# Process each municipality
total_municipalities = len(data)
for i, municipality in enumerate(data):
    name = municipality.get('name', 'Unknown')
    print(f"Processing {i+1}/{total_municipalities}: {name}")
    original_text = "".join(municipality.get("description", []))
    corrected_description = correct_text_ds(original_text)
    municipality["description"] = [corrected_description]
    
    # Save progress after processing each municipality
    with open("municipalities_corrected.json", "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4, ensure_ascii=False)
    
    # Short delay to avoid rate limiting
    time.sleep(1)

print("Correction process completed! The corrected data is saved in 'municipalities_corrected.json'.")


### Landmarks

In [10]:
# Path to the landmarks folder in Google Drive
landmarks_folder = 'data\landmarks'

# List to store structured data
landmarks_data = []

# Loop through each .txt file in the folder
for filename in os.listdir(landmarks_folder):
    if filename.endswith(".txt"):  # Ensure we only process .txt files
        file_path = os.path.join(landmarks_folder, filename)

        # Read the HTML content
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse HTML using BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")

        # Extract title (or use filename if no title found)
        title = soup.title.string if soup.title else filename.replace(".txt", "")
        title = clean_text(title)  # Clean the title text

        # Extract first 3 paragraphs for description
        paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")][:3]
        paragraphs = [clean_text(paragraph) for paragraph in paragraphs]  # Clean description paragraphs

        # Extract coordinates (if available)
        lat, lon = extract_coordinates(html_content)
        coordinates = {"latitude": lat, "longitude": lon} if lat and lon else None

        # Find the municipality (look for it in the text)
        municipality = None
        if "municipality" in html_content.lower():
            municipality_match = re.search(r"Municipality of\s+([A-Za-z\s]+)", html_content)
            if municipality_match:
                municipality = clean_text(municipality_match.group(1))  # Clean municipality text

        # Structure the data
        landmark = {
            "name": title,
            "category": "Landmark",
            "description": paragraphs,
            "coordinates": coordinates,
            "municipality": municipality,
            "source_file": filename
        }

        # Append to the list
        landmarks_data.append(landmark)


# Save structured data as JSON
output_json = os.path.join(landmarks_folder, "landmarks.json")
with open(output_json, "w", encoding="utf-8") as json_file:
    json.dump(landmarks_data, json_file, indent=4, ensure_ascii=False)

print(f"Landmark data saved as {output_json}")

Landmark data saved as data\landmarks\landmarks.json


In [12]:
import openai
import os
import time
import logging
import pandas as pd
from dotenv import load_dotenv
from tabulate import tabulate
from json import JSONDecodeError

# Configure logging
logging.basicConfig(
    filename="landmarks_correction.log",
    level=logging.INFO,  # Change to logging.INFO, logging.DEBUG, logging.ERROR as needed
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Load the JSON file
with open("data\landmarks\landmarks.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Load environment variables from .env file
load_dotenv()

# Get DeepSeek API key
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
if not deepseek_api_key:
    logging.error("DeepSeek API key is missing! Check your .env file.")
    raise ValueError("DeepSeek API key not found.")

def correct_text_ds(text):
    """
    Corrects typos using deepseek api
    """
    logging.info("Calling DeepSeek API...")
    try:
        client = openai.OpenAI(
            api_key=deepseek_api_key,
            base_url="https://api.deepseek.com/v1"  # DeepSeek endpoint
        )
        prompt = f"Correct any typos in the following text while keeping the meaning intact. ONLY RETURN CORRECTED TEXT. Do not include ANYTHING in addition to the corrected text:\n{text}"
        response = client.chat.completions.create(
            model="deepseek-chat",  
            # model="deepseek-reasoner",
            messages=[
                {"role": "system", "content": "You are a proofreading assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=2000,
            top_p=0.8
        )
        
        result = response.choices[0].message.content
        logging.info("DeepSeek API call successful.")
        logging.info(result)
        return result

    except Exception as e:
        logging.error(f"Unexpected error calling DeepSeek API: {e}")
        return ""


# Process each landmark
total_landmarks = len(data)
for i, landmark in enumerate(data):
    print(f"Processing {i+1}/{total_landmarks}: {landmark['name']}")
    corrected_description = correct_text_ds("".join(landmark["description"]))
    landmark["description"] = [corrected_description]

# Save the corrected data back to a new JSON file
with open("landmarks_corrected.json", "w", encoding="utf-8") as file:
    json.dump(data, file, indent=4, ensure_ascii=False)

print("Correction process completed! The corrected data is saved in 'landmarks_corrected.json'.")


Processing 1/574: Academia del Perpetuo Socorro - Wikipedia
Processing 2/574: Academia Interamericana Metro - Wikipedia
Processing 3/574: Academia Maria Reina - Wikipedia
Processing 4/574: Academia San Jorge - Wikipedia
Processing 5/574: Adjuntas barrio-pueblo - Wikipedia
Processing 6/574: Aguada barrio-pueblo - Wikipedia
Processing 7/574: Naval Radio Transmitter Facility Aguada - Wikipedia
Processing 8/574: Aguadilla barrio-pueblo - Wikipedia
Processing 9/574: Aguadilla Ice Skating Arena - Wikipedia
Processing 10/574: Aguas Buenas barrio-pueblo - Wikipedia
Processing 11/574: Aguas Buenas Cave System - Wikipedia
Processing 12/574: Aguirre State Forest - Wikipedia
Processing 13/574: Aibonito barrio-pueblo - Wikipedia
Processing 14/574: Aibonito Festival of Flowers - Wikipedia
Processing 15/574: Albergue Olmpico - Wikipedia
Processing 16/574: Albizu University - Wikipedia
Processing 17/574: Ana G. Mndez University - Wikipedia
Processing 18/574: Antiguo Casino de Ponce - Wikipedia
Process

## News

In [None]:
# Navigate to the correct folder
data_dir = "/content/drive/MyDrive/IronHack_final_project/elmundo_chunked_es_page1_40years"
files = os.listdir(data_dir)

print(f"Total files: {len(files)}")
print("Sample files:", files[:5])  # Preview first 5 files
file_sizes = {file: os.path.getsize(os.path.join(data_dir, file)) for file in files}
print(f"Average file size: {sum(file_sizes.values()) / len(file_sizes):.2f} bytes")
print("Smallest files:", sorted(file_sizes.items(), key=lambda x: x[1])[:5])
print("Largest files:", sorted(file_sizes.items(), key=lambda x: x[1], reverse=True)[:5])

Total files: 1668
Sample files: ['19501216_1.txt', '19430227_1.txt', '19520517_1.txt', '19450331_1.txt', '19340224_1.txt']
Average file size: 19467.90 bytes
Smallest files: [('19410104_1.txt', 9), ('19380101_1.txt', 9), ('19280901_1.txt', 6673), ('19351221_1.txt', 7570), ('19360125_1.txt', 7782)]
Largest files: [('19470802_1.txt', 37101), ('19471004_1.txt', 36966), ('19461221_1.txt', 36070), ('19470524_1.txt', 35724), ('19471206_1.txt', 35615)]


## Exploratory Data Analysis