In [None]:
pip install -r requirements.txt


In [1]:
# General libraries
import pandas as pd
import numpy as np

In [None]:
# Importing home-made libraries
from app.scraping import scrape_accommodation_data
from app.scraping import scrape_min_price
from app.scraping import start_webdriver
from app.scraping import close_webdriver

from app.geos import get_coordinates

from app.bundling import select_accommodation_bundles

from app.input import read_excel_to_df
from app.input import check_code
from app.input import read_code

In [3]:
# Libraries for geo locations
from geopy.geocoders import Nominatim # Importing the geopy library and Nominatim class
from geopy.exc import GeocoderTimedOut
from geopy.distance import Distance
from geopy.distance import geodesic

In [4]:
# Libraries for web scraping
from time import sleep
from selenium import webdriver
from bs4 import BeautifulSoup
import os
from itertools import product
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException
from datetime import timedelta
from io import BytesIO
import requests
from datetime import datetime

## Importing the "full dataset" (shortened for convenience)

In [None]:
# Importing internal match data
url = "https://raw.githubusercontent.com/dvandasova/JEM207_project/b5414c0b6fd52309880b8478913089768dbe320e/02_Datasets/internal-data.xlsx"
response = requests.get(url)
df = pd.read_excel(BytesIO(response.content))
df

## Omitting all matches within 14 days to prevent cases with no fligts or accomodation

In [None]:
df = df[df['date'] >= (datetime.now() + timedelta(days=14))]
df = df.reset_index(drop=True)
df

## Using the imported scraping functions, first scrape flight data

In [None]:
# Loop through each row in the DataFrame
min_prices = []  # To store the minimum prices for each city and date combination
for index, row in df.iterrows():
    city = row['flight.Code']  # Use the column name 'flight.Code' for the city
    departure_date = row['date'].strftime('%Y-%m-%d')  # Assuming 'date' is in a proper date format

    # Calculate the return date as departure date + 2 days
    return_date = (row['date'] + timedelta(days=2)).strftime('%Y-%m-%d')
    
    # Scrape the minimum price for the city and date combination
    min_price = scrape_min_price(city, departure_date, return_date)
    
    # Append the minimum price to the list
    min_prices.append(min_price)

    print(min_price)

# Add the minimum price as a new column in the original DataFrame
df['Min Price'] = min_prices

# Display the updated DataFrame
print(df)


## And now scrape the accomodations and create the bundles with each scraping iterations


In [None]:
# Define lists to hold the accommodation data for each row
accommodation_standard = []
accommodation_superior = []
accommodation_luxurious = []

driver = webdriver.Chrome()
driver.maximize_window()

# Loop through each row in the DataFrame to get location and dates
for index, row in df.iterrows():

    location = row['accom.Code']  # Assuming 'accom.Code' contains the location identifier
    departure_date = row['date'].strftime('%Y-%m-%d')  # Assuming 'date' is a date column
    return_date = (row['date'] + timedelta(days=2)).strftime('%Y-%m-%d') 

    city = row['city']  # Assuming 'city' contains the city name
    
    # Scrape accommodation data
    accommodation_data = scrape_accommodation_data(location, departure_date, return_date)
    accommodation_data["Location"] = accommodation_data["Location"] + ", " + city
    
    # Filter out locations with "Nearby - "
    accommodation_data = accommodation_data[~accommodation_data['Location'].str.contains("Nearby - ", na=False)]

    current_venue = row["venue"] 

    # Retrieve the coordinates for each accommodation
    places = get_coordinates(accommodation_data['Location'])

    # Create a DataFrame for places and coordinates
    places_df = pd.DataFrame(places, columns=["Place", "Latitude", "Longitude"])

    # Geocode the current venue
    geolocator = Nominatim(user_agent="Geopy Library")
    current_venue_location = geolocator.geocode(current_venue)

    # Check if venue location was found
    if current_venue_location:
        base_coords = (current_venue_location.latitude, current_venue_location.longitude)
    else:
        base_coords = ("Location not found", "Location not found")

    base_latitude = base_coords[0]
    base_longitude = base_coords[1]

    # Iterate over the places to replace "Location not found" with the base coordinates
    for i, row in places_df.iterrows():
        if row["Latitude"] == "Location not found":
            places_df.at[i, "Latitude"] = base_latitude
            places_df.at[i, "Longitude"] = base_longitude
            places_df.at[i, "Latitude"] = places_df.at[i, "Latitude"] + 0.1 
            places_df.at[i, "Longitude"] = places_df.at[i, "Longitude"] + 0.1 



    # Now places_df contains both the valid coordinates and venue coordinates for "Location not found"
    # Combine this updated data back into the accommodation_data
    accommodation_data = accommodation_data.reset_index(drop=True)

    # Add latitude and longitude from places_df to accommodation_data
    accommodation_data["Latitude"] = places_df["Latitude"]
    accommodation_data["Longitude"] = places_df["Longitude"]

    # Calculate distances between each accommodation and the venue
    distances = []
    for i in range(len(places_df)):
        latitude = places_df.iloc[i]["Latitude"]
        longitude = places_df.iloc[i]["Longitude"]

        # Only calculate distance if both latitude and longitude are valid
        if base_latitude != "Location not found" and latitude != "Location not found":
            dist = geodesic((base_latitude, base_longitude), (latitude, longitude)).kilometers
        else:
            dist = float('inf')  # Assign a large distance if location is invalid

        distances.append(dist)

    # Add distances to accommodation_data
    accommodation_data["Distance"] = distances

    # Print the accommodation data (for debugging purposes)
    print(accommodation_data)

    # Select the Standard, Superior, and Luxurious bundles
    Standard_bundle, Superior_bundle, Luxurious_bundle = select_accommodation_bundles(accommodation_data)

    # Append the accommodation names and prices to the corresponding lists
    accommodation_standard.append({
        'Name': Standard_bundle['Name'],
        'Price': Standard_bundle['Price'],
        'Rating': Standard_bundle['Rating']
    })
    accommodation_superior.append({
        'Name': Superior_bundle['Name'],
        'Price': Superior_bundle['Price'],
        'Rating': Superior_bundle['Rating']
    })
    accommodation_luxurious.append({
        'Name': Luxurious_bundle['Name'],
        'Price': Luxurious_bundle['Price'],
        'Rating': Luxurious_bundle['Rating']
    })

# Add the accommodation bundles to the original DataFrame
df['Standard Accommodation'] = [x['Name'] for x in accommodation_standard]
df['Superior Accommodation'] = [x['Name'] for x in accommodation_superior]
df['Luxurious Accommodation'] = [x['Name'] for x in accommodation_luxurious]

df['Standard Price'] = [x['Price'] for x in accommodation_standard]
df['Superior Price'] = [x['Price'] for x in accommodation_superior]
df['Luxurious Price'] = [x['Price'] for x in accommodation_luxurious]

df['Standard Rating'] = [x['Rating'] for x in accommodation_standard]
df['Superior Rating'] = [x['Rating'] for x in accommodation_superior]
df['Luxurious Rating'] = [x['Rating'] for x in accommodation_luxurious]

# Save the updated DataFrame with accommodation data

# Close the WebDriver
driver.quit()


## Make sure the prices are numeric for easier further processing

In [None]:
# Convert 'Min Price' to numeric, treating 'n/a' and other non-numeric entries as 0
df['Min Price'] = pd.to_numeric(df['Min Price'], errors='coerce').fillna(0)

df['Standard Price'] = df['Standard Price'].replace({'\$': '', ',': ''}, regex=True)
df['Standard Price'] = pd.to_numeric(df['Standard Price'], errors='coerce')

df['Superior Price'] = df['Superior Price'].replace({'\$': '', ',': ''}, regex=True)
df['Superior Price'] = pd.to_numeric(df['Superior Price'], errors='coerce')

df['Luxurious Price'] = df['Luxurious Price'].replace({'\$': '', ',': ''}, regex=True)
df['Luxurious Price'] = pd.to_numeric(df['Luxurious Price'], errors='coerce')

# Now perform the summation
df["Standard Price Total"] = pd.to_numeric(df["Standard Price"], errors='coerce').fillna(0) + df["Min Price"]
df["Superior Price Total"] = pd.to_numeric(df["Superior Price"], errors='coerce').fillna(0) + df["Min Price"]
df["Luxurious Price Total"] = pd.to_numeric(df["Luxurious Price"], errors='coerce').fillna(0) + df["Min Price"]

## Print the final scraped dataset

In [None]:
df

## Download the dataset, or further use the results (vizualize etc.)


In [10]:
df.to_excel('your_file.xlsx', engine='xlsxwriter', index=False)

In [14]:
file_path = r"your_download_path"  # Change this to your desired path
df.to_excel(file_path, index=False)