In [1]:
pip install -r requirements.txt


In [None]:
# General libraries
import pandas as pd
import numpy as np

In [2]:

# Libraries for geo locations
from geopy.geocoders import Nominatim # Importing the geopy library and Nominatim class
from geopy.exc import GeocoderTimedOut
from geopy.distance import Distance
from geopy.distance import geodesic

In [3]:
# Libraries for web scraping
from time import sleep
from selenium import webdriver
from bs4 import BeautifulSoup
import os
from itertools import product
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException
from datetime import timedelta
from io import BytesIO
import requests
from io import BytesIO
from datetime import datetime

## Defining input functions

In [4]:
# User Input: Function to check code and return error message if correct input code not found
def check_code(primary_code, dataframe):
    # Extract first column of the dataframe
    codes = dataframe.iloc[:, 0:1]
    # Check if the primary code exists in the internal list of codes
    if primary_code in codes.values:
        pass
    else:
        print(f"Error: Code {primary_code} is not a valid input. Please check the code and try again.")
    # Check if the primary code corresponds to a date that is in the future. Date is in second column of the data frame
    if dataframe.loc[dataframe.iloc[:, 0] == primary_code].iloc[0, 1] < pd.Timestamp.now():
        print(f"Error: Code {primary_code} does not correspond to a date in the future. Please check the code and try again.")

In [5]:
# User Input: Function to read the input file and extract data
def read_code(file_path, internal_list):
    with open(file_path, 'r') as file:
        file.readline()  # Skip the first line as it's not needed for the code
        primary_code = file.readline().strip()  # Read the second line for primary code

        # Check if the primary code exists in the internal list of codes
        check_code(primary_code, internal_list)
    
    return primary_code

In [7]:
# Internal Input: Function to read the internal file and extract data
def read_excel_to_df(file_path):
    try:
        # Read the Excel file into a DataFrame
        df = pd.read_excel(file_path, engine='openpyxl')
        
        # Check if the DataFrame has the correct number of columns and they are in the expected order
        expected_columns = ['code', 'date', 'event', 'city', 'venue', 'accom.Code', 'flight.Code']
        if len(df.columns) != len(expected_columns) or not all(df.columns == expected_columns):
            # If columns do not match, raise an error
            raise ValueError("Error: Unexpected input form. Please insert a file containing " +
                             "\"code\", \"date\", \"event\", \"city\", \"venue\", \"accommodation code\" and \"flight code\" in this order.")

        # Convert the 'date' column to datetime dtype
        df['date'] = pd.to_datetime(df['date'])

        # Return the DataFrame
        return df
    
    except ValueError as ve:
        print(ve)
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [13]:
# Importing internal match data
url = "https://raw.githubusercontent.com/dvandasova/JEM207_project/de33c218e1946a9e46aa50b2188d3ee5434d1ad9/02_Datasets/internal-data.xlsx"
response = requests.get(url)
df = pd.read_excel(BytesIO(response.content))
df

Unnamed: 0,code,date,event,city,venue,accom.Code,flight.Code
0,bundes000,2024-09-02,Borussia Dortmund vs. St. Pauli,Dortmund,"Signal Iduna Park, Dortmund",A-p35722,DTM
1,bundes001,2024-09-14,RB Leipzig vs. Union Berlin,Leipzig,"Red Bull Arena, Leipzig",A-p34515,LEJ
2,bundes002,2024-09-14,Borussia Dortmund vs. Heidenheim 1846,Dortmund,"Signal Iduna Park, Dortmund",A-p35722,DTM
3,bundes003,2024-09-14,1899 Sinsheim vs. Bayer Leverkusen,Sinsheim,"Rhein-Neckar Arena, Sinsheim",A-p33317,MHG
4,bundes004,2024-09-14,Freiburg vs. Bochum,Freiburg,"Europa-Park Stadion, Freiburg",A-p35464,HAM
...,...,...,...,...,...,...,...
86,bundes086,2024-11-30,Freiburg vs. Bor. Mönchengladbach,Freiburg,"Europa-Park Stadion, Freiburg",A-p35464,HAM
87,bundes087,2024-11-30,Augsburg vs. Bochum,Augsburg,"WWK Arena, Augsburg",A-p36311,MUC
88,bundes088,2024-11-30,Mainz 05 vs. 1899 Sinsheim,Mainz,"Mewa Arena, Mainz",A-p34389,FRA
89,bundes089,2024-11-30,Union Berlin vs. Bayer Leverkusen,Berlin,"Stadion An der Alten Försterei, Berlin",A-p36116,BER


## Defining scraping and bundling functions and scraping the data

In [10]:
# Initialize the WebDriver (e.g., Chrome)
driver = webdriver.Chrome()

df = df[df['date'] >= (datetime.now() + timedelta(days=14))]
df = df.reset_index(drop=True)

# Define a function to scrape the minimum price for a given city and dates
def scrape_min_price(city, departure_date, return_date):
    url = f"https://www.kayak.ie/flights/PRG-{city}/{departure_date}/{return_date}?sort=bestflight_a"
    driver.get(url)
    sleep(10)  # Adjust the sleep time as necessary
    
    # Close any popups that may appear
    try:
        popwindow = driver.find_element("xpath", '//*[@id="portal-container"]/div/div[2]/div/div/div[1]/div/span[2]/button/div/div')
        popwindow.click()
    except Exception:
        pass  # Ignore if no popup appears

    # Find all flight rows
    flight_rows = driver.find_elements("xpath", '//div[@class="nrc6-inner"]')

    # Initialize list to hold flight prices
    flight_prices = []

    # Scrape flight prices from each row
    for row in flight_rows:
        elementHTML = row.get_attribute('outerHTML')
        elementSoup = BeautifulSoup(elementHTML, 'html.parser')

        price = elementSoup.find("div", {"class": "f8F1-price-text"})

        if price:
            # Clean and append price (removing currency symbols)
            price_text = price.text.replace('€', '').replace(',', '').strip()
            try:
                flight_prices.append(float(price_text))  # Convert to float
            except ValueError:
                pass  # Skip if conversion fails

    # Return the minimum price if available, otherwise return None
    return min(flight_prices) if flight_prices else None

# Loop through each row in the DataFrame
min_prices = []  # To store the minimum prices for each city and date combination
for index, row in df.iterrows():
    city = row['flight.Code']  # Use the column name 'flight.Code' for the city
    departure_date = row['date'].strftime('%Y-%m-%d')  # Assuming 'date' is in a proper date format

    # Calculate the return date as departure date + 2 days
    return_date = (row['date'] + timedelta(days=2)).strftime('%Y-%m-%d')
    
    # Scrape the minimum price for the city and date combination
    min_price = scrape_min_price(city, departure_date, return_date)
    
    # Append the minimum price to the list
    min_prices.append(min_price)

    print(min_price)

# Add the minimum price as a new column in the original DataFrame
df['Min Price'] = min_prices

# Close the WebDriver
driver.quit()

# Display the updated DataFrame
print(df)

None


KeyboardInterrupt: 

In [11]:
# Setup your Selenium driver (e.g., Chrome)
driver = webdriver.Chrome()
driver.maximize_window()

def scrape_accommodation_data(location, departure_date, return_date, max_retries=3):
    # URL to load
    url = f"https://www.kayak.com/hotels/{location}/{departure_date}/{return_date}/2adults;map?sort=rank_a"
    
    # Initialize retries
    attempts = 0
    booking_data = []

    while attempts < max_retries:
        driver.get(url)
        sleep(10)

        # Scroll down to load more hotels
        actions = ActionChains(driver)
        for _ in range(5):
            actions.send_keys(Keys.PAGE_DOWN).perform()
            sleep(2)

        # Find hotel rows
        hotel_rows = driver.find_elements("xpath", '//div[contains(@class, "resultInner")]')

        # If no hotel rows are found, retry by reloading the page
        if not hotel_rows:
            print(f"No hotels found, retrying... attempt {attempts + 1} of {max_retries}")
            attempts += 1
            continue  # Reload the page and try again

        # Process the hotel rows if found
        for row in hotel_rows:
            elementHTML = row.get_attribute('outerHTML')
            elementSoup = BeautifulSoup(elementHTML, 'html.parser')

            # Extract hotel details
            location_elem = elementSoup.find("div", {"class": "upS4 upS4-big-name"})
            location_text = location_elem.text if location_elem else "Location not found"

            name_elem = elementSoup.find("div", {"class": "FLpo-hotel-name"})
            name_text = name_elem.text if name_elem else "Name not found"

            rating_elem = elementSoup.find("div", {"class": "wdjx wdjx-positive wdjx-mod-rating-condensed"})
            rating_text = rating_elem.text[:3] if rating_elem else "0"
            rating_text = rating_text.replace(",", ".")

            price_elem = elementSoup.find("div", {"class": "c1XBO"})
            price_text = price_elem.text if price_elem else "0"

            # Print out the prices found during scraping for debugging
            print(f"Price for {name_text}: {price_text}")

            distance_elem = elementSoup.find("span", {"class": "hotel-distance"})
            distance_text = distance_elem.text if distance_elem else "0"

            if price_text == "0" or rating_text == "0" or name_text == "Name not found":
                continue  # Skip this row if any of the data is missing
            else:
                # Add the hotel data to the list
                booking_data.append({
                    'Location': location_text,
                    "Name": name_text,
                    "Rating": rating_text,
                    "Price": price_text
                })

        # If we have successfully scraped data, break out of the retry loop
        if booking_data:
            break
        else:
            print(f"Retrying scrape... attempt {attempts + 1} of {max_retries}")
            attempts += 1

    # If after retries, no data is found, return an empty DataFrame
    if not booking_data:
        print("No data found after retries.")
        return pd.DataFrame()

    # Convert the scraped data to a DataFrame
    booking_data_df = pd.DataFrame(booking_data)

    # Data cleaning and conversion to numeric types
    booking_data_df['Rating'] = pd.to_numeric(booking_data_df['Rating'], errors='coerce')
    booking_data_df['Price_numeric'] = booking_data_df['Price'].replace({'\$': '', ',': ''}, regex=True)
    booking_data_df['Price_numeric'] = pd.to_numeric(booking_data_df['Price_numeric'], errors='coerce')

    return booking_data_df


  booking_data_df['Price_numeric'] = booking_data_df['Price'].replace({'\$': '', ',': ''}, regex=True)


In [12]:
from time import sleep
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver

# Function to start a new WebDriver session
def start_webdriver():
    driver = webdriver.Chrome()  # Add options if needed
    driver.maximize_window()
    return driver

# Function to close WebDriver session
def close_webdriver(driver):
    driver.quit()

def scrape_accommodation_data(location, departure_date, return_date, max_retries=5):
    # URL to load
    url = f"https://www.kayak.com/hotels/{location}/{departure_date}/{return_date}/2adults;map?sort=rank_a"
    
    # Initialize retries
    attempts = 0
    booking_data = []

    # Start WebDriver
    driver = start_webdriver()

    while attempts < max_retries:
        driver.get(url)
        sleep(10)

        # Scroll down to load more hotels
        actions = ActionChains(driver)
        for _ in range(5):
            actions.send_keys(Keys.PAGE_DOWN).perform()
            sleep(2)

        # Find hotel rows
        hotel_rows = driver.find_elements("xpath", '//div[contains(@class, "resultInner")]')

        # If no hotel rows are found, retry by reloading the page and restarting WebDriver
        if not hotel_rows:
            print(f"No hotels found, restarting WebDriver... attempt {attempts + 1} of {max_retries}")
            close_webdriver(driver)  # Close the current WebDriver session
            driver = start_webdriver()  # Start a fresh WebDriver session
            attempts += 1
            continue  # Retry with a fresh session

        # Process the hotel rows if found
        for row in hotel_rows:
            elementHTML = row.get_attribute('outerHTML')
            elementSoup = BeautifulSoup(elementHTML, 'html.parser')

            # Extract hotel details
            location_elem = elementSoup.find("div", {"class": "upS4 upS4-big-name"})
            location_text = location_elem.text if location_elem else "Location not found"

            name_elem = elementSoup.find("div", {"class": "FLpo-hotel-name"})
            name_text = name_elem.text if name_elem else "Name not found"

            rating_elem = elementSoup.find("div", {"class": "wdjx wdjx-positive wdjx-mod-rating-condensed"})
            rating_text = rating_elem.text[:3] if rating_elem else "0"
            rating_text = rating_text.replace(",", ".")

            price_elem = elementSoup.find("div", {"class": "c1XBO"})
            price_text = price_elem.text if price_elem else "0"

            # Print out the prices found during scraping for debugging
            print(f"Price for {name_text}: {price_text}")

            distance_elem = elementSoup.find("span", {"class": "hotel-distance"})
            distance_text = distance_elem.text if distance_elem else "0"

            if price_text == "0" or rating_text == "0" or name_text == "Name not found":
                continue  # Skip this row if any of the data is missing
            else:
                # Add the hotel data to the list
                booking_data.append({
                    'Location': location_text,
                    "Name": name_text,
                    "Rating": rating_text,
                    "Price": price_text
                })

        # If we have successfully scraped data, break out of the retry loop
        if booking_data:
            break
        else:
            print(f"Retrying scrape... attempt {attempts + 1} of {max_retries}")
            attempts += 1

    # Close WebDriver after all attempts or after successful scrape
    close_webdriver(driver)

    # If after retries, no data is found, return an empty DataFrame
    if not booking_data:
        print("No data found after retries.")
        return pd.DataFrame()

    # Convert the scraped data to a DataFrame
    booking_data_df = pd.DataFrame(booking_data)

    # Data cleaning and conversion to numeric types
    booking_data_df['Rating'] = pd.to_numeric(booking_data_df['Rating'], errors='coerce')
    booking_data_df['Price_numeric'] = booking_data_df['Price'].replace({'\$': '', ',': ''}, regex=True)
    booking_data_df['Price_numeric'] = pd.to_numeric(booking_data_df['Price_numeric'], errors='coerce')

    return booking_data_df

  booking_data_df['Price_numeric'] = booking_data_df['Price'].replace({'\$': '', ',': ''}, regex=True)


In [13]:
def select_accommodation_bundles(booking_data_df):
    # Price classification
    low_percentile = booking_data_df['Price_numeric'].quantile(0.33)
    mid_percentile = booking_data_df['Price_numeric'].quantile(0.66)

    # Distance classification
    low_dist_quantile = booking_data_df['Distance'].quantile(0.33)
    mid_dist_quantile = booking_data_df['Distance'].quantile(0.66)

    # Define a function to calculate rating points
    def rating_points(rating):
        if rating < 7:
            return 1
        elif rating <= 8.5:
            return 2
        else:
            return 3

    # Define a function to calculate distance points
    def distance_points(dist):
        if dist <= low_dist_quantile:
            return 3
        elif dist <= mid_dist_quantile:
            return 2
        else:
            return 1

    # Assign points based on rating
    booking_data_df['Rating_Points'] = booking_data_df['Rating'].apply(rating_points)

    # Assign points based on distance
    booking_data_df['Distance_Points'] = booking_data_df['Distance'].apply(distance_points)

    # Calculate total points by summing Rating_Points and Distance_Points
    booking_data_df['Total_Points'] = booking_data_df['Rating_Points'] + booking_data_df['Distance_Points']

    # Sort the dataframe by Total_Points in descending order
    booking_data_df = booking_data_df.sort_values(by='Total_Points', ascending=False)

    # Safely select the three accommodations based on price
    luxurious_bundle = booking_data_df.loc[booking_data_df['Price_numeric'] > mid_percentile]
    superior_bundle = booking_data_df.loc[(booking_data_df['Price_numeric'] > low_percentile) & (booking_data_df['Price_numeric'] <= mid_percentile)]
    standard_bundle = booking_data_df.loc[booking_data_df['Price_numeric'] <= low_percentile]

    # Check if there's data for each category and return the best accommodation based on Total_Points
    if not luxurious_bundle.empty:
        luxurious_bundle = luxurious_bundle.iloc[0]
    else:
        luxurious_bundle = {'Name': 'Not Found', 'Price': 0, 'Rating': 0, 'Total_Points': 0}

    if not superior_bundle.empty:
        superior_bundle = superior_bundle.iloc[0]
    else:
        superior_bundle = {'Name': 'Not Found', 'Price': 0, 'Rating': 0, 'Total_Points': 0}

    if not standard_bundle.empty:
        standard_bundle = standard_bundle.iloc[0]
    else:
        standard_bundle = {'Name': 'Not Found', 'Price': 0, 'Rating': 0, 'Total_Points': 0}

    return standard_bundle, superior_bundle, luxurious_bundle

In [14]:
# Function to obtain the coordinates of a list of places
def get_coordinates(places):
    geolocator = Nominatim(user_agent="Geopy Library")
    coordinates = []  # Use a list to store coordinate data
    
    for place in places:
        try:
            location = geolocator.geocode(place)
            if location:
                # Append the place and its coordinates as a list
                coordinates.append([place, location.latitude, location.longitude])
            else:
                # Append the place and a message indicating location was not found
                coordinates.append([place, "Location not found", "Location not found"])
        except Exception as e:
            # Append the place and the error occurred
            coordinates.append([place, f"Error occurred: {str(e)}", f"Error occurred: {str(e)}"])
    
    return coordinates

In [28]:
# Define lists to hold the accommodation data for each row

accommodation_standard = []
accommodation_superior = []
accommodation_luxurious = []

driver = webdriver.Chrome()
driver.maximize_window()

# Loop through each row in the DataFrame to get location and dates
for index, row in df.iterrows():

    location = row['accom.Code']  # Assuming 'accom.Code' contains the location identifier
    departure_date = row['date'].strftime('%Y-%m-%d')  # Assuming 'date' is a date column
    return_date = (row['date'] + timedelta(days=2)).strftime('%Y-%m-%d') 

    city = row['city']  # Assuming 'city' contains the city name
    
    # Scrape accommodation data
    accommodation_data = scrape_accommodation_data(location, departure_date, return_date)
    accommodation_data["Location"] = accommodation_data["Location"] + ", " + city
    
    accommodation_data = accommodation_data[~accommodation_data['Location'].str.contains("Nearby - ", na=False)]
    current_venue = row["venue"] 

    places = get_coordinates(accommodation_data['Location'])

    places_df = pd.DataFrame(places, columns=["Place", "Latitude", "Longitude"])

    # Create a mask for valid coordinates (i.e., filter out "Location not found")
    valid_mask = places_df["Latitude"] != "Location not found"

    accommodation_data = accommodation_data.reset_index(drop=True)

    # Ensure valid_mask has the same index as the original accommodation_data
    valid_mask = valid_mask & accommodation_data.index.isin(places_df.index)

    # Apply the mask to filter both places_df and accommodation_data
    places_filtered = places_df[valid_mask].reset_index(drop=True)
    accommodation_data = accommodation_data[valid_mask].reset_index(drop=True)

    geolocator = Nominatim(user_agent="Geopy Library")
    current_venue = geolocator.geocode(current_venue)
    base_coords = (current_venue.latitude, current_venue.longitude) if current_venue else "Location not found"

    base_latitude = base_coords[0]
    base_longitude = base_coords[1] 


    distances = []
    for i in range(len(places_filtered)):
        # Access latitude and longitude using pandas iloc to reference rows
        latitude = places_filtered.iloc[i]["Latitude"]
        longitude = places_filtered.iloc[i]["Longitude"]
        
        # Calculate distance between base coordinates and other places
        dist = geodesic((base_latitude, base_longitude), (latitude, longitude)).kilometers
        distances.append(dist)

    # Add distances to accommodation_data
    accommodation_data["Distance"] = distances

    # Select the Standard, Superior, and Luxurious bundles
    Standard_bundle, Superior_bundle, Luxurious_bundle = select_accommodation_bundles(accommodation_data)

    # Append the accommodation names and prices to the corresponding lists
    accommodation_standard.append({
        'Name': Standard_bundle['Name'],
        'Price': Standard_bundle['Price'],
        'Rating': Standard_bundle['Rating']
    })
    accommodation_superior.append({
        'Name': Superior_bundle['Name'],
        'Price': Superior_bundle['Price'],
        'Rating': Superior_bundle['Rating']
    })
    accommodation_luxurious.append({
        'Name': Luxurious_bundle['Name'],
        'Price': Luxurious_bundle['Price'],
        'Rating': Luxurious_bundle['Rating']
    })

# Add the accommodation bundles to the original DataFrame
df['Standard Accommodation'] = [x['Name'] for x in accommodation_standard]
df['Superior Accommodation'] = [x['Name'] for x in accommodation_superior]
df['Luxurious Accommodation'] = [x['Name'] for x in accommodation_luxurious]

df['Standard Price'] = [x['Price'] for x in accommodation_standard]
df['Superior Price'] = [x['Price'] for x in accommodation_superior]
df['Luxurious Price'] = [x['Price'] for x in accommodation_luxurious]

df['Standard Rating'] = [x['Rating'] for x in accommodation_standard]
df['Superior Rating'] = [x['Rating'] for x in accommodation_superior]
df['Luxurious Rating'] = [x['Rating'] for x in accommodation_luxurious]

# Save the updated DataFrame with accommodation data

# Close the WebDriver
driver.quit()


Price for HOtello Lehel: $866
Price for Boutique Hotel Atrium München: $1,295
Price for Holiday Inn Express Munich - Olympiapark: $724
Price for Hotel Lifestyle: $714
Price for Haus im Tal: $1,213
Price for Achat Hotel München Süd: $535
Price for Relexa Hotel München: $1,256
Price for Four Points by Sheraton Munich Arabellapark: $778
Price for Sofitel Munich Bayerpost: $1,855
Price for Moxy Munich Ostbahnhof: $805
Price for Aloft Munich: $1,201
Price for NH München Messe: $648
Price for The Rilano Hotel München: $613
Price for Advastay By King's: $981
Price for Miano Hotel & Bar: $525
Price for Munich Marriott Hotel: $1,087
Price for Munich Marriott Hotel City West: $1,040
Price for Vienna House Easy München: $628
Price for Ruby Lilly Hotel Munich: $962
Price for New Orly: $853
Price for Hotel Perlach Allee: $563
Price for Innspire Hotel: $753
Price for Moma1890 Boutique Hotel: $1,023
Price for Parkhotel Rothof: $925
Price for Best Western Hotel Arabellapark München: $1,211
Price for N

KeyError: 'Location'

In [19]:
# Convert 'Min Price' to numeric, treating 'n/a' and other non-numeric entries as 0
df['Min Price'] = pd.to_numeric(df['Min Price'], errors='coerce').fillna(0)

df['Standard Price'] = df['Standard Price'].replace({'\$': '', ',': ''}, regex=True)
df['Standard Price'] = pd.to_numeric(df['Standard Price'], errors='coerce')

df['Superior Price'] = df['Superior Price'].replace({'\$': '', ',': ''}, regex=True)
df['Superior Price'] = pd.to_numeric(df['Superior Price'], errors='coerce')

df['Luxurious Price'] = df['Luxurious Price'].replace({'\$': '', ',': ''}, regex=True)
df['Luxurious Price'] = pd.to_numeric(df['Luxurious Price'], errors='coerce')

# Now perform the summation
df["Standard Price Total"] = pd.to_numeric(df["Standard Price"], errors='coerce').fillna(0) + df["Min Price"]
df["Superior Price Total"] = pd.to_numeric(df["Superior Price"], errors='coerce').fillna(0) + df["Min Price"]
df["Luxurious Price Total"] = pd.to_numeric(df["Luxurious Price"], errors='coerce').fillna(0) + df["Min Price"]

df["Luxurious Price Total"]

  df['Standard Price'] = df['Standard Price'].replace({'\$': '', ',': ''}, regex=True)
  df['Superior Price'] = df['Superior Price'].replace({'\$': '', ',': ''}, regex=True)
  df['Luxurious Price'] = df['Luxurious Price'].replace({'\$': '', ',': ''}, regex=True)


0    570.0
1    692.0
Name: Luxurious Price Total, dtype: float64

## Print the final scraped dataset

In [21]:
print(df)

Index(['code', 'date', 'event', 'city', 'venue', 'accom.Code', 'flight.Code',
       'Min Price'],
      dtype='object')
