In [None]:
pip install -r requirements.txt

In [5]:
# General libraries
import pandas as pd
import numpy as np

In [None]:
# Importing home-made libraries
from app.scraping import scrape_accommodation_data
from app.scraping import scrape_min_price
from app.scraping import start_webdriver
from app.scraping import close_webdriver

from app.geos import get_coordinates

from app.bundling import select_accommodation_bundles

from app.input import read_excel_to_df
from app.input import check_code
from app.input import read_code

In [4]:
# Libraries for geo locations
from geopy.geocoders import Nominatim # Importing the geopy library and Nominatim class
from geopy.exc import GeocoderTimedOut
from geopy.distance import Distance
from geopy.distance import geodesic

In [5]:
# Libraries for web scraping
from time import sleep
from selenium import webdriver
from bs4 import BeautifulSoup
import os
from itertools import product
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException
from datetime import timedelta
from io import BytesIO
import requests
from datetime import datetime

In [8]:
# Libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Importing internal match data
url = "https://raw.githubusercontent.com/dvandasova/JEM207_project/b5414c0b6fd52309880b8478913089768dbe320e/02_Datasets/internal-data.xlsx"
response = requests.get(url)
data_frame = pd.read_excel(BytesIO(response.content))
data_frame

In [None]:
file_path = 'yourpath_to_the_input_txt_file'  # path to input file
primary_code = read_code(file_path, data_frame)  # Use app.input.read_code
print(primary_code)

In [None]:
# From data_frame extract the row corresponding to the primary code
df = data_frame.loc[data_frame.iloc[:, 0] == primary_code]
df

## Individual match scraping
### Flight and Accommodation
The purpose of the individual scraping is to check the latest price of the selected match (different to in bulk scraping, which takes a lot of time, this individual is much faster)

Please exchange the the following filepaths for your own, leading to the text file used for selecting the match

In [None]:
# Loop through each row in the DataFrame
min_prices = []  # To store the minimum prices for each city and date combination
for index, row in df.iterrows():
    city = row['flight.Code']  # Use the column name 'flight.Code' for the city
    departure_date = row['date'].strftime('%Y-%m-%d')  # Assuming 'date' is in a proper date format

    # Calculate the return date as departure date + 2 days
    return_date = (row['date'] + timedelta(days=2)).strftime('%Y-%m-%d')
    
    # Scrape the minimum price for the city and date combination
    min_price = scrape_min_price(city, departure_date, return_date)
    
    # Append the minimum price to the list
    min_prices.append(min_price)

    print(min_price)

# Add the minimum price as a new column in the original DataFrame
df['Min Price'] = min_prices

# Display the updated DataFrame
df


In [None]:
# Define lists to hold the accommodation data for each row
accommodation_standard = []
accommodation_superior = []
accommodation_luxurious = []

driver = webdriver.Chrome()
driver.maximize_window()

# Loop through each row in the DataFrame to get location and dates
for index, row in df.iterrows():

    location = row['accom.Code']  # Assuming 'accom.Code' contains the location identifier
    departure_date = row['date'].strftime('%Y-%m-%d')  # Assuming 'date' is a date column
    return_date = (row['date'] + timedelta(days=2)).strftime('%Y-%m-%d') 

    city = row['city']  # Assuming 'city' contains the city name
    
    # Scrape accommodation data
    accommodation_data = scrape_accommodation_data(location, departure_date, return_date)
    accommodation_data["Location"] = accommodation_data["Location"] + ", " + city
    
    # Filter out locations with "Nearby - "
    accommodation_data = accommodation_data[~accommodation_data['Location'].str.contains("Nearby - ", na=False)]

    current_venue = row["venue"] 

    # Retrieve the coordinates for each accommodation
    places = get_coordinates(accommodation_data['Location'])

    # Create a DataFrame for places and coordinates
    places_df = pd.DataFrame(places, columns=["Place", "Latitude", "Longitude"])

    # Geocode the current venue
    geolocator = Nominatim(user_agent="Geopy Library")
    current_venue_location = geolocator.geocode(current_venue)

    # Check if venue location was found
    if current_venue_location:
        base_coords = (current_venue_location.latitude, current_venue_location.longitude)
    else:
        base_coords = ("Location not found", "Location not found")

    base_latitude = base_coords[0]
    base_longitude = base_coords[1]

    # Iterate over the places to replace "Location not found" with the base coordinates
    for i, row in places_df.iterrows():
        if row["Latitude"] == "Location not found":
            places_df.at[i, "Latitude"] = base_latitude
            places_df.at[i, "Longitude"] = base_longitude
            places_df.at[i, "Latitude"] = places_df.at[i, "Latitude"] + 0.1 
            places_df.at[i, "Longitude"] = places_df.at[i, "Longitude"] + 0.1 



    # Now places_df contains both the valid coordinates and venue coordinates for "Location not found"
    # Combine this updated data back into the accommodation_data
    accommodation_data = accommodation_data.reset_index(drop=True)

    # Add latitude and longitude from places_df to accommodation_data
    accommodation_data["Latitude"] = places_df["Latitude"]
    accommodation_data["Longitude"] = places_df["Longitude"]

    # Calculate distances between each accommodation and the venue
    distances = []
    for i in range(len(places_df)):
        latitude = places_df.iloc[i]["Latitude"]
        longitude = places_df.iloc[i]["Longitude"]

        # Only calculate distance if both latitude and longitude are valid
        if base_latitude != "Location not found" and latitude != "Location not found":
            dist = geodesic((base_latitude, base_longitude), (latitude, longitude)).kilometers
        else:
            dist = float('inf')  # Assign a large distance if location is invalid

        distances.append(dist)

    # Add distances to accommodation_data
    accommodation_data["Distance"] = distances

    # Print the accommodation data (for debugging purposes)
    print(accommodation_data)

    # Select the Standard, Superior, and Luxurious bundles
    Standard_bundle, Superior_bundle, Luxurious_bundle = select_accommodation_bundles(accommodation_data)

    # Append the accommodation names and prices to the corresponding lists
    accommodation_standard.append({
        'Name': Standard_bundle['Name'],
        'Price': Standard_bundle['Price'],
        'Rating': Standard_bundle['Rating']
    })
    accommodation_superior.append({
        'Name': Superior_bundle['Name'],
        'Price': Superior_bundle['Price'],
        'Rating': Superior_bundle['Rating']
    })
    accommodation_luxurious.append({
        'Name': Luxurious_bundle['Name'],
        'Price': Luxurious_bundle['Price'],
        'Rating': Luxurious_bundle['Rating']
    })

# Add the accommodation bundles to the original DataFrame
df['Standard Accommodation'] = [x['Name'] for x in accommodation_standard]
df['Superior Accommodation'] = [x['Name'] for x in accommodation_superior]
df['Luxurious Accommodation'] = [x['Name'] for x in accommodation_luxurious]

df['Standard Price'] = [x['Price'] for x in accommodation_standard]
df['Superior Price'] = [x['Price'] for x in accommodation_superior]
df['Luxurious Price'] = [x['Price'] for x in accommodation_luxurious]

df['Standard Rating'] = [x['Rating'] for x in accommodation_standard]
df['Superior Rating'] = [x['Rating'] for x in accommodation_superior]
df['Luxurious Rating'] = [x['Rating'] for x in accommodation_luxurious]

# Save the updated DataFrame with accommodation data

# Close the WebDriver
driver.quit()

df

In [None]:
df

## SECOND PART: Visuals
### Flight and Accommodation Pricing Trends 
The purpose of this graph is to show how much in advance is it best to book a stay to see a match. The graph contains price averages of each match day in our database.
standard, superior, luxurious

In [19]:
from app.visuals import plot_standard
from app.visuals import plot_superior
from app.visuals import plot_luxurious
from app.visuals import plot_accommodation_data


In [None]:
# Importing the scraped data
url = "https://raw.githubusercontent.com/dvandasova/JEM207_project/956a3ddf3bd8e67bc692aff8e4affb00827bf5c4/full-scrape.xlsx"
response = requests.get(url)
df = pd.read_excel(BytesIO(response.content))
df

In [None]:
accommodation_data = df 
accommodation_data_1 = accommodation_data[['date', 'Standard Price Total', 'Superior Price Total', 'Luxurious Price Total']]
accommodation_data_1
#print(accommodation_data_1)

In [22]:
# Group the data by 'date' and calculate the mean price for each accommodation type
accommodation_data_1 = accommodation_data_1.groupby('date').mean()
#print(accommodation_data_1)

In [None]:
# Consider each date as series and accomodation type as category
# Plot a bar chart that has 3 columns for each date (accomodation type) and the height of the columns is the mean price
plot_accommodation_data(accommodation_data_1)

In [24]:
# Create a new DataFrame that contains only the 'event', 'Standard Price Total', 'Superior Price Total', and 'Luxurious Price Total' columns
accommodation_data_2 = accommodation_data[['event', 'Standard Price Total', 'Superior Price Total', 'Luxurious Price Total']]
#print(accommodation_data_2)

In [25]:
# Create an empty list, ake the 'event' column and separate it into two columns ('Home' and 'Away') separated by the ' vs. ' string
event_list = []
for event in accommodation_data_2['event']:
    event_list.append(event.split(' vs. '))
    # Appent the 'Standard Price Total', 'Superior Price Total', and 'Luxurious Price Total' columns to the list
    event_list[-1].extend(accommodation_data_2.loc[accommodation_data_2['event'] == event, ['Standard Price Total', 'Superior Price Total', 'Luxurious Price Total']].values[0])
#print(event_list)

# Create a DataFrame from the list
event_df = pd.DataFrame(event_list, columns=['Home', 'Away', 'Standard Price Total', 'Superior Price Total', 'Luxurious Price Total'])
event_df = event_df.groupby(['Home', 'Away']).mean().unstack()
#print(event_df)

In [None]:
# Create a heatmap that shows Home team on y-axis, Away team on x-axis, and the mean price Standard accommodation as the value
# Plot the heatmap
plot_standard(event_df)

# Create a heatmap that shows Home team on y-axis, Away team on x-axis, and the mean price Superior accommodation as the value
# Plot the heatmap
plot_superior(event_df)

# Create a heatmap that shows Home team on y-axis, Away team on x-axis, and the mean price Luxurious accommodation as the value
# Plot the heatmap
plot_luxurious(event_df)