## Import libraries:

In [1]:
from datetime import datetime, timedelta
from pathlib import Path
import os
import pandas as pd
import time
import requests
import json
import random
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

## Define functions:

In [2]:
def IsFilenameAlreadyExisting(filename):
    data_files_existing = os.listdir(directory_actual) + os.listdir(directory_plan)
    if filename in data_files_existing:
        return True
    return False
        
def pauseRandomlyLong():
    delay = round(random.randint(31, 53) / float(17),2)
    time.sleep(delay)

def pauseRandomlyShort():
    delay = round(random.randint(17, 31) / float(17),2)
    time.sleep(delay)
    
def GetIcelandairURL(dest, date_out_str):
    url= f"""https://www.icelandair.com/api/instantSearch/v1/bestPrice/byDay/return/multipleReturnsPerDeparture?
    departure=KEF
    &arrival={dest}
    &locale=is-IS
    &period=0
    &tripDuration=1
    &tripDurationFlexibility=21
    &fromDate={date_out_str}
    &fallbackToRouteCurrency=true
    """
    return url.replace("\n", "").replace(" ", "")

def SampleDataFromPlay(dataFromRequest, samplingList):
    data_home = dataFromRequest.json()['data']['lowestPrices']['homebound']
    data_out = dataFromRequest.json()['data']['lowestPrices']['outbound']

    for out in data_out:
        for home in data_home:
            if home['date'] <= out['date']: continue

            cost = int(home['price']) + int(out['price'])

            samplingList.append(
                {'Airline': 'Play',
                 'C_Date': today_str,
                 'DateOut': out['date'],
                 'DateBack': home['date'],
                 'Destination': dest,
                 'Price': cost})
            
    return samplingList

def SampleDataFromIcelandair(samplingList):
    dates_back = sorted(response.json()['inbound'].keys())

    for date_back_str in dates_back:
        samplingList.append(
        {'Airline': 'Icelandair',
         'C_Date': today_str,
         'DateOut': date_out_str,
         'DateBack': date_back_str,
         'Destination': dest,
         'Price': response.json()['inbound'][date_back_str]['totalFareAmount']})
    
    return samplingList

def ChooseDestinations(dest_from, dest_to):
    btn_dest_dep.click()
    time.sleep(2)
    input_dest_dep = browser.find_element_by_id("airportsAutocomplete")
    input_dest_dep.send_keys(dest_from + Keys.DOWN + Keys.ENTER)
    time.sleep(2)
    input_dest_dep = browser.find_element_by_id("airportsAutocomplete")
    input_dest_dep.send_keys(dest_to + Keys.DOWN + Keys.ENTER)
    time.sleep(2)
    btn_date_dep.click()
    time.sleep(2)

def GetYearMonth():
    month_ids = {
        'janúar': '01',
        'febrúar': '02',
        'mars': '03',
        'apríl': '04',
        'maí': '05',
        'júní': '06',
        'júlí': '07',
        'ágúst': '08',
        'september': '09',
        'október': '10',
        'nóvember': '11',
        'desember': '12'
    }
#     html = soup.find("div",{"class":"rdp-month", 'class':'rdp-caption_first'})
    year_month_arr = soup.find('div', {'class': 'rdp-caption'}).text.split(" ")[::-1]
    year_month_arr[1] = month_ids[year_month_arr[1]]
    return year_month_arr


def GetSoup():
    current_month_xpath = "//*[@id=\"__next\"]/div[2]/div[2]/header/div[2]/div[1]/div/div[2]/div/div[1]/form/div/div[4]/div/div/div/div/div[2]/div/div[1]/div[2]/div/div/div[1]"
    current_month = browser.find_element(By.XPATH, current_month_xpath)
    html = current_month.get_attribute('innerHTML')
    soup = bs(html, 'html.parser')
    return soup

def GetCalendarCells(soup):
    calendar_cells = soup.find_all('div', {'class': 'css-e4o1c'})
    return calendar_cells

def GetDate(calendar_cell):
    date_arr = GetYearMonth()
    day_raw = calendar_cell.find('span').text
    day = day_raw if len(day_raw) > 1 else '0' + day_raw
    date_arr.append(day)
    return '-'.join(date_arr)

def GetTicketPrice(calendar_cell):
    ticket_price = int(calendar_cell.find('div', {'class': 'css-1mowg62'}).text.replace(',', ''))
    return ticket_price

def IsTripTooLong(date_out, date_home, max_trip_length = 21):
    date_out_array = [int(x) for x in date_out.split('-')]
    date_home_array = [int(x) for x in date_home.split('-')]
    
    day_count_out = sum([a * b for a,b in zip(date_out_array,  [365, 30, 1])])
    day_count_home = sum([a * b for a,b in zip(date_home_array,  [365, 30, 1])])
    
    trip_length = day_count_home - day_count_out
    
    return trip_length > max_trip_length

def GoToNextMonth():
    btn_next = browser.find_element(By.XPATH, "//*[@id=\"__next\"]/div[2]/div[2]/header/div[2]/div[1]/div/div[2]/div/div[1]/form/div/div[4]/div/div/div/div/div[2]/div/div[1]/div[2]/div/div/div[2]/div/div[2]/button")
    btn_next.click()
    time.sleep(2)
    
def GetTicketPrices():
    prices = {}
    
    for i in range(0, number_of_months):
        soup = GetSoup()
        calendar_cells = GetCalendarCells(soup)
        print('-'.join(GetYearMonth()))
        time.sleep(1)
        
        for cell in calendar_cells:
            prices[GetDate(cell)] = GetTicketPrice(cell)
            
        time.sleep(2)
        GoToNextMonth()
            
    return prices

def GetTripPrices(prices):
    for out_date in data_out:
        for home_date in data_home:
            if home_date > out_date:
                if IsTripTooLong(out_date, home_date): break
                trip_cost = data_out[out_date] + data_home[home_date]
                prices.append(
                    {'Airline': 'Play',
                     'C_Date': today_str,
                     'DateOut': out_date,
                     'DateBack': home_date,
                     'Destination': dest,
                     'Price': trip_cost
                    })
    return prices

## Initialize directories:

In [3]:
dir_play = os.path.join('PlaneTicketData', 'Play')
dir_iceair = os.path.join('PlaneTicketData', 'Icelandair')

Path('PlaneTicketData').mkdir(parents=True, exist_ok=True)
Path(dir_play).mkdir(parents=True, exist_ok=True)
Path(dir_iceair).mkdir(parents=True, exist_ok=True)

## Steal prices from Play:

In [5]:
today = datetime.now()
today_str = today.strftime('%Y-%m-%d')

number_of_months = 4
dests = ['ALC', 'AMS', 'BER',
         'CDG', 'CPH', 'STN', 'TFS']

dest_counter = 1

trip_prices = []

target_filename = f"PlaneTicketPrice_Play_{today_str}.csv"
target_filename_path = os.path.join('PlaneTicketData', 'Play') 

if target_filename not in os.listdir(target_filename_path):
    
#     startTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}: Data sampling started.")
    
    browser = webdriver.Chrome(os.path.join(os.getcwd(), "chromedriver.exe"))
    browser.get(f"https://www.flyplay.com/is/")
    browser.implicitly_wait(10)

    btn_dest_dep = browser.find_element_by_id("originAirportButton")
    btn_date_dep = browser.find_element_by_id("departureDateButton")

    for dest in dests:
        data_out = {}
        data_home = {}
    
        print(f"{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}: Sampling data for {dest}.")
        ChooseDestinations('kef', dest)

        for i in range(0, number_of_months):
            soup = GetSoup()
            calendar_cells = GetCalendarCells(soup)

            for cell in calendar_cells:
                data_out[GetDate(cell)] = GetTicketPrice(cell)

            time.sleep(2)

            GoToNextMonth()

        time.sleep(2)

        ChooseDestinations(dest, 'kef')

        for i in range(0, number_of_months):
            soup = GetSoup()
            calendar_cells = GetCalendarCells(soup)

            for cell in calendar_cells:
                data_home[GetDate(cell)] = GetTicketPrice(cell)

            time.sleep(2)

            GoToNextMonth()
        
        print(f"{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}: Data sampling for {dest} completed ({dest_counter} / {len(dests)})")
        time.sleep(2)

        trip_prices = GetTripPrices(trip_prices)
    
#     endTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}: Data sampling finished.")
    
    if trip_prices:
        df = pd.DataFrame(trip_prices)
        df.to_csv(os.path.join(target_filename_path, target_filename), index=False)
        print(f'Data has been saved to a file: {target_filename}')
    else:
        print("No data was added to the database.")

    browser.close()

else:
    print(f'File already exists: {target_filename}')


Data sampling started at 2021-10-27 02:32:24.
Data sampling finished at 2021-10-27 02:39:44.
Data has been saved to a file: PlaneTicketPrice_Play_2021-10-27.csv


## Steal prices from Icelandair:

In [15]:
today = datetime.now() + timedelta(days=-1)
today_str = today.strftime('%Y-%m-%d')

number_of_days = 31
random.seed(today)

dest_counter = 1

dests = ['AMS', 'BER', 'BOS', 'ORD', 'DEN',
         'DUB', 'FRA', 'CPH', 'LHR', 'MAN', 
         'MUC', 'NYC', 'MCO', 'OSL', 'CDG', 
         'SEA', 'ARN', 'TFS', 'YTO', 'IAD',]

target_filename = f"PlaneTicketPrice_Icelandair_{today_str}.csv"
target_filename_path = os.path.join('PlaneTicketData', 'Icelandair') 

if target_filename not in os.listdir(target_filename_path):

    startTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"Data sampling started at {startTime}.")
    
    trip_prices = []

    for dest in dests:
        
        for i in range(1, number_of_days):
            date_out = today + timedelta(days=i)
            date_out_str = date_out.strftime('%Y-%m-%d')

            url = GetIcelandairURL(dest, date_out_str)
            
            response = requests.get(url)
            pauseRandomlyShort()
            
            if (response.status_code == 200):
                
                print(f"{date_out_str}: Found flights to {dest} ({i}/{number_of_days - 1})")
                
                trip_prices = SampleDataFromIcelandair(trip_prices)
            else:
                print(f"No flights available to {dest} ({i}/{number_of_days - 1})")
        
        dest_counter += 1
        print(f"Data sampling for {dest} completed ({dest_counter}/{len(dests)})")
        pauseRandomlyLong()
    
    endTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"Data sampling finished at {endTime}.")
    if trip_prices:
        df = pd.DataFrame(trip_prices)
        df.to_csv(os.path.join(target_filename_path, target_filename), index=False)
        print(f'Data has been saved to a file: {target_filename}')
    else:
        print("No data was sampled in this run.")

else:
    print('File already exists.')

Data sampling started at 2021-10-27 02:42:47.
2021-10-27: Found flights to AMS (1/30)
2021-10-28: Found flights to AMS (2/30)
2021-10-29: Found flights to AMS (3/30)
2021-10-30: Found flights to AMS (4/30)
2021-10-31: Found flights to AMS (5/30)
2021-11-01: Found flights to AMS (6/30)
2021-11-02: Found flights to AMS (7/30)
2021-11-03: Found flights to AMS (8/30)
2021-11-04: Found flights to AMS (9/30)
2021-11-05: Found flights to AMS (10/30)
2021-11-06: Found flights to AMS (11/30)
2021-11-07: Found flights to AMS (12/30)
2021-11-08: Found flights to AMS (13/30)
2021-11-09: Found flights to AMS (14/30)
2021-11-10: Found flights to AMS (15/30)
2021-11-11: Found flights to AMS (16/30)
2021-11-12: Found flights to AMS (17/30)
2021-11-13: Found flights to AMS (18/30)
2021-11-14: Found flights to AMS (19/30)
2021-11-15: Found flights to AMS (20/30)
2021-11-16: Found flights to AMS (21/30)
2021-11-17: Found flights to AMS (22/30)
2021-11-18: Found flights to AMS (23/30)
2021-11-19: Found fl

2021-11-13: Found flights to FRA (18/30)
2021-11-14: Found flights to FRA (19/30)
2021-11-15: Found flights to FRA (20/30)
2021-11-16: Found flights to FRA (21/30)
2021-11-17: Found flights to FRA (22/30)
2021-11-18: Found flights to FRA (23/30)
2021-11-19: Found flights to FRA (24/30)
2021-11-20: Found flights to FRA (25/30)
2021-11-21: Found flights to FRA (26/30)
2021-11-22: Found flights to FRA (27/30)
2021-11-23: Found flights to FRA (28/30)
2021-11-24: Found flights to FRA (29/30)
2021-11-25: Found flights to FRA (30/30)
Data sampling for FRA completed (8/20)
2021-10-27: Found flights to CPH (1/30)
2021-10-28: Found flights to CPH (2/30)
2021-10-29: Found flights to CPH (3/30)
2021-10-30: Found flights to CPH (4/30)
2021-10-31: Found flights to CPH (5/30)
2021-11-01: Found flights to CPH (6/30)
2021-11-02: Found flights to CPH (7/30)
2021-11-03: Found flights to CPH (8/30)
2021-11-04: Found flights to CPH (9/30)
2021-11-05: Found flights to CPH (10/30)
2021-11-06: Found flights t

2021-11-01: Found flights to OSL (6/30)
2021-11-02: Found flights to OSL (7/30)
2021-11-03: Found flights to OSL (8/30)
2021-11-04: Found flights to OSL (9/30)
2021-11-05: Found flights to OSL (10/30)
2021-11-06: Found flights to OSL (11/30)
2021-11-07: Found flights to OSL (12/30)
2021-11-08: Found flights to OSL (13/30)
2021-11-09: Found flights to OSL (14/30)
2021-11-10: Found flights to OSL (15/30)
2021-11-11: Found flights to OSL (16/30)
2021-11-12: Found flights to OSL (17/30)
2021-11-13: Found flights to OSL (18/30)
2021-11-14: Found flights to OSL (19/30)
2021-11-15: Found flights to OSL (20/30)
2021-11-16: Found flights to OSL (21/30)
2021-11-17: Found flights to OSL (22/30)
2021-11-18: Found flights to OSL (23/30)
2021-11-19: Found flights to OSL (24/30)
2021-11-20: Found flights to OSL (25/30)
2021-11-21: Found flights to OSL (26/30)
2021-11-22: Found flights to OSL (27/30)
2021-11-23: Found flights to OSL (28/30)
2021-11-24: Found flights to OSL (29/30)
2021-11-25: Found fl

2021-11-21: Found flights to IAD (26/30)
No flights available to IAD (27/30)
No flights available to IAD (28/30)
2021-11-24: Found flights to IAD (29/30)
No flights available to IAD (30/30)
Data sampling for IAD completed (21/20)
Data sampling finished at 2021-10-27 03:03:42.
Data has been saved to a file: PlaneTicketPrice_Icelandair_2021-10-26.csv
