This notebook uses Selenium chrome driver to scrape daily flight schedule data


### 1. Selenium Set Up

In [1]:
# selenium
from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.common.keys import Keys

# beutiful soup
import requests
from bs4 import BeautifulSoup

import pandas as pd
import json, time

  from pandas.core.computation.check import NUMEXPR_INSTALLED


First we will set up the selenium interactive scraper. Change the `driver_path` in the code chunk below and run the cell to see if the web driver load.

In [2]:
# initiate driver
driver_path = '/Users/xxxx/chromedriver.exe'
driver = webdriver.Chrome(driver_path)
time.sleep(5)
driver.quit()

  driver = webdriver.Chrome(driver_path)


In [3]:
def initate_driver(url,interactive = False):
    # set chrome driver option
    if not interactive:
        options = webdriver.ChromeOptions()
        options.add_argument("--blink-settings=imagesEnabled=false")

    # initiate driver
    global driver_path
    global driver
    driver = webdriver.Chrome(driver_path)

    driver.get(url)

In [68]:
pd.Timestamp.today() - pd.Timedelta(days = 1)

Timestamp('2023-12-22 13:55:17.983835')

In [70]:
timestamp = pd.Timestamp.today() - pd.Timedelta(days = 1)
day = timestamp.day
month = timestamp.month_name()[0:3].upper()

## 2. London Heathrow Airport

We will first write some helper function.

In [55]:
earlier_flight_button = '//*[@id="flight-list-app"]/div/div[2]/div[2]//button[1]'

def scrape_heathrow_page():
    """
    parse the departure time, flight code and the city to three list
    """
    if 'times' not in globals():
        global times
        times = []

    if 'codes' not in globals():
        global codes
        codes = []
    
    if 'citys' not in globals():
        global citys
        citys = []
    
    if 'urls' not in globals():
        global urls
        urls = []

    # loop over all list flight schedule item
    for result in driver.find_elements(By.XPATH,'//*[@class="airline-listing-table"]/a[contains(@class,"airline-listing-line-item")]'):
        ftime = result.find_element(By.XPATH,"./div").text
        code = result.find_element(By.XPATH,"./div[2]/div[1]/div[1]").text
        city = result.find_element(By.XPATH,"./div[2]/div[1]/div[2]").text
        url  = result.get_attribute("href")
        times.append(ftime)
        codes.append(code)
        citys.append(city)
        urls.append(url)
        print(f"Flight {code} departing for {city} at {ftime}: {url}")

def scrape_flight_page(dep):
    """scape the individual flight page"""
    # identify which block to scrape
    div_id = 0 if dep else 1
    # point to the flight detail card
    res  = driver.find_elements(By.XPATH, "//div[contains(@class,'show-flight-details')]")
    card = res[div_id]
    # departure from LHR
    try:
        time_act = card.find_element(By.XPATH, ".//div[contains(@aria-label,'actual time')]").text
        return time_act
    except:
        print("An error occured when parsing the actual time.")
        return None

def go_to_top():            
    while True:
        try:
            driver.find_element(By.XPATH,earlier_flight_button).send_keys(Keys.RETURN)
            time.sleep(0.5) 
        except:
            print("Loaded to the top of the list")
            break
    

### 2.1 Departures

First, we will load the page and get to the top of the daily flight schedule table.

In [48]:
initate_driver("https://www.heathrow.com/departures")
time.sleep(5) 

# confirm the page is loadded properly
input("Enter when the page is loaded")

# get to top of the day
go_to_top()


Loaded to the top of the list


We will now start scraping the data

In [91]:
times = []
codes = []
citys = []
urls = []

# scrape the first page
scrape_heathrow_page()

# loop through all schedule of the date
later_flight_button =   '//*[@id="flight-list-app"]/div/div[2]/div[2]/div/div[3]/button'
while True:
	try: 
		# load later flights
		driver.find_element(By.XPATH,later_flight_button).send_keys(Keys.RETURN)
		# add the data the the list
		scrape_heathrow_page()

	except:
		print("Reached the end of the list")
		break

Flight TP1363 departing for Lisbon at 06:00: https://www.heathrow.com/departures/terminal-2/flight-details/TP1363/19-12-2023
Flight OS458 departing for Vienna at 06:00: https://www.heathrow.com/departures/terminal-2/flight-details/OS458/19-12-2023
Flight LX345 departing for Zurich at 06:00: https://www.heathrow.com/departures/terminal-2/flight-details/LX345/19-12-2023
Flight BA472 departing for Barcelona at 06:05: https://www.heathrow.com/departures/terminal-5/flight-details/BA472/19-12-2023
Flight BA1414 departing for Belfast at 06:15: https://www.heathrow.com/departures/terminal-5/flight-details/BA1414/19-12-2023
Flight AF1381 departing for Paris at 06:20: https://www.heathrow.com/departures/terminal-4/flight-details/AF1381/19-12-2023
Flight BA456 departing for Madrid at 06:20: https://www.heathrow.com/departures/terminal-5/flight-details/BA456/19-12-2023
Flight BA1432 departing for Edinburgh at 06:25: https://www.heathrow.com/departures/terminal-5/flight-details/BA1432/19-12-2023
Fl

In [92]:
departure = pd.DataFrame({"time_sch":times,'code':codes,
                            'dest':citys,'url':urls})
departure = departure.set_index("code")

print(departure.shape)
departure.head()

(637, 3)


Unnamed: 0_level_0,time_sch,dest,url
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TP1363,06:00,Lisbon,https://www.heathrow.com/departures/terminal-2...
OS458,06:00,Vienna,https://www.heathrow.com/departures/terminal-2...
LX345,06:00,Zurich,https://www.heathrow.com/departures/terminal-2...
BA472,06:05,Barcelona,https://www.heathrow.com/departures/terminal-5...
BA1414,06:15,Belfast,https://www.heathrow.com/departures/terminal-5...


#### 2.1.2 Scrape individual page

In [362]:
options = webdriver.ChromeOptions()
options.add_argument("--blink-settings=imagesEnabled=false")
driver = webdriver.Chrome(driver_path,options=options)
driver.get("https://www.heathrow.com/departures")

  driver = webdriver.Chrome(driver_path,options=options)


In [360]:
departure[departure['time_act'].isnull()]

Unnamed: 0_level_0,time_sch,dest,url,time_act
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LH2477,16:40,Munich,https://www.heathrow.com/departures/terminal-2...,
BA488,16:45,Barcelona,https://www.heathrow.com/departures/terminal-5...,
BA1488,16:45,Glasgow,https://www.heathrow.com/departures/terminal-5...,
BA960,16:45,Munich,https://www.heathrow.com/departures/terminal-5...,
BA203,16:50,Boston,https://www.heathrow.com/departures/terminal-5...,
...,...,...,...,...
BA059,22:25,Cape Town,https://www.heathrow.com/departures/terminal-5...,
VS449,22:25,Johannesburg,https://www.heathrow.com/departures/terminal-3...,
BA083,22:30,Abuja,https://www.heathrow.com/departures/terminal-5...,
MS780,22:30,Cairo,https://www.heathrow.com/departures/terminal-2...,


In [368]:
# iterate through rows
counter = 1
# set up headless driver
for key, val in departure[departure['time_act'].isnull()].iterrows():
    driver.get(val['url'])
    try:
        time_act = scrape_flight_page(dep  =True)
        departure.loc[key,'time_act'] = time_act
        print(f'{counter}: flight {key} scheduled at {val["time_sch"]} departed at {time_act}')
    except:
        print(f"Error occured when calling scrape_flight_page for flight {key}")
    counter +=1 

    

1: flight BA706 scheduled at 18:40 departed at 19:05
2: flight IB3167 scheduled at 18:45 departed at 19:46
An error occured when parsing the actual time.
3: flight KL1024 scheduled at 18:50 departed at None
4: flight AA139 scheduled at 18:50 departed at 19:18
5: flight IB3173 scheduled at 19:05 departed at 19:32
6: flight BA482 scheduled at 19:10 departed at 19:24
7: flight BA1352 scheduled at 19:10 departed at 19:30
8: flight BA1478 scheduled at 19:15 departed at 19:31
9: flight LH2481 scheduled at 19:15 departed at 19:59
10: flight BA770 scheduled at 19:25 departed at 19:48
11: flight BA376 scheduled at 19:25 departed at 20:00
Error occured when calling scrape_flight_page for flight OS456
13: flight BA956 scheduled at 19:25 departed at 20:47
14: flight SK812 scheduled at 19:30 departed at 20:02
Error occured when calling scrape_flight_page for flight BA870
16: flight EI177 scheduled at 19:45 departed at 20:16
17: flight BA352 scheduled at 19:45 departed at 20:25
18: flight BA858 sche

In [369]:
departure.isnull().sum()

time_sch    0
dest        0
url         0
time_act    7
dtype: int64

### 2.2 Arrivals

In [139]:
driver.get("https://www.heathrow.com/arrivals")
time.sleep(5) 

# confirm the page is loadded properly
input("Enter when the page is loaded")

# get to top of the day
earlier_flight_button = '//*[@id="flight-list-app"]/div/div[2]/div[2]//button[1]'
while True:
    try:
        driver.find_element(By.XPATH,earlier_flight_button).send_keys(Keys.RETURN)
        time.sleep(1) 
    except:
        print("Loaded to the top of the list")
        break

Loaded to the top of the list


In [140]:
times = []
codes = []
citys = []
urls = []

# scrape the first page
scrape_heathrow_page()

# loop through all schedule of the date
later_flight_button = '//*[@id="flight-list-app"]/div/div[2]/div[2]/div/div[3]/button'
while True:
	try: 
		# load later flights
		driver.find_element(By.XPATH,later_flight_button).send_keys(Keys.RETURN)
		# add the data the the list
		scrape_heathrow_page()
	except:
		print("Reached the end of the list")
		break

Flight BA058 departing for Cape Town at 04:45: https://www.heathrow.com/arrivals/terminal-5/flight-details/BA058/19-12-2023
Flight BA074 departing for Lagos at 04:50: https://www.heathrow.com/arrivals/terminal-5/flight-details/BA074/19-12-2023
Flight VS450 departing for Johannesburg at 04:55: https://www.heathrow.com/arrivals/terminal-3/flight-details/VS450/19-12-2023
Flight BA056 departing for Johannesburg at 05:00: https://www.heathrow.com/arrivals/terminal-5/flight-details/BA056/19-12-2023
Flight QF009 departing for Melbourne at 05:05: https://www.heathrow.com/arrivals/terminal-3/flight-details/QF009/19-12-2023
Flight BA262 departing for Riyadh at 05:15: https://www.heathrow.com/arrivals/terminal-5/flight-details/BA262/19-12-2023
Flight BA016 departing for Sydney at 05:25: https://www.heathrow.com/arrivals/terminal-5/flight-details/BA016/19-12-2023
Flight SV119 departing for Riyadh at 05:25: https://www.heathrow.com/arrivals/terminal-4/flight-details/SV119/19-12-2023
Flight BA078 de

In [146]:
arrivals = pd.DataFrame({"time_sch":times,'code':codes,'orig':citys, 'url':urls})
arrivals = arrivals.set_index('code')
arrivals.head()

Unnamed: 0_level_0,time_sch,orig,url
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BA058,04:45,Cape Town,https://www.heathrow.com/arrivals/terminal-5/f...
BA074,04:50,Lagos,https://www.heathrow.com/arrivals/terminal-5/f...
VS450,04:55,Johannesburg,https://www.heathrow.com/arrivals/terminal-3/f...
BA056,05:00,Johannesburg,https://www.heathrow.com/arrivals/terminal-5/f...
QF009,05:05,Melbourne,https://www.heathrow.com/arrivals/terminal-3/f...


#### 2.2.2 Scrape indivual page

In [382]:
driver.quit()
initate_driver("https://www.heathrow.com")

  driver = webdriver.Chrome(driver_path)


In [None]:
# iterate through rows
counter = 1
# set up headless driver
for key, val in arrivals[arrivals['time_act'].isnull()].iterrows():
    driver.get(val['url'])
    try:
        time_act = scrape_flight_page(dep = False)
        print(f'{counter}: flight {key} scheduled at {val["time_sch"]} departed at {time_act}')
    except:
        print(f"Error occured when calling scrape_flight_page for flight {key}")
    arrivals.loc[key,'time_act'] = time_act
    counter +=1 

In [386]:
arrivals.isnull().sum()

time_sch    0
orig        0
url         0
time_act    9
dtype: int64

### 2.3 Parse the Data

In [388]:
departure['orig'] = ["London" for _ in range(len(departure))]
arrivals['dest'] = ['London' for _ in range(len(arrivals))]
df = pd.concat([departure, arrivals])

In [389]:
df.to_csv("./19DEC2023_LHR.csv", index = False)

In [42]:
df.head()

Unnamed: 0,time,code,dest,orig
0,06:00,TP1363,Lisbon,London
1,06:00,OS458,Vienna,London
2,06:00,LX345,Zurich,London
3,06:05,BA472,Barcelona,London
4,06:15,BA1414,Belfast,London


## 3. Hong Kong International Airport

In [4]:
# get to top of list
def HKG_go_to_top():
    while True:
        try:
            driver.find_element(By.XPATH, "//a[@class='loadEarlierBox']").send_keys(Keys.RETURN)
        except:
            print("Reached the top of the list")
def HKG_load_more():
    driver.find_element(By.XPATH, "//a[contains(@class,'loadMore')]").send_keys(Keys.RETURN)

### 3.1.1 Passenger Departures

In [5]:
initate_driver("https://www.hongkongairport.com/en/flights/departures/passenger.page", True)
driver.get("https://www.hongkongairport.com/en/flights/departures/passenger.page")
input("Select the desired date.")

  driver = webdriver.Chrome(driver_path)


''

In [6]:
HKG_load_more()

In [7]:
flights = driver.find_elements(By.XPATH,"//div[@class='resultDataContainerBox']//tbody/tr[@class='data']")
flights_even = driver.find_elements(By.XPATH,"//div[@class='resultDataContainerBox']//tbody/tr[@class='data even']")
flights.extend(flights_even)

In [8]:
times = []
codes = []
citys = []
times_act = []

for flight in flights:
    try:
        code = flight.find_element(By.XPATH,".//span[@class='flightNo']").text
        city = flight.find_element(By.XPATH,".//td[@class='destData']").text
        ftime = flight.find_element(By.XPATH,".//td[@class='timeData']").text
        time_act = flight.find_element(By.XPATH,".//td[@class='statusData']").text
        try:
            times.append(ftime)
            codes.append(code)
            citys.append(city)
            times_act.append(time_act)
            print(f"Flight {code} departing for {city} schedule at {ftime}: departed at {time_act}")
        except:
            print("Error when loading data to list")
    except:
        print("An error occured when parsing the flight data")


Flight CX 261 departing for Paris schedule at 00:05: departed at Dep 00:14
Flight CX 880 departing for Los Angeles schedule at 00:15: departed at Dep 00:23
Flight CX 105 departing for Melbourne schedule at 00:25: departed at Dep 00:37
Flight CX 749 departing for Johannesburg schedule at 00:30: departed at Dep 00:52
Flight EK 381 departing for Dubai schedule at 00:35: departed at Dep 00:35
Flight CX 872 departing for San Francisco schedule at 00:55: departed at Dep 01:03
Flight CX 888 departing for Vancouver schedule at 01:00: departed at Dep 01:19
Flight HX 763 departing for Bangkok schedule at 01:20: departed at Dep 01:32
Flight 7C 2108 departing for Seoul/ICN schedule at 01:30: departed at Dep 02:09
Flight CX 566 departing for Osaka/Kansai schedule at 01:50: departed at Dep 02:08
Flight HX 769 departing for Bangkok schedule at 02:05: departed at Dep 02:06
Flight HX 606 departing for Tokyo/NRT schedule at 02:20: departed at Dep 02:14
Flight HX 602 departing for Osaka/Kansai schedule a

In [9]:
hkg_dep_pas = pd.DataFrame({"time_sch":times, 'code':codes,
                                'dest':citys,'time_act':times_act})
hkg_dep_pas = hkg_dep_pas.set_index('code')
hkg_dep_pas.head()

Unnamed: 0_level_0,time_sch,dest,time_act
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CX 261,00:05,Paris,Dep 00:14
CX 880,00:15,Los Angeles,Dep 00:23
CX 105,00:25,Melbourne,Dep 00:37
CX 749,00:30,Johannesburg,Dep 00:52
EK 381,00:35,Dubai,Dep 00:35


### 3.1.2 Cargo departures

In [10]:
driver.get("https://www.hongkongairport.com/en/flights/departures/cargo.page")
input("Select the date of interest on the interactive window")

''

In [11]:
HKG_load_more()

In [12]:
flights = driver.find_elements(By.XPATH,"//div[@class='resultDataContainerBox']//tbody/tr[@class='data']")
flights_even = driver.find_elements(By.XPATH,"//div[@class='resultDataContainerBox']//tbody/tr[@class='data even']")
flights.extend(flights_even)

In [13]:
times = []
codes = []
citys = []
times_act = []

for flight in flights:
    try:
        code = flight.find_element(By.XPATH,".//span[@class='flightNo']").text
        city = flight.find_element(By.XPATH,".//td[@class='destData']").text
        ftime = flight.find_element(By.XPATH,".//td[@class='timeData']").text
        time_act = flight.find_element(By.XPATH,".//td[@class='statusData']").text
        try:
            times.append(ftime)
            codes.append(code)
            citys.append(city)
            times_act.append(time_act)
            print(f"Flight {code} departing for {city} schedule at {ftime}: {time_act}")
        except:
            print("Error when loading data to list")
    except:
        print("An error occured when parsing the flight data")


Flight CX 2067 departing for Dubai/DWC
Amsterdam schedule at 00:15: Dep 00:57
Flight EK 9837 departing for Dubai/DWC schedule at 01:10: Dep 07:25
Flight SV 987 departing for Riyadh schedule at 01:25: Dep 01:33
Flight NH 8512 departing for Tokyo/NRT schedule at 02:40: Dep 03:04
Flight RH 962 departing for Osaka/Kansai schedule at 02:55: Dep 09:52
Flight LD 327 departing for Singapore schedule at 03:20: Dep 03:27
Flight KE 314 departing for Seoul/ICN schedule at 03:20: Dep 03:28
Flight LD 720 departing for Chengdu schedule at 03:25: Dep 03:19
Flight LD 204 departing for Osaka/Kansai schedule at 03:30: Dep 04:06
Flight LD 457 departing for Manila
Cebu schedule at 03:40: Dep 04:09
Flight CX 2070 departing for Anchorage
Miami schedule at 03:40: Dep 05:01
Flight LD 216 departing for Nagoya schedule at 03:45: Dep 03:57
Flight LD 561 departing for Ho Chi Minh schedule at 03:45: Dep 04:19
Flight LD 782 departing for Shanghai/PVG schedule at 03:50: Dep 03:59
Flight 3S 543 departing for Singapore

In [14]:
hkg_dep_car = pd.DataFrame({"time_sch":times, 'code':codes,
                                'dest':citys,'time_act':times_act})
hkg_dep_car = hkg_dep_car.set_index('code')
hkg_dep_car.head()

Unnamed: 0_level_0,time_sch,dest,time_act
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CX 2067,00:15,Dubai/DWC\nAmsterdam,Dep 00:57
EK 9837,01:10,Dubai/DWC,Dep 07:25
SV 987,01:25,Riyadh,Dep 01:33
NH 8512,02:40,Tokyo/NRT,Dep 03:04
RH 962,02:55,Osaka/Kansai,Dep 09:52


### 3.2.1 Passenger Arrivals

In [15]:
driver.get("https://www.hongkongairport.com/en/flights/arrivals/passenger.page")
input("Select the date of interest on the interactive window")

''

In [16]:
HKG_load_more()

In [17]:
flights = driver.find_elements(By.XPATH,"//div[@class='resultDataContainerBox']//tbody/tr[@class='data']")
flights_even = driver.find_elements(By.XPATH,"//div[@class='resultDataContainerBox']//tbody/tr[@class='data even']")
flights.extend(flights_even)

In [18]:
times = []
codes = []
citys = []
times_act = []

for flight in flights:
    try:
        code = flight.find_element(By.XPATH,".//span[@class='flightNo']").text
        city = flight.find_element(By.XPATH,".//td[@class='originData']").text
        ftime = flight.find_element(By.XPATH,".//td[@class='timeData']").text
        time_act = flight.find_element(By.XPATH,".//td[@class='statusData']").text
        try:
            times.append(ftime)
            codes.append(code)
            citys.append(city)
            times_act.append(time_act)
            print(f"Flight {code} departing from {city} schedule at {ftime}: arrived at {time_act}")
        except:
            print("Error when loading data to list")
    except:
        print("An error occured when parsing the flight data")


Flight MU 725 departing from Shanghai/PVG schedule at 00:15: arrived at At gate 00:04
Flight CX 636 departing from Singapore schedule at 00:20: arrived at At gate 00:12
Flight UO 651 departing from Tokyo/NRT schedule at 00:30: arrived at At gate 01:01
Flight MM 067 departing from Osaka/Kansai schedule at 00:45: arrived at At gate 01:48
Flight UO 863 departing from Osaka/Kansai schedule at 01:15: arrived at At gate 01:52
Flight TW 117 departing from Seoul/ICN schedule at 01:30: arrived at At gate 03:03
Flight HX 637 departing from Osaka/Kansai schedule at 02:35: arrived at At gate 02:32
Flight UO 765 departing from Phuket schedule at 04:20: arrived at At gate 03:58
Flight CX 662 departing from Dhaka schedule at 04:45: arrived at At gate 04:21
Flight HB 232 departing from Manila schedule at 05:10: arrived at At gate 05:18
Flight CX 829 departing from Toronto schedule at 05:25: arrived at At gate 04:51
Flight UK 101 departing from Delhi schedule at 05:55: arrived at At gate 06:06
Flight U

In [19]:
hkg_arrival_pas = pd.DataFrame({"time_sch":times, 'code':codes,
                                'orig':citys,'time_act':times_act})
hkg_arrival_pas = hkg_arrival_pas.set_index('code')
hkg_arrival_pas.head()

Unnamed: 0_level_0,time_sch,orig,time_act
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MU 725,00:15,Shanghai/PVG,At gate 00:04
CX 636,00:20,Singapore,At gate 00:12
UO 651,00:30,Tokyo/NRT,At gate 01:01
MM 067,00:45,Osaka/Kansai,At gate 01:48
UO 863,01:15,Osaka/Kansai,At gate 01:52


### 3.2.2 Arrival Cargo

In [20]:
driver.get("https://www.hongkongairport.com/en/flights/arrivals/cargo.page")
input("Select the date of interest on the interactive window")

''

In [21]:
HKG_load_more()

In [22]:
flights = driver.find_elements(By.XPATH,"//div[@class='resultDataContainerBox']//tbody/tr[@class='data']")
flights_even = driver.find_elements(By.XPATH,"//div[@class='resultDataContainerBox']//tbody/tr[@class='data even']")
flights.extend(flights_even)

In [23]:
times = []
codes = []
citys = []
times_act = []


for flight in flights:
    try:
        code = flight.find_element(By.XPATH,".//span[@class='flightNo']").text
        city = flight.find_element(By.XPATH,".//td[@class='destData']").text
        ftime = flight.find_element(By.XPATH,".//td[@class='timeData']").text
        time_act = flight.find_element(By.XPATH,".//td[@class='statusData']").text
        try:
            times.append(ftime)
            codes.append(code)
            citys.append(city)
            times_act.append(time_act)
            print(f"Flight {code} departing from {city} schedule at {ftime}: arrived at {time_act}")
        except:
            print("Error when loading data to list")
    except:
        print("An error occured when parsing the flight data")


Flight EK 9784 departing from Dubai/DWC schedule at 00:00: arrived at At gate 00:00
Flight KZ 203 departing from Tokyo/NRT schedule at 00:25: arrived at At gate 01:07
Flight LD 129 departing from Seoul/ICN schedule at 00:45: arrived at At gate 01:16
Flight CX 097 departing from Felipe Angeles
Guadalajara
Anchorage schedule at 00:55: arrived at At gate 03:54
Flight CI 5835 departing from Taipei schedule at 01:00: arrived at At gate 04:46
Flight NH 8511 departing from Tokyo/NRT schedule at 01:10: arrived at At gate 01:41
Flight CX 035 departing from Osaka/Kansai
Seoul/ICN schedule at 01:15: arrived at At gate 03:18
Flight 8K 525 departing from Hanoi schedule at 01:20: arrived at At gate 01:30
Flight LD 326 departing from Singapore schedule at 01:25: arrived at At gate 01:36
Flight LD 842 departing from Bangkok schedule at 01:25: arrived at At gate 01:07
Flight LD 209 departing from Tokyo/NRT schedule at 01:30: arrived at At gate 02:05
Flight 3S 520 departing from Leipzig
Bahrain schedule

In [24]:
hkg_arrival_car = pd.DataFrame({"time_sch":times, 'code':codes,
                                'orig':citys,'time_act':times_act})
hkg_arrival_car = hkg_arrival_car.set_index('code')
hkg_arrival_car.head()

Unnamed: 0_level_0,time_sch,orig,time_act
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EK 9784,00:00,Dubai/DWC,At gate 00:00
KZ 203,00:25,Tokyo/NRT,At gate 01:07
LD 129,00:45,Seoul/ICN,At gate 01:16
CX 097,00:55,Felipe Angeles\nGuadalajara\nAnchorage,At gate 03:54
CI 5835,01:00,Taipei,At gate 04:46


In [25]:
hkg_arrival_car.tail()

Unnamed: 0_level_0,time_sch,orig,time_act
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RH 9376,22:15,Chennai,At gate 04:56 (23/12/2023)
FX 9799,23:00,Anchorage,At gate 00:52 (23/12/2023)
ES 723,23:20,Bahrain,At gate 22:52
CX 2048,23:30,Hanoi,At gate 23:30
CX 2074,23:45,Singapore\nPenang,At gate 00:39 (23/12/2023)


In [26]:
driver.quit()

### 3.3 Combine the dataframes

In [27]:
# add column for passenger 
hkg_dep_pas['commercial'] = [1 for _ in range(len(hkg_dep_pas))]
hkg_arrival_pas['commercial'] = [1 for _ in range(len(hkg_arrival_pas))]
hkg_dep_car['commercial'] = [0 for _ in range(len(hkg_dep_car))]
hkg_arrival_car['commercial'] = [0 for _ in range(len(hkg_arrival_car))]

In [28]:
# add original and destination
hkg_dep_pas['orig'] =["HKG" for _ in range(len(hkg_dep_pas))]
hkg_dep_car['orig'] = ["HKG" for _ in range(len(hkg_dep_car))]

hkg_arrival_pas['dest'] = ["HKG" for _ in range(len(hkg_arrival_pas))]
hkg_arrival_car['dest'] = ["HKG" for _ in range(len(hkg_arrival_car))]

In [29]:
dfs = [hkg_dep_pas,hkg_dep_car,hkg_arrival_pas,hkg_arrival_car]
hkg = pd.concat(dfs).sort_values("time_sch")

In [39]:
pd.Timestamp.today().month_name()[0:3].upper()

'DEC'

In [41]:
hkg.to_csv(f"./{day}{month}2023_HKG.csv", index = True)

## 4. Schipool Airport
https://www.schiphol.nl/en/departures/
they do not have a date filter

In [42]:
def AMS_go_to_top():
    earlier_flight_button = "//div[contains(@class,'flights--cards')]//*[@id='flights-load-previous']"
    while True:
        try:
            driver.find_element(By.XPATH,earlier_flight_button).send_keys(Keys.RETURN)
        except:
            print("Loaded to the top of the list")
            break

def AMS_expand_page():
    load_flight_button = "//div[contains(@class,'flights--cards')]//*[@id='flights-load-next']"
    while True:
        try:
            driver.find_element(By.XPATH,load_flight_button).send_keys(Keys.RETURN)
            time.sleep(0.1)
        except:
            print("Reached bottom of the list")
            break

### 4.1 Departure

In [44]:
initate_driver("https://www.schiphol.nl/en/departures/?datetime=2023-12-19&query=")
input("Select the date page on the interactive window")

  driver = webdriver.Chrome(driver_path)


''

In [47]:
AMS_expand_page()
flights = driver.find_elements(By.XPATH, "//div[contains(@class,'flights--cards')]//li[contains(@class,'card-flight')]")
len(flights)

Reached bottom of the list


735

In [46]:
while len(flights) < 600:
    AMS_expand_page()
    flights = driver.find_elements(By.XPATH, "//div[contains(@class,'flights--cards')]//li[contains(@class,'card-flight')]")
    print(f"{len(flights)} flights loaded")

Reached bottom of the list
500 flights loaded
Reached bottom of the list
600 flights loaded


In [48]:
codes = []
citys = []
times = []
stats = []
times_act = []
for flight in flights[:700]: 
    code = flight.find_element(By.XPATH,".//span[contains(@class,'card-flight__number')]").text
    city = flight.find_element(By.XPATH,".//*[contains(@class,'card-flight__airport')]").text
    ftime = flight.find_element(By.XPATH,".//time").text
    try:
        stat = flight.find_element(By.XPATH,".//span[contains(@class,'flight-status--normal')]").text
    except:
        stat = "CANCELLED"
    try:
        time_act = flight.find_element(By.XPATH,".//ins[@class='time-delayed']").text
    except:
        time_act = ftime
    codes.append(code)
    citys.append(city)
    times.append(ftime)
    stats.append(stat)
    times_act.append(time_act)
    print(f'{stat} Flight {code} scheduled at {ftime} departed to {city} at {time_act}')


DEPARTED Flight HV 5641 Transavia scheduled at 06:00 departed to La Palma (SPC) at 06:00
DEPARTED Flight HV 6457 Transavia scheduled at 06:00 departed to Las Palmas de Gran Canaria (LPA) at 06:10
DEPARTED Flight HV 5071 Transavia scheduled at 06:05 departed to Tromso (TOS) at 06:16
DEPARTED Flight HV 6731 Transavia scheduled at 06:20 departed to Sevilla (SVQ) at 06:20
DEPARTED Flight HV 5355 Transavia scheduled at 06:30 departed to Faro (FAO) at 06:47
DEPARTED Flight HV 6901 Transavia scheduled at 06:30 departed to Dubai International (DXB) at 06:30
DEPARTED Flight HV 6591 Transavia scheduled at 06:40 departed to Salzburg (SZG) at 06:56
DEPARTED Flight HV 5581 Transavia scheduled at 06:45 departed to Nice (NCE) at 07:01
DEPARTED Flight KL 1141 KLM scheduled at 06:45 departed to Oslo (OSL) at 06:45
CANCELLED Flight KL 1165 KLM scheduled at 06:45 departed to Helsinki (HEL) at 06:45
DEPARTED Flight HV 6115 Transavia scheduled at 06:50 departed to Malaga (AGP) at 07:20
DEPARTED Flight HV 6

In [49]:
AMS_dep = pd.DataFrame({"time_sch":times,"time_act": times_act,'code':codes,
                            'dest':citys,'stat':stats})


In [57]:
AMS_dep.tail(66)

Unnamed: 0,time_sch,time_act,code,dest,stat
634,23:05,23:34,TK 1956 Turkish Airlines,Istanbul Airport (IST),DEPARTED
635,05:00,05:20,HV 407 Transavia,Kittila (KTT),DEPARTED
636,05:10,05:32,HV 6119 Transavia,Malaga (AGP),DEPARTED
637,05:25,05:42,HV 6947 Transavia,Alicante (ALC),DEPARTED
638,05:40,05:54,HV 705 Transavia,Ivalo (IVL),DEPARTED
...,...,...,...,...,...
695,08:00,08:00,OR 2201 TUI fly,Salen (SCR),DEPARTED
696,08:05,08:05,KL 1071 KLM,Manchester (MAN),DEPARTED
697,08:05,08:05,KL 1153 KLM,Gothenburg (GOT),DEPARTED
698,08:05,08:05,KL 1277 KLM,Edinburgh (EDI),DEPARTED


In [58]:
AMS_dep = AMS_dep.iloc[:615,]

#### 4.2 Arrivals

In [59]:
driver.get("https://www.schiphol.nl/en/arrivals/?datetime=2023-12-19&query=")

In [60]:
flights = driver.find_elements(By.XPATH, "//div[contains(@class,'flights--cards')]//li[contains(@class,'card-flight')]")
len(flights)

50

In [61]:
while len(flights) < 610:
    AMS_expand_page()
    flights = driver.find_elements(By.XPATH, "//div[contains(@class,'flights--cards')]//li[contains(@class,'card-flight')]")
    print(f"{len(flights)} flights loaded")

Reached bottom of the list
100 flights loaded
Reached bottom of the list
400 flights loaded
Reached bottom of the list
450 flights loaded
Reached bottom of the list
646 flights loaded


In [62]:
codes = []
citys = []
times = []
stats = []
times_act = []
for flight in flights: # shift 500 to the left
    code = flight.find_element(By.XPATH,".//span[contains(@class,'card-flight__number')]").text
    city = flight.find_element(By.XPATH,".//*[contains(@class,'card-flight__airport')]").text
    ftime = flight.find_element(By.XPATH,".//time").text
    try:
        stat = flight.find_element(By.XPATH,".//span[contains(@class,'flight-status--normal')]").text
    except:
        stat = "CANCELLED"
    try:
        time_act = flight.find_element(By.XPATH,".//ins[@class='time-delayed']").text
    except:
        time_act = ftime
    codes.append(code)
    citys.append(city)
    times.append(ftime)
    stats.append(stat)
    times_act.append(time_act)
    print(f'Flight {code} scheduled at {ftime} departed from {city} arrived at {time_act}')


Flight HV 5956 Transavia scheduled at 00:00 departed from Lisbon (LIS) arrived at 00:35
Flight OR 3721 TUI fly scheduled at 00:05 departed from Sao Vicente Island (VXE) arrived at 00:33
Flight HV 5752 Transavia scheduled at 00:15 departed from Marrakech (RAK) arrived at 00:57
Flight HV 6110 Transavia scheduled at 00:15 departed from Malaga (AGP) arrived at 00:45
Flight HV 6672 Transavia scheduled at 00:35 departed from Tenerife (TFS) arrived at 01:20
Flight HV 6706 Transavia scheduled at 00:40 departed from Fuerteventura (FUE) arrived at 01:05
Flight HV 5672 Transavia scheduled at 00:45 departed from Ibiza (IBZ) arrived at 01:17
Flight HV 6506 Transavia scheduled at 00:55 departed from Amman (AMM) arrived at 01:23
Flight HV 5228 Transavia scheduled at 01:25 departed from Hurghada (HRG) arrived at 01:25
Flight KL 588 KLM scheduled at 05:20 departed from Lagos (LOS) arrived at 05:20
Flight KL 590 KLM scheduled at 05:45 departed from Accra (ACC) arrived at 05:45
Flight KL 810 KLM schedule

In [63]:
AMS_arr = pd.DataFrame({"time_sch":times,"time_act": times_act,'code':codes,
                            'orig':citys,'stat':stats})

In [47]:
AMS_arr.tail(10)

Unnamed: 0,time_sch,time_act,code,orig,stat
603,23:55,01:00,HV 5218 Transavia,Catania (CTA),BAGGAGE ON BELT
604,00:00,00:35,HV 5956 Transavia,Lisbon (LIS),BAGGAGE ON BELT
605,00:05,00:33,OR 3721 TUI fly,Sao Vicente Island (VXE),BAGGAGE ON BELT
606,00:15,00:57,HV 5752 Transavia,Marrakech (RAK),BAGGAGE ON BELT
607,00:15,00:45,HV 6110 Transavia,Malaga (AGP),BAGGAGE ON BELT
608,00:35,01:20,HV 6672 Transavia,Tenerife (TFS),LANDED
609,00:40,01:05,HV 6706 Transavia,Fuerteventura (FUE),BAGGAGE ON BELT
610,00:45,01:17,HV 5672 Transavia,Ibiza (IBZ),LANDED
611,00:55,01:23,HV 6506 Transavia,Amman (AMM),LANDED
612,01:25,01:25,HV 5228 Transavia,Hurghada (HRG),LANDED


In [48]:
AMS_arr = AMS_arr.iloc[:604,]

In [67]:
driver.quit()

#### 4.3 Parse the df

In [64]:
AMS_dep['orig'] = ["Amsterdam" for _ in range(len(AMS_dep))]
AMS_arr['dest'] = ["Amsterdam" for _ in range(len(AMS_arr))]

In [65]:
dfs = [AMS_dep,AMS_arr]
AMS = pd.concat(dfs)


In [66]:
AMS.to_csv(f"./{day}{month}2023_AMS.csv", index = False)

## 5. Frankfurt Airport
https://www.frankfurt-airport.com/en/flights-and-transfer/departures.html

## 6. Flight Radar 24

In [None]:
# departure
driver.get("https://www.flightradar24.com/data/airports/lhr/departures")

result = find_element(By.XPATH,'//section[@id="cnt-data-content"]//')

### code Bank

In [45]:
# departure flight data page
driver.get("https://www.heathrow.com/arrivals/terminal-5/flight-details/BA280/18-12-2023")

In [46]:
res = driver.find_elements(By.XPATH, "//div[contains(@class,'show-flight-details')]")
# //section[contains(@class,'flight-card-content')//div]


In [47]:
res[1].find_element(By.XPATH, "//div[contains(@aria-label,'actual time')]").text

'18:34'