In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import time
import pandas as pd

In [None]:
WAIT_TIME = 100
SCROLL_PAUSE_TIME = 3

In [11]:
# Search parameters
search_parameters = [
        {
            "SOURCE_COUNTRY" : 'sin',
            "DEST_COUNTRY" : 'mil',
            "FROM_DATE" : '2025-06-25',
            "END_DATE" : '2025-07-04',
            "NUM_ADULTS" : '1',
            "TRIP_TYPE" : 'rt'
        },
        {
            "SOURCE_COUNTRY" : 'mil',
            "DEST_COUNTRY" : 'ams',
            "FROM_DATE" : '2025-08-03',
            "END_DATE" : '2025-08-24',
            "NUM_ADULTS" : '1',
            "TRIP_TYPE" : 'rt'
        },
        {
            "SOURCE_COUNTRY" : 'ams',
            "DEST_COUNTRY" : 'san',
            "FROM_DATE" : '2025-09-13',
            "END_DATE" : '2025-09-29',
            "NUM_ADULTS" : '1',
            "TRIP_TYPE" : 'rt'
        },
        {
            "SOURCE_COUNTRY" : 'ams',
            "DEST_COUNTRY" : 'sin',
            "FROM_DATE" : '2025-09-13',
            "END_DATE" : '2025-09-29',
            "NUM_ADULTS" : '1',
            "TRIP_TYPE" : 'rt'
        }
    ]

In [12]:
import shutil
chromedriver_path = shutil.which("chromedriver")
service = Service(chromedriver_path)
options = webdriver.ChromeOptions()
# options.add_argument("--headless") 

In [13]:
def return_html_from_search(search_info_dict:dict):
    driver = webdriver.Chrome(service=service, options=options)
    SOURCE_COUNTRY = search_info_dict['SOURCE_COUNTRY']
    DEST_COUNTRY = search_info_dict['DEST_COUNTRY']
    FROM_DATE = search_info_dict['FROM_DATE']
    END_DATE = search_info_dict['END_DATE']
    TRIP_TYPE = search_info_dict['TRIP_TYPE']
    NUM_ADULTS = search_info_dict['NUM_ADULTS']

    url = f'https://sg.trip.com/flights/showfarefirst?dcity={SOURCE_COUNTRY}&acity={DEST_COUNTRY}&ddate={FROM_DATE}&rdate={END_DATE}&triptype={TRIP_TYPE}&quantity={NUM_ADULTS}'
    print(url)
    driver.get(url)

    # switch to new window
    windows = driver.window_handles
    driver.switch_to.window(windows[-1])

    print(f'Scraping {driver.title}...')

    # The website loads data as you scroll down. To simulata that and get all results
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for new content to load
        time.sleep(SCROLL_PAUSE_TIME)

        # Check new scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            print("Reached the bottom. No more data to load.")
            break

        last_height = new_height

    # This html class data-toast-title only appears when all flight results have been loaded on the website. 
    # We wait until this toast appears within the html
    wait = WebDriverWait(driver, WAIT_TIME)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[data-toast-title="All flight results loaded"]')))

    # Sleep while data is still rendering on the page
    time.sleep(30)

    # this class list-placeholder-v2.list-placeholder-v2__loading is a placeholder class until a row of flight results is being loaded
    # We try to wait until all of them are loaded, but if not loaded by a predetermined time, we take the rows of flight results that have been loaded
    try:
        wait.until(
            EC.invisibility_of_element_located(
                (By.CSS_SELECTOR, ".list-placeholder-v2.list-placeholder-v2__loading")
            )
        )
    except:
        print('not all results loaded')

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    return soup

In [14]:
def return_dataframe_of_scraped_results(soup):
    scraped_results = []
    m_result_div = soup.find("div", class_ = "m-result-list")
    flight_divs = m_result_div.find_all("div", attrs={"data-flight-id": True})

    for flight_div in flight_divs:
        try:
            flight_info_output = {}
            flight_info_list_div = flight_div.find("div", class_="flight-info-list")
            airline_name_div = flight_info_list_div.find("div", class_ = "flight-info is-v2")
            airline_name = airline_name_div.get('aria-label')
            flight_info_output['airline_name'] = airline_name.strip()
            name_of_airline_div = flight_info_list_div.find("div", class_ = "flight-info-col col-1")
            date_and_time_of_flight_div = flight_info_list_div.find("div", class_ = "flight-info-col col-2")
            time_divs = date_and_time_of_flight_div.find_all("span", class_ = "time_cbcc")
            flight_info_output['departure_time'] = time_divs[0].get("data-testid")[14:]
            flight_info_output['arrival_time'] = time_divs[1].get("data-testid")[14:]
            terminal_divs = date_and_time_of_flight_div.find_all("span", class_ = "flight-info-stop__code_e162")
            flight_info_output['departure_terminal'] = terminal_divs[0].get_text(strip=True)
            flight_info_output['arrival_terminal'] = terminal_divs[1].get_text(strip=True)
            mode_of_transport_div = date_and_time_of_flight_div.find("span", class_ = "flight-info-stop__text_3ee2")
            flight_info_output['Way_Of_Travel'] = mode_of_transport_div.get('aria-label')
            travel_time_in_hours_div = date_and_time_of_flight_div.find("div", class_ = "flight-info-duration_576d")
            flight_info_output['Time in Hours'] = travel_time_in_hours_div.get_text(strip=True)
            trip_type_div = flight_div.find("div", class_="item-con-price__tag")
            flight_info_output['Trip_Type'] = trip_type_div.get_text(strip=True)

            price_div = flight_div.find("span", class_="ThemeColor8 f-20 o-price-flight_b825 no-cursor_1b76")
            flight_info_output['Price'] = '$' + price_div.get('data-price')

            scraped_results.append(flight_info_output)
        except Exception as e:
            # if the html parsing fails, it means the page had not been loaded yet
            continue
    output_df = pd.DataFrame(scraped_results)
    output_df = output_df.drop_duplicates()
    output_df = output_df.reset_index(drop=True)
    time.sleep(20)
    return output_df

In [15]:
merged_dataframe = pd.DataFrame()
for search_info_dict in search_parameters:
    soup = return_html_from_search(search_info_dict)
    data_frame_from_search = return_dataframe_of_scraped_results(soup)
    merged_dataframe = pd.concat([merged_dataframe, data_frame_from_search], ignore_index=True)

https://sg.trip.com/flights/showfarefirst?dcity=sin&acity=mil&ddate=2025-06-25&rdate=2025-07-04&triptype=rt&quantity=1
Scraping ...
Reached the bottom. No more data to load.
not all results loaded
https://sg.trip.com/flights/showfarefirst?dcity=mil&acity=ams&ddate=2025-08-03&rdate=2025-08-24&triptype=rt&quantity=1
Scraping ...
Reached the bottom. No more data to load.
not all results loaded
https://sg.trip.com/flights/showfarefirst?dcity=ams&acity=san&ddate=2025-09-13&rdate=2025-09-29&triptype=rt&quantity=1
Scraping ...
Reached the bottom. No more data to load.
https://sg.trip.com/flights/showfarefirst?dcity=ams&acity=sin&ddate=2025-09-13&rdate=2025-09-29&triptype=rt&quantity=1
Scraping ...
Reached the bottom. No more data to load.
not all results loaded


In [16]:
print(merged_dataframe)

                                         airline_name     departure_time   
0                                  Singapore Airlines  25-06-25 23:30:00  \
1   China Eastern Airlines ,China Eastern Airlines...  25-06-25 00:55:00   
2   China Eastern Airlines ,China Eastern Airlines...  25-06-25 14:20:00   
3   Air China ,Air China      Transfer in Shanghai...  25-06-25 17:30:00   
4   Air China ,Air China      Transfer in Beijing ...  25-06-25 09:00:00   
..                                                ...                ...   
77  Air India ,Air India      Transfer in New Delh...  25-09-13 20:35:00   
78  Air India ,Air India      Transfer in New Delh...  25-09-13 20:35:00   
79  Etihad Airways ,Etihad Airways      Transfer i...  25-09-13 22:00:00   
80  Lufthansa ,Lufthansa      Transfer in Frankfur...  25-09-13 19:25:00   
81  Lufthansa ,Lufthansa      Transfer in Frankfur...  25-09-13 18:30:00   

         arrival_time departure_terminal arrival_terminal   
0   25-06-26 06:30:00     

In [17]:
merged_dataframe

Unnamed: 0,airline_name,departure_time,arrival_time,departure_terminal,arrival_terminal,Way_Of_Travel,Time in Hours,Trip_Type,Price
0,Singapore Airlines,25-06-25 23:30:00,25-06-26 06:30:00,SIN T3,MXP T1,Direct,13h,Return,$1595
1,"China Eastern Airlines ,China Eastern Airlines...",25-06-25 00:55:00,25-06-25 19:35:00,SIN T3,MXP T1,7h in Shanghai,24h 40m,Return,$917
2,"China Eastern Airlines ,China Eastern Airlines...",25-06-25 14:20:00,25-06-26 20:00:00,SIN T3,MXP T1,17h 50m in Xi'an,35h 40m,Return,$917
3,"Air China ,Air China Transfer in Shanghai...",25-06-25 17:30:00,25-06-26 08:05:00,SIN T1,MXP T1,2h 15m in Shanghai,20h 35m,Return,$921
4,"Air China ,Air China Transfer in Beijing ...",25-06-25 09:00:00,25-06-26 06:30:00,SIN T1,MXP T1,10h 15m in Beijing,27h 30m,Return,$951
...,...,...,...,...,...,...,...,...,...
77,"Air India ,Air India Transfer in New Delh...",25-09-13 20:35:00,25-09-14 21:45:00,AMS,SIN T2,4h 40m in New Delhi,19h 10m,Return,$899
78,"Air India ,Air India Transfer in New Delh...",25-09-13 20:35:00,25-09-15 07:30:00,AMS,SIN T2,15h in New Delhi,28h 55m,Return,$899
79,"Etihad Airways ,Etihad Airways Transfer i...",25-09-13 22:00:00,25-09-15 09:40:00,AMS,SIN T2,15h 25m in Abu Dhabi,29h 40m,Return,$986
80,"Lufthansa ,Lufthansa Transfer in Frankfur...",25-09-13 19:25:00,25-09-14 16:25:00,AMS,SIN T2,1h 20m in Frankfurt,15h,Return,$1036
