In [1]:
from selenium import webdriver
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
import time
from datetime import datetime, timedelta
from itertools import combinations
from loguru import logger
from bs4 import BeautifulSoup
import pandas as pd

WAIT_TIMEOUT = 20  # Increased timeout

# Setup loguru
logger.remove()  # Remove default handler
logger.add("scraping.log", rotation="500 MB", level="INFO")
logger.add(lambda msg: print(msg, end=""), colorize=True, level="INFO")

# Setup
chrome_options = Options()
chrome_options.add_argument("--incognito")
driver = uc.Chrome(options=chrome_options)

# Generate URLs
airports = ['HAN', 'DAD', 'PQC', 'CXR', 'DLI', 'SGN']
classes = ['ECONOMY', 'PREMIUM_ECONOMY', 'BUSINESS']
start_date = datetime(2024, 8, 8)
end_date = datetime(2024, 12, 31)
base_url = "https://www.traveloka.com/en-vn/flight/fullsearch?ap={}.{}&dt={}.NA&ps=1.0.0&sc={}"

urls = []
for dep, arr in combinations(airports, 2):
    for travel_class in classes:
        current_date = start_date
        while current_date <= end_date:
            date_str = current_date.strftime("%d-%m-%Y")
            urls.append(base_url.format(dep, arr, date_str, travel_class))
            urls.append(base_url.format(arr, dep, date_str, travel_class))
            current_date += timedelta(days=1)

In [2]:
def get_flights(driver):
    flight_xpaths = [
        '//*[@id="FLIGHT_SEARCH_RESULT_CONTENT"]/div[5]/div[3]/div/div/div[@data-testid]',
        '//*[@id="FLIGHT_SEARCH_RESULT_CONTENT"]/div[6]/div[3]/div/div/div/div[@data-testid]'
    ]
    
    for xpath in flight_xpaths:
        flights = driver.find_elements(By.XPATH, xpath)
        valid_flights = [flight for flight in flights if flight.get_attribute('data-testid').startswith('flight-inventory-card-container-')]
        if valid_flights:
            return valid_flights
    return []

In [3]:
driver.get(urls[10])

In [4]:
flights = get_flights(driver)

In [5]:
flight = flights[0]
buttons_xpath = './/div/div/div/div/div[2]/div[1]/div'
buttons = flight.find_elements(By.XPATH, buttons_xpath)
driver.execute_script("arguments[0].click();", buttons[0])

details_area = flight.find_element(By.XPATH, './/div/div/div/div[3]/div/div')
details_html = BeautifulSoup(details_area.get_attribute('innerHTML'), 'html.parser')

if len(details_html.text) < 50:
    details_area = flight.find_element(By.XPATH, './/div/div[1]/div[4]/div/div')
    details_html = BeautifulSoup(details_area.get_attribute('innerHTML'), 'html.parser')

In [6]:
details_html

<div class="css-1dbjc4n r-1ielgck r-1ivxbpt r-1i1ao36"><div class="css-1dbjc4n r-1ielgck r-otx420 r-1i1ao36"><div class="css-1dbjc4n r-e8mqni r-1habvwh r-18u37iz r-1h0z5md r-11c0sde"><div class="css-1dbjc4n r-e8mqni r-1habvwh r-13awgt0 r-18u37iz r-1h0z5md r-88pszg"><div class="css-1dbjc4n r-13awgt0 r-13qz1uu"><div class="css-1dbjc4n r-1tuna9m r-1oszu61 r-18u37iz r-1h0z5md"><div class="css-1dbjc4n r-e8mqni r-1d09ksm r-1h0z5md r-ttb5dx"><div class="css-901oao r-t1w4ow r-1b43r93 r-majxgm r-rjixqe r-5oul0u r-fdjqy7" dir="auto" style="color: rgb(3, 18, 26);">21:20</div><div class="css-901oao r-t1w4ow r-1enofrn r-majxgm r-1cwl3u0 r-fdjqy7" dir="auto" style="color: rgb(104, 113, 118);">13 Aug</div></div><div class="css-1dbjc4n r-uia4a0 r-1awozwy r-1h0z5md r-13hce6t r-1mf7evn r-1jg9483"><div class="css-1dbjc4n r-14lw9ot r-1naam9t r-sdzlij r-rs99b7 r-tbmifm r-1s2bzr4 r-16eto9q"></div><div class="css-1dbjc4n r-p9fnmn r-13awgt0 r-92ng3h"></div></div><div class="css-1dbjc4n r-e8mqni r-1habvwh r-13

In [7]:
flight = flights[1]
buttons_xpath = './/div/div/div/div/div[2]/div[1]/div'
buttons = flight.find_elements(By.XPATH, buttons_xpath)
driver.execute_script("arguments[0].click();", buttons[0])

details_area = flight.find_element(By.XPATH, './/div/div/div/div[3]/div/div')
details_html = BeautifulSoup(details_area.get_attribute('innerHTML'), 'html.parser')

if len(details_html.text) < 50:
    details_area = flight.find_element(By.XPATH, './/div/div[1]/div[4]/div/div')
    details_html = BeautifulSoup(details_area.get_attribute('innerHTML'), 'html.parser')

In [8]:
details_html

<div class="css-1dbjc4n r-1ielgck r-1ivxbpt r-1i1ao36"><div class="css-1dbjc4n r-1ielgck r-otx420 r-1i1ao36"><div class="css-1dbjc4n r-e8mqni r-1habvwh r-18u37iz r-1h0z5md r-11c0sde"><div class="css-1dbjc4n r-e8mqni r-1habvwh r-13awgt0 r-18u37iz r-1h0z5md r-88pszg"><div class="css-1dbjc4n r-13awgt0 r-13qz1uu"><div class="css-1dbjc4n r-1tuna9m r-1oszu61 r-18u37iz r-1h0z5md"><div class="css-1dbjc4n r-e8mqni r-1d09ksm r-1h0z5md r-ttb5dx"><div class="css-901oao r-t1w4ow r-1b43r93 r-majxgm r-rjixqe r-5oul0u r-fdjqy7" dir="auto" style="color: rgb(3, 18, 26);">20:55</div><div class="css-901oao r-t1w4ow r-1enofrn r-majxgm r-1cwl3u0 r-fdjqy7" dir="auto" style="color: rgb(104, 113, 118);">13 Aug</div></div><div class="css-1dbjc4n r-uia4a0 r-1awozwy r-1h0z5md r-13hce6t r-1mf7evn r-1jg9483"><div class="css-1dbjc4n r-14lw9ot r-1naam9t r-sdzlij r-rs99b7 r-tbmifm r-1s2bzr4 r-16eto9q"></div><div class="css-1dbjc4n r-p9fnmn r-13awgt0 r-92ng3h"></div></div><div class="css-1dbjc4n r-e8mqni r-1habvwh r-13

In [9]:
flight = flights[2]
buttons_xpath = './/div/div/div/div/div[2]/div[1]/div'
buttons = flight.find_elements(By.XPATH, buttons_xpath)
driver.execute_script("arguments[0].click();", buttons[0])

details_area = flight.find_element(By.XPATH, './/div/div/div/div[3]/div/div')
details_html = BeautifulSoup(details_area.get_attribute('innerHTML'), 'html.parser')

if len(details_html.text) < 50:
    details_area = flight.find_element(By.XPATH, './/div/div[1]/div[4]/div/div')
    details_html = BeautifulSoup(details_area.get_attribute('innerHTML'), 'html.parser')

In [10]:
details_html

<div class="css-1dbjc4n r-1ielgck r-1ivxbpt r-1i1ao36"><div class="css-1dbjc4n r-1ielgck r-otx420 r-1i1ao36"><div class="css-1dbjc4n r-e8mqni r-1habvwh r-18u37iz r-1h0z5md r-11c0sde"><div class="css-1dbjc4n r-e8mqni r-1habvwh r-13awgt0 r-18u37iz r-1h0z5md r-88pszg"><div class="css-1dbjc4n r-13awgt0 r-13qz1uu"><div class="css-1dbjc4n r-1tuna9m r-1oszu61 r-18u37iz r-1h0z5md"><div class="css-1dbjc4n r-e8mqni r-1d09ksm r-1h0z5md r-ttb5dx"><div class="css-901oao r-t1w4ow r-1b43r93 r-majxgm r-rjixqe r-5oul0u r-fdjqy7" dir="auto" style="color: rgb(3, 18, 26);">22:55</div><div class="css-901oao r-t1w4ow r-1enofrn r-majxgm r-1cwl3u0 r-fdjqy7" dir="auto" style="color: rgb(104, 113, 118);">13 Aug</div></div><div class="css-1dbjc4n r-uia4a0 r-1awozwy r-1h0z5md r-13hce6t r-1mf7evn r-1jg9483"><div class="css-1dbjc4n r-14lw9ot r-1naam9t r-sdzlij r-rs99b7 r-tbmifm r-1s2bzr4 r-16eto9q"></div><div class="css-1dbjc4n r-p9fnmn r-13awgt0 r-92ng3h"></div></div><div class="css-1dbjc4n r-e8mqni r-1habvwh r-13

In [11]:
flight = flights[3]
buttons_xpath = './/div/div/div/div/div[2]/div[1]/div'
buttons = flight.find_elements(By.XPATH, buttons_xpath)
driver.execute_script("arguments[0].click();", buttons[0])

details_area = flight.find_element(By.XPATH, './/div/div/div/div[3]/div/div')
details_html = BeautifulSoup(details_area.get_attribute('innerHTML'), 'html.parser')

if len(details_html.text) < 50:
    details_area = flight.find_element(By.XPATH, './/div/div[1]/div[4]/div/div')
    details_html = BeautifulSoup(details_area.get_attribute('innerHTML'), 'html.parser')

In [12]:
details_html

<div class="css-1dbjc4n r-1ielgck r-1ivxbpt r-1i1ao36"><div class="css-1dbjc4n r-1ielgck r-otx420 r-1i1ao36"><div class="css-1dbjc4n r-e8mqni r-1habvwh r-18u37iz r-1h0z5md r-11c0sde"><div class="css-1dbjc4n r-e8mqni r-1habvwh r-13awgt0 r-18u37iz r-1h0z5md r-88pszg"><div class="css-1dbjc4n r-13awgt0 r-13qz1uu"><div class="css-1dbjc4n r-1tuna9m r-1oszu61 r-18u37iz r-1h0z5md"><div class="css-1dbjc4n r-e8mqni r-1d09ksm r-1h0z5md r-ttb5dx"><div class="css-901oao r-t1w4ow r-1b43r93 r-majxgm r-rjixqe r-5oul0u r-fdjqy7" dir="auto" style="color: rgb(3, 18, 26);">18:30</div><div class="css-901oao r-t1w4ow r-1enofrn r-majxgm r-1cwl3u0 r-fdjqy7" dir="auto" style="color: rgb(104, 113, 118);">13 Aug</div></div><div class="css-1dbjc4n r-uia4a0 r-1awozwy r-1h0z5md r-13hce6t r-1mf7evn r-1jg9483"><div class="css-1dbjc4n r-14lw9ot r-1naam9t r-sdzlij r-rs99b7 r-tbmifm r-1s2bzr4 r-16eto9q"></div><div class="css-1dbjc4n r-p9fnmn r-13awgt0 r-92ng3h"></div></div><div class="css-1dbjc4n r-e8mqni r-1habvwh r-13