In [None]:
import os
import datetime
from time import sleep

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [None]:
def get_url(places):
    """
    Hàm tạo danh sách URL dựa trên danh sách địa điểm và thời gian.
    """
    dates = []
    start_day = datetime.datetime.today() + datetime.timedelta(days=1)
    end_day = datetime.datetime.today().replace(
        hour=0, minute=0, second=0, microsecond=0
    ) + datetime.timedelta(days=20)

    for i in range((end_day - start_day).days + 1):
        day = (start_day + datetime.timedelta(days=i)).strftime("%d-%m-%Y")
        dates.append(day)

    urls = []
    for place in places:
        for day in dates:
            url = (
                f"https://www.traveloka.com/vi-vn/flight/fullsearch"
                f"?ap={place}&dt={day}.NA&ps=1.0.0&sc=ECONOMY"
            )
            urls.append(url)
    return urls

In [None]:
def crawl_planetrip(url_list):
    """
    Hàm crawl dữ liệu chuyến bay theo từng URL.
    """
    driver = webdriver.Edge()
    df_by_url = {}

    for url in url_list:
        driver.get(url)
        sleep(5)
        wait = WebDriverWait(driver, 50)

        columns = [
            "brand", "price", "start_time", "start_day", "end_day", "end_time",
            "trip_time", "take_place", "destination", "hand_luggage",
            "checked_baggage", "crawl_date"
        ]
        df = pd.DataFrame(columns=columns)

        brand, price, start_day, end_day = [], [], [], []
        start_time, end_time, trip_time = [], [], []
        take_place, destination = [], []
        hand_luggage, checked_baggage = [], []
        crawl_date = [datetime.datetime.today().strftime("%d-%m-%Y")] * 1000

        # Cuộn trang để load toàn bộ dữ liệu
        initial_height = driver.execute_script("return document.body.scrollHeight")
        scroll_step = initial_height // 200
        for i in range(1000000):
            position = scroll_step * (i + 1)
            driver.execute_script(f"window.scrollTo(0, {position});")
            new_height = driver.execute_script("return document.body.scrollHeight")
            if position >= new_height:
                break
        driver.execute_script("window.scrollTo(0, 0)")

        elements = driver.find_elements(
            By.XPATH,
            "//div[@class='css-1dbjc4n r-9nbb9w r-otx420 "
            "r-1i1ao36 r-1x4r79x']"
        )

        price_elements = wait.until(EC.visibility_of_all_elements_located((
            By.XPATH,
            "//div[@class='css-1dbjc4n r-obd0qt r-eqz5dr "
            "r-9aw3ui r-knv0ih r-ggk5by']//h3"
        )))

        brand_elements = wait.until(EC.visibility_of_all_elements_located((
            By.XPATH,
            "//div[@class='css-1dbjc4n r-1habvwh r-18u37iz r-1ssbvtb']//div"
        )))

        detail_elements = wait.until(EC.visibility_of_all_elements_located((
            By.XPATH,
            "//div[@class='css-1dbjc4n r-13awgt0 r-18u37iz "
            "r-f4gmv6 r-1777fci']"
        )))

        old_elements = []
        for i, detail in enumerate(detail_elements):
            if elements[i] in old_elements:
                continue
            ActionChains(driver).move_to_element(detail).click().perform()
            old_elements.append(elements[i])

            brand.append(brand_elements[i].text)
            price.append(price_elements[i].text)

            start_time.append(wait.until(EC.visibility_of_element_located((
                By.XPATH,
                "//div[contains(@class,'r-e8mqni') and contains(@class,'ttb5dx')]"
                "//div[contains(@class,'r-5oul0u')]"
            ))).text)

            end_time.append(wait.until(EC.visibility_of_element_located((
                By.XPATH,
                "//div[contains(@class,'r-q3we1') and contains(@class,'ttb5dx')]"
                "//div[contains(@class,'r-fdjqy7')]"
            ))).text)

            start_day.append(wait.until(EC.visibility_of_element_located((
                By.XPATH,
                "//div[contains(@class,'r-e8mqni') and contains(@class,'ttb5dx')]"
                "//div[contains(@class,'r-fdjqy7')]"
            ))).text)

            end_day.append(wait.until(EC.visibility_of_element_located((
                By.XPATH,
                "//div[contains(@class,'r-q3we1') and contains(@class,'ttb5dx')]"
                "//div[contains(@class,'r-fdjqy7')]"
            ))).text)

            trip_time.append(wait.until(EC.visibility_of_element_located((
                By.XPATH,
                "//div[contains(@class,'r-13awgt0') and contains(@class,'r-fdjqy7')]"
            ))).text)

            destination.append(wait.until(EC.visibility_of_element_located((
                By.XPATH,
                "//div[contains(@class,'r-e8mqni') and contains(@class,'r-q3we1')]"
            ))).text)

            take_place.append(wait.until(EC.visibility_of_element_located((
                By.XPATH,
                "//div[contains(@class,'r-e8mqni') and not(contains(@class,'r-q3we1'))]"
            ))).text)

            baggage_elements = wait.until(EC.presence_of_all_elements_located((
                By.XPATH,
                "//div[contains(@class,'r-19u6a5r')]"
            )))
            temp_hand, temp_checked = "", ""
            for baggage in baggage_elements:
                text = baggage.text.lower()
                if "xách tay" in text:
                    temp_hand = baggage.text
                elif "hành lý" in text:
                    temp_checked = baggage.text
            hand_luggage.append(temp_hand)
            checked_baggage.append(temp_checked)

            new_df = pd.DataFrame(list(zip(
                brand, price, start_time, start_day, end_time, end_day,
                trip_time, take_place, destination, checked_baggage,
                hand_luggage, crawl_date
            )), columns=columns)

            df = pd.concat([df, new_df], axis=0, ignore_index=True)

            brand, price, start_day, end_day = [], [], [], []
            start_time, end_time, trip_time = [], [], []
            take_place, destination = [], []
            hand_luggage, checked_baggage = [], []
            crawl_date = [datetime.datetime.now().strftime("%d-%m-%y")] * 1000

            detail.click()
            sleep(5)

        new_url = url[53:]
        df.to_csv(f"planetrip_{new_url}.csv", index=False, encoding="utf-8-sig")
        df_by_url[new_url] = df

    driver.quit()
    return df_by_url

In [None]:
des_list = [
        "SGN.PQC", "SGN.HAN", "SGN.DAD",
        "SGN.HPH", "SGN.DLI", "SGN.CXR"
]

for des in des_list:
    print(f"Destination: {des}")
    url_list = get_url([des]) 
    df_by_url = crawl_planetrip(url_list)