In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time
from datetime import timedelta, datetime
import csv
import re

# 設置 Selenium 驅動
options = Options()
options.add_argument("--headless")  # 如果需要顯示瀏覽器，請去掉此行
service = Service("/opt/homebrew/bin/chromedriver")  # 指定 ChromeDriver 的路徑
driver = webdriver.Chrome(service=service, options=options)

# 打開Google Travel的航班頁面
url = "https://www.google.com/travel/flights/booking?tfs=CBwQAhpnEgoyMDI0LTEyLTIyIh8KA1RQRRIKMjAyNC0xMi0yMhoDQktLKgJURzIDNjM3Ih8KA0JLSxIKMjAyNC0xMi0yMhoDU1lEKgJURzIDNDcxKAFqDAgCEggvbS8wZnRreHIHCAESA1NZREABSAFwAYIBCwj___________8BmAEC&tfu=CnRDalJJTmxKTldsRk1iRFJCYmpSQlNFeHZSbmRDUnkwdExTMHRMUzB0ZEdKaWJIY3lNMEZCUVVGQlIyTlVkM05yUVdWa1QwRkJFZ3RVUnpZek4zeFVSelEzTVJvTENQcXRBUkFBR2dOVVYwUTRISERTbmdRPRICCAUiAA&authuser=0"
driver.get(url)

# 等待頁面加載
driver.implicitly_wait(10)

# 抓取出發日期
try:
    departure_date_element = driver.find_element(By.XPATH, "//span[contains(@class, 'mv1WYe')]").get_attribute("innerHTML")[:9]
    departure_date = departure_date_element.strip()
except NoSuchElementException:
    departure_date = "未找到出發日期"

# 抓取出發時間
try:
    departure_time_element = driver.find_element(By.XPATH, "//div[@class='wtdjmc YMlIz ogfYpf tPgKwe']").get_attribute("aria-label")
    departure_time = departure_time_element.split("：")[-1].strip()  # 抓取時間部分
except NoSuchElementException:
    departure_time = "未找到出發時間"

# 抓取抵達時間
try:
    arrival_time_element = driver.find_element(By.XPATH, "//div[@class='XWcVob YMlIz ogfYpf tPgKwe']").get_attribute("aria-label")
    arrival_time = arrival_time_element.split("：")[-1].strip()  # 抓取時間部分
except NoSuchElementException:
    arrival_time = "未找到抵達時間"

# 抓取出發和抵達機場代碼
try:
    airport_elements = driver.find_elements(By.XPATH, "//span[contains(@class, 'qeoz6e HKHSfd')]/following-sibling::span[@dir='ltr']")
    departure_airport = airport_elements[0].get_attribute("innerHTML").strip("()")  # 第一個是出發機場
    arrival_airport = airport_elements[1].get_attribute("innerHTML").strip("()")    # 第二個是抵達機場
except (NoSuchElementException, IndexError):
    departure_airport, arrival_airport = "未找到出發機場", "未找到抵達機場"

# 抓取航空公司
try:
    airline = driver.find_element(By.XPATH, "//div[contains(@class, 'sSHqwe')]/span[1]").text
except NoSuchElementException:
    airline = "未找到航空公司"

# 抓取行程時間
try:
    travel_time_element = driver.find_element(By.XPATH, "//div[@class='gvkrdb AdWm1c tPgKwe ogfYpf']").get_attribute("innerHTML")
    match = re.search(r'(\d+ 小時 \d+ 分鐘)', travel_time_element)
    flight_duration = match.group(1) if match else travel_time_element.strip()
except NoSuchElementException:
    flight_duration = "未找到行程時間"

# 抓取停靠站數量
try:
    layover_element = driver.find_element(By.XPATH, "//div[@class='EfT7Ae AdWm1c tPgKwe']//span[@class='ogfYpf']").get_attribute("aria-label")
    layover = layover_element.split(" flight.")[0]  # 提取 "1 stop" 或 "Non-stop"
except NoSuchElementException:
    layover = "Non-stop"

if layover != "Non-stop":
    # 抓取停留時間
    try:
        layover_info_element = driver.find_element(By.XPATH, '//div[@class = "tvtJdb eoY5cb y52p7d"]').get_attribute("innerHTML")
        time_pattern = r'(\d+\s*小時\s*\d+\s*分鐘|\d+\s*分鐘)'
        match = re.search(time_pattern, layover_info_element)
        layover_time = match.group(1) if match else "未找到停留時間"
    except NoSuchElementException:
        layover_time = "未找到停留時間"
else:
    layover_time = "Non-stop"

# 檢查是否有 "Overnight" 元素
try:
    overnight_element = driver.find_element(By.XPATH, '//div[@class="qj0iCb" and contains(text(), "Overnight")]')
    overnight = "Yes"
except NoSuchElementException:
    overnight = "No"

# 抓取機型
try:
    aircraft = driver.find_element(By.XPATH, '//div[@class="MX5RWe sSHqwe y52p7d"]/span[@class = "Xsgmwe"][last()]').get_attribute("innerHTML")
except NoSuchElementException:
    aircraft = "未找到機型"

# 抓取航班代碼
try:
    flight_number_element = driver.find_element(By.XPATH, '//div[@class="MX5RWe sSHqwe y52p7d"]/span[contains(@class, "Xsgmwe")][2]').get_attribute("innerHTML")
    flight_number = flight_number_element.replace('&nbsp;', ' ').strip()  # 去除前後空白
except NoSuchElementException:
    flight_number = "未找到航班代碼"

# 抓取艙等
try:
    cabin_class = driver.find_element(By.XPATH, '//span[contains(@class, "Xsgmwe")]/div').get_attribute("innerHTML")
except NoSuchElementException:
    cabin_class = "未找到艙等"
    
# 獲取今天的日期
today = datetime.today()

def replace_days_ago_with_date(price_history_text):
    price_with_date = []
    
    # 匹配 "60 天前 - $xxx" 格式的數據
    pattern = r"(\d+)\s*天前\s*-\s*\$([\d,]+)"
    matches = re.findall(pattern, price_history_text)
    
    for match in matches:
        days_ago = int(match[0])
        price = match[1]
        
        # 計算具體日期
        specific_date = today - timedelta(days=days_ago)
        formatted_date = specific_date.strftime("%m/%d")  # 以 "月/日" 格式顯示
        
        # 將 "60 天前 - $xxx" 替換為 "月/日 - $xxx"
        price_with_date.append(f"{formatted_date} - ${price}")
    
    return ", ".join(price_with_date)
 
# 修改價格歷史的部分
try:
    elements = driver.find_elements(By.XPATH, "//*[name()='g' and @class='ke9kZe-LkdAo-RbRzK-JNdkSc pKrx3d']")
    price_history = [element.get_attribute("aria-label") for element in elements]
    price_history_with_dates = [replace_days_ago_with_date(ph) for ph in price_history]
except NoSuchElementException:
    price_history_with_dates = "未找到價格歷史"

# 準備寫入 CSV 檔案（覆寫模式）
with open(f'/Users/yuchingchen/Documents/專題/data/data/google_flights_data_syd.csv', 'a', newline='', encoding='utf-8-sig') as csv_file:    
    csv_writer = csv.writer(csv_file)
    # 寫入標題
    csv_writer.writerow([
        "出發日期", "出發時間", "出發機場代號", 
        "抵達時間", "抵達機場代號", "航空公司", 
        "停靠站數量", "停留時間", "飛行時間", 
        "是否過夜", "機型", "航班代碼", "艙等", "價格歷史"
    ])

    # 將資料寫入 CSV
    csv_writer.writerow([
            departure_date, departure_time, departure_airport,
            arrival_time, arrival_airport, airline,
            layover, layover_time, flight_duration,
            overnight, aircraft, flight_number, cabin_class,
            ', '.join(price_history_with_dates)  # 將價格歷史串接為一個字符串
    ])

# 關閉瀏覽器
driver.quit()