In [10]:
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
from dash import State
from dash.exceptions import PreventUpdate
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
import time
import re
import plotly.express as px

In [14]:


# 創建一個 Pandas DataFrame
data = {'name': [], 'location': [], 'price': [], 'rating': [], 'distance': [], 'comment': []}

# 創建 Dash 應用
app = dash.Dash(__name__)
# 創建布局
app.layout = html.Div([
    # 輸入地點
    dcc.Input(id='location-input', type='text', value='台北', placeholder='輸入地點'),

    # 選擇日期範圍
    dcc.DatePickerRange(
        id='date-picker-range',
        start_date='2023-11-24',
        end_date='2023-11-25',
        display_format='YYYY-MM-DD'
    ),

    # 新增一個按鈕
    html.Button('執行程式', id='run-button'),

    # 顯示散點圖
    dcc.Graph(id='scatter-plot')
])

# 定義回調函數
@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('run-button', 'n_clicks')],
    [State('location-input', 'value'),
     State('date-picker-range', 'start_date'),
     State('date-picker-range', 'end_date')]
)
def update_scatter_plot(n_clicks, selected_location, start_date, end_date):
    if n_clicks is None:
        raise PreventUpdate
    url = "https://www.booking.com/index.zh-tw.html"
    options = webdriver.ChromeOptions()
    service = ChromeService(executable_path=r"C:\Users\chuan\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)
    # 前往網頁
    driver.get(url)




    try:
        # 使用WebDriverWait等待元素可見
        button = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.XPATH, '//button[@aria-label="關閉登入的資訊。"]'))
        )
        
        # 點擊按鈕
        button.click()
    except Exception as e:
        print(f"沒有白癡視窗跳出來")

    # 找到搜尋位置的輸入框
    loc_query = driver.find_element(By.CLASS_NAME, "eb46370fe1")

    # 清空輸入框
    loc_query.clear()


    # 將地點輸入
    travel_loc = selected_location  # 使用選擇的地點
    loc_query.send_keys(travel_loc)

    time.sleep(2)

    element = WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.XPATH, '//div[@class="a1139161bf"]'))
    )

    # 模擬按一下元素
    element.click()

    date_element_23 = WebDriverWait(driver, 60).until(
        EC.element_to_be_clickable((By.XPATH, f'//span[@data-date="{start_date}"]'))
    )
    date_element_23.click()

    date_element_24 = WebDriverWait(driver, 60).until(
        EC.element_to_be_clickable((By.XPATH, f'//span[@data-date="{end_date}"]'))
    )

    date_element_24.click()


    # 找到搜尋按鈕，模擬點擊
    submit_button = driver.find_element(By.XPATH, '//button[@type="submit" and @class="a83ed08757 c21c56c305 a4c1805887 f671049264 d2529514af c082d89982 cceeb8986b"]')
    submit_button.click()
    for i in range(4):
        # 使用WebDriverWait等待元素可見，爬取飯店名
        title_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//div[@data-testid="title"]'))
        )

        # 使用WebDriverWait等待元素可見，爬取距離
        distance_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//span[@data-testid="distance"]'))
        )
        # 使用WebDriverWait等待元素可見，爬取評分和評論標籤
        review_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//div[@data-testid="review-score"]'))
        )

        # 使用WebDriverWait等待元素可見，爬取價格
        price_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//span[@data-testid="price-and-discounted-price"]'))
        )
        # 使用WebDriverWait等待元素可見，爬取地點
        location_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//a[@class="a83ed08757 f88a5204c2 a1ae279108 b98133fb50"]'))
        )


        for i in range(len(title_elements)):
            title_text = title_elements[i].text
            distance_text = distance_elements[i].text
            location_text = location_elements[i].find_element(By.CLASS_NAME, 'aee5343fdb').text
            price_text = price_elements[i].text
            # score_text = score_elements[i].find_element(By.CLASS_NAME, 'a3b8729ab1').text
            # review_label_text = score_elements[i].find_element(By.CLASS_NAME, 'a3b8729ab1').get_attribute("aria-label")
            try:
                score_text = review_elements[i].find_element(By.CLASS_NAME, 'a3b8729ab1').text
                comment_text = review_elements[i].find_element(By.CLASS_NAME, 'cb2cbb3ccb').text
            except:
                score_text = "0"
                comment_text = "無"
            

            # 將資料加入 DataFrame
            data['name'].append(title_text)
            data['location'].append(location_text)
            data['price'].append(price_text)
            data['rating'].append(score_text)
            data['distance'].append(distance_text)
            data['comment'].append(comment_text)

        try:
        # 使用 WebDriverWait 等待按鈕可見
            button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="下一頁"]'))
            )
            
            # 點擊按鈕
            button.click()

            print("換頁")
            time.sleep(2)

        except Exception as e:
            print(f"發生例外: {e}")
        # print(f"飯店名稱: {title_text}, 地點: {location_text}, 距離: {distance_text}, 分數: {score_text}, 評論標籤: {comment_text}, 價格: {price_text}")

    # 創建 DataFrame
    df = pd.DataFrame(data)

    # 存入 CSV 檔案
    df.to_csv('booking_data.csv', index=False)

    print("資料已存入 booking_data.csv")


    for i in range(0,len(df["price"])):
        df["price"][i] = re.sub(r'[^\d.]', '', df["price"][i])

    df["price"] = df["price"].astype('float')

    #rating to float + handle missing value by remove row
    df.dropna(subset=['rating'], inplace=True)
    df["rating"] = df["rating"].astype('float')

    #distance to int
    for index, row in df.iterrows():
        mult = 1
        if("公里" in row["distance"]):
                mult = 1000
        df.at[index, 'distance'] = float(re.sub(r'[^\d.]', '', row["distance"])) * mult
    df["distance"].astype("int")

    for index, row in df.iterrows():
        if ',' in row['comment']:
            df.at[index, 'comment'] = row['comment'].replace(',', '')



    #define outlier
    Q1 = df[['price', 'distance']].quantile(0.25)
    Q3 = df[['price', 'distance']].quantile(0.75)
    IQR = Q3 - Q1

    outlier_mask = ((df[['price', 'distance']] < (Q1 - 1.5 * IQR)) | (df[['price', 'distance']] > (Q3 + 1.5 * IQR))).any(axis=1)

    # Remove outliers
    df_no_outliers = df[~outlier_mask]

    # Create a scatter plot without outliers
    fig = px.scatter(
        df_no_outliers,
        x='price',
        y='distance',
        color='rating',
        title=f'Hotel Scatter Plot {selected_location} (No Outliers)',
        hover_name='name',
        hover_data={'name': False, 'rating': True, 'price': True, 'distance': True}
    )

    # Customize axis labels
    fig.update_xaxes(title_text='Price ($)')
    fig.update_yaxes(title_text='Distance from Center (m)')

    fig.show()

    return fig

# 運行應用
if __name__ == '__main__':
    app.run_server(debug=True, port=8051)


換頁
換頁
換頁
換頁
資料已存入 booking_data.csv







Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`


Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`

