In [28]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

# ----- 可調整變數 -----
base_url = "https://tabelog.com/tw/tokyo/rstLst/tempura/"  # 不含最後的 /{page}/
n = 10                                                # 要爬的頁數
headers = {                                           # 模擬瀏覽器
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
pause_per_item = 0.3    # 每筆資料間延遲秒數 
output_file    = "tabelog_tokyo_tempura.csv"                            
# ----------------------

def fetch_with_backoff(url, headers, max_retries=5, backoff_factor=2, initial_wait=1):
    wait = initial_wait
    for attempt in range(1, max_retries + 1):
        resp = requests.get(url, headers=headers)
        if resp.status_code == 429:
            print(f"[429] 第 {attempt} 次，等待 {wait}s 後重試…")
            time.sleep(wait)
            wait *= backoff_factor
        else:
            resp.raise_for_status()
            return resp
    raise RuntimeError("重試多次後仍失敗，請稍後再試")

results = []
for page in range(1, n + 1):
    url = f"{base_url}/{page}/"
    print(f"▶ 抓取第 {page} 頁：{url}")
    resp = fetch_with_backoff(url, headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    for item in soup.select(".list-rst__wrap"):
        name_tag = item.select_one(".list-rst__rst-name-target")
        name = name_tag.text.strip() if name_tag else None
        link = name_tag["href"] if name_tag and name_tag.has_attr("href") else None

        rating_tag = item.select_one(".c-rating__val")
        rating = rating_tag.text.strip() if rating_tag else None

        raw_access = item.select_one(".list-rst__area-genre")
        access_text = raw_access.text.strip() if raw_access else ""
        station, categories = (access_text.split(" / ", 1) + [None])[:2]

        lunch = dinner = None
        for icon in item.select("i.c-rating-v3__time"):
            val = icon.find_next_sibling("span", class_="c-rating-v3__val")
            if not val:
                continue
            txt = val.text.strip()
            cls = icon.get("class", [])
            if "c-rating-v3__time--lunch" in cls:
                lunch = txt
            elif "c-rating-v3__time--dinner" in cls:
                dinner = txt

        results.append({
            "店名": name,
            "平均星級評分": rating,
            "車站": station,
            "餐廳類別": categories,
            "連結": link,
            "午餐價位": lunch,
            "晚餐價位": dinner
        })

        time.sleep(pause_per_item)

# 組成 DataFrame
df = pd.DataFrame(results)


▶ 抓取第 1 頁：https://tabelog.com/tw/tokyo/rstLst/tempura//1/
▶ 抓取第 2 頁：https://tabelog.com/tw/tokyo/rstLst/tempura//2/
▶ 抓取第 3 頁：https://tabelog.com/tw/tokyo/rstLst/tempura//3/
[429] 第 1 次，等待 1s 後重試…
▶ 抓取第 4 頁：https://tabelog.com/tw/tokyo/rstLst/tempura//4/
▶ 抓取第 5 頁：https://tabelog.com/tw/tokyo/rstLst/tempura//5/
[429] 第 1 次，等待 1s 後重試…
▶ 抓取第 6 頁：https://tabelog.com/tw/tokyo/rstLst/tempura//6/
▶ 抓取第 7 頁：https://tabelog.com/tw/tokyo/rstLst/tempura//7/
[429] 第 1 次，等待 1s 後重試…
▶ 抓取第 8 頁：https://tabelog.com/tw/tokyo/rstLst/tempura//8/
▶ 抓取第 9 頁：https://tabelog.com/tw/tokyo/rstLst/tempura//9/
▶ 抓取第 10 頁：https://tabelog.com/tw/tokyo/rstLst/tempura//10/


In [29]:
# 儲存到 CSV 檔案
df.to_csv(output_file, index=False ,encoding="utf-8-sig")

In [30]:
from pathlib import Path
# 確保評分欄位是 float 型別
df["平均星級評分"] = df["平均星級評分"].astype(float)

# 建立新的過濾後 DataFrame
df_high = df[df["平均星級評分"] > 3.5]
# 自動推導高分檔名
p = Path(output_file)
high_file = p.with_name(p.stem + "_high_score" + p.suffix)

In [31]:
df_high.to_csv(high_file, index=False, encoding="utf-8-sig")