In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re



In [4]:
import requests, pandas as pd, re, os, time, random
from bs4 import BeautifulSoup

def crawl_saketime_region(region_name: str, url: str):
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        res = requests.get(url, headers=headers, timeout=15)
        res.raise_for_status()
    except:
        return None  # 回傳 None 表示失敗

    soup = BeautifulSoup(res.text, "html.parser")
    data = []

    for item in soup.select("li.ranking"):
        try:
            rank_tag = item.select_one("p.rank")
            h2_tag = item.select_one("h2")
            brand_info_tag = item.select_one("p.brand_info")
            score_tag = item.select_one("p.brand_point span.point")

            if not (rank_tag and h2_tag and brand_info_tag and score_tag):
                raise ValueError()

            rank = rank_tag.text.strip().replace("位", "")
            name = h2_tag.select_one("span").text.strip()
            kana = h2_tag.text.split("（")[-1].replace("）", "").strip() if "（" in h2_tag.text else ""
            detail_url = "https://www.saketime.jp" + h2_tag.select_one("a")["href"]

            region, brewery = "", ""
            info_text = brand_info_tag.text.strip()
            if "|" in info_text:
                region, brewery = map(str.strip, info_text.split("|"))
            else:
                region = info_text.strip()

            score = score_tag.text.strip().replace("点", "")
            price_tag = item.select_one("p.brand_price")
            price_text = price_tag.text if price_tag else ""
            match = re.search(r"¥[\s]*([\d,]+)", price_text)
            price = match.group(1).replace(",", "") if match else ""

            data.append({
                "地區": region_name,
                "排名": rank,
                "名稱": name,
                "假名": kana,
                "得分": score,
                "酒造地區": region,
                "酒造名稱": brewery,
                "價格": price,
                "詳細頁面連結": detail_url
            })

        except:
            continue

    return pd.DataFrame(data)


# 全部地區（完整47都道府縣）
areas = [ "niigata", "yamanashi", "nagano", "ishikawa", "toyama", "fukui",
    "osaka", "hyogo", "kyoto", "shiga", "nara", "wakayama",
    "okayama", "hiroshima", "tottori", "shimane", "yamaguchi",
    "kagawa", "tokushima", "ehime", "kochi",
    "fukuoka", "saga", "nagasaki", "kumamoto", "oita", "miyazaki", "kagoshima", "okinawa"
]

base_url = "https://www.saketime.jp/ranking/{}/"
failed = []
all_dfs = []

for area in areas:
    csv_name = f"saketime_{area}.csv"
    if os.path.exists(csv_name):
        print(f"✅ 已存在，略過：{csv_name}")
        df = pd.read_csv(csv_name)
        all_dfs.append(df)
        continue

    print(f"📍 抓取：{area}")
    df = crawl_saketime_region(area, base_url.format(area))
    if df is not None and not df.empty:
        df.to_csv(csv_name, index=False, encoding="utf-8-sig")
        all_dfs.append(df)
        print(f"✅ 儲存：{csv_name}（{len(df)} 筆）")
    else:
        failed.append(area)
        print(f"❌ 失敗：{area}")

    time.sleep(random.uniform(1.5, 3.5))

# 合併所有成功的 CSV
if all_dfs:
    final_df = pd.concat(all_dfs, ignore_index=True)
    final_df.to_csv("saketime_all_regions.csv", index=False, encoding="utf-8-sig")
    print(f"\n📊 合併完成！總共 {len(final_df)} 筆，儲存為 saketime_all_regions.csv")

if failed:
    print(f"\n⚠️ 以下地區抓取失敗（可稍後重試）：{failed}")


📍 抓取：niigata
✅ 儲存：saketime_niigata.csv（22 筆）
📍 抓取：yamanashi
✅ 儲存：saketime_yamanashi.csv（20 筆）
📍 抓取：nagano
✅ 儲存：saketime_nagano.csv（21 筆）
📍 抓取：ishikawa
✅ 儲存：saketime_ishikawa.csv（20 筆）
📍 抓取：toyama
✅ 儲存：saketime_toyama.csv（21 筆）
📍 抓取：fukui
✅ 儲存：saketime_fukui.csv（20 筆）
📍 抓取：osaka
✅ 儲存：saketime_osaka.csv（20 筆）
📍 抓取：hyogo
✅ 儲存：saketime_hyogo.csv（20 筆）
📍 抓取：kyoto
✅ 儲存：saketime_kyoto.csv（20 筆）
📍 抓取：shiga
✅ 儲存：saketime_shiga.csv（20 筆）
📍 抓取：nara
✅ 儲存：saketime_nara.csv（21 筆）
📍 抓取：wakayama
✅ 儲存：saketime_wakayama.csv（20 筆）
📍 抓取：okayama
✅ 儲存：saketime_okayama.csv（20 筆）
📍 抓取：hiroshima
✅ 儲存：saketime_hiroshima.csv（20 筆）
📍 抓取：tottori
✅ 儲存：saketime_tottori.csv（20 筆）
📍 抓取：shimane
✅ 儲存：saketime_shimane.csv（20 筆）
📍 抓取：yamaguchi
✅ 儲存：saketime_yamaguchi.csv（20 筆）
📍 抓取：kagawa
✅ 儲存：saketime_kagawa.csv（20 筆）
📍 抓取：tokushima
✅ 儲存：saketime_tokushima.csv（23 筆）
📍 抓取：ehime
✅ 儲存：saketime_ehime.csv（23 筆）
📍 抓取：kochi
✅ 儲存：saketime_kochi.csv（20 筆）
📍 抓取：fukuoka
✅ 儲存：saketime_fukuoka.csv（20 筆）
📍 抓取：saga
✅ 儲存：saketime_saga.c

In [5]:
df_all = []
df_all = pd.read_csv("saketime_all_regions.csv")

In [6]:
df_all.head()

Unnamed: 0,地區,排名,名稱,假名,得分,酒造地區,酒造名稱,價格,詳細頁面連結
0,niigata,1,あべ,あべ,4.35,新潟,阿部酒造,2145.0,https://www.saketime.jp/brands/4250/
1,niigata,2,荷札酒,にふだざけ,4.31,新潟,加茂錦酒造,1980.0,https://www.saketime.jp/brands/2450/
2,niigata,3,ゆきのまゆ,旧：醸す森,4.2,新潟,苗場酒造,1000.0,https://www.saketime.jp/brands/5909/
3,niigata,4,高千代,たかちよ,4.18,新潟,高千代酒造,1420.0,https://www.saketime.jp/brands/793/
4,niigata,5,雅楽代,うたしろ,4.16,新潟,天領盃酒造,1650.0,https://www.saketime.jp/brands/5496/
