In [39]:
import os
import re
import csv
import time
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from typing import List, Dict, Optional



In [53]:
DEFAULT_CONFIG = {
    "BASE_URL": "https://www.ptt.cc",
    "MAN_BOARD": "EuropeTravel",
    "ROOT_INDEX_URL": "https://www.ptt.cc/man/EuropeTravel/index.html",
    "HEADERS": {"User-Agent": "Mozilla/5.0"},
    "ENCODING": "utf-8",
    "SLEEP_TIME_RANGE": (0.1, 0.3),
    "PAGE_SLEEP_TIME_RANGE": (0.3, 0.6),
    "DATA_DIR": os.path.join(os.getcwd(), "EuropeTravel"),
}

In [54]:
def get_category_pages() -> List[Dict[str, str]]:
    """從 EuropeTravel 精華區首頁抓所有分類頁"""
    url = "https://www.ptt.cc/man/EuropeTravel/index.html"
    res = safe_request(url)
    if not res:
        return []
    soup = BeautifulSoup(res.text, "html.parser")
    links = []
    for a in soup.select("div.title a"):
        href = a.get("href", "")
        if href.startswith("/man/EuropeTravel/") and href.endswith("index.html"):
            links.append({
                "分類名稱": a.text.strip("◆ ").strip(),
                "分類URL": CONFIG["BASE_URL"] + href
            })
    return links


In [55]:
def safe_write_csv(data, filename, mode='a', fieldnames=None):
    """將 dict 或 list 寫入 csv，會自動補 header"""
    try:
        with open(filename, mode=mode, encoding="utf-8-sig", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            if f.tell() == 0:
                writer.writeheader()
            if isinstance(data, list):
                writer.writerows(data)
            else:
                writer.writerow(data)
        return True
    except Exception as err:
        print(f"❌ 無法寫入 {filename}：{err}")
        return False

In [56]:
def get_country_pages(category_url: str) -> List[Dict[str, str]]:
    """從分類頁抓各國子頁（如 荷蘭、比利時）"""
    res = safe_request(category_url)
    if not res:
        return []
    soup = BeautifulSoup(res.text, "html.parser")
    links = []
    for a in soup.find_all("a"):
        href = a.get("href", "")
        if href.startswith("/man/EuropeTravel/") and href.endswith(".html") and "/M." not in href:
            links.append({
                "國家名稱": a.text.strip(),
                "國家URL": CONFIG["BASE_URL"] + href
            })
    return links


In [57]:
def get_articles_in_country_page(country_url: str) -> List[Dict[str, str]]:
    """擷取單一國家頁下所有文章標題與 URL"""
    res = safe_request(country_url)
    if not res:
        return []
    soup = BeautifulSoup(res.text, "html.parser")
    articles = []
    for a in soup.find_all("a"):
        href = a.get("href", "")
        if href.startswith("/man/EuropeTravel/") and "/M." in href:
            articles.append({
                "文章標題": a.text.strip(),
                "文章URL": CONFIG["BASE_URL"] + href
            })
    return articles


In [58]:
def get_article_content(article_url: str) -> Dict[str, str]:
    """從單一文章頁抓標題、內文、URL清單"""
    res = safe_request(article_url)
    if not res:
        return {}
    soup = BeautifulSoup(res.text, "html.parser")
    main = soup.find("div", id="main-content")
    if not main:
        return {}
    # 清除簽名、標籤
    for tag in main.find_all(["div", "span"], recursive=False):
        tag.extract()
    text = main.get_text(separator="\n").strip()
    urls = [a["href"] for a in main.find_all("a", href=True)]
    return {
        "內文": text,
        "連結集": "|".join(urls)
    }


In [59]:
def crawl_or_load_categories() -> List[Dict[str, str]]:
    path = get_data_path("categories.csv")
    if os.path.exists(path):
        print("📄 已存在分類檔，讀取中...")
        return pd.read_csv(path).to_dict(orient="records")
    data = get_category_pages()
    if data:
        safe_write_csv(data, path, mode="w", fieldnames=["分類名稱", "分類URL"])
    return data

def crawl_or_load_countries(分類名稱: str, 分類URL: str) -> List[Dict[str, str]]:
    safe_name = re.sub(r"[\\/:*?\"<>|]", "_", 分類名稱)
    path = get_data_path(f"{safe_name}_countries.csv")
    if os.path.exists(path):
        print(f"📄 已存在 {分類名稱} 國家檔案，讀取中...")
        return pd.read_csv(path).to_dict(orient="records")
    data = get_country_pages(分類URL)
    if data:
        safe_write_csv(data, path, mode="w", fieldnames=["國家名稱", "國家URL"])
    return data


In [60]:
def crawl_or_load_articles(分類名稱: str, 國家名稱: str, 國家URL: str) -> List[Dict[str, str]]:
    safe_cat = re.sub(r"[\\/:*?\"<>|]", "_", 分類名稱)
    safe_country = re.sub(r"[\\/:*?\"<>|]", "_", 國家名稱)
    path = get_data_path(os.path.join(safe_cat, f"{safe_country}_articles.csv"))
    if os.path.exists(path):
        print(f"📄 已存在 {國家名稱} 文章檔案，讀取中...")
        return pd.read_csv(path).to_dict(orient="records")
    data = get_articles_in_country_page(國家URL)
    if data:
        safe_write_csv(data, path, mode="w", fieldnames=["文章標題", "文章URL"])
    return data


In [61]:
def crawl_articles_and_save(分類名稱: str, 國家名稱: str, article_list: List[Dict[str, str]]):
    safe_cat = re.sub(r"[\\/:*?\"<>|]", "_", 分類名稱)
    safe_country = re.sub(r"[\\/:*?\"<>|]", "_", 國家名稱)
    path = get_data_path(os.path.join(safe_cat, f"{safe_country}.csv"))
    
    records = []
    for article in article_list:
        content = get_article_content(article["文章URL"])
        if not content:
            continue
        records.append({
            "文章標題": article["文章標題"],
            "文章URL": article["文章URL"],
            "內文": content.get("內文", ""),
            "內部連結": content.get("連結集", "")
        })
        random_sleep()
    
    if records:
        safe_write_csv(records, path, mode="w", fieldnames=records[0].keys())
        print(f"✅ {國家名稱} 文章內文共儲存：{len(records)} 筆")


In [62]:
# Utility Functions
def safe_request(url, headers=None):
    """Make a safe HTTP request with error handling"""
    try:
        headers = headers or DEFAULT_CONFIG['HEADERS']
        res = requests.get(url, headers=headers)
        res.encoding = DEFAULT_CONFIG['ENCODING']
        return res
    except Exception as e:
        print(f"⚠️ Request failed for {url}: {e}")
        return None

def random_sleep(range_tuple=None):
    """Sleep for a random duration within the specified range"""
    sleep_range = range_tuple or DEFAULT_CONFIG['SLEEP_TIME_RANGE']
    time.sleep(round(random.uniform(*sleep_range), 2))

def get_data_path(filename: str, config: dict = CONFIG) -> str:
    """根據 CONFIG['DATA_DIR'] 回傳完整路徑，支援子資料夾"""
    if os.path.isabs(filename):
        path = filename
    else:
        path = os.path.join(config["DATA_DIR"], filename)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    return path

def safe_write_csv(data, filename, mode='a', fieldnames=None):
    """Safely write data to CSV with proper encoding and quoting"""
    try:
        with open(filename, mode=mode, encoding='utf-8-sig', newline='') as f:
            writer = csv.DictWriter(
                f, 
                fieldnames=fieldnames,
                quoting=csv.QUOTE_ALL,  # Quote all fields
                escapechar='\\',  # Use backslash as escape character
                doublequote=True  # Double quotes within fields
            )
            if f.tell() == 0:
                writer.writeheader()
            if isinstance(data, list):
                writer.writerows(data)
            else:
                writer.writerow(data)
        return True
    except Exception as e:
        print(f"❌ Error writing to {filename}: {e}")
        return False

def clean_content(text):
    """Clean text content by removing extra whitespace and newlines"""
    if not isinstance(text, str):
        return text
    cleaned = re.sub(r'\n\s*\n', '\n', text)
    return cleaned.strip()

In [63]:
# Step 1：取得 EuropeTravel 精華區分類頁
categories = crawl_or_load_categories()
print(f"✅ 共擷取分類：{len(categories)} 個")
pd.DataFrame(categories)


✅ 共擷取分類：15 個


Unnamed: 0,分類名稱,分類URL
0,【 網友精彩回憶錄 】,https://www.ptt.cc/man/EuropeTravel/DFA8/index...
1,【 歐洲各國旅遊資訊 】,https://www.ptt.cc/man/EuropeTravel/D507/index...
2,【 旅館大搜集 】,https://www.ptt.cc/man/EuropeTravel/D2DF/index...
3,【 留學新資訊 】,https://www.ptt.cc/man/EuropeTravel/D3AF/index...
4,【 出國事宜 】,https://www.ptt.cc/man/EuropeTravel/D4FA/index...
5,【 常見問題 】,https://www.ptt.cc/man/EuropeTravel/D561/index...
6,【 行程規劃與交通 】,https://www.ptt.cc/man/EuropeTravel/D14F/index...
7,【 抱怨與讚嘆 】,https://www.ptt.cc/man/EuropeTravel/D970/index...
8,【 可能想知道的消息 】,https://www.ptt.cc/man/EuropeTravel/DB18/index...
9,【 戀戀歐風 】,https://www.ptt.cc/man/EuropeTravel/D527/index...


In [64]:
# Step 2：從分類選取一個（例如第 0 個）
chosen_category = categories[1]  # 你可以更換成 1, 2, 3...
category_name = chosen_category["分類名稱"]
category_url = chosen_category["分類URL"]

countries = crawl_or_load_countries(category_name, category_url)
print(f"✅ 分類【{category_name}】下共擷取國家：{len(countries)} 個")
pd.DataFrame(countries)


✅ 分類【【 歐洲各國旅遊資訊 】】下共擷取國家：35 個


Unnamed: 0,國家名稱,國家URL
0,精華區beta EuropeTravel,https://www.ptt.cc/man/EuropeTravel/index.html
1,返回上層,https://www.ptt.cc/man/EuropeTravel/index.html
2,精華區,https://www.ptt.cc/man/EuropeTravel/index.html
3,◆ ======== 西歐 =======,https://www.ptt.cc/man/EuropeTravel/D507/D6E3/...
4,◆ 荷蘭,https://www.ptt.cc/man/EuropeTravel/D507/D5D6/...
5,◆ 比利時,https://www.ptt.cc/man/EuropeTravel/D507/D629/...
6,◆ 英國,https://www.ptt.cc/man/EuropeTravel/D507/D5AD/...
7,◆ 奧地利,https://www.ptt.cc/man/EuropeTravel/D507/D605/...
8,◆ 瑞士,https://www.ptt.cc/man/EuropeTravel/D507/D5C1/...
9,◆ 法國,https://www.ptt.cc/man/EuropeTravel/D507/D5B6/...


In [71]:
# Step 3：從國家清單中選取一個（例如第 0 個）
chosen_country = countries[9]
country_name = chosen_country["國家名稱"]
country_url = chosen_country["國家URL"]

articles = crawl_or_load_articles(category_name, country_name, country_url)
print(f"✅ 國家【{country_name}】共擷取文章：{len(articles)} 篇")
pd.DataFrame(articles)


✅ 國家【◆ 法國】共擷取文章：67 篇


Unnamed: 0,文章標題,文章URL
0,◇ [重要]不要笨笨被人家畫畫 ....,https://www.ptt.cc/man/EuropeTravel/D507/D5B6/...
1,◇ 請問法國的香波(chambord)堡...,https://www.ptt.cc/man/EuropeTravel/D507/D5B6/...
2,◇ 法屬阿爾俾斯山,https://www.ptt.cc/man/EuropeTravel/D507/D5B6/...
3,◇ 冰淇淋,https://www.ptt.cc/man/EuropeTravel/D507/D5B6/...
4,◇ 史特拉斯堡,https://www.ptt.cc/man/EuropeTravel/D507/D5B6/...
...,...,...
62,◇ [心得] 巴黎 折扣季觀察心得–UNIQLO 和 ZARA,https://www.ptt.cc/man/EuropeTravel/D507/D5B6/...
63,◇ [心得] 巴黎 花神咖啡館,https://www.ptt.cc/man/EuropeTravel/D507/D5B6/...
64,◇ [心得] 巴黎 便宜舒適小酒館HOTEL VOLTAIRE,https://www.ptt.cc/man/EuropeTravel/D507/D5B6/...
65,◇ [心得] [法國]到巴黎的感動*艾菲爾鐵塔,https://www.ptt.cc/man/EuropeTravel/D507/D5B6/...


In [72]:
# Step 4：抓該國家每篇文章的內文
crawl_articles_and_save(category_name, country_name, articles)

✅ ◆ 法國 文章內文共儲存：67 筆


In [73]:
# ✅ 指定想抓的國家索引（0-based）
target_indices = [6, 7, 10, 21, 27]

for idx in target_indices:
    if idx >= len(countries):
        print(f"⚠️ 索引 {idx} 超出可用範圍（最多 {len(countries)-1}）")
        continue

    country = countries[idx]
    country_name = country["國家名稱"]
    country_url = country["國家URL"]

    print(f"\n🌍 處理第 {idx} 個國家 ➜ {country_name}")

    articles = crawl_or_load_articles(category_name, country_name, country_url)
    print(f"🔗 共取得文章：{len(articles)}")

    crawl_articles_and_save(category_name, country_name, articles)



🌍 處理第 6 個國家 ➜ ◆ 英國
🔗 共取得文章：94
✅ ◆ 英國 文章內文共儲存：94 筆

🌍 處理第 7 個國家 ➜ ◆ 奧地利
🔗 共取得文章：22
✅ ◆ 奧地利 文章內文共儲存：22 筆

🌍 處理第 10 個國家 ➜ ◆ 德國
🔗 共取得文章：87
✅ ◆ 德國 文章內文共儲存：87 筆

🌍 處理第 21 個國家 ➜ ◆ 義大利
🔗 共取得文章：55
✅ ◆ 義大利 文章內文共儲存：55 筆

🌍 處理第 27 個國家 ➜ ◆ 匈牙利
🔗 共取得文章：4
✅ ◆ 匈牙利 文章內文共儲存：4 筆


In [75]:
import pandas as pd
import os
import re

# 設定資料夾與輸出檔案
folder = "EuropeTravel/【 歐洲各國旅遊資訊 】"
output_path = "EuropeTravel/整合文章_含國家標註.csv"

# 找出所有 .csv 檔案（排除 *_articles.csv）
all_files = [
    f for f in os.listdir(folder)
    if f.endswith(".csv") and not f.endswith("_articles.csv")
]

combined_data = []

# 每個檔案加上「國家」欄位後合併
for file in all_files:
    path = os.path.join(folder, file)
    try:
        df = pd.read_csv(path)
        # 從檔案名稱推測國家名稱（去掉副檔名）
        country = re.sub(r"\.csv$", "", file)
        df["國家"] = country
        combined_data.append(df)
    except Exception as e:
        print(f"❌ 無法處理檔案：{file}，錯誤：{e}")

# 串接所有 DataFrame 並輸出
if combined_data:
    combined_df = pd.concat(combined_data, ignore_index=True)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    combined_df.to_csv(output_path, index=False, encoding="utf-8-sig")
    print(f"✅ 合併完成，總共 {len(combined_df)} 筆資料。儲存至：{output_path}")
else:
    print("⚠️ 找不到可合併的檔案。")


✅ 合併完成，總共 370 筆資料。儲存至：EuropeTravel/整合文章_含國家標註.csv


✅ 分類【【    出國事宜      】】下共擷取文章：6 個


Unnamed: 0,國家名稱,國家URL
0,精華區beta EuropeTravel,https://www.ptt.cc/man/EuropeTravel/index.html
1,返回上層,https://www.ptt.cc/man/EuropeTravel/index.html
2,精華區,https://www.ptt.cc/man/EuropeTravel/index.html
3,◆ 要準備的東西,https://www.ptt.cc/man/EuropeTravel/D4FA/D163/...
4,◆ 護照辦理,https://www.ptt.cc/man/EuropeTravel/D4FA/DE23/...
5,◆ 去荷比法三國 買什麼東西 最划算,https://www.ptt.cc/man/EuropeTravel/D4FA/D4DE/...


In [77]:
def crawl_flat_category(分類名稱: str, 分類URL: str):
    import re

    # 安全檔名
    safe_cat = re.sub(r"[\\/:*?\"<>|]", "_", 分類名稱)
    output_path = get_data_path(f"{safe_cat}.csv")

    print(f"🔍 開始爬分類：{分類名稱}")
    
    res = safe_request(分類URL)
    if not res:
        print("❌ 請求失敗")
        return

    soup = BeautifulSoup(res.text, "html.parser")
    articles = []

    for a in soup.find_all("a"):
        href = a.get("href", "")
        if href.startswith("/man/") and "/M." in href:
            title = a.text.strip()
            full_url = CONFIG["BASE_URL"] + href
            articles.append({"標題": title, "URL": full_url})

    print(f"🔗 共取得 {len(articles)} 篇文章")

    # 擷取每篇文章內容
    records = []
    for art in articles:
        content = get_article_content(art["URL"])
        if content:
            records.append({
                "分類": 分類名稱,
                "標題": art["標題"],
                "URL": art["URL"],
                "內文": content.get("內文", ""),
                "內部連結": content.get("連結集", "")
            })
        random_sleep()

    if records:
        safe_write_csv(records, output_path, mode="w", fieldnames=records[0].keys())
        print(f"✅ 已儲存 {len(records)} 筆文章 ➜ {output_path}")
    else:
        print("⚠️ 沒有資料可以儲存")


In [81]:
chosen_category = categories[6]
crawl_flat_category(chosen_category["分類名稱"], chosen_category["分類URL"])

🔍 開始爬分類：【  行程規劃與交通  】
🔗 共取得 39 篇文章
✅ 已儲存 39 筆文章 ➜ C:\Users\benin\RAG_learning-project\EuropeTravel\【  行程規劃與交通  】.csv


{'分類名稱': '【    出國事宜      】',
 '分類URL': 'https://www.ptt.cc/man/EuropeTravel/D4FA/index.html'}