In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import time

def get_page_content(url, retries=5, delay=4):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # 檢查請求是否成功
            response.encoding = response.apparent_encoding  # 自動檢測編碼
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            if attempt < retries - 1:
                print(f"Retrying {url} in {delay} seconds...")
                time.sleep(delay)
    return None

def extract_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # 移除腳本和樣式標籤
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
    # 提取文字內容
    text = soup.get_text(separator=' ', strip=True)
    return text

def get_parent_url(url):
    parsed_url = urlparse(url)
    path = parsed_url.path.rsplit('/', 1)[0]
    if path == '':
        return "Root"
    parent_url = urljoin(url, path)
    return parent_url

def find_all_links(soup, base_url):
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        full_url = urljoin(base_url, href)
        links.append(full_url)
    return links

def crawl_page(url, depth=0, max_depth=2, result=None):
    if result is None:
        result = []

    if depth > max_depth:
        return result

    html_content = get_page_content(url)
    if not html_content:
        return result

    soup = BeautifulSoup(html_content, 'html.parser')
    text_content = extract_text(html_content)
    label = text_content[0:35]+"..."
    parent_url = get_parent_url(url)
    all_links = find_all_links(soup, url)

    page_data = {
        "url": url,
        "parent_url": parent_url,
        "label": label,
        "text_content": "",
        "links": []
    }

    result.append(page_data)

    # 立即將結果寫入JSON文件
    with open('Result/ESunCardOnlineFavorite.20240927.json', 'w', encoding='utf-8') as file:
        json.dump(result, file, ensure_ascii=False, indent=4)

    for link in all_links:
        result = crawl_page(link, depth + 1, max_depth, result)

    return result

def main(url, max_depth=2):
    result = crawl_page(url, max_depth=max_depth)
    if result:
        print(f"Content saved to page_content.json")

# 示例使用
if __name__ == "__main__":
    # start_url = "https://www.esunbank.com/zh-tw/personal/credit-card/discount/shops/all?category=onlineshop"
    start_url = "https://www.esunbank.com/zh-tw/about/faq/content?q=credit_card/001"
    main(start_url, max_depth=2)


KeyboardInterrupt: 