In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json

# 定义允许爬取的URL列表
# allowed_url_parts  = ['https://web.ubot.com.tw/UB/2023stocksandETF/','https://www.ubot.com.tw/','https://card.ubot.com.tw/', 'https://cardweb.ubot.com.tw/', 'https://mybank.ubot.com.tw/']
allowed_url_parts  = ['https://www.esunbank.com/zh-tw/personal/credit-card/discount/shops','https://www.esunbank.com/zh-tw/personal/credit-card/discount/shops/all?category=onlineshop'] #,'https://www.esunbank.com/']

def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
        response.encoding = response.apparent_encoding  # Set the encoding based on the content
        return response.text
    except requests.RequestException as e:
        print(f"Error during request to {url}: {e}")
        return None

def parse_page_content(content, url, parent_info=None):
    if content:
        soup = BeautifulSoup(content, 'html.parser')
        title = soup.title.text if soup.title else 'No Title'
        description_tag = soup.find('meta', attrs={'name': 'description'})
        description = description_tag.get('content') if description_tag else 'No Description'
        
        # 修改这里，检查 parent_info 是否为 None
        parent_url = parent_info['url'] if parent_info else 'No Parent URL'

        # 修改这里，检查 keyword_tag 是否为 None
        keyword_tag = soup.find('meta', attrs={'name': 'keywords'})
        keyword = keyword_tag.get('content') if keyword_tag else 'No Keyword'

        # 仅保留父节点的URL，并新增 'keyword' 字段
        page_info = {'url': url, 'title': title, 'description': description, 'parent_url': parent_url, 'keyword': keyword}
        return page_info
    else:
        return None

def crawl_website(url, visited_urls=set(), parent_info=None):
    if any(part in url for part in allowed_url_parts) and url not in visited_urls:
        content = get_page_content(url)
        if content:
            # 调用 parse_page_content 时传递了 parent_info 参数
            page_info = parse_page_content(content, url, parent_info)
            # print(f"URL: {page_info['url']}")
            # print(f"Title: {page_info['title']}")
            # print(f"Description: {page_info['description']}")
            # print(f"Parent URL: {page_info['parent_url']}")
            # print(f"Keyword: {page_info['keyword']}")
            
            record_to_json(page_info)

            visited_urls.add(url)

            soup = BeautifulSoup(content, 'html.parser')
            links = soup.find_all('a', href=True)
            for link in links:
                next_url = urljoin(url, link['href'])
                if next_url.startswith('http') and next_url not in visited_urls:
                    # 递归时传递了当前页面的信息作为父节点信息
                    crawl_website(next_url, visited_urls, parent_info=page_info)

def record_to_json(data):
    with open('玉山.2024信用卡線上優惠.json', 'a', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False)
        json_file.write('\n')


# Replace 'your_website_url' with the actual URL you want to start crawling from

# crawl_website('https://web.ubot.com.tw/UB/2023stocksandETF/')
# crawl_website('https://www.esunbank.com/zh-tw/personal/credit-card/discount/shops')
crawl_website('https://www.esunbank.com/zh-tw/personal/credit-card/discount/shops/all?category=onlineshop')



: 