In [7]:
import requests

import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import _extractWebPage as ewp

def get_page_content(url, retries=5, delay=4):
    for attempt in range(retries):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
            }
            response = requests.get(url, timeout=10, headers=headers)
            response.raise_for_status()  # 檢查請求是否成功
            response.encoding = response.apparent_encoding  # 自動檢測編碼
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            if attempt < retries - 1:
                print(f"Retrying {url} in {delay} seconds...")
                time.sleep(delay)
    return None

# 如果網站有強大的反爬蟲機制，你可以使用 Selenium 來模擬真實的瀏覽器行為。
def get_page_selenium(url, retries=5, delay=4):
    # 設置瀏覽器驅動
    driver = webdriver.Chrome()
    driver.get(url)
    page_source = driver.page_source  # 獲取頁面內容
    driver.quit()  # 關閉瀏覽器
    return page_source


def crawl_page(i_req, parent_url, url, depth=0, max_depth=2, result=None, visited_urls=None):
    if result is None:
        result = []

    if visited_urls is None:
        visited_urls = set()

    if depth > max_depth:
        return result

    if url in visited_urls:
        print(f"Skipping already visited URL: {url}")
        return result

    visited_urls.add(url)
    with open(visited_url_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(visited_urls))

    html_content = get_page_selenium(url)
    if not html_content:
        return result

    soup2, extract_text, title, meta_description, markdown_text \
        = ewp.extractor(html_content, unwanted_elements = unwanted_elements, wanted_elements = wanted_elements, unwanted_tags = unwanted_tags)
    
    # print(soup2)
    page_data = {
        "i_req": i_req,
        "url": url,
        "parent_url": parent_url,
        'title': title,
        'meta_description': meta_description,
        'md': markdown_text,
    }
    result.append(page_data)

   
    with open(outfile, 'w', encoding='utf-8') as file:
        json.dump(result, file, ensure_ascii=False, indent=4)

    depth += 1
    base_url = 'https://www.esunbank.com/'
    # specific_url = 'https://www.esunbank.com/zh-tw/personal/credit-card/'
    except_url = visited_urls
    all_links = ewp.find_all_links(soup2, base_url, except_url)  # 這邊原來是 soup 全部的內容
    print(i_req, depth, url)
    if len(all_links) > 0:
        print('all-links', all_links)

    # 過濾掉已經在 visited_urls 集合中的 URL
    # all_links = [link for link in all_links if link not in visited_urls]

    for link in all_links:
        i_req += 1
        result = crawl_page(i_req, url, link, depth, max_depth, result, visited_urls)

    return result

 # 讀取現有的 visited_url.txt 檔案內容



unwanted_tags = ['script', 'link', 'style', 'meta', 'noscript', 'head', 'header', 'footer', 'select', 'label', 'legend', 'form', 'img', 'button', 'table', 'figcaption']
unwanted_elements = { # 定義要移除的 class 列表
    'div': ['hide-component', 'l-breadCrumb', 'bread', 'cookie-concent', 'gotop', 'menuBtn', 'darkLoad', 'market-widget', 'scroll-box', 'hotnews', 'important-info', 'nav-main__sub-menu', 'content__header', 'livenews__switch', 'article-function', 'fbmsg-box', 'box__promo', 'list-box relatednews', 'relatednews', 'ad-recommend', 'ad-box', 'theme-switch'],
    'section': ['l-rating'],
    'p': ['hint', 'box-title'],
    'a': ['goMain', 'header__main-logo-square', 'nav-main__collapse-btn'],
    'ul': ['bread-crumb'],
    'li': ['nav-main__menu-item']
}
wanted_elements = { # 定義要保留的 class 列表
    'section': ['l-linkCard'],
}
visited_url_file =  'Result/玉山銀行/visited_url.txt'
outfile =           'Result/玉山銀行/ESun-2.20241023.json'  # 立即將結果寫入 JSON 文件

def main(url, max_depth=4):
    i_req, depth = 0, 0
    visited_urls = set()

    try:
        with open(visited_url_file, 'r', encoding='utf-8') as f:
            existing_urls = f.read().split('\n')
            print(f"目前已搜錄{ len(existing_urls) }個URL網址")
            visited_urls.update(existing_urls)
    except FileNotFoundError:
        pass  # 如果檔案不存在，則跳過讀取步驟
    
    result = crawl_page(i_req, 'root', url, depth, max_depth, visited_urls=visited_urls)
    if result:
        print(f"爬蟲任務結束~")



if __name__ == "__main__":
    # url = "https://www.esunbank.com/zh-tw/personal/credit-card/tools"
    url = "https://event.esunbank.com.tw/mkt/OpenAccount/marketing/index.html?ven=cc_apply_event_traveler-card"
    main(url)


{'https://www.esunbank.com/zh-tw/about/announcement/privacy/privacy-statement', 'https://www.esunbank.com/en/personal/credit-card/friendly', 'https://www.esunbank.com/zh-tw/personal/credit-card', 'https://www.esunbank.com/hotel_list.html', 'https://www.esunbank.com/zh-tw/personal/credit-card/tools/do', 'https://www.esunbank.com.tw/event/credit/notice/index.html', 'https://www.esunbank.com.tw/s/IntegratedApply/Login?ven=cc_apply_event_traveler-card', 'https://www.esunbank.com/zh-tw/about/customer-service', 'https://www.esunbank.com/zh-tw/about/faq/faqlist?tag=credit-card', 'https://esun.co/kMEph', 'https://esun.co/zvo9G', 'https://www.esunbank.com/zh-tw/personal', 'https://www.esunbank.com/zh-tw/personal/credit-card/friendly', 'https://www.esunbank.com/?eservice=EnglishService&qaCategory=general', 'https://www.esunbank.com.tw/bank/about/announcement/announcement?i=eYxNfOekAkKLtZwoUwDqaw&p=9WLFRe74B0ikiWlxA46w8Q&d=HA4k102TJkG2QuRj6K5Wtg', 'https://www.esunbank.com/zh-tw/personal/credit-c