In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import time
import html2text
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_page_content(url, retries=5, delay=4):
    for attempt in range(retries):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
            }
            response = requests.get(url, timeout=10, headers= headers)
            response.raise_for_status()  # 檢查請求是否成功
            response.encoding = response.apparent_encoding  # 自動檢測編碼
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            if attempt < retries - 1:
                print(f"Retrying {url} in {delay} seconds...")
                time.sleep(delay)
    return None

#如果網站有強大的反爬蟲機制，你可以使用 Selenium 來模擬真實的瀏覽器行為。
def get_page_selenium(url, retries=5, delay=4):
    # 設置瀏覽器驅動
    driver = webdriver.Chrome()
    driver.get(url)
    page_source = driver.page_source # 獲取頁面內容
    driver.quit() #關閉瀏覽器
    return page_source

def is_under_specific_url(base_url, href, specific_url):
    specific_path = urlparse(specific_url).path
    href_path = urlparse(href).path
    return href_path.startswith(specific_path)

def find_all_links(soup, base_url, specific_url, except_urls):
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if '#' in href:
            continue  # 排除包含#符號的鏈接
        
        full_url = urljoin(base_url, href)
        if full_url in except_urls:
            continue  # 排除特定的網址

        links.append(full_url)
    return links

#使用一個字典來存儲標籤和相應的不想要的 class 列表
def remove_elements_by_class(soup, tag, unwanted_classes):
    for class_name in unwanted_classes:
        for element in soup.find_all(tag, class_=class_name):
            element.decompose()

def keep_elements_by_class(soup, tag, wanted_classes):
    # 找到並保留想要的元素
    wanted_elements = []
    for class_name in wanted_classes:
        for element in soup.find_all(tag, class_=class_name):
            wanted_elements.append(element.extract())

    # 移除其他不想要的元素
    for element in soup.find_all(tag):
        element.decompose()

    # 將保留的元素重新插入到原始位置
    for element in wanted_elements:
        soup.append(element)

def extractor(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 移除腳本和樣式標籤
    unwanted_tags = ['script','link','style','meta','noscript','head','header','footer','select','label','legend','form','img','button','table','figcaption']
    for script_or_style in soup(unwanted_tags):
        script_or_style.decompose()

    # 定義要移除的 class 列表
    unwanted_elements = {
        'div': ['hide-component', 'l-breadCrumb'
                ,'bread','cookie-concent', 'gotop', 'menuBtn', 'darkLoad', 'market-widget'
                ,'scroll-box', 'hotnews', 'important-info', 'nav-main__sub-menu', 'content__header'
                ,'livenews__switch', 'article-function','fbmsg-box','box__promo'
                ,'list-box relatednews', 'relatednews', 'ad-recommend', 'ad-box','theme-switch'],
        'section': ['l-rating'],
        'p': ['hint','box-title'],
        'a': ['goMain', 'header__main-logo-square', 'nav-main__collapse-btn'],
        'ul': ['bread-crumb'],
        'li': ['nav-main__menu-item']
    }
    # 移除不想要的元素
    for tag, classes in unwanted_elements.items():
        remove_elements_by_class(soup, tag, classes)

    # 定義要保留的 class 列表
    wanted_classes = {
        'section': ['l-linkCard'],
    }

    # 保留想要的元素並移除其他不想要的元素
    for tag, classes in wanted_classes.items():
        keep_elements_by_class(soup, tag, classes)

    # 提取文字內容
    text = '' #soup.get_text(separator=' ', strip=True)

    # 將 HTML 轉換為 Markdown
    h = html2text.HTML2Text()
    h.ignore_links = True  # 保留鏈接
    h.ignore_images = True
    h.ignore_mailto_links = True
    markdown_text = h.handle(str(soup))

    return soup, text, markdown_text


def crawl_page(i_req, parent_url, url, depth=0, max_depth=2, result=None):
    if result is None:
        result = []

    if depth > max_depth:
        return result

    html_content = get_page_selenium(url)
    if not html_content:
        return result

    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 提取 <title> 標籤的內容
    title_tag = soup.find('title')
    title_text = title_tag.get_text(strip=True) if title_tag else ''


    # 找到 <meta> 標籤中 name 屬性為 "description" 的標籤，並提取內容出來
    meta_description = soup.find('meta', attrs={'name': 'description'})
    meta_description_text = ''
    meta_description_text = meta_description['content'] if meta_description and 'content' in meta_description.attrs else ''

    soup2, extract_text, markdown_text = extractor(html_content)
    # print(soup2)
    page_data = {
        "i_req": i_req,
        "url": url,
        "parent_url": parent_url,
        'title': title_text,
        'meta_description': meta_description_text,
        'md': markdown_text,
    }
    result.append(page_data)

    # 立即將結果寫入JSON文件
    outfile = 'Result/玉山銀行/ESun.20241023.json'
    with open(outfile, 'w', encoding='utf-8') as file:
        json.dump(result, file, ensure_ascii=False, indent=4)
    
    
    depth += 1
    base_url = 'https://www.esunbank.com/'
    specific_url = 'https://www.esunbank.com/zh-tw/personal/credit-card/'
    except_url = [
        'https://www.esunbank.com/?eservice=EnglishService&qaCategory=general',
        'https://robot.esunbank.com.tw/?eservice=Widget&qaCategory=general', #小玉
        'https://www.esunbank.com.tw/bank/about/announcement/announcement?i=eYxNfOekAkKLtZwoUwDqaw&p=9WLFRe74B0ikiWlxA46w8Q&d=HA4k102TJkG2QuRj6K5Wtg',
        'https://card.esunbank.com.tw/EsunCreditweb/common/noCookie.jsp'
        ]
    all_links = find_all_links(soup2, base_url, specific_url, except_url)  # 這邊原來是soup全部的內容  
    print(i_req, depth, url)
    if len(all_links) > 0: 
        print('all-links', all_links)

    for link in all_links:
        i_req += 1
        result = crawl_page(i_req, url, link, depth, max_depth, result)

    return result


def main(url, max_depth=4):
    i_req, depth= 0, 0
    result = crawl_page(i_req, 'root', url, depth, max_depth)
    if result:
        print(f"Content saved to page_content.json 結束~")

if __name__ == "__main__":
    url = "https://www.esunbank.com/zh-tw/personal/credit-card/tools"
    main(url)


0 1 https://www.esunbank.com/zh-tw/personal/credit-card/tools
all-links ['https://esun.co/4HKTR', 'https://esun.co/zvo9G', 'https://esun.co/rXST7', 'https://esun.co/dcdWJ', 'https://esun.co/hGAcI', 'https://esun.co/Pgogv', 'https://esun.co/XxMXy', 'https://esun.co/67ay4', 'https://esun.co/ejeVY', 'https://esun.co/kMEph', 'https://esun.co/QRwgU', 'https://esun.co/2DO9T', 'https://esun.co/ToGEk', 'https://esun.co/a38Cj', 'https://esun.co/JeoLB', 'https://esun.co/Qnd49', 'https://esun.co/82LhF', 'https://esun.co/CM4Fc', 'https://esun.co/SeIFL', 'https://esun.co/NZBlT', 'https://esun.co/q9bwe', 'https://esun.co/f7ztD', 'https://esun.co/FkZIy', 'https://esun.co/luiVe']
1 2 https://esun.co/4HKTR
2 2 https://esun.co/zvo9G
all-links ['https://www.esunbank.com/?eservice=EnglishService&qaCategory=general']
3 3 https://www.esunbank.com/?eservice=EnglishService&qaCategory=general
3 2 https://esun.co/rXST7
all-links ['https://www.esunbank.com/zh-tw/personal', 'https://www.esunbank.com/zh-tw/persona

KeyboardInterrupt: 