In [29]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import time
import html2text

def get_page_content(url, retries=5, delay=4):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # 檢查請求是否成功
            response.encoding = response.apparent_encoding  # 自動檢測編碼
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            if attempt < retries - 1:
                print(f"Retrying {url} in {delay} seconds...")
                time.sleep(delay)
    return None

def get_parent_url(url):
    parsed_url = urlparse(url)
    path = parsed_url.path.rsplit('/', 1)[0]
    parent_url = urljoin(url, path)
    return parent_url

def find_all_links(soup, base_url):
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        full_url = urljoin(base_url, href)
        links.append(full_url)
    return links

def extractor(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # 移除腳本和樣式標籤
    for script_or_style in soup(['script','link','style','meta','noscript','head','header','footer']):
        script_or_style.decompose()

    #定義要移除的 class 列表
    unwanted_classes = ['hide-component', 'loading', 'unwanted-class3']
    for class_name in unwanted_classes:
        for div in soup.find_all('div', class_=class_name):
            div.decompose()

    # 定義要移除的 class 列表
    unwanted_classes = ['l-rating']
    for class_name in unwanted_classes:
        for div in soup.find_all('section', class_=class_name):
            div.decompose()

    # 提取文字內容
    text = soup.get_text(separator=' ', strip=True)

    # 將 HTML 轉換為 Markdown
    h = html2text.HTML2Text()
    h.ignore_links = True  # 保留鏈接
    h.ignore_images = True
    h.ignore_mailto_links = True
    h.wrap_list_items = True
    h.wrap_links = True
    h.wrap_tables = True
    markdown_text = h.handle(str(soup))

    return text, markdown_text


def crawl_page(i_req, url, depth=0, max_depth=2, result=None):
    if result is None:
        result = []

    if depth > max_depth:
        return result

    html_content = get_page_content(url)
    if not html_content:
        return result

    soup = BeautifulSoup(html_content, 'html.parser')
    
    title = soup.find('title')['content']
    # 找到 <meta> 標籤中 name 屬性為 "description" 的標籤，並提取內容出來
    meta_description = soup.find('meta', attrs={'name': 'description'})
    meta_description_text = ''
    if meta_description and 'content' in meta_description.attrs:
        meta_description_text = meta_description['content']

    extract_text, markdown_text = extractor(html_content)
    # print(markdown_text)
    
    parent_url = get_parent_url(url)
    all_links = find_all_links(soup, url)  # 這邊原來是soup全部的內容
    # print(all_links)

    i_req += 1

    print(i_req, meta_description_text, url)
    page_data = {
        "i_req": i_req,
        "url": url,
        "parent_url": parent_url,
        'title': title,
        'meta_description': meta_description_text,
        'md': markdown_text,
        "keyword": "",
    }

    result.append(page_data)

    # 立即將結果寫入JSON文件
    outfile = 'Result/ESunCardOnlineFavorite.20240927-a.json'
    with open(outfile, 'w', encoding='utf-8') as file:
        json.dump(result, file, ensure_ascii=False, indent=4)

    for link in all_links:
        i_req += 1
        result = crawl_page(i_req, link, depth + 1, max_depth, result)

    return result


def main(url, max_depth=4):
    i_req = 1
    result = crawl_page(i_req=i_req, url= url, max_depth=max_depth)
    if result:
        print(f"Content saved to page_content.json")

if __name__ == "__main__":
    # url = "https://www.esunbank.com/zh-tw/personal/credit-card/discount/shops"  # 替換為你要抓取的網頁URL
    # url = "https://www.esunbank.com/zh-tw/about/faq/content?q=credit_card/001"
    # url = "https://www.esunbank.com/zh-tw/personal/credit-card/discount/shops/all?category=onlineshop"  #優惠總覽 - 線上購物
    url = "https://www.esunbank.com/zh-tw/about/faq/content?q=credit_card/001"  #如何查詢信用卡辦卡進度?      
    main(url)


2 None https://www.esunbank.com/zh-tw/about/faq/content?q=credit_card/001
3 None https://www.esunbank.com/zh-tw/about/faq/content?q=credit_card/001#accesskey-U
4 None https://www.esunbank.com/zh-tw/about/faq/content?q=credit_card/001#accesskey-U
5 None https://www.esunbank.com/zh-tw/about/faq/content?q=credit_card/001#accesskey-U
6 None https://www.esunbank.com/zh-tw/about/faq/content?q=credit_card/001#accesskey-U
6 <meta content="玉山銀行以最高服務品質，提供個人與企業優質金融服務。包含台外幣帳戶、信用卡、投資理財、基金、貸款、網路銀行等服務。" name="description"/> https://www.esunbank.com/zh-tw/personal
6 <meta content="玉山專業團隊，是企業成功的靠山。玉山銀行秉持專業、服務、責任的態度，結合完整的法人金融服務團隊，提供企業全方位的服務，讓玉山成為您最佳的財務夥伴。" name="description"/> https://www.esunbank.com/zh-tw/business/corporate
6 <meta content="玉山私人銀行與您攜手向前。" name="description"/> https://www.esunbank.com/zh-tw/private-bank
6 None https://www.esunbank.com/zh-tw/esg
6 None https://www.esunbank.com/zh-tw/about
6 None https://robot.esunbank.com.tw/?eservice=ehome&qaCategory=general
6 None https://www.esunbank

KeyboardInterrupt: 

In [None]:
#篩除多餘重複的網頁內容
import json

# 從實體的 JSON 檔案讀取資料
input_file = 'Result/ESunCardOnlineFavorite.20240927-a.json'  # 請將 'input.json' 替換為你的 JSON 檔案名稱
output_file = 'Result/ESunCardOnlineFavorite.20240927-b.json'  # 輸出的 JSON 檔案名稱

# 讀取 JSON 檔案
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 使用一個集合來追蹤已經見過的 URL
seen_urls = set()
unique_data = []

for item in data:
    if item['url'] not in seen_urls:
        seen_urls.add(item['url'])
        unique_data.append(item)

# 將結果轉換回 JSON 格式
unique_json_data = json.dumps(unique_data, ensure_ascii=False, indent=4)

# 將結果寫入 test.json 檔案
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(unique_json_data)

print(f"唯一的資料已寫入 {output_file} 檔案。")
