In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import time
import html2text
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_page_content(url, retries=5, delay=4):
    for attempt in range(retries):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
            }
            response = requests.get(url, timeout=10, headers= headers)
            response.raise_for_status()  # 檢查請求是否成功
            response.encoding = response.apparent_encoding  # 自動檢測編碼
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            if attempt < retries - 1:
                print(f"Retrying {url} in {delay} seconds...")
                time.sleep(delay)
    return None

#如果網站有強大的反爬蟲機制，你可以使用 Selenium 來模擬真實的瀏覽器行為。
def get_page_selenium(url, retries=5, delay=4):
    # 設置瀏覽器驅動
    driver = webdriver.Chrome()
    driver.get(url)
    # 等待具有特定class属性的div标签出现并点击
    # wait = WebDriverWait(driver, 5)
    # div_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button#moreBtn")))
    # div_element.click()
    # div_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button#moreBtn")))
    # div_element.click()

    # 獲取頁面內容
    page_source = driver.page_source
    # print(page_source)
    # 關閉瀏覽器
    driver.quit()
    return page_source

def is_same_domain(url1, url2):
    return urlparse(url1).netloc == urlparse(url2).netloc

def is_next_level(base_url, href):
    base_path = urlparse(base_url).path
    href_path = urlparse(href).path
    return href_path.startswith(base_path) and href_path != base_path

def is_under_specific_url(base_url, href, specific_url):
    specific_path = urlparse(specific_url).path
    href_path = urlparse(href).path
    return href_path.startswith(specific_path)

def find_all_links(soup, base_url, specific_url):
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        full_url = urljoin(base_url, href)
        if (is_same_domain(base_url, full_url) and
            is_next_level(base_url, full_url) and
            is_under_specific_url(base_url, full_url, specific_url)):
            links.append(full_url)
    return links

#使用一個字典來存儲標籤和相應的不想要的 class 列表
def remove_elements_by_class(soup, tag, unwanted_classes):
    for class_name in unwanted_classes:
        for element in soup.find_all(tag, class_=class_name):
            element.decompose()

def keep_elements_by_class(soup, tag, wanted_classes):
    # 找到並保留想要的元素
    wanted_elements = []
    for class_name in wanted_classes:
        for element in soup.find_all(tag, class_=class_name):
            wanted_elements.append(element.extract())

    # 移除其他不想要的元素
    for element in soup.find_all(tag):
        element.decompose()

    # 將保留的元素重新插入到原始位置
    for element in wanted_elements:
        soup.append(element)

def extractor(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # 移除腳本和樣式標籤
    unwanted_tags = ['script','link','style','meta','noscript','head','header','footer','select','label','legend','form','img','button','table','figcaption']
    for script_or_style in soup(unwanted_tags):
        script_or_style.decompose()

    unwanted_classes =  ['copied']
    for class_name in unwanted_classes:
        for div in soup.find_all('div', id=class_name):
            div.decompose()

    #定義要移除的 class 列表
    #unwanted_classes = ['hide-component', 'loading', 'unwanted-class3']
    unwanted_classes = {
        'div': ['bread', 'cookie-concent', 'gotop', 'menuBtn', 'darkLoad', 'market-widget',
                'scroll-box', 'hotnews', 'important-info', 'nav-main__sub-menu', 'content__header',
                'livenews__switch', 'article-function','fbmsg-box','box__promo',
                'list-box relatednews', 'relatednews', 'ad-recommend', 'ad-box','theme-switch'],
        'section': ['l-rating'],
        'p': ['hint','box-title'],
        'a': ['goMain', 'header__main-logo-square', 'nav-main__collapse-btn'],
        'ul': ['bread-crumb'],
        'li': ['nav-main__menu-item']
    }
    # # 移除不想要的元素
    for tag, classes in unwanted_classes.items():
        remove_elements_by_class(soup, tag, classes)

    # # 定義要保留的 class 列表
    # wanted_classes = {
    #     'div': ['newslist livenews'],
    # }

    # # 保留想要的元素並移除其他不想要的元素
    # for tag, classes in wanted_classes.items():
    #     keep_elements_by_class(soup, tag, classes)

    # 提取文字內容
    text = '' #soup.get_text(separator=' ', strip=True)

    # 將 HTML 轉換為 Markdown
    h = html2text.HTML2Text()
    h.ignore_links = True  # 保留鏈接
    h.ignore_images = True
    h.ignore_mailto_links = True
    # h.wrap_list_items = True
    # h.wrap_links = True
    # h.wrap_tables = True
    markdown_text = h.handle(str(soup))

    return soup, text, markdown_text


def crawl_page(i_req, parent_url, url, depth=0, max_depth=2, result=None):
    if result is None:
        result = []

    if depth > max_depth:
        return result

    html_content = get_page_selenium(url)
    if not html_content:
        return result

    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 提取 <title> 標籤的內容
    title_tag = soup.find('title')
    title_text = title_tag.get_text(strip=True) if title_tag else ''


    # 找到 <meta> 標籤中 name 屬性為 "description" 的標籤，並提取內容出來
    meta_description = soup.find('meta', attrs={'name': 'description'})
    meta_description_text = ''
    meta_description_text = meta_description['content'] if meta_description and 'content' in meta_description.attrs else ''

    soup2, extract_text, markdown_text = extractor(html_content)
    # print(soup2)
    
    base_url = 'https://www.ctee.com.tw/news'
    specific_url = 'https://www.ctee.com.tw/news'
    all_links = find_all_links(soup2, base_url, specific_url)  # 這邊原來是soup全部的內容
    print('all-links',all_links)

    i_req += 1

    print(i_req, url)
    page_data = {
        "i_req": i_req,
        "url": url,
        "parent_url": parent_url,
        'title': title_text,
        'meta_description': meta_description_text,
        'md': markdown_text,
    }

    result.append(page_data)

    # 立即將結果寫入JSON文件
    outfile = 'Result/玉山銀行/ESun.20241023.json'
    with open(outfile, 'w', encoding='utf-8') as file:
        json.dump(result, file, ensure_ascii=False, indent=4)

    for link in all_links:
        i_req += 1
        result = crawl_page(i_req, url, link, depth + 1, max_depth, result)

    return result


def main(url, max_depth=4):
    i_req = 1
    result = crawl_page(i_req=i_req, parent_url='root', url= url, max_depth=max_depth)
    if result:
        print(f"Content saved to page_content.json")

if __name__ == "__main__":
    # url = "https://www.esunbank.com/zh-tw/personal/credit-card/discount/shops"  # 替換為你要抓取的網頁URL
    # url = "https://www.esunbank.com/zh-tw/about/faq/content?q=credit_card/001"
    # url = "https://www.esunbank.com/zh-tw/personal/credit-card/discount/shops/all?category=onlineshop"  #優惠總覽 - 線上購物
    # url = "https://www.esunbank.com/zh-tw/about/faq/content?q=credit_card/001"  #如何查詢信用卡辦卡進度?     
    # url = "https://www.fsc.gov.tw/ch/home.jsp?id=2&parentpath=0" 
    # url = "https://www.ctee.com.tw/livenews/stock"
    url = "https://www.esunbank.com/zh-tw/personal/credit-card/tools"
    # url = "https://www.ctee.com.tw/stock/star"
    main(url)


In [None]:
import json

# 從實體的 JSON 檔案讀取資料
input_file = 'Result/玉山銀行/ESun-2.20241023.a.json'  # 請將 'input.json' 替換為你的 JSON 檔案名稱
output_file = 'Result/玉山銀行/ESun-2.20241023.b.json'  # 輸出的 JSON 檔案名稱
duplicate_file = 'Result/玉山銀行/ESun-2.20241023.duplicates.json'  # 重複資料的 JSON 檔案名稱

# 讀取 JSON 檔案
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 使用一個集合來追蹤已經見過的 URL
seen_urls = set()
unique_data = []
duplicate_data = []

for item in data:
    if item['url'] not in seen_urls:
        seen_urls.add(item['url'])
        unique_data.append(item)
    else:
        duplicate_data.append(item)

# 將唯一的結果轉換回 JSON 格式
unique_json_data = json.dumps(unique_data, ensure_ascii=False, indent=4)

# 將唯一的結果寫入 output_file 檔案
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(unique_json_data)

# 將重複的結果轉換回 JSON 格式
duplicate_json_data = json.dumps(duplicate_data, ensure_ascii=False, indent=4)

# 將重複的結果寫入 duplicate_file 檔案
with open(duplicate_file, 'w', encoding='utf-8') as f:
    f.write(duplicate_json_data)

print(f"唯一的資料已寫入 {output_file} 檔案。")
print(f"重複的資料已寫入 {duplicate_file} 檔案。")



#### 整理出已經爬過的網站，能夠避免重複爬取

In [18]:
import json

# 讀取 JSON 檔案
# input_file =  'Result/玉山銀行/visited_url.txt'  # 請將 'input.json' 替換為你的 JSON 檔案名稱
output_file = 'Result/玉山銀行/visited_url.txt'  # 輸出的文字檔案名稱

# # 讀取 JSON 檔案
# with open(input_file, 'r', encoding='utf-8') as f:
#     data = json.load(f)

# 使用一個集合來追蹤已經見過的 URL
visited_urls = set()

# 讀取現有的 visited_url.txt 檔案內容
try:
    with open(output_file, 'r', encoding='utf-8') as f:
        existing_urls = f.read().split('\n')
        visited_urls.update(existing_urls)
except FileNotFoundError:
    pass  # 如果檔案不存在，則跳過讀取步驟

# # 抽取 url 和 parent_url，並排除重複的 URL
# for item in data:
#     visited_urls.add(item['url'])
#     visited_urls.add(item['parent_url'])

# 將 URL 排序
sorted_urls = sorted(visited_urls)

# 將排序後的 URL 寫入文字檔案，以逗號區隔開
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(sorted_urls))

print(f"已將排序後的 URL 寫入 {output_file} 檔案。")

已將排序後的 URL 寫入 Result/玉山銀行/visited_url.txt 檔案。
