In [None]:
import threading
import requests
import queue
import csv
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm.notebook import tqdm

# ===== 設定區 =====
URL_CSV = '1w_article_count.csv'
OUTPUT_CSV = 'text_1w_morethan30reads.csv'
ERROR_LOG = 'error_urls.csv'
MAX_FETCH_THREADS = 5
MAX_PARSE_THREADS = 5
HEADERS = {'User-Agent': 'Mozilla/5.0'}

# ===== 初始化資料與 queue =====
df = pd.read_csv(URL_CSV)
df=df[df["count"] >= 30]
fetch_queue = queue.Queue()
parse_queue = queue.Queue()
writer_queue = queue.Queue()
error_list = []

for url in df['article_url']:
    fetch_queue.put(url)

# ===== tqdm 記錄數量 =====
progress_lock = threading.Lock()
fetch_pbar = tqdm(total=fetch_queue.qsize(), desc="Fetched", position=0)
parse_pbar = tqdm(total=fetch_queue.qsize(), desc="Parsed", position=1)
write_pbar = tqdm(total=fetch_queue.qsize(), desc="Written", position=2)

# ===== 抓取階段 =====
def fetcher():
    while True:
        try:
            url = fetch_queue.get(timeout=3)
        except queue.Empty:
            break
        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
            parse_queue.put((url, response.text))
        except Exception as e:
            print(f"[抓取失敗] {url}: {e}")
            error_list.append((url, 'fetch', str(e)))
        finally:
            with progress_lock:
                fetch_pbar.update(1)
            fetch_queue.task_done()

# ===== 解析階段 =====
def parser():
    while True:
        try:
            url, html = parse_queue.get(timeout=30)
        except queue.Empty:
            print("Parse queue is empty.")
            break
        try:
            soup = BeautifulSoup(html, 'html.parser')
            article = soup.find(id='article_page')
            text = article.get_text(separator='\n', strip=True) if article else ''
            writer_queue.put((url, text))
        except Exception as e:
            print(f"[解析失敗] {url}: {e}")
            error_list.append((url, 'parse', str(e)))
        finally:
            with progress_lock:
                parse_pbar.update(1)
            parse_queue.task_done()

# ===== 寫入階段 =====
def writer():
    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(['article_url', 'text'])
        while True:
            try:
                url, text = writer_queue.get(timeout=30)
                csv_writer.writerow([url, text])
                with progress_lock:
                    write_pbar.update(1)
                writer_queue.task_done()
            except queue.Empty:
                print("Writer queue is empty.")
                break

# ===== 錯誤寫入 =====
def write_errors():
    if error_list:
        with open(ERROR_LOG, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['url', 'stage', 'error'])
            writer.writerows(error_list)

# ===== 執行流程控制 =====
if __name__ == '__main__':
    start_time = time.time()

    # 啟動所有 thread（平行）
    fetch_threads = [threading.Thread(target=fetcher) for _ in range(MAX_FETCH_THREADS)]
    parse_threads = [threading.Thread(target=parser) for _ in range(MAX_PARSE_THREADS)]
    writer_thread = threading.Thread(target=writer)

    for t in fetch_threads + parse_threads:
        t.start()
    writer_thread.start()

    for t in fetch_threads:
        t.join()
    fetch_queue.join()

    for t in parse_threads:
        t.join()
    parse_queue.join()

    writer_thread.join()
    writer_queue.join()

    # 寫入錯誤紀錄
    write_errors()

    fetch_pbar.close()
    parse_pbar.close()
    write_pbar.close()

    end_time = time.time()
    print(f"✅ 全部完成，用時 {end_time - start_time:.2f} 秒")