In [1]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd
import os
from datetime import datetime
import signal
import sys
import urllib.parse

class CteeScraper:
    def __init__(self, max_pages=10, stop_url=None):
        # 設定 Chrome 選項
        self.chrome_options = uc.ChromeOptions()
        self.chrome_options.add_argument('--disable-gpu')
        self.chrome_options.add_argument('--no-sandbox')
        self.chrome_options.add_argument('--disable-dev-shm-usage')

        # 初始化瀏覽器
        self.driver = uc.Chrome(
            options=self.chrome_options
        )
        
        # 設定最大頁數
        self.max_pages = max_pages
        
        # 要中斷的指定網址
        self.stop_url = stop_url
        
        # 是否停止爬取的標誌
        self.stop_scraping = False

    def is_same_domain(self, url1, url2):
        """
        比較兩個 URL 是否為相同網域
        """
        if not url1 or not url2:
            return False
        
        # 解析 URL
        parsed_url1 = urllib.parse.urlparse(url1)
        parsed_url2 = urllib.parse.urlparse(url2)
        
        # 比較網域和路徑
        return (parsed_url1.netloc == parsed_url2.netloc and 
                parsed_url1.path == parsed_url2.path)

    def get_news(self, url='https://www.ctee.com.tw/livenews/stock'):
        try:
            # 當前日期時間
            current_time = datetime.now()
            timestamp = current_time.strftime("%Y%m%d-%H")
            
            # 準備CSV檔案名稱
            csv_filename = f'ctee_news_{timestamp}.csv'
            
            # 初始化資料列表
            news_data = []

            # 訪問網頁
            self.driver.get(url)
            time.sleep(2)  # 等待頁面載入

            for page in range(self.max_pages):
                if self.stop_scraping:
                    break

                # 等待新聞列表載入
                news_elements = WebDriverWait(self.driver, 4).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.newslist__card'))
                )

                # 解析新聞資料
                current_page_news = []
                for news in news_elements:
                    try:
                        title = news.find_element(By.CSS_SELECTOR, 'h3.news-title').text
                        time_date = news.find_element(By.CSS_SELECTOR, '.news-date').text
                        time_text = news.find_element(By.CSS_SELECTOR, '.news-time').text
                        link = news.find_element(By.CSS_SELECTOR, 'h3.news-title a').get_attribute('href')

                        current_page_news.append({
                            'date': time_date,
                            'time': time_text,
                            'title': title,
                            'link': link
                        })
                         # 如果指定了停止的網址，並且連結匹配，則中斷
                        if self.stop_url and self.is_same_domain(link, self.stop_url):
                            print(f"找到匹配的網址: {link}")
                            self.stop_scraping = True
                            break
                    except Exception as e:
                        print(f"解析單則新聞時發生錯誤: {str(e)}")
                        continue

                # 將當前頁面的新聞附加到總資料列表
                news_data.extend(current_page_news)

                # 立即寫入CSV（追加模式）
                df = pd.DataFrame(current_page_news)
                # 如果是第一頁，創建新檔案；否則追加
                mode = 'w' #if page == 0 else 'a'
                header = page == 0
                df.to_csv(csv_filename, mode=mode, header=header, index=False, encoding='utf-8-sig')
                
                print(f"第 {page + 1} 頁資料已寫入 {csv_filename}")
                # 檢查是否需要中斷
                if self.stop_scraping:
                    break
                # 檢查是否還有「載入更多」按鈕
                try:
                    # 滾動到頁面底部
                    self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(2)  # 等待頁面滾動

                    # 點擊 "載入更多" 按鈕
                    load_more_button = WebDriverWait(self.driver, 10).until(
                        EC.element_to_be_clickable((By.ID, 'moreBtn'))
                    )
                    
                    # 使用 JavaScript 點擊
                    self.driver.execute_script("arguments[0].click();", load_more_button)
                    time.sleep(3)  # 等待新資料載入

                except Exception as e:
                    print("無法載入更多內容，可能已到達最後一頁")
                    break

            print(f"爬取完成，共 {len(news_data)} 筆新聞已寫入 {csv_filename}")
            return news_data

        except Exception as e:
            print(f"爬取過程發生錯誤: {str(e)}")
            return None

        finally:
            self.driver.quit()

# 執行爬蟲
if __name__ == "__main__":
    # 範例：如果想要在找到特定網址時中斷
    stop_url = "https://www.ctee.com.tw/news/20241204700896-430201"
    scraper = CteeScraper(max_pages=10, stop_url=stop_url)
    news_data = scraper.get_news()

第 1 頁資料已寫入 ctee_news_20241205-09.csv
第 2 頁資料已寫入 ctee_news_20241205-09.csv
找到匹配的網址: https://www.ctee.com.tw/news/20241204700896-430201
第 3 頁資料已寫入 ctee_news_20241205-09.csv
爬取完成，共 151 筆新聞已寫入 ctee_news_20241205-09.csv


: 