In [14]:
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from twisted.internet import reactor
import json
from crochet import setup, wait_for
from concurrent.futures import ThreadPoolExecutor

# 初始化 crochet
setup()

class EnhancedSpider(scrapy.Spider):
    name = 'enhanced_spider'
    start_urls = ['https://www.esunbank.com/zh-tw/personal/credit-card/discount/shops']  # 替換為您的目標網站URL

    def parse(self, response):
        # 提取當前頁面的所有可讀文本
        all_text = ' '.join(response.xpath('//body//text()').getall()).strip()
        
        # 提取當前頁面的標題
        title = response.css('title::text').get()
        
        # 提取分頁信息
        pagination = {
            'first': response.css('.first::attr(href)').get(),
            'prev': response.css('.prev::attr(href)').get(),
            'next': response.css('.next::attr(href)').get(),
            'last': response.css('.last::attr(href)').get()
        }
        
        # 提取當前頁面的具體數據
        page_data = self.extract_page_data(response)
        
        # 創建包含所有信息的字典
        yield {
            'url': response.url,
            'title': title,
            'text_content': all_text,
            'pagination': pagination,
            'page_data': page_data
        }
        
        # 如果存在下一頁，繼續爬取
        next_page = pagination['next']
        if next_page:
            yield response.follow(next_page, self.parse)

    def extract_page_data(self, response):
        # 這個方法需要根據您的網頁結構進行定制
        paragraphs = response.css('p::text').getall()
        return {
            'paragraphs': paragraphs
            # 可以根據需要添加更多具體的數據字段
        }

# 配置日誌
configure_logging()

# 創建一個全局變量來存儲結果
results = []

# 定義一個回調函數來收集結果
def collect_results(item, response, spider):
    results.append(item)

# 設置信號來收集結果
from scrapy.signalmanager import dispatcher
dispatcher.connect(collect_results, signal=scrapy.signals.item_scraped)

@wait_for(600)  # 等待最多180秒
def run_spider():
    """
    運行爬蟲並返回結果
    """
    runner = CrawlerRunner(get_project_settings())
    deferred = runner.crawl(EnhancedSpider)
    deferred.addCallback(lambda _: reactor.stop())
    reactor.run()
    return deferred

# 使用ThreadPoolExecutor來運行爬蟲
def execute_spider():
    with ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(run_spider)
        return future.result()

# 運行爬蟲
execute_spider()

# 將結果保存為 JSON 文件
with open('enhanced_output.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("爬蟲完成，結果已保存到 enhanced_output.json 文件中")

# 顯示前幾個結果的部分內容
# print("\n前3個結果的摘要:")
# for item in results[:3]:
#     print(f"URL: {item['url']}")
#     print(f"Title: {item['title']}")
#     print(f"Pagination: {item['pagination']}")
#     print(f"Text content (first 100 characters): {item['text_content'][:100]}...")
#     print(f"Page data: {item['page_data']}")
#     print("---")


TimeoutError: 