In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from urllib.parse import urljoin
import os
import httpx
import ssl
import re

# SSL 설정
ssl_context = ssl.create_default_context()
ssl_context.options |= ssl.OP_LEGACY_SERVER_CONNECT

def clean_filename(filename):
    """파일 이름에서 허용되지 않는 문자를 대체"""
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

class BookScraper:
    def __init__(self):
        self.base_url = "https://books.toscrape.com/"
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('--headless')  # 화면 숨김
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')

    def create_driver(self):
        return webdriver.Chrome(options=self.options)

    # 카테고리 정보 수집
    def scrape_categories(self):
        driver = self.create_driver()
        try:
            driver.get(self.base_url)
            categories = driver.find_elements(By.CSS_SELECTOR, "ul.nav.nav-list li ul li a")
            category_list = [
                {
                    "category": category.text.strip(),
                    "link": urljoin(self.base_url, category.get_attribute("href"))
                }
                for category in categories
            ]
            return category_list
        finally:
            driver.quit()

    # 각 카테고리별 책 정보 및 이미지 수집
    def scrape_books(self, categories):
        driver = self.create_driver()
        rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
        all_books = []

        try:
            for cat in categories:
                category_name = clean_filename(cat['category'])
                category_link = cat['link']
                category_books = []

                # 이미지 저장 폴더 생성
                image_dir = os.path.join("images", category_name)
                os.makedirs(image_dir, exist_ok=True)

                driver.get(category_link)
                book_index = 1  # 카테고리 내 책 순번 초기화

                while True:
                    books = driver.find_elements(By.CSS_SELECTOR, "article.product_pod")
                    for book in books:
                        # 책 정보 수집
                        title = book.find_element(By.CSS_SELECTOR, "h3 a").get_attribute("title")
                        price = book.find_element(By.CSS_SELECTOR, ".price_color").text
                        rating_class = book.find_element(By.CSS_SELECTOR, "p.star-rating").get_attribute("class")
                        rating = rating_map[rating_class.split()[-1]]

                        # 이미지 URL 처리
                        img_element = book.find_element(By.CSS_SELECTOR, "img.thumbnail")
                        img_url = urljoin(self.base_url, img_element.get_attribute("src"))
                        image_path = os.path.join(image_dir, f"{category_name}.{book_index}.jpg")

                        # 이미지 저장
                        response = httpx.get(img_url, verify=ssl_context)
                        if response.status_code == 200:
                            with open(image_path, "wb") as f:
                                f.write(response.content)

                        # 책 데이터 추가
                        category_books.append({
                            "title": title,
                            "price": price,
                            "rating": rating,
                            "image_path": image_path
                        })

                        book_index += 1  # 순번 증가

                    # 다음 페이지 이동
                    try:
                        next_button = driver.find_element(By.CSS_SELECTOR, "li.next a")
                        next_link = next_button.get_attribute("href")
                        driver.get(next_link)
                    except Exception:
                        break

                # 카테고리별 책 데이터 저장
                all_books.append({
                    "category": category_name,
                    "books": category_books
                })

        finally:
            driver.quit()
        return all_books

    # HTML 생성
    def generate_html(self, books_data, output_file="books.html"):
        html_content = """
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Books by Category</title>
            <style>
                body { font-family: Arial, sans-serif; margin: 20px; }
                .category { margin-bottom: 40px; }
                .book { display: flex; align-items: center; margin-bottom: 15px; }
                .book img { width: 100px; height: auto; margin-right: 15px; }
                .book-info { max-width: 500px; }
                .book-info h3 { margin: 0; font-size: 1.2em; }
                .book-info p { margin: 5px 0; }
            </style>
        </head>
        <body>
            <h1>Books by Category</h1>
        """

        for category_data in books_data:
            html_content += f"<div class='category'><h2>{category_data['category']}</h2>"
            for book in category_data['books']:
                html_content += f"""
                <div class='book'>
                    <img src="{book['image_path']}" alt="{book['title']}">
                    <div class='book-info'>
                        <h3>{book['title']}</h3>
                        <p>Price: {book['price']}</p>
                        <p>Rating: {'⭐' * book['rating']}</p>
                    </div>
                </div>
                """
            html_content += "</div>"

        html_content += """
        </body>
        </html>
        """

        with open(output_file, "w", encoding="utf-8") as f:
            f.write(html_content)
        print(f"HTML 파일 저장 완료: {output_file}")

# 실행
if __name__ == "__main__":
    scraper = BookScraper()
    categories = scraper.scrape_categories()
    books_data = scraper.scrape_books(categories)
    scraper.generate_html(books_data)


HTML 파일 저장 완료: books.html
