[컴퓨터 공학 - 예스24](https://www.yes24.com/24/Category/Display/001001003031?FetchSize=100&PageNumber=71)

In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import pandas as pd
import time
import random
from requests.exceptions import RequestException
import re

def retry_request(url, max_retries=5, base_delay=1):
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            return response
        except RequestException as e:
            if attempt == max_retries - 1:
                raise e
            delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
            print(f"Request failed. Retrying in {delay:.2f} seconds...")
            time.sleep(delay)

def get_last_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # "끝" 버튼을 찾습니다.
    end_button = soup.select_one('.yesUI_pagenS a.bgYUI.end')
    
    if end_button:
        # "끝" 버튼의 href 속성에서 PageNumber 값을 추출합니다.
        href = end_button.get('href', '')
        match = re.search(r'PageNumber=(\d+)', href)
        if match:
            return int(match.group(1))
    
    # "끝" 버튼을 찾지 못하거나 PageNumber를 추출하지 못한 경우
    # 페이지 번호들을 찾아 그 중 가장 큰 값을 반환합니다.
    page_numbers = [int(a.text) for a in soup.select('.yesUI_pagenS a.num') if a.text.isdigit()]
    return max(page_numbers) if page_numbers else 1


def scrape_page(url):
    response = retry_request(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    books = soup.select('.goods_info')
    
    data = []
    for book in books:
        title = book.select_one('.goods_name a').text.strip() if book.select_one('.goods_name a') else 'N/A'
        author = book.select_one('.goods_auth a').text.strip() if book.select_one('.goods_auth a') else 'N/A'
        publisher = book.select_one('.goods_pub').text.strip() if book.select_one('.goods_pub') else 'N/A'
        pub_date = book.select_one('.goods_date').text.strip() if book.select_one('.goods_date') else 'N/A'
        price = book.select_one('.goods_price .yes_b').text.strip() if book.select_one('.goods_price .yes_b') else 'N/A'
        discount = book.select_one('.goods_benefit').text.strip() if book.select_one('.goods_benefit') else 'N/A'
        rating = book.select_one('.gd_rating .yes_b').text.strip() if book.select_one('.gd_rating .yes_b') else 'N/A'
        review_count = book.select_one('.gd_reviewCount .txC_blue').text.strip() if book.select_one('.gd_reviewCount .txC_blue') else 'N/A'
        detail_url = 'https://www.yes24.com' + book.select_one('.goods_name a')['href'] if book.select_one('.goods_name a') else 'N/A'
        description = book.select_one('.goods_read').text.strip() if book.select_one('.goods_read') else 'N/A'
        
        data.append((title, author, publisher, pub_date, price, discount, rating, review_count, detail_url, description))
    
    return data

In [None]:
def main(cat_id):
    base_url = f"https://www.yes24.com/24/Category/Display/{cat_id}?FetchSize=300&PageNumber={}"
    
    # Get total number of pages
    total_pages = get_last_page(base_url.format(1))
    print(f"Total pages: {total_pages}")
    
    # Create SQLite database and table
    conn = sqlite3.connect('yes24_books.db')
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS books
                      (title TEXT, author TEXT, publisher TEXT, pub_date TEXT, price TEXT, discount TEXT, 
                      rating TEXT, review_count TEXT, detail_url TEXT, description TEXT)''')
    
    # Scrape all pages and insert data into the database
    for page in range(1, total_pages + 1):
        url = base_url.format(page)
        try:
            books_data = scrape_page(url)
            cursor.executemany('INSERT INTO books VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', books_data)
            conn.commit()
            print(f"Page {page} scraped and saved.")
        except Exception as e:
            print(f"Error occurred while scraping page {page}: {str(e)}")
        time.sleep(random.uniform(1, 3))  # Random wait time between 1 and 3 seconds
    
    # Close the database connection
    conn.close()
    
    # Read the data into a pandas DataFrame
    conn = sqlite3.connect('yes24_books.db')
    df = pd.read_sql_query("SELECT * FROM books", conn)
    conn.close()
    
    # Print the number of records and first few rows
    print(f"Total number of records: {len(df)}")
    print("\nFirst few rows of the data:")
    print(df.head())

category_dict = {"컴퓨터공학":"001001003031", "프로그래밍 언어":"001001003022", "인공지능":"001001003032"}
for cat_id in category_dict.values():
    main(cat_id)

In [None]:
conn = sqlite3.connect('yes24_books.db')
df = pd.read_sql_query("SELECT * FROM books", conn)
conn.close()

In [None]:
df

In [None]:
df.shape

In [None]:
df["review_count"] = pd.to_numeric(df["review_count"].str.replace(",", ""), errors="coerce")

In [None]:
df.nlargest(10, "review_count")

In [None]:
df