In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import time
import re
import sqlite3
import pandas as pd

In [None]:
def get_book_details(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # 책 설명 가져오기
    description = soup.find("div", {"id": "bookDescription_feature_div"})
    description_text = description.get_text(strip=True) if description else "설명 없음"
    
    # Product Details 정보 가져오기
    details = soup.find("div", {"id": "detailBullets_feature_div"})
    
    publisher = ""
    publication_date = ""
    language = ""
    pages = ""
    isbn10 = ""
    isbn13 = ""
    dimensions = ""
    bestseller_rank = ""
    customer_reviews = ""
    
    if details:
        for li in details.find_all("li"):
            text = li.get_text(strip=True)
            if "Publisher" in text:
                publisher = text.split(":")[-1].strip()
            elif "Publication date" in text:
                publication_date = text.split(":")[-1].strip()
            elif "Language" in text:
                language = text.split(":")[-1].strip()
            elif "Tankobon Hardcover" in text:
                pages = text.split(":")[-1].strip().split()[0]
            elif "ISBN-10" in text:
                isbn10 = text.split(":")[-1].strip()
            elif "ISBN-13" in text:
                isbn13 = text.split(":")[-1].strip()
            elif "Dimensions" in text:
                dimensions = text.split(":")[-1].strip()
    
    # Bestseller Rank
    bestseller_div = soup.find("div", {"id": "detailBulletsWrapper_feature_div"})
    if bestseller_div:
        bestseller_text = bestseller_div.get_text()
        match = re.search(r'#([\d,]+) in [\w\s]+', bestseller_text)
        if match:
            bestseller_rank = match.group(1)
    
    # Customer Reviews
    reviews = soup.find("span", {"class": "a-size-base a-color-base"})
    if reviews:
        customer_reviews = reviews.get_text(strip=True)
    
    return (description_text, publisher, publication_date, language, pages, 
            isbn10, isbn13, dimensions, bestseller_rank, customer_reviews)

# SQLite 데이터베이스 설정
conn = sqlite3.connect('books.db')
c = conn.cursor()

# 테이블 생성
c.execute('''
    CREATE TABLE IF NOT EXISTS books (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT,
        price TEXT,
        detail_url TEXT,
        description TEXT,
        publisher TEXT,
        publication_date TEXT,
        language TEXT,
        pages TEXT,
        isbn10 TEXT,
        isbn13 TEXT,
        dimensions TEXT,
        bestseller_rank TEXT,
        customer_reviews TEXT
    )
''')
conn.commit()

url = "https://www.amazon.co.jp/s?k=django&crid=31VCMPC10DQQ2&sprefix=%2Caps%2C148&ref=nb_sb_ss_recent_1_0_recent"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

products = soup.find_all("div", {"data-component-type": "s-search-result"})

for product in products:
    title_element = product.find("span", {"class": "a-size-base-plus a-color-base a-text-normal"})
    price_element = product.find("span", {"class": "a-price-whole"})
    link_element = product.find("a", {"class": "a-link-normal s-no-outline"})
    
    if title_element and price_element and link_element:
        title = title_element.text.strip()
        price = price_element.text.strip()
        detail_url = "https://www.amazon.co.jp" + link_element.get("href")
        
        print(f"Processing: {title}")
        
        # 상세 페이지에서 정보 가져오기
        details = get_book_details(detail_url)
        
        # 데이터베이스에 저장
        c.execute('''
            INSERT INTO books (title, price, detail_url, description, publisher, 
            publication_date, language, pages, isbn10, isbn13, dimensions, 
            bestseller_rank, customer_reviews)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (title, price, detail_url) + details)
        conn.commit()
        
        # 서버에 부담을 주지 않기 위해 각 요청 사이에 지연 추가
        time.sleep(2)

# 데이터베이스 연결 종료
conn.close()

print("Data collection completed and saved to the database.")

In [None]:
# SQLite 데이터베이스 연결
conn = sqlite3.connect('books.db')

# SQL 쿼리 실행 및 결과를 DataFrame으로 변환
df = pd.read_sql_query("SELECT * FROM books", conn)

# 데이터베이스 연결 종료
conn.close()

In [None]:
# DataFrame 확인
print(df.shape)
df

In [None]:
df.iloc[1]["description"]