## Prompt

* 페이지별 도서 목록을 가져오고 상세페이지 URL 도 별도의 컬럼으로 sqlite 로 저장하도록 코드를 작성할 것 해당 항목당 페이지를 순회하며 수집하도록 코드를 작성할 것 저장한 데이터가 몇 개인지 확인하는 코드도 함께 작성해 주세요.

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import random
import sqlite3

def create_database():
    conn = sqlite3.connect('amazon_books.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS books
                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
                  language TEXT,
                  title TEXT,
                  author TEXT,
                  url TEXT,
                  description TEXT)''')
    conn.commit()
    return conn

def get_book_details(url, headers):
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        description = soup.select_one('#bookDescription_feature_div')
        if description:
            description = description.text.strip()
        else:
            description = "No description available"
        
        return description
    except Exception as e:
        print(f"Error fetching book details: {e}")
        return "Description fetch failed"

def get_recommended_books(language, conn, pages=3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
    }
    
    c = conn.cursor()
    
    for page in range(1, pages + 1):
        url = f"https://www.amazon.com/s?k={language}+books&i=stripbooks-intl-ship&page={page}"
        
        try:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            items = soup.select('div.s-result-item[data-component-type="s-search-result"]')
            
            if not items:
                print(f"No items found on page {page} for {language}. Stopping.")
                break
            
            for item in items:
                title_element = item.select_one('h2 > a > span')
                if title_element:
                    title = title_element.text.strip()
                    book_url = "https://www.amazon.com" + item.select_one('h2 > a')['href']
                else:
                    continue
                
                author_element = item.select_one('div.a-row.a-size-base.a-color-secondary .a-row')
                author = author_element.text.strip() if author_element else "Unknown"
                
                description = get_book_details(book_url, headers)
                
                c.execute("INSERT INTO books (language, title, author, url, description) VALUES (?, ?, ?, ?, ?)",
                          (language, title, author, book_url, description))
                
                conn.commit()
                
                print(f"Added: {title} by {author}")
                
                time.sleep(random.uniform(2, 4))  # 랜덤 딜레이 추가
            
            print(f"Completed page {page} for {language}")
            time.sleep(random.uniform(3, 5))  # 페이지 간 추가 딜레이
        
        except Exception as e:
            print(f"Error on page {page} for {language}: {e}")
            break

def count_books(conn):
    c = conn.cursor()
    c.execute("SELECT COUNT(*) FROM books")
    return c.fetchone()[0]

def main():
    languages = ["chinese", "spanish", "french", "japanese"]
    conn = create_database()

    for lang in languages:
        print(f"\nCollecting Recommended {lang.capitalize()} Books:")
        get_recommended_books(lang, conn)
        time.sleep(random.uniform(3, 5))  # 언어 간 추가 딜레이

    total_books = count_books(conn)
    print(f"\nTotal books collected: {total_books}")

    conn.close()
    print("Data collection completed and stored in the database.")

if __name__ == "__main__":
    main()


Collecting Recommended Chinese Books:
Added: Chinese Stories for Language Learners: A Treasury of Proverbs and Folktales in Bilingual Chinese and English (Online Audio Recordings Included) by Part of: Stories for Language Learners (12 books)   | by Vivian Ling , Peng Wang , et al. | Mar 23, 2021
Added: You Are My Glory (Chinese Edition) by Chinese Edition | by Gu Man | May 1, 2019


KeyboardInterrupt: 

In [20]:
import pandas as pd

def connect_to_database():
    return sqlite3.connect('amazon_books.db')

def count_books_by_language(conn):
    cursor = conn.cursor()
    cursor.execute("""
        SELECT language, COUNT(*) as book_count
        FROM books
        GROUP BY language
        ORDER BY language
    """)
    return cursor.fetchall()


def fetch_books(conn):
    cursor = conn.cursor()
    cursor.execute("""
        SELECT *
        FROM books
    """)
    return cursor.fetchall()


def fetch_books_as_dataframe(conn):
    query = "SELECT * FROM books"
    df = pd.read_sql_query(query, conn)
    return df


def print_language_summary(language_counts):
    print("Book Count by Language:")
    print("-----------------------")
    for language, count in language_counts:
        print(f"{language.capitalize()}: {count}")
    print("-----------------------")
    total = sum(count for _, count in language_counts)
    print(f"Total Books: {total}")

In [21]:
conn = connect_to_database()
try:
    df = fetch_books_as_dataframe(conn)
    print("df.shape : ", df.shape)
    language_counts = count_books_by_language(conn)
    print_language_summary(language_counts)
except sqlite3.Error as e:
    print(f"An error occurred: {e}")
finally:
    conn.close()

df.shape :  (130, 6)
Book Count by Language:
-----------------------
Chinese: 50
French: 32
Spanish: 48
-----------------------
Total Books: 130


In [22]:
df

Unnamed: 0,id,language,title,author,url,description
0,1,chinese,Chinese Stories for Language Learners: A Treas...,Part of: Stories for Language Learners (12 boo...,https://www.amazon.com/Chinese-Stories-Languag...,The highly anticipated next book in Tuttle's S...
1,2,chinese,You Are My Glory (Chinese Edition),"Chinese Edition | by Gu Man | May 1, 2019",https://www.amazon.com/You-Are-My-Glory-Chines...,十年过去，乔晶晶意外的星光闪耀，高中拒绝过她的男神却似乎已经泯然众人…… 时光匆匆，你依旧在...
2,3,chinese,Learning Mandarin Chinese Characters Volume 1:...,Book 1 of 2: Learning Mandarin Chinese Charact...,https://www.amazon.com/Learning-Mandarin-Chine...,Reinforce your written Chinese with this pract...
3,4,chinese,Bilingual Chinese English books short stories ...,Simplified Chinese Edition | by Xiang Li | Dec...,https://www.amazon.com/Bilingual-Chinese-Engli...,这套绘本包括最经典的20本安徒生和格林童话故事，图片清晰生动，非常有利于吸引宝宝的注意力。这...
4,5,chinese,My First English-Chinese Learning Library: Bil...,Chinese Edition | by Wonder House Books | Oct...,https://www.amazon.com/First-English-Chinese-L...,A collection of 10 well-researched Bilingual B...
...,...,...,...,...,...,...
125,126,french,Arsène Lupin : L'édition complète (French Edit...,"French Edition | by Maurice Leblanc | Feb 22,...",https://www.amazon.com/Ars%C3%A8ne-Lupin-L%C3%...,No description available
126,127,french,French Long Stories: 5 French Long Stories for...,Book 3 of 3: French Stories for Beginners and ...,https://www.amazon.com/French-Long-Stories-Int...,No description available
127,128,french,French Country Cottage Christmas,"by Courtney Allison | Aug 29, 2023",https://www.amazon.com/Country-Cottage-Christm...,No description available
128,129,chinese,Chinese Stories for Language Learners: A Treas...,Part of: Stories for Language Learners (12 boo...,https://www.amazon.com/Chinese-Stories-Languag...,The highly anticipated next book in Tuttle's S...
