In [12]:
import requests
from bs4 import BeautifulSoup
import time

BASE_URLS = {
    "book": "https://book.douban.com/top250?start=",
    "movie": "https://movie.douban.com/top250?start=",
    "music": "https://music.douban.com/top250?start="
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def fetch_info(category):
    results = []
    base_url = BASE_URLS[category]
    for start in range(0, 250, 25):  # 每一页25项
        response = requests.get(base_url + str(start), headers=HEADERS)
        soup = BeautifulSoup(response.content, "html.parser")
        
        items = soup.select(".item")
        for item in items:
            title = item.select_one("a[title]")["title"]
            rating_num = item.select_one(".rating_nums").text.strip()
            details = item.select_one(".pl").text.strip().split(" / ")
            
            author = details[0] if len(details) > 0 else ""
            publish_info = details[1:-1]  # 排除最后的价格信息
            price = details[-1] if len(details) > 1 else ""
            
            quote_el = item.select_one(".inq")
            quote = quote_el.text if quote_el else ""
            
            results.append({
                "title": title,
                "rating": rating_num,
                "author": author,
                "publish_info": " / ".join(publish_info),
                "price": price,
                "quote": quote
            })
        
        time.sleep(1)  # 为避免频繁请求，每次循环结束后暂停1秒
    return results

if __name__ == "__main__":
    category = input("Enter category (book, movie, music): ")
    data = fetch_info(category)
    for item in data:
        print(item)


Enter category (book, movie, music): book
{'title': '红楼梦', 'rating': '9.6', 'author': '[清] 曹雪芹 著', 'publish_info': '人民文学出版社 / 1996-12', 'price': '59.70元', 'quote': '都云作者痴，谁解其中味？'}
{'title': '活着', 'rating': '9.4', 'author': '余华', 'publish_info': '作家出版社 / 2012-8-1', 'price': '20.00元', 'quote': '生的苦难与伟大'}
{'title': '1984', 'rating': '9.4', 'author': '[英] 乔治·奥威尔', 'publish_info': '刘绍铭 / 北京十月文艺出版社 / 2010-4-1', 'price': '28.00', 'quote': '栗树荫下，我出卖你，你出卖我'}
{'title': '哈利·波特', 'rating': '9.7', 'author': 'J.K.罗琳 (J.K.Rowling)', 'publish_info': '苏农 / 人民文学出版社 / 2008-12-1', 'price': '498.00元', 'quote': '从9¾站台开始的旅程'}
{'title': '三体全集', 'rating': '9.5', 'author': '刘慈欣', 'publish_info': '重庆出版社 / 2012-1', 'price': '168.00元', 'quote': '地球往事三部曲'}
{'title': '百年孤独', 'rating': '9.3', 'author': '[哥伦比亚] 加西亚·马尔克斯', 'publish_info': '范晔 / 南海出版公司 / 2011-6', 'price': '39.50元', 'quote': '魔幻现实主义文学代表作'}
{'title': '飘', 'rating': '9.3', 'author': '[美国] 玛格丽特·米切尔', 'publish_info': '李美华 / 译林出版社 / 2000-9', 'price': '40.00元'

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

    
# Convert the data to a pandas DataFrame and save to Excel
df = pd.DataFrame(data)
df.to_excel(f"{category}_top250.xlsx", index=False)
print(f"Data saved to {category}_top250.xlsx")


Data saved to book_top250.xlsx


In [24]:
import requests
from bs4 import BeautifulSoup
import time

URL = "https://www.goodreads.com/list/show/19.Best_for_Book_Clubs?page="
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
}

response = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
books_data = []
for i in range(1,1000):
    URL = URL + str(i)
    for book_row in soup.findAll("tr", {"itemtype": "http://schema.org/Book"}):
        title = book_row.find("span", {"itemprop": "name"}).text
        author = book_row.find("a", {"class": "authorName"}).span.text
        rating = book_row.find("span", {"class": "minirating"}).text.split()[0]
        num_ratings = book_row.find("span", {"class": "minirating"}).text.split("—")[1].split()[0]
        score = book_row.find("span", {"smallText uitext"}).a.text.split(":")[1].strip()
        num_voters = book_row.find("span", {"smallText uitext"}).find_all("a")[1].text.split()[0]

        book_data = {
            "title": title,
            "author": author,
            "rating": rating,
            "num_ratings": num_ratings,
            "score": score,
            "num_voters": num_voters
        }
        books_data.append(book_data)
#     time.sleep(1)

# for book in books_data:
#     print(book)


In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

    
df = pd.DataFrame(books_data)
df.to_excel('books_data.xlsx', index=False)