In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import re
import os

In [None]:


# === CẤU HÌNH ===
CHROMEDRIVER_PATH = "./chromedriver.exe"
if not os.path.exists(CHROMEDRIVER_PATH):
    raise FileNotFoundError(f"Không tìm thấy chromedriver.exe tại: {os.path.abspath(CHROMEDRIVER_PATH)}")

service = Service(CHROMEDRIVER_PATH)
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-features=VizDisplayCompositor")  # Giảm crash

browser = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(browser, 20)

def close_login_popup():
    """Đóng popup đăng nhập nếu có — nhưng chỉ khi trang đã ổn định."""
    try:
        # Chờ một chút để popup (nếu có) xuất hiện
        time.sleep(0.5)
        selectors = [
            "button[aria-label='Close']",
            "div.Modal__close button",
            "button.Modal__close",
            "span[aria-label='Close']"
        ]
        for sel in selectors:
            try:
                btn = browser.find_element(By.CSS_SELECTOR, sel)
                if btn.is_displayed() and btn.is_enabled():
                    browser.execute_script("arguments[0].click();", btn)
                    time.sleep(0.5)
                    print(" Đã đóng popup.")
                    return True
            except:
                continue
    except:
        pass
    return False

# === CRAWL ===
base_url = "https://www.goodreads.com/list/show/19253.Books_you_wish_more_people_knew_about_Part_II"
books_data = []
max_pages = 7
current_page = 1

try:
    while current_page <= max_pages:
        list_url = base_url if current_page == 1 else f"{base_url}?page={current_page}"
        print(f" Đang crawl trang {current_page}...")
        
        browser.get(list_url)
        time.sleep(1)
        close_login_popup()
        
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "tr[itemtype='http://schema.org/Book']")))
        book_rows = browser.find_elements(By.CSS_SELECTOR, "tr[itemtype='http://schema.org/Book']")
        print(f"Tìm thấy {len(book_rows)} cuốn sách.")

        for idx in range(len(book_rows)):  
            try:
                # Lấy lại danh sách để tránh stale
                books = browser.find_elements(By.CSS_SELECTOR, "tr[itemtype='http://schema.org/Book']")
                if idx >= len(books):
                    break

                # Lấy thông tin từ trang danh sách
                book = books[idx]
                title = book.find_element(By.CSS_SELECTOR, "a.bookTitle span").text.strip()
                author = book.find_element(By.CSS_SELECTOR, "a.authorName span").text.strip()
                minirating = book.find_element(By.CSS_SELECTOR, "span.minirating").text

                # Phân tích rating
                avg_rating = num_ratings = None
                if "avg rating" in minirating:
                    parts = [p.strip() for p in minirating.split("—")]
                    if parts:
                        avg_match = re.search(r"([\d.]+)", parts[0])
                        avg_rating = float(avg_match.group(1)) if avg_match else None
                    if len(parts) > 1:
                        num_ratings = int(re.sub(r'[^\d]', '', parts[1])) if parts[1] else None


                book_link = book.find_element(By.CSS_SELECTOR, "a.bookTitle").get_attribute("href")

                # --- VÀO TRANG CHI TIẾT ---
                browser.get(book_link)
                time.sleep(1.5)  # Đợi trang bắt đầu load

                # Đóng popup NẾU CÓ, nhưng không làm gián đoạn luồng chính
                close_login_popup()


                # Thu thập dữ liệu
                # === LẤY NUM_REVIEWS TỪ TRANG CHI TIẾT ===
                num_reviews_detail = None
                try:
                    review_elem = browser.find_element(By.XPATH, "/html/body/div[1]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[2]/a/div[2]/div/span[2]")
                    review_text = review_elem.text.strip()
                    num_reviews_detail = int(re.sub(r'[^\d]', '', review_text))
                except Exception as e:
                    print(f" Không lấy được num_reviews từ XPath: {str(e)[:100]}")
                num_reviews = num_reviews_detail 
                print(f"num_reviews: {num_reviews}")

            
                genres = []
                try:
                    genre_elems = browser.find_elements(By.CSS_SELECTOR, "div.BookPageMetadataSection__genres a")
                    genres = [g.text.strip() for g in genre_elems if g.text.strip()]
                    genres = list(dict.fromkeys(genres))
                    print(f"geners: {', '.join(genres)}")
                except Exception as e:
                    print(f" Lỗi genres: {e}")

                publish_year = pages = None
                try:
                    details = browser.find_element(By.XPATH, "/html/body/div[1]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[6]/div/span[1]/span/div").text
                    pages_match = re.search(r'(\d+)\s+pages', details)
                    year_match = re.search(r'first published.*?(\d{4})|published.*?(\d{4})', details, re.IGNORECASE)
                    pages = int(pages_match.group(1)) if pages_match else None
                    publish_year = int(year_match.group(1)) if year_match else None
                    print(f"publish_year: {publish_year}, pages: {pages}")
                except:
                    pass

                currently_reading = want_to_read = None
                try:
                    stats = browser.find_elements(By.XPATH, "/html/body/div[1]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[7]/div")
                    for stat in stats:
                        txt = stat.text.lower()
                        if "currently reading" in txt:
                            match = re.search(r'([\d,]+)', txt)
                            currently_reading = int(match.group(1).replace(",", "")) if match else None
                        elif "want to read" in txt:
                            match = re.search(r'([\d,]+)', txt)
                            want_to_read = int(match.group(1).replace(",", "")) if match else None
                    print(f"currently_reading: {currently_reading}, want_to_read: {want_to_read}")
                except:
                    pass

                books_data.append({
                    "Title": title,
                    "Author": author,
                    "Average rating": avg_rating,
                    "Number of ratings": num_ratings,
                    "Number of reviews": num_reviews,
                    "Genres": ", ".join(genres) if genres else None,
                    "Publish year": publish_year,
                    "Number of pages": pages,
                    "Currently reading": currently_reading,
                    "Want to read": want_to_read
                })

                print(f"[{len(books_data)}]  {title[:50]}...")

                # ⚠️ KHÔNG DÙNG browser.back() — DÙNG get() lại trang danh sách
                browser.get(list_url)
                time.sleep(1.5)
                close_login_popup()
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "tr[itemtype='http://schema.org/Book']")))

                time.sleep(2)

            except Exception as e:
                print(f" Lỗi sách {idx+1}: {str(e)[:150]}")
                # Phục hồi: quay lại trang danh sách
                try:
                    browser.get(list_url)
                    time.sleep(1)
                    close_login_popup()
                    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "tr[itemtype='http://schema.org/Book']")))
                except:
                    pass
                time.sleep(2)

        current_page += 1

finally:
    browser.quit()

# Lưu kết quả
if books_data:
    df = pd.DataFrame(books_data)
    df.to_csv("test.csv", index=False, encoding='utf-8-sig')
    print(f" Đã lưu {len(books_data)} sách vào test.csv")
    print(f" Đã thu thập dữ liệu {len(books_data)} sách.")
else:
    print(" Không có dữ liệu.")


📖 Đang crawl trang 1...
Tìm thấy 100 cuốn sách.
ℹ️ Đã đóng popup.
num_reviews: 928
geners: Romance, Contemporary Romance, Contemporary, Humor, Chick Lit, Adult, New Adult
publish_year: 2013, pages: 242
currently_reading: 604, want_to_read: None
[1] ✅ The Law of Attraction (Lawyers in Love, #1)...
num_reviews: 222
geners: Fantasy, Middle Grade, Young Adult, Fiction, Magic, Fae, Childrens
publish_year: 2015, pages: 265
currently_reading: 485, want_to_read: None
[2] ✅ Isle of Winds (The Changeling, #1)...
num_reviews: 105
geners: Young Adult, Mystery, Paranormal, Thriller, Fiction, Supernatural, Horror
publish_year: 2013, pages: 268
currently_reading: 24, want_to_read: None
[3] ✅ Breaking Glass...
num_reviews: 121
geners: Fantasy, Young Adult, Magic, Fiction
publish_year: 2016, pages: None
currently_reading: 100, want_to_read: None
[4] ✅ Drowned Tomb (The Changeling, #2)...
num_reviews: 115
geners: Fantasy, Young Adult, Fiction, Magic
publish_year: 2017, pages: None
currently_reading: 73

KeyboardInterrupt: 

In [3]:
print(len(books_data), "books data collected.")

820 books data collected.


In [4]:
df = pd.DataFrame(books_data)
df.head()

Unnamed: 0,Title,Author,Average rating,Number of ratings,Number of reviews,Genres,Publish year,Number of pages,Currently reading,Want to read
0,"The Law of Attraction (Lawyers in Love, #1)",N.M. Silber,3.76,11553,928.0,"Romance, Contemporary Romance, Contemporary, H...",2013.0,242.0,604.0,
1,"Isle of Winds (The Changeling, #1)",James Fahy,4.27,1689,222.0,"Fantasy, Middle Grade, Young Adult, Fiction, M...",2015.0,265.0,485.0,
2,Breaking Glass,Lisa Amowitz,3.83,504,105.0,"Young Adult, Mystery, Paranormal, Thriller, Fi...",2013.0,268.0,24.0,
3,"Drowned Tomb (The Changeling, #2)",James Fahy,4.46,800,121.0,"Fantasy, Young Adult, Magic, Fiction",2016.0,,100.0,
4,"Chains of Gaia (The Changeling, #3)",James Fahy,4.59,695,115.0,"Fantasy, Young Adult, Fiction, Magic",2017.0,,73.0,


In [5]:
df.to_csv("test.csv", index=False, encoding='utf-8-sig')

In [6]:
df.to_csv("results.csv", index=False, encoding='utf-8-sig')