<a href="https://colab.research.google.com/github/gnmarten/utils/blob/main/Lovelybooks_(23112025).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#paste lovelybooks url
book_url = 'https://www.lovelybooks.de/autor/Mareike-Fallwickl/Und-alle-so-still-11024743518-w/'

In [None]:
#expanded to include all Lovelybooks text reviews (23.11.2025)
# Update package list and install necessary dependencies
!apt-get update
!apt-get install -y wget unzip libvulkan1

# Download and install Google Chrome
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get install -f -y

# Install xvfb for virtual framebuffer support
!apt-get install -y xvfb

# Install required packages
!pip install selenium chromedriver-autoinstaller beautifulsoup4 pandas

# Automatically install the correct version of ChromeDriver
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

# Configure Selenium with obfuscation for headless mode
def configure_driver():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--remote-debugging-port=9222')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    driver = webdriver.Chrome(options=options)

    # Execute CDP commands to hide automation
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {
        "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    return driver

# Scrape reviews and metadata from a LovelyBooks.de book page
def scrape_reviews(book_url):
    driver = configure_driver()
    driver.get(book_url)
    reviews = []
    metadata = {}

    try:
        # Wait for page to load
        time.sleep(3)

        # Extract metadata - correct title selector
        try:
            title_elem = driver.find_element(By.CSS_SELECTOR, '.BookTitle__Title-lxryhc-1')
            metadata['title'] = title_elem.text
            print(f"Found title: {metadata['title']}")
        except Exception as e:
            print(f"Could not find title: {e}")
            metadata['title'] = 'Unknown'

        # Extract subtitle if available
        try:
            subtitle_elem = driver.find_element(By.CSS_SELECTOR, '.BookTitle__Subtitle-lxryhc-2')
            metadata['subtitle'] = subtitle_elem.text
        except:
            metadata['subtitle'] = ''

        # Extract author
        try:
            author_links = driver.find_elements(By.CSS_SELECTOR, '.BookInfos__AuthorLinkWrap-sc-13pbtxl-2 a')
            if author_links:
                metadata['author'] = author_links[-1].text  # Get last author link (actual author name)
        except:
            metadata['author'] = 'Unknown'

        page_count = 0
        previous_review_count = 0

        # Loop through all review pages
        while True:
            page_count += 1
            print(f"\n--- Processing page {page_count} ---")

            # Wait for reviews to load
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '.StreamEntry, .style__EntryContent-sc-1aphc73-11, article[class*="ReviewAndReviewShortQuoteStreamItem"]'))
                )
            except Exception as e:
                print(f"No reviews found on page: {e}")
                break

            # Get page source and parse with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Find all review entries - try multiple selectors
            review_entries = soup.select('article[class*="ReviewAndReviewShortQuoteStreamItem"], article.style__ReviewAndReviewShortQuoteStreamItem-sc-1aphc73-0')

            if len(review_entries) == 0:
                review_entries = soup.select('.StreamEntry, [class*="StreamEntry"]')

            if len(review_entries) == 0:
                review_entries = soup.select('[class*="style__EntryContent"]')

            print(f"Found {len(review_entries)} review entries on this page")

            if len(review_entries) == 0:
                print("No reviews found, stopping...")
                break

            # Track reviews added from this page
            reviews_added_this_page = 0

            for idx, entry in enumerate(review_entries, 1):
                try:
                    # Extract author
                    author = None
                    author_elem = entry.select_one('[class*="UsernameLink"], .UserLink__TextOnly-sc-5ioweh-1, a[class*="UsernameLink"]')
                    if author_elem:
                        author = author_elem.get_text(strip=True)

                    # Extract datetime
                    datetime = None
                    time_elem = entry.select_one('time')
                    if time_elem:
                        datetime = time_elem.get('datetime', time_elem.get_text(strip=True))

                    # Extract star rating
                    star_rating = 0
                    star_elements = entry.select('.CommonIcon.-star-full, [class*="CommonIcon"].-star-full')
                    star_rating = len(star_elements)

                    # Extract short review (Kurzmeinung)
                    short_review = None
                    quote_elem = entry.select_one('.Quote__Paragraph-ukhk62-0, [class*="Quote__Paragraph"]')
                    if quote_elem:
                        short_review = quote_elem.get_text(strip=True)

                    # Extract full review
                    full_review = None
                    review_elem = entry.select_one('.HtmlBox, [class*="HtmlBox"]')
                    if review_elem:
                        full_review = review_elem.get_text(strip=True)

                    # Only add if we have some content and haven't seen this review before
                    if (author or short_review or full_review):
                        # Create a unique identifier for the review to avoid duplicates
                        review_id = f"{author}_{datetime}_{star_rating}"

                        # Check if we already have this review
                        if not any(r.get('review_id') == review_id for r in reviews):
                            reviews.append({
                                'review_id': review_id,
                                'author': author,
                                'datetime': datetime,
                                'star_rating': star_rating,
                                'short_review': short_review,
                                'full_review': full_review,
                                'page': page_count
                            })
                            reviews_added_this_page += 1
                            print(f"  ✓ Review {len(reviews)}: {author} - {star_rating} stars")

                except Exception as e:
                    print(f"  ✗ Error extracting review {idx}: {e}")

            print(f"Added {reviews_added_this_page} new reviews from this page (Total: {len(reviews)})")

            # Check if we're getting new reviews
            if len(reviews) == previous_review_count:
                print("No new reviews found on this page, stopping...")
                break

            previous_review_count = len(reviews)

            # Try to find and click the 'Weitere Beiträge laden' button
            button_clicked = False

            # Try multiple selector strategies
            selectors = [
                'button.moreButton',
                'button.-outlined.moreButton',
                'button[class*="moreButton"]',
                'button.Button.-outlined.moreButton',
                '.separator button',
                'button:contains("Weitere Beiträge laden")'
            ]

            for selector in selectors:
                try:
                    # Wait for button to be present
                    load_more_button = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                    )

                    print(f"Found button with selector: {selector}")

                    # Scroll to button
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button)
                    time.sleep(1)

                    # Wait for button to be clickable
                    load_more_button = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                    )

                    # Try JavaScript click first (more reliable)
                    driver.execute_script("arguments[0].click();", load_more_button)
                    print(f"Clicked 'Weitere Beiträge laden' button using JavaScript")

                    button_clicked = True
                    time.sleep(3)  # Wait for new content to load
                    break

                except Exception as e:
                    # Try next selector
                    continue

            # If no button found with any selector, try finding by text
            if not button_clicked:
                try:
                    # Try XPath to find button by text
                    load_more_button = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Weitere Beiträge laden')]"))
                    )

                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button)
                    time.sleep(1)
                    driver.execute_script("arguments[0].click();", load_more_button)
                    print("Clicked 'Weitere Beiträge laden' button using XPath")
                    button_clicked = True
                    time.sleep(3)
                except Exception as e:
                    print(f"Could not find load more button: {e}")

            if not button_clicked:
                print("No more pages to load - button not found")
                break

            # Safety limit
            if page_count >= 50:
                print("Reached page limit of 50")
                break

    except Exception as e:
        print(f"Error scraping reviews: {e}")
        import traceback
        traceback.print_exc()
    finally:
        driver.quit()

    # Remove review_id from final output
    for review in reviews:
        review.pop('review_id', None)

    return metadata, reviews

# Save reviews to a CSV file
def save_reviews_to_csv(metadata, reviews, output_file='lovelybooks_reviews.csv'):
    if not reviews:
        print("No reviews to save!")
        return

    df = pd.DataFrame(reviews)
    df['book_title'] = metadata.get('title', '')
    df['book_subtitle'] = metadata.get('subtitle', '')
    df['book_author'] = metadata.get('author', '')

    # Reorder columns
    cols = ['book_title', 'book_subtitle', 'book_author', 'author', 'datetime',
            'star_rating', 'short_review', 'full_review', 'page']
    df = df[cols]

    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"\n✓ Saved {len(reviews)} reviews to {output_file}")

    # Auto-download in Colab
    try:
        from google.colab import files
        files.download(output_file)
        print(f"  ⬇ Downloaded {output_file}")
    except:
        print(f"  (Not in Colab environment - file saved locally)")

    return df

# Main script
if __name__ == '__main__':
    # Example book URL
    #book_url = 'https://www.lovelybooks.de/autor/Mareike-Fallwickl/Und-alle-so-still-11024743518-w/'

    print("="*60)
    print("LOVELYBOOKS REVIEW SCRAPER")
    print("="*60)
    print(f"\nScraping: {book_url}\n")

    metadata, reviews = scrape_reviews(book_url)

    print("\n" + "="*60)
    print("RESULTS")
    print("="*60)
    print(f"Book: {metadata.get('title', 'Unknown')}")
    print(f"Author: {metadata.get('author', 'Unknown')}")
    print(f"Total reviews scraped: {len(reviews)}")

    if reviews:
        df = save_reviews_to_csv(metadata, reviews)
        print("\nFirst few reviews:")
        print(df[['author', 'star_rating', 'short_review']].head())
    else:
        print("\nNo reviews found!")

    print("\n" + "="*60)
    print("DONE!")
    print("="*60)