<a href="https://colab.research.google.com/github/gnmarten/utils/blob/main/Perlentaucher_(23112025).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#change book url in script before running
#results will be downloaded as csv file (allow downloads)
test_url = 'https://www.perlentaucher.de/buch/mareike-fallwickl/und-alle-so-still.html'

In [None]:
#perlentaucher 2311

# Update package list and install necessary dependencies
!apt-get update
!apt-get install -y wget unzip libvulkan1

# Download and install Google Chrome
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get install -f -y

# Install xvfb for virtual framebuffer support
!apt-get install -y xvfb

# Install required packages
!pip install selenium chromedriver-autoinstaller beautifulsoup4 pandas

# Automatically install the correct version of ChromeDriver
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()

import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

def create_driver():
    """Create Chrome driver configured for Colab headless environment"""
    options = Options()

    # Colab/headless specific options
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--remote-debugging-port=9222')

    # Anti-detection measures
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)

    # Add user agent to appear as a real browser
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    # Additional headers
    options.add_argument('--accept-language=en-US,en;q=0.9')

    driver = webdriver.Chrome(options=options)

    # Execute CDP commands to hide automation
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {
        "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })

    # Remove webdriver property
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    return driver

def scrape_book_urls(max_pages=425):
    """Scrape all book URLs from the listing pages"""
    driver = create_driver()
    book_urls = []

    try:
        driver.get('https://www.perlentaucher.de/buchKSL/deutsche-romane.html')
        print("Starting to scrape book URLs...")

        for i in range(max_pages):
            try:
                # Wait for the book links to load
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '.book.teaser-block h3 a'))
                )

                # Get all book URLs on current page
                links = driver.find_elements(By.CSS_SELECTOR, '.book.teaser-block h3 a')
                current_urls = [link.get_attribute('href') for link in links]
                book_urls.extend(current_urls)

                print(f"Page {i+1}: Found {len(current_urls)} books. Total: {len(book_urls)}")

                # Try to click next page button
                try:
                    next_button = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, '.related-next'))
                    )
                    next_button.click()
                    time.sleep(2)  # Wait for page to load
                except:
                    print(f"No more pages found after page {i+1}")
                    break

            except Exception as e:
                print(f"Error on page {i+1}: {e}")
                break

    finally:
        driver.quit()

    # Save URLs to file
    with open('BooksUrls.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(book_urls))

    print(f"\nTotal books scraped: {len(book_urls)}")
    return book_urls

def scrape_review_content(review_link):
    """Scrape the actual review text from a review page"""
    driver = create_driver()
    review_content = ""

    try:
        driver.get(review_link)
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Look for review content in span elements within the review section
        # Try multiple possible selectors
        review_spans = soup.select('span.review-text, .review-content span, .text span, span')

        # If that doesn't work, try getting all text from common review containers
        if not review_spans:
            review_container = soup.select_one('.review-text, .review-content, .text, .content')
            if review_container:
                review_content = review_container.get_text(strip=True)
        else:
            # Join all span texts
            review_content = ' '.join([span.get_text(strip=True) for span in review_spans if span.get_text(strip=True)])

    except Exception as e:
        print(f"  ⚠ Could not fetch review content: {e}")
    finally:
        driver.quit()

    return review_content

def scrape_book_details(url):
    """Scrape details and reviews from a single book page"""
    driver = create_driver()
    book_data = {
        'url': url,
        'title': None,
        'author': None,
        'description': None,
        'publisher': None,
        'year': None,
        'reviews': []
    }

    try:
        driver.get(url)
        time.sleep(2)

        # Get page source and parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract title
        title_elem = soup.select_one('h1')
        if title_elem:
            book_data['title'] = title_elem.get_text(strip=True)

        # Extract author
        author_elem = soup.select_one('.author, .book-author')
        if author_elem:
            book_data['author'] = author_elem.get_text(strip=True)

        # Extract description
        desc_elem = soup.select_one('.book-description, .description, .text')
        if desc_elem:
            book_data['description'] = desc_elem.get_text(strip=True)

        # Extract publisher and year
        meta_elem = soup.select_one('.book-meta, .meta')
        if meta_elem:
            meta_text = meta_elem.get_text(strip=True)
            book_data['publisher'] = meta_text

        # Extract reviews - look for h3.newspaper elements
        review_headers = soup.select('h3.newspaper')
        print(f"  Found {len(review_headers)} review headers")

        for idx, header in enumerate(review_headers, 1):
            review_header_text = header.get_text(strip=True)

            # Get the review link
            link = header.select_one('a.newspaper')
            if link:
                review_link = link.get('href', '')
                if review_link.startswith('/'):
                    review_link = f"https://www.perlentaucher.de{review_link}"

                print(f"  Fetching review {idx}/{len(review_headers)} content from: {review_link}")
                # Scrape the actual review content
                review_content = scrape_review_content(review_link)

                book_data['reviews'].append({
                    'source': review_header_text,
                    'content': review_content
                })
            else:
                # No link, just save the header
                book_data['reviews'].append({
                    'source': review_header_text,
                    'content': ''
                })

        print(f"✓ Scraped: {book_data['title']} - {len(book_data['reviews'])} reviews with content")

    except Exception as e:
        print(f"✗ Error scraping {url}: {e}")
    finally:
        driver.quit()

    return book_data

def scrape_multiple_books(urls, delay=2):
    """Scrape details for multiple books"""
    all_books_data = []

    for idx, url in enumerate(urls, 1):
        print(f"\nScraping book {idx}/{len(urls)}: {url}")
        try:
            book_data = scrape_book_details(url)
            all_books_data.append(book_data)
            time.sleep(delay)  # Be respectful with requests
        except Exception as e:
            print(f"Failed to scrape book {idx}: {e}")
            continue

    return all_books_data

def save_to_csv(books_data, filename='books_with_reviews.csv'):
    """Save scraped data to CSV and auto-download in Colab"""
    rows = []
    for book in books_data:
        # Create separate rows for each review
        if book['reviews']:
            for review in book['reviews']:
                rows.append({
                    'URL': book['url'],
                    'Title': book['title'],
                    'Author': book['author'],
                    'Description': book['description'],
                    'Publisher': book['publisher'],
                    'Year': book['year'],
                    'Review Source': review['source'],
                    'Review Content': review['content']
                })
        else:
            # Add row even if no reviews
            rows.append({
                'URL': book['url'],
                'Title': book['title'],
                'Author': book['author'],
                'Description': book['description'],
                'Publisher': book['publisher'],
                'Year': book['year'],
                'Review Source': '',
                'Review Content': ''
            })

    df = pd.DataFrame(rows)
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"\n✓ Data saved to {filename}")
    print(f"  Total rows: {len(df)}")

    # Auto-download in Colab
    try:
        from google.colab import files
        files.download(filename)
        print(f"  ⬇ Downloaded {filename}")
    except:
        print(f"  (Not in Colab environment - file saved locally)")

    return df

# ============================================================================
# EXAMPLE USAGE
# ============================================================================

print("="*60)
print("PERLENTAUCHER BOOK SCRAPER")
print("="*60)

# Step 1: Test with a single book
print("\n[TEST] Scraping single book...")
#test_url = 'https://www.perlentaucher.de/buch/mareike-fallwickl/und-alle-so-still.html'
book_data = scrape_book_details(test_url)

print("="*60)
print("RESULTS:")
print("="*60)
print(f"Title: {book_data['title']}")
print(f"Author: {book_data['author']}")
print(f"Reviews found: {len(book_data['reviews'])}")
if book_data['reviews']:
    print("\nReviews:")
    for i, review in enumerate(book_data['reviews'], 1):
        print(f"\n  {i}. Source: {review['source']}")
        print(f"     Content preview: {review['content'][:200]}..." if len(review['content']) > 200 else f"     Content: {review['content']}")
else:
    print("  No reviews found")

# Auto-save single test result to CSV
print("\n" + "="*60)
print("Saving test result to CSV...")
df = save_to_csv([book_data], filename='single_book_test.csv')
print(df)

# Step 2: Uncomment to scrape all book URLs (may take a long time!)
# print("\n[STEP 2] Scraping all book URLs...")
# book_urls = scrape_book_urls(max_pages=5)  # Adjust max_pages as needed

# Step 3: Uncomment to load URLs from file
# print("\n[STEP 3] Loading URLs from file...")
# with open('BooksUrls.txt', 'r', encoding='utf-8') as f:
#     book_urls = [line.strip() for line in f.readlines() if line.strip()]
# print(f"Loaded {len(book_urls)} URLs")

# Step 4: Uncomment to scrape multiple books
# print("\n[STEP 4] Scraping multiple books...")
# books_to_scrape = book_urls[:10]  # Test with first 10 books
# all_data = scrape_multiple_books(books_to_scrape, delay=2)
# df = save_to_csv(all_data)
# print("\nFirst few rows:")
# print(df.head())

print("\n" + "="*60)
print("DONE! Uncomment sections above to scrape more books.")
print("="*60)