In [25]:
%pip install beautifulsoup4
%pip install playwright
%pip install playwright-stealth requests
!playwright install

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [26]:
import requests
import gzip
import io
import re
from bs4 import BeautifulSoup

# The Sitemap Index you provided (Attractions)
SITEMAP_INDEX = "https://www.tripadvisor.com/sitemap/att/en_US/sitemap_en_US_attractions_index.xml"

# Your target cities
TARGET_GEO_IDS = ["g187323"] # Berlin

def get_urls_from_sitemap():
    print("Fetching Sitemap Index...")
    headers = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}
    r = requests.get(SITEMAP_INDEX, headers=headers)
    soup = BeautifulSoup(r.content, 'xml')
    
    sub_sitemaps = [loc.text for loc in soup.find_all('loc')]
    
    found_urls = []

    # We iterate through the sub-sitemaps (There are many, maybe limit this for testing)
    print(f"Found {len(sub_sitemaps)} sub-sitemaps. Checking the latest ones...")
    
    # In a real run, you might loop through all. 
    # For now, let's look at the last 3 (most recent) to see if we find matches.
    for sub_map_url in sub_sitemaps[-3:]: 
        print(f"Checking: {sub_map_url}")
        try:
            r_sub = requests.get(sub_map_url, headers=headers)
            
            # Decompress GZIP
            with gzip.open(io.BytesIO(r_sub.content), 'rb') as f:
                xml_content = f.read()
                
            soup_sub = BeautifulSoup(xml_content, 'xml')
            urls = [loc.text for loc in soup_sub.find_all('loc')]
            
            # Filter for our GeoID
            for url in urls:
                for geo in TARGET_GEO_IDS:
                    # Pattern matching for TripAdvisor Attraction URLs containing the GeoID
                    if f"-{geo}-" in url and "Activities-" in url:
                        found_urls.append(url)
                        
        except Exception as e:
            print(f"Error processing map: {e}")

    return found_urls

# Run this once to generate your "ToDo" list
target_urls = get_urls_from_sitemap()
print(f"Found {len(target_urls)} attractions.")

Fetching Sitemap Index...


Found 0 sub-sitemaps. Checking the latest ones...
Found 0 attractions.


In [27]:
import asyncio
import random
import os
from playwright.async_api import async_playwright
from playwright_stealth import Stealth  # Import the class, not the function

# Create debug folder
if not os.path.exists("debug_screens"):
    os.makedirs("debug_screens")
if not os.path.exists("html_dumps"):
    os.makedirs("html_dumps")

cities = [
    {"name": "Berlin", "geo": "g187323"},
]

async def scrape_with_stealth(cities):
    # --- CHANGED: Use Stealth().use_async() wrapper ---
    async with Stealth().use_async(async_playwright()) as p:
        
        # 1. Launch Browser
        # We must keep headless=False because TripAdvisor is extremely sensitive to headless browsers.
        browser = await p.chromium.launch(
            headless=False, 
            args=[
                "--disable-blink-features=AutomationControlled", 
                "--start-maximized"
            ]
        )
        
        # 2. Configure Context (User Agent & Viewport are critical)
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            locale="en-US"
        )
        
        # Note: In this new API, stealth is applied automatically to contexts/pages created here
        page = await context.new_page()

        for city in cities:
            print(f"--- Processing {city['name']} ---")
            
            # Start loop: offset 0, 30, 60...
            for offset in range(0, 91, 30): 
                
                if offset == 0:
                    url = f"https://www.tripadvisor.com/Attractions-{city['geo']}-Activities-{city['name']}.html"
                else:
                    url = f"https://www.tripadvisor.com/Attractions-{city['geo']}-Activities-oa{offset}-{city['name']}.html"

                print(f"Navigating to offset {offset}...")
                
                try:
                    # Random "human" pause before clicking/navigating
                    await asyncio.sleep(random.uniform(2, 5))
                    
                    response = await page.goto(url, timeout=60000, wait_until="domcontentloaded")
                    
                    # 4. CHECK IF BLOCKED
                    page_content = await page.content()
                    
                    if "Access Denied" in page_content or (response and response.status == 403):
                        print("!!! BLOCKED BY TRIPADVISOR !!!")
                        await page.screenshot(path="debug_screens/blocked.png")
                        # Pause to let you solve it manually if possible
                        print("Please check the browser window. Waiting 60 seconds...")
                        await asyncio.sleep(60)
                    
                    # Check for "Challenge" (Cloudflare/Anti-bot)
                    if "Challenge" in await page.title():
                        print("!!! CAPTCHA DETECTED !!! - Please solve it in the browser window.")
                        await asyncio.sleep(30) 

                    # 5. Human-like Scrolling
                    # TripAdvisor relies heavily on lazy loading. If you don't scroll, you won't get all 30 items.
                    try:
                        await page.wait_for_selector('footer', state='attached', timeout=10000)
                    except:
                        pass # If footer not found, just scroll anyway
                        
                    for _ in range(3):
                        await page.mouse.wheel(0, 600)
                        await asyncio.sleep(random.uniform(0.5, 1.5))
                    
                    # 6. Save Data
                    filename = f"html_dumps/{city['name']}_oa{offset}.html"
                    with open(filename, "w", encoding="utf-8") as f:
                        f.write(await page.content())
                    
                    print(f" -> Saved {filename}")

                except Exception as e:
                    print(f"Error on {url}: {e}")
                    await page.screenshot(path=f"debug_screens/error_{offset}.png")

        await browser.close()

# Run this cell
await scrape_with_stealth(cities)

--- Processing Berlin ---
Navigating to offset 0...
!!! BLOCKED BY TRIPADVISOR !!!
Please check the browser window. Waiting 60 seconds...


CancelledError: 

In [None]:
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False, args=["--disable-blink-features=AutomationControlled"])
    context = await browser.new_context()
    await context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    page = await context.new_page()
    await page.goto("https://bot.sannysoft.com/")
    await asyncio.sleep(5)
    # Check the page visually to see if 'WebDriver' is green (false)
    await browser.close()