In [37]:
import asyncio
import random
import os
from camoufox.async_api import AsyncCamoufox
from camoufox import DefaultAddons
from playwright_captcha import ClickSolver, FrameworkType, CaptchaType
import datetime
import re
import json

In [None]:
async def handle_cookies(page):

    print('Checking for cookies on the page...')

    try:
        accept_button = page.get_by_role("button", name=re.compile("Accept", re.IGNORECASE))
        if await accept_button.is_visible(timeout=5000):
            print("Accepting cookies")
            await accept_button.click()
            await asyncio.sleep(2)
    except Exception:
        print("No cookies found")

def save_to_json(data, filename="tripadvisor_data.json"):
    existing_data = []
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            try:
                existing_data = json.load(f)
            except (json.JSONDecodeError, ValueError):
                existing_data = []
    
    existing_data.append(data)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, indent=4, ensure_ascii=False)

async def scrape_attraction_details(new_tab, city, category_field, specific_type):
    try:

        # Name of the Attraction
        name_locator = new_tab.locator('h1.biGQs._P.CIuBz')
        await name_locator.wait_for(state="visible", timeout=15000)
        name = await name_locator.inner_text()

        # Image Link
        img_locator = new_tab.locator('picture.NhWcC._R.mdkdE.afQPz.eXZKw img').first
        srcset = await img_locator.get_attribute("srcset")
        # Take the first URL in srcset or fallback to src
        image_link = srcset.split(',')[0].split(' ')[0] if srcset else await img_locator.get_attribute("src")
        
# 3. Scoped Operating Hours Logic
        operating_hours = {}
        
        """
        try:
            # 1. Click the button using JS to bypass visibility checks
            hours_btn = new_tab.locator('button.keqHA.f._S.G_.w').first
            await hours_btn.evaluate("node => { node.scrollIntoView(); node.click(); }")
            
            # 2. Wait for the container to be ATTACHED (it might be hidden initially)
            container_selector = 'div[data-automation="attractionsPoiHoursForDay"]'
            container = new_tab.locator(container_selector).first
            await container.wait_for(state="attached", timeout=7000)
            
            # 3. Force the container to be visible via JS if needed
            await container.evaluate("node => node.style.display = 'block'")
            
            # 4. Wait specifically for the DAY elements inside to appear
            day_locator = container.locator('div.biGQs._P.ezezH')
            await day_locator.first.wait_for(state="visible", timeout=5000)

            # 5. Extract only from this specific container
            days = await day_locator.all()
            print(days)
            times = await container.locator('div.biGQs._P.VImYz.AWdfh').all()
            print(times)
            
            for d, t in zip(days, times):
                day_text = (await d.inner_text()).strip()
                time_text = (await t.inner_text()).strip()
                
                # Knowledge Graph Validation
                if any(day in day_text for day in ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]):
                    operating_hours[day_text] = time_text
                    
        except Exception as e:
            print(f"   ⚠️ Scoped hours extraction failed: {e}")
        """
        print(f'Name: {name}, City: {city}, Image Link: {image_link}, Attraction Type: {category_field}, subcategory_specific_type: {specific_type}, operating hours: {operating_hours} ')
        return {
            "name": name,
            "city": city,
            "image_link": image_link,
            "attraction_type": category_field,
            "subcategory_specific_type": specific_type,
            "operating_hours": operating_hours,
            "BudgetTier": "TBD", # Placeholder for LLM pipeline
            "WeatherSuitability": "TBD" # Placeholder for LLM pipeline
        }
    except Exception as e:
        print(f"Error extracting details: {e}")
        return None



In [None]:
async def run_scraper():
    
    categories = ["Sights & Landmarks", "Museums", "Nightlife", "Nature & Parks"]
    max_pages = 10
    city = "Berlin"

    async with AsyncCamoufox(
        headless=False,
        humanize=True, 
        os="windows",
        persistent_context=True, 
        user_data_dir="./tripadvisor_session_2026",
        exclude_addons=[DefaultAddons.UBO],
        block_webgl=False,
        main_world_eval=True,
        disable_coop=True
    ) as browser:
        
        page = await browser.new_page()
        base_url = "https://www.tripadvisor.com/Attractions-g187323-Activities-oa0-Berlin.html"

        await page.goto(base_url, wait_until="domcontentloaded", timeout=60000)
        await handle_cookies(page)
        await page.context.storage_state(path="tripadvisor_auth.json")
        await asyncio.sleep(3)

        print("Session state updated")

        for category_name in categories:
            
            print(f'Processing: {category_name}')

            category_btn = page.get_by_text(category_name, exact=True).first
            await category_btn.click()

            for current_page_num in range(1, max_pages + 1):

                selector = 'div.XfVdV.o.AIbhI'
                count = await page.locator(selector).count()
                print(f"Found {count} attractions in {category_name}")

                for i in range(count):
                    
                    container = page.locator('div.hZuqH.y').nth(i)
                    # Take specific type: first subdiv in alPVI eNNhq PgLKC tnGGX yzLvM
                    type_locator = container.locator('div.alPVI.eNNhq.PgLKC.tnGGX.yzLvM div').first
                    specific_type = await type_locator.inner_text() if await type_locator.count() > 0 else "N/A"

                    current_link = page.locator(selector).nth(i)
                    
                    try:
                        await current_link.scroll_into_view_if_needed()
                        attraction_name = await current_link.text_content()
                        print(f'Opening attraction {i+1}: {attraction_name.strip()}')

                        async with page.expect_popup() as popup_info:
                            await current_link.click()
                        
                        
                        new_tab = await popup_info.value 
     

            
                        data = await scrape_attraction_details(new_tab, city, category_name, specific_type)
                        
                        if data:
                            save_to_json(data)
                            print(f"✅ Saved data for {attraction_name.strip()}")

                        await new_tab.close()
                        
                        await asyncio.sleep(random.uniform(1, 2))

                    except Exception as e:
                        print(f"Could not click item {i+1}: {e}")
                        continue
                
                if current_page_num < max_pages:
                    next_page_num = current_page_num + 1
                    next_btn_selector = f'div.Yzhnw.P [aria-label="{next_page_num}"]'
                    next_btn = page.locator(next_btn_selector)

                    if await next_btn.count() > 0:
                        await next_btn.scroll_into_view_if_needed()
                        await next_btn.click()
                    else:
                        print(f"Page {next_page_num} not found. Ending category early.")
                        break


await run_scraper()

⚠️ proxies.txt not found. Running without proxy.
Checking for cookies on the page...
Session state updated
Processing: Sights & Landmarks


TargetClosedError: Locator.click: Target page, context or browser has been closed