In [40]:
import sys
from pathlib import Path
import os
import re

# Add workspace packages to Python path for notebook execution
notebook_dir = Path(os.getcwd())
if notebook_dir.name == "notebooks":
    project_root = notebook_dir.parent
else:
    project_root = notebook_dir if (notebook_dir / "packages" / "cityvibe-core").exists() else notebook_dir.parent

core_path = project_root / "packages" / "cityvibe-core" / "src"
common_path = project_root / "packages" / "cityvibe-common" / "src"

if core_path.exists():
    sys.path.insert(0, str(core_path))
if common_path.exists():
    sys.path.insert(0, str(common_path))

# Install missing dependencies if needed
dependencies_installed = False

try:
    import sqlalchemy
    import playwright
    print("‚úÖ All dependencies are available")
except ImportError as e:
    missing = str(e).split()[-1].replace("'", "")
    print(f"‚ö†Ô∏è {missing} not found. Installing dependencies...")
    import subprocess
    try:
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", "--quiet",
            "sqlalchemy>=2.0.0", "sqlmodel>=0.0.14", "asyncpg>=0.29.0", 
            "pydantic>=2.0.0", "alembic>=1.13.0", "psycopg2-binary>=2.9.0",
            "playwright>=1.40.0"
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print("‚úÖ Dependencies installed successfully!")
        print("‚ö†Ô∏è Installing Playwright browsers (this may take a moment)...")
        subprocess.check_call([
            sys.executable, "-m", "playwright", "install", "chromium"
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print("‚úÖ Playwright browsers installed!")
        # Force reload after installation
        import importlib
        importlib.invalidate_caches()
        dependencies_installed = True
    except Exception as e:
        print(f"‚ùå Failed to install dependencies: {e}")
        print("\nüí° Please install manually with:")
        print(f"   {sys.executable} -m pip install sqlalchemy>=2.0.0 sqlmodel>=0.0.14 asyncpg>=0.29.0 pydantic>=2.0.0 alembic>=1.13.0 psycopg2-binary>=2.9.0 playwright>=1.40.0")
        print(f"   {sys.executable} -m playwright install chromium")
        raise

import asyncio
import time
import json
import logging
import requests
from bs4 import BeautifulSoup
from decimal import Decimal, InvalidOperation
from typing import Any, Optional, AsyncGenerator, Dict, List
from uuid import UUID
from datetime import datetime
import httpx

# Import playwright async API (required for notebooks/Jupyter)
try:
    from playwright.async_api import async_playwright
    print("‚úÖ Playwright async API imported successfully")
except ImportError:
    print("‚ö†Ô∏è Playwright not available (optional)")

# Import venue models after dependencies are installed
from cityvibe_core.models.venue import Venue, VenueCreate, VenuePublic
print("‚úÖ Successfully imported cityvibe_core models")

‚úÖ All dependencies are available
‚úÖ Playwright async API imported successfully
‚úÖ Successfully imported cityvibe_core models


In [5]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

In [None]:
def extract_lat_lon(soup: BeautifulSoup) -> tuple[Optional[Decimal], Optional[Decimal]]:
    """Haalt latitude en longitude uit Google Maps afbeeldingen in de HTML."""
    url_to_check = None
    
    # 1. Sidebar map
    map_img = soup.select_one(".locatie-small img")
    if map_img:
        src_attr = map_img.get('src')
        if src_attr:
            url_to_check = str(src_attr)
    
    # 2. Fallback: Grote map
    if not url_to_check:
        map_div = soup.select_one(".locatie-large a")
        if map_div:
            style_attr = map_div.get('style')
            if style_attr:
                url_to_check = str(style_attr)

    if url_to_check and isinstance(url_to_check, str):
        match = re.search(r'center=([\d\.]+),([\d\.]+)', url_to_check)
        if match:
            try:
                return Decimal(match.group(1)), Decimal(match.group(2))
            except (InvalidOperation, ValueError):
                pass
    return None, None

def extract_opening_hours(soup: BeautifulSoup) -> Dict[str, str]:
    """Parseert de HTML tabel met openingstijden."""
    hours = {}
    table = soup.select_one(".openingstijden-tabel")
    
    if not table:
        return hours

    rows = table.select(".openingstijden-tabel-tr")
    for row in rows:
        label_div = row.select_one(".openingstijden-label div")
        if not label_div:
            continue
        day = label_div.get_text(strip=True)

        data_div = row.select_one(".openingstijden-data")
        if not data_div:
            hours[day] = "Unknown"
            continue

        closed_div = data_div.select_one(".openingstijden-gesloten")
        if closed_div:
            hours[day] = "Gesloten"
        else:
            time_div = data_div.select_one(".openingstijden-restaurant")
            if time_div:
                start = time_div.select_one(".start")
                end = time_div.select_one(".einde")
                s_txt = start.get_text(strip=True) if start else "?"
                e_txt = end.get_text(strip=True) if end else "?"
                hours[day] = f"{s_txt} - {e_txt}"
            else:
                hours[day] = "Unknown"
    return hours

def extract_venue_features(soup: BeautifulSoup) -> tuple[str, Dict[str, str]]:
    """Haalt het type zaak en de lijst met kenmerken op."""
    venue_type = "Restaurant"
    features = {}
    
    kenmerken_div = soup.select_one(".kenmerken .content")
    if kenmerken_div:
        for dl in kenmerken_div.find_all("dl"):
            dt = dl.find("dt")
            dd = dl.find("dd")
            if dt and dd:
                key = dt.get_text(strip=True)
                val = dd.get_text(strip=True)
                features[key] = val
                if "Soort zaak" in key:
                    venue_type = val
    return venue_type, features

def extract_address_info(soup: BeautifulSoup) -> Dict[str, str | None]:
    """Haalt straat, postcode, stad en naam op."""
    info = {"name": None, "street": None, "zip_code": None, "city": "Amsterdam"}
    
    address_div = soup.select_one(".address")
    if address_div:
        h1 = address_div.find("h1")
        if h1:
            info["name"] = h1.get_text(strip=True)
        
        street_span = address_div.select_one(".street")
        if street_span:
            info["street"] = street_span.get_text(strip=True)
            
        zip_span = address_div.select_one(".postcode")
        if zip_span:
            info["zip_code"] = zip_span.get_text(strip=True)
        
        city_span = address_div.select_one(".city")
        if city_span:
            info["city"] = city_span.get_text(strip=True)
            
    return info

def parse_venue_html(html_content: str, url: str) -> Optional[VenueCreate]:
    """
    De hoofd parser. Voegt alle bovenstaande functies samen.
    Input: Rauwe HTML string en de URL.
    Output: Een VenueCreate object of None.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    
    # 1. Basis Info
    addr_info = extract_address_info(soup)
    if not addr_info["name"]:
        logging.warning(f"Parse error: Geen naam gevonden voor {url}")
        return None

    # 2. Geo & Features & Uren (Deel-functies aanroepen)
    lat, lon = extract_lat_lon(soup)
    venue_type, features = extract_venue_features(soup)
    opening_hours = extract_opening_hours(soup)

    # 3. Externe Links & Plaatjes (Simpel genoeg om hier te houden)
    external_website = None
    for a in soup.find_all("a", href=True):
        txt = a.get_text().lower()
        if "website" in txt and "debuik" not in a['href']:
            external_website = a['href']
            break
    
    final_url = external_website if external_website else url

    image_url = None
    img_tag = soup.select_one("img.imgfade-transition")
    if img_tag and img_tag.get("src"):
        image_url = img_tag["src"]

    # 4. Construct Object
    config = {
        "source": "debuik.nl",
        "original_url": url,
        "scraped_at": datetime.utcnow().isoformat(),
        "street": addr_info["street"],
        "zip_code": addr_info["zip_code"],
        "image_url": image_url,
        "features": features,
        "opening_hours": opening_hours
    }

    return VenueCreate(
        name=addr_info["name"],
        website_url=final_url,
        city=addr_info["city"],
        state="Noord-Holland",
        country="NL",
        latitude=lat,
        longitude=lon,
        venue_type=venue_type,
        scraper_config=config,
        active=True
    )

In [41]:
SEARCH_URL = "https://www.debuik.nl/amsterdam/zoek/restaurant"

async def fetch_listing_urls(limit: int = 10) -> List[str]:
    """Gebruikt Playwright om de zoekpagina te scrollen en URLs te verzamelen."""
    logging.info(f"Start Playwright (limit={limit})...")
    unique_urls = []
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        
        try:
            await page.goto(SEARCH_URL)
            await page.wait_for_selector('a[href*="/amsterdam/restaurant/"]', timeout=15000)
            
            # Scrollen als we meer nodig hebben dan de initi√´le lading
            if limit > 10:
                for _ in range(2):
                    await page.mouse.wheel(0, 3000)
                    await asyncio.sleep(1)
            
            hrefs = await page.evaluate("""() => {
                return Array.from(document.querySelectorAll('a[href*="/amsterdam/restaurant/"]'))
                    .map(a => a.href)
            }""")
            
            # Filteren en uniek maken
            for href in hrefs:
                if "/amsterdam/restaurant/" in href:
                     unique_urls.append(href)
            
            unique_urls = list(set(unique_urls))
            
        except Exception as e:
            logging.error(f"Playwright error: {e}")
        finally:
            await browser.close()
            
    return unique_urls[:limit]

async def fetch_and_parse_venue(client: httpx.AsyncClient, url: str) -> Optional[VenueCreate]:
    """Haalt HTML op met HTTPX en stuurt het naar de parser."""
    try:
        response = await client.get(url, timeout=15)
        if response.status_code == 200:
            # Hier roepen we de 'Pure Functie' aan uit sectie 2
            return parse_venue_html(response.text, url)
        else:
            logging.warning(f"Status {response.status_code} voor {url}")
    except Exception as e:
        logging.error(f"Error scraping {url}: {e}")
    return None

async def run_scraper(limit: int = 5) -> List[VenueCreate]:
    """De 'Main' functie die alles aan elkaar knoopt."""
    
    # 1. Haal URLs op
    urls = await fetch_listing_urls(limit)
    logging.info(f"{len(urls)} URLs gevonden. Details ophalen...")
    
    results = []
    headers = {"User-Agent": "Mozilla/5.0 (compatible; Scraper/1.0)"}
    
    # 2. Haal details op
    async with httpx.AsyncClient(headers=headers) as client:
        for i, url in enumerate(urls, 1):
            logging.info(f"Processing ({i}/{len(urls)}): {url}")
            
            venue = await fetch_and_parse_venue(client, url)
            if venue:
                results.append(venue)
            
            await asyncio.sleep(0.5) # Rate limiting
            
    return results

In [47]:
def extract_address_info(soup: BeautifulSoup) -> Dict[str, str | None]:
    info = {"name": None, "street": None, "zip_code": None, "city": "Amsterdam"}
    
    # 1. Probeer de .address div (Server Side Rendered)
    address_div = soup.select_one("div.address")
    if address_div:
        h1 = address_div.find("h1")
        if h1: info["name"] = h1.get_text(strip=True)
        if address_div.select_one(".street"): info["street"] = address_div.select_one(".street").get_text(strip=True)
        if address_div.select_one(".postcode"): info["zip_code"] = address_div.select_one(".postcode").get_text(strip=True)
        if address_div.select_one(".city"): info["city"] = address_div.select_one(".city").get_text(strip=True)
        
    # 2. Fallback: Zoek naar H1 als de div structuur anders is (voor de zekerheid)
    if not info["name"]:
        h1 = soup.find("h1")
        if h1: info["name"] = h1.get_text(strip=True)

    return info

def extract_description(soup: BeautifulSoup) -> Optional[str]:
    intro = soup.select_one(".introductie")
    if intro: return intro.get_text(separator=" ", strip=True)
    return None

def extract_image(soup: BeautifulSoup) -> Optional[str]:
    # Slideshow image
    img = soup.select_one(".restaurant-slideshow .restaurant-slide img.imgfade-transition")
    if img and img.get("src"): return img["src"]
    # Thumbnail fallback
    thumb = soup.select_one(".thumbnails img")
    if thumb and thumb.get("src"): return thumb["src"]
    return None

def extract_lat_lon(soup: BeautifulSoup) -> tuple[Optional[Decimal], Optional[Decimal]]:
    url_to_check = None
    map_img = soup.select_one(".locatie-small img")
    if map_img: url_to_check = map_img.get('src')
    
    if not url_to_check:
        map_div = soup.select_one(".locatie-large a")
        if map_div: url_to_check = map_div.get('style')

    if url_to_check:
        match = re.search(r'center=([\d\.]+),([\d\.]+)', str(url_to_check))
        if match:
            try:
                return Decimal(match.group(1)), Decimal(match.group(2))
            except: pass
    return None, None

def extract_opening_hours(soup: BeautifulSoup) -> Dict[str, str]:
    hours = {}
    table = soup.select_one(".openingstijden-tabel")
    if not table: return hours

    for row in table.select(".openingstijden-tabel-tr"):
        day_div = row.select_one(".openingstijden-label div")
        if not day_div: continue
        day = day_div.get_text(strip=True)

        data_div = row.select_one(".openingstijden-data")
        if not data_div: continue

        if data_div.select_one(".openingstijden-gesloten"):
            hours[day] = "Gesloten"
        else:
            time_div = data_div.select_one(".openingstijden-restaurant")
            if time_div:
                s = time_div.select_one(".start").get_text(strip=True)
                e = time_div.select_one(".einde").get_text(strip=True)
                hours[day] = f"{s} - {e}"
            else:
                hours[day] = "Unknown"
    return hours

def extract_venue_features(soup: BeautifulSoup) -> tuple[str, Dict[str, str]]:
    venue_type = "Restaurant"
    features = {}
    kenmerken_div = soup.select_one(".kenmerken .content")
    if kenmerken_div:
        for dl in kenmerken_div.find_all("dl"):
            dt = dl.find("dt")
            dd = dl.find("dd")
            if dt and dd:
                k = dt.get_text(strip=True)
                v = dd.get_text(strip=True)
                features[k] = v
                if "Soort zaak" in k: venue_type = v
    return venue_type, features

def parse_venue_html(html_content: str, url: str) -> Optional[VenueCreate]:
    soup = BeautifulSoup(html_content, "html.parser")
    
    # 1. Basis Info
    addr_info = extract_address_info(soup)
    if not addr_info["name"]:
        # DEBUG: Print de titel als het mislukt
        print(f"‚ö†Ô∏è  Parse Error. Pagina Titel: {soup.title.string if soup.title else 'Geen titel'}")
        return None

    # 2. Extracties
    lat, lon = extract_lat_lon(soup)
    venue_type, features = extract_venue_features(soup)
    opening_hours = extract_opening_hours(soup)
    description = extract_description(soup)
    image_url = extract_image(soup)

    # 3. Website (Fallback op eigen URL als geen externe link)
    external_website = None
    sidebar = soup.select_one(".restaurant-contactvlak")
    if sidebar:
        for a in sidebar.find_all("a", href=True):
            if "website" in a.get_text().lower() and "debuik.nl" not in a['href']:
                external_website = a['href']
                break
    
    final_url = external_website if external_website else url

    config = {
        "source": "debuik.nl",
        "original_url": url,
        "scraped_at": datetime.utcnow().isoformat(),
        "street": addr_info["street"],
        "zip_code": addr_info["zip_code"],
        "description": description,
        "image_url": image_url,
        "features": features,
        "opening_hours": opening_hours
    }

    return VenueCreate(
        name=addr_info["name"],
        website_url=final_url,
        city=addr_info["city"],
        state="Noord-Holland",
        country="NL",
        latitude=lat,
        longitude=lon,
        venue_type=venue_type,
        scraper_config=config,
        active=True
    )

# ==============================================================================
# 3. TEST MET PLAYWRIGHT (Browser simulatie)
# ==============================================================================

async def test_single_url_with_browser(target_url: str):
    print(f"üöÄ Start Playwright browser voor: {target_url}")
    
    async with async_playwright() as p:
        # Start browser (headless=True zie je niks, zet False om te kijken)
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = await context.new_page()
        
        try:
            # Ga naar pagina
            await page.goto(target_url, timeout=30000)
            
            # Wacht even tot de .address div er is (belangrijk!)
            try:
                await page.wait_for_selector("div.address", timeout=5000)
            except:
                print("‚ö†Ô∏è  Timeout: Kon .address div niet vinden op de pagina.")
            
            # Haal de volledige HTML op
            html_content = await page.content()
            
            print("‚úÖ HTML opgehaald. Start parser...")
            venue = parse_venue_html(html_content, target_url)
            
            if venue:
                print("\nüéâ SUCCESVOL GESPRAAPT:")
                print("="*60)
                print(f"Naam:        {venue.name}")
                print(f"Type:        {venue.venue_type}")
                print(f"Adres:       {venue.scraper_config['street']}, {venue.scraper_config['zip_code']}")
                print(f"Geo:         {venue.latitude}, {venue.longitude}")
                print(f"Img:         {venue.scraper_config['image_url']}")
                if venue.scraper_config.get('description'):
                    print(f"Desc:        {venue.scraper_config['description'][:100]}...")
                print("-" * 60)
                print("Openingstijden:")
                print(json.dumps(venue.scraper_config['opening_hours'], indent=2))
                print("="*60)
            else:
                print("‚ùå Parsen mislukt.")
                
        except Exception as e:
            print(f"‚ùå Browser error: {e}")
        finally:
            await browser.close()

# Start de test
url = "https://www.debuik.nl/amsterdam/restaurant/petitbysam"
await test_single_url_with_browser(url)

üöÄ Start Playwright browser voor: https://www.debuik.nl/amsterdam/restaurant/petitbysam
‚úÖ HTML opgehaald. Start parser...

üéâ SUCCESVOL GESPRAAPT:
Naam:        PetitbySam
Type:        Delicatessenzaak
Adres:       Vijzelstraat 93, 1017 HA
Geo:         52.3633362, 4.8924013
Img:         https://www.debuik.nl/fp/zhdlWu6QpalbAUPWOhfU/convert?&w=2660&h=1290&fit=crop
Desc:        PetitbySam van is een fijne delicatessenzaak aan de Vijzelstraat in Amsterdam. Eigenaresse Smaita Ra...
------------------------------------------------------------
Openingstijden:
{
  "Maandag": "Gesloten",
  "Dinsdag": "08:00 - 18:00",
  "Woensdag": "08:00 - 18:00",
  "Donderdag": "08:00 - 18:00",
  "Vrijdag": "08:00 - 18:00",
  "Zaterdag": "10:00 - 18:00",
  "Zondag": "10:00 - 18:00"
}


  "scraped_at": datetime.utcnow().isoformat(),
