In [5]:
import requests
import json
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import time
import random

In [6]:
# ==============================================================================
# 1. NORMALIZATION FUNCTIONS (Exact structure provided by you)
# ==============================================================================

def normalize_location_as_event(loc, page_props, url):
    """
    Take a Location block (like AMAZE) and return ONLY the event/visit-related
    fields in a clean, flat structure.
    """
    seo = page_props.get("seo", {})
    localizations = page_props.get("localizations", [])

    images = loc.get("images") or []
    main_image = images[0]["src"] if images else None

    address = loc.get("address") or {}
    coords = loc.get("coordinates") or {}

    return {
        "kind": "Location",
        "id": loc.get("id"),
        "ffID": loc.get("ffID"),
        "slug": loc.get("slug"),
        "name": loc.get("name"),
        "category": loc.get("category", []),

        # SEO / routing
        "directory": page_props.get("directory"),
        "pages": page_props.get("pages", []),
        "seo_title": seo.get("title"),
        "seo_description": seo.get("description"),
        "seo_slug": seo.get("slug"),
        "seo_og_image": seo.get("ogImage"),
        "localizations": localizations,

        # Content
        "intro": loc.get("intro"),
        "description_html": loc.get("description"),

        # Media
        "main_image": main_image,
        "images": images,

        # Visit info
        "alwaysOpen": loc.get("alwaysOpen"),
        "closedDates": loc.get("closedDates"),
        "businessHours": loc.get("businessHours", {}),
        "openUntil": loc.get("openUntil"),
        "soldOut": loc.get("soldOut"),

        # Contact / location
        "address": {
            "street": address.get("street"),
            "houseNumber": address.get("houseNumber"),
            "zipcode": address.get("zipcode"),
            "city": address.get("city"),
        },
        "coordinates": {
            "lat": coords.get("latitude"),
            "lng": coords.get("longitude"),
        },
        "phoneNumber": loc.get("phoneNumber"),
        "email": loc.get("email"),
        "urls": loc.get("urls", []),

        # Commercial info
        "promotions": loc.get("promotions", []),
        
        # Where it came from
        "source_url": url,
    }


def normalize_event(event, page_props, url):
    """
    Normalize real Event pages to match the schema.
    """
    seo = page_props.get("seo", {})
    localizations = page_props.get("localizations", [])

    images = event.get("images") or []
    main_image = images[0]["src"] if images else None

    return {
        "kind": "Event",
        "id": event.get("id"),
        "slug": event.get("slug"),
        "name": event.get("title") or event.get("name"),

        "directory": page_props.get("directory"),
        "pages": page_props.get("pages", []),
        "seo_title": seo.get("title"),
        "seo_description": seo.get("description"),
        "seo_slug": seo.get("slug"),
        "seo_og_image": seo.get("ogImage"),
        "localizations": localizations,

        "intro": event.get("intro"),
        "description_html": event.get("description"),

        "main_image": main_image,
        "images": images,

        "dates": event.get("date") or event.get("dates"),
        "highlights": event.get("highlights", []),

        "source_url": url,
    }


In [7]:
# ==============================================================================
# 2. PAGE SCRAPER (Logic provided by you)
# ==============================================================================

def scrape_event_page(url):
    try:
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko)"
            )
        }
        # Slight timeout increase for stability
        response = requests.get(url, headers=headers, timeout=15)

        if response.status_code != 200:
            # print(f"Skipping {url} (Status: {response.status_code})")
            return None

        soup = BeautifulSoup(response.content, "html.parser")
        script_tag = soup.find("script", id="__NEXT_DATA__")

        if not script_tag:
            print(f"Error: No __NEXT_DATA__ found in {url}")
            return None

        json_obj = json.loads(script_tag.string)
        page_props = json_obj.get("props", {}).get("pageProps", {}) or {}

        page_type = page_props.get("pageType", "Unknown")

        # ---------- EVENT PAGES ----------
        if (
            page_type == "Event"
            or "event" in page_props
            or "Event" in page_props
        ):
            event = (
                page_props.get("event")
                or page_props.get("Event")
                or page_props.get("data")
            )
            if not event:
                # print(f"No event object found in {url}")
                return None

            return normalize_event(event, page_props, url)

        # ---------- LOCATION PAGES (like AMAZE) ----------
        if (
            page_type == "Location"
            or "Location" in page_props
            or "location" in page_props
        ):
            loc = (
                page_props.get("Location")
                or page_props.get("location")
            )

            # Fallback: some pages inline the location-like data on the root
            if not loc and "name" in page_props:
                loc = page_props

            if not loc:
                # print(f"No Location object found in {url}")
                return None

            return normalize_location_as_event(loc, page_props, url)

        # everything else is not relevant as event data
        return None

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None


In [8]:
# ==============================================================================
# 3. SITEMAP DISCOVERY
# ==============================================================================

def get_sitemap_urls():
    sitemap_url = "https://www.iamsterdam.com/sitemap.xml"
    print(f"Fetching sitemap from: {sitemap_url}")
    
    try:
        response = requests.get(sitemap_url, timeout=30)
        response.raise_for_status()
        
        root = ET.fromstring(response.content)
        # Handle the standard sitemap namespace
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        
        all_urls = []
        
        for url_tag in root.findall('ns:url', namespace):
            loc = url_tag.find('ns:loc', namespace).text
            
            # Filter logic: 
            # 1. Must be in /uit/agenda/ (Dutch) or /whats-on/calendar/ (English)
            # 2. Must be deep enough (exclude listing pages)
            if "/uit/agenda/" in loc or "/whats-on/calendar/" in loc:
                # Usually detail pages have 6+ segments: domain / lang / section / sub / category / item
                if len(loc.split("/")) > 6:
                    all_urls.append(loc)
                    
        return list(set(all_urls))
        
    except Exception as e:
        print(f"Critical Error parsing sitemap: {e}")
        return []

In [9]:
urls = get_sitemap_urls()
print(f"Found {len(urls)} potential event/location URLs in sitemap.")

Fetching sitemap from: https://www.iamsterdam.com/sitemap.xml
Found 5531 potential event/location URLs in sitemap.


In [12]:
# ==============================================================================
# SCRAPE 50 RANDOM URLS
# ==============================================================================

# Get all URLs from sitemap (or use existing 'urls' variable if already loaded)
if 'urls' not in locals() or len(urls) == 0:
    urls = get_sitemap_urls()
    print(f"Fetched {len(urls)} URLs from sitemap")

# Select 50 random URLs
random.seed(42)  # For reproducibility
random_urls = random.sample(urls, min(50, len(urls)))
print(f"\nSelected {len(random_urls)} random URLs to scrape")
print(f"First 5 URLs:")
for i, url in enumerate(random_urls[:5], 1):
    print(f"  {i}. {url}")

# Scrape the URLs
print(f"\n{'='*80}")
print("Starting scraping process...")
print(f"{'='*80}\n")

results = []
start_time = time.time()

# Use ThreadPoolExecutor for parallel scraping (10 workers for 50 URLs)
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(scrape_event_page, url): url for url in random_urls}
    
    processed_count = 0
    for future in futures:
        url = futures[future]
        try:
            data = future.result()
            if data:
                results.append(data)
                print(f"✅ [{processed_count + 1}/{len(random_urls)}] Successfully scraped: {url[:80]}...")
            else:
                print(f"⚠️  [{processed_count + 1}/{len(random_urls)}] No data extracted from: {url[:80]}...")
            
            processed_count += 1
            
        except Exception as exc:
            print(f"❌ [{processed_count + 1}/{len(random_urls)}] Error scraping {url[:80]}...: {exc}")
            processed_count += 1

duration = time.time() - start_time

print(f"\n{'='*80}")
print(f"Scraping completed in {duration:.2f} seconds")
print(f"Successfully extracted data for {len(results)} out of {len(random_urls)} URLs")
print(f"Success rate: {len(results)/len(random_urls)*100:.1f}%")
print(f"{'='*80}\n")

# Display summary
if results:
    print("Summary of scraped items:")
    event_count = sum(1 for r in results if r.get('kind') == 'Event')
    location_count = sum(1 for r in results if r.get('kind') == 'Location')
    print(f"  - Events: {event_count}")
    print(f"  - Locations: {location_count}")
    
    # Show a sample result
    print(f"\nSample result (first item):")
    print(json.dumps(results[0], indent=2, ensure_ascii=False)[:500] + "...")

# Store results in variable for further use
scraped_results = results



Selected 50 random URLs to scrape
First 5 URLs:
  1. https://www.iamsterdam.com/uit/agenda/overnachten/accommodaties/de-malle-moolen
  2. https://www.iamsterdam.com/uit/agenda/winkelen/winkels/beadies
  3. https://www.iamsterdam.com/en/whats-on/calendar/attractions-and-sights/tours/that-dam-guide-amsterdam-private-tours
  4. https://www.iamsterdam.com/en/whats-on/calendar/attractions-and-sights/tours/smart-phone-city-challenge
  5. https://www.iamsterdam.com/en/whats-on/calendar/eating-and-drinking/cafes-and-bars/palladium

Starting scraping process...

✅ [1/50] Successfully scraped: https://www.iamsterdam.com/uit/agenda/overnachten/accommodaties/de-malle-moolen...
✅ [2/50] Successfully scraped: https://www.iamsterdam.com/uit/agenda/winkelen/winkels/beadies...
✅ [3/50] Successfully scraped: https://www.iamsterdam.com/en/whats-on/calendar/attractions-and-sights/tours/tha...
✅ [4/50] Successfully scraped: https://www.iamsterdam.com/en/whats-on/calendar/attractions-and-sights/tours/sma..

In [13]:
scraped_results

[{'kind': 'Location',
  'id': '39e0338a-ad92-4718-85ac-4cc34c4f003e',
  'ffID': '5b068a92adbe1d0001d24654',
  'slug': 'uit/agenda/overnachten/accommodaties/de-malle-moolen',
  'name': 'De Malle Moolen',
  'category': [{'title': 'Accommodaties',
    'slug': 'uit/agenda/overnachten/accommodaties'}],
  'directory': 'uit',
  'pages': ['agenda', 'overnachten', 'accommodaties', 'de-malle-moolen'],
  'seo_title': 'De Malle Moolen',
  'seo_description': 'Hotel de Mallemolen is een eensterrenhotel gevestigd in een monumentaal pand. Het hotel heeft 13 kamers, met gedeelde badkamer faciliteiten (douche/toilet).',
  'seo_slug': 'uit/agenda/overnachten/accommodaties/de-malle-moolen',
  'seo_og_image': 'https://app.thefeedfactory.nl/api/assets/63b2fe61cdc49f5de115f2a9/malle_moolen.jpg',
  'localizations': [{'locale': 'en-GB',
    'slug': 'whats-on/calendar/accomodations/all-accomodations/de-malle-moolen'},
   {'locale': 'nl-NL',
    'slug': 'uit/agenda/overnachten/accommodaties/de-malle-moolen'}],
 

In [14]:
# ==============================================================================
# PRETTY PRINT JSON RESULTS
# ==============================================================================

# Check if we have results to display
if 'scraped_results' in locals() and scraped_results:
    print(f"Pretty printing {len(scraped_results)} results:\n")
    print("=" * 80)
    print(json.dumps(scraped_results, indent=2, ensure_ascii=False))
    print("=" * 80)
elif 'results' in locals() and results:
    print(f"Pretty printing {len(results)} results:\n")
    print("=" * 80)
    print(json.dumps(results, indent=2, ensure_ascii=False))
    print("=" * 80)
else:
    print("No results found. Please run the scraping cell first.")
    print("Available variables:", [k for k in locals().keys() if not k.startswith('_')])


Pretty printing 50 results:

[
  {
    "kind": "Location",
    "id": "39e0338a-ad92-4718-85ac-4cc34c4f003e",
    "ffID": "5b068a92adbe1d0001d24654",
    "slug": "uit/agenda/overnachten/accommodaties/de-malle-moolen",
    "name": "De Malle Moolen",
    "category": [
      {
        "title": "Accommodaties",
        "slug": "uit/agenda/overnachten/accommodaties"
      }
    ],
    "directory": "uit",
    "pages": [
      "agenda",
      "overnachten",
      "accommodaties",
      "de-malle-moolen"
    ],
    "seo_title": "De Malle Moolen",
    "seo_description": "Hotel de Mallemolen is een eensterrenhotel gevestigd in een monumentaal pand. Het hotel heeft 13 kamers, met gedeelde badkamer faciliteiten (douche/toilet).",
    "seo_slug": "uit/agenda/overnachten/accommodaties/de-malle-moolen",
    "seo_og_image": "https://app.thefeedfactory.nl/api/assets/63b2fe61cdc49f5de115f2a9/malle_moolen.jpg",
    "localizations": [
      {
        "locale": "en-GB",
        "slug": "whats-on/calendar/ac

In [None]:
# A. Get URLs from Sitemap
urls = get_sitemap_urls()
print(f"Found {len(urls)} potential event/location URLs in sitemap.")

# B. Optional: Slice for testing (remove [:50] to scrape all 3000+)
urls_to_process = urls #[:50] 

print(f"Starting extraction for {len(urls_to_process)} URLs using parallel threads...")

results = []
start_time = time.time()

# C. Run Parallel Scraper
# Using 20 threads implies 20 concurrent connections.
with ThreadPoolExecutor(max_workers=20) as executor:
    futures = {executor.submit(scrape_event_page, url): url for url in urls_to_process}
    
    processed_count = 0
    for future in futures:
        try:
            data = future.result()
            if data:
                results.append(data)
            
            processed_count += 1
            if processed_count % 50 == 0:
                print(f"Processed {processed_count}/{len(urls_to_process)}...")
                
        except Exception as exc:
            print(f"Thread exception: {exc}")

duration = time.time() - start_time
print(f"Scraping completed in {duration:.2f} seconds.")
print(f"Successfully extracted data for {len(results)} items.")

# D. Save to JSON
output_filename = "iamsterdam_data_full.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)
    
print(f"Data saved to {output_filename}")

## Categories

In [11]:
# ==============================================================================
# EXTRACT ALL CATEGORIES AND SUBCATEGORIES FROM URLS
# ==============================================================================

def extract_categories_from_urls(urls):
    """
    Extract all unique categories and subcategories from a list of URLs.
    
    Returns a dictionary with structure:
    {
        'nl': {
            'category': ['subcategory1', 'subcategory2', ...],
            ...
        },
        'en': {
            'category': ['subcategory1', 'subcategory2', ...],
            ...
        }
    }
    """
    categories = {
        'nl': {},  # Dutch: /uit/agenda/
        'en': {}   # English: /en/whats-on/calendar/
    }
    
    for url in urls:
        # Parse Dutch URLs: /uit/agenda/category/subcategory/item
        if '/uit/agenda/' in url:
            parts = url.split('/uit/agenda/')
            if len(parts) > 1:
                path_parts = [p for p in parts[1].split('/') if p]  # Remove empty strings
                if len(path_parts) >= 2:
                    category = path_parts[0]
                    subcategory = path_parts[1]
                    
                    if category not in categories['nl']:
                        categories['nl'][category] = set()
                    categories['nl'][category].add(subcategory)
        
        # Parse English URLs: /en/whats-on/calendar/category/subcategory/item
        elif '/whats-on/calendar/' in url:
            parts = url.split('/whats-on/calendar/')
            if len(parts) > 1:
                path_parts = [p for p in parts[1].split('/') if p]  # Remove empty strings
                if len(path_parts) >= 2:
                    category = path_parts[0]
                    subcategory = path_parts[1]
                    
                    if category not in categories['en']:
                        categories['en'][category] = set()
                    categories['en'][category].add(subcategory)
    
    # Convert sets to sorted lists for better readability
    for lang in categories:
        for category in categories[lang]:
            categories[lang][category] = sorted(list(categories[lang][category]))
    
    return categories


def print_category_structure(categories):
    """Pretty print the category structure"""
    print("=" * 80)
    print("CATEGORY STRUCTURE FOR IAMSTERDAM.COM")
    print("=" * 80)
    
    for lang in ['nl', 'en']:
        lang_name = 'Dutch (NL)' if lang == 'nl' else 'English (EN)'
        print(f"\n{lang_name} - /{'uit/agenda/' if lang == 'nl' else 'en/whats-on/calendar/'}")
        print("-" * 80)
        
        for category in sorted(categories[lang].keys()):
            subcategories = categories[lang][category]
            print(f"\n  {category}")
            for subcat in subcategories:
                print(f"    └─ {subcat}")
    
    print("\n" + "=" * 80)


# Get URLs and extract categories
urls = get_sitemap_urls()
categories = extract_categories_from_urls(urls)
print_category_structure(categories)

# Also return as a dictionary for programmatic use
categories

Fetching sitemap from: https://www.iamsterdam.com/sitemap.xml
CATEGORY STRUCTURE FOR IAMSTERDAM.COM

Dutch (NL) - /uit/agenda/
--------------------------------------------------------------------------------

  attracties-en-bezienswaardigheden
    └─ attracties
    └─ bezienswaardigheden
    └─ parken-en-natuurgebieden
    └─ rondvaarten
    └─ tours

  concerten-en-muziek
    └─ concerten

  eten-en-drinken
    └─ cafes-en-bars
    └─ restaurants

  festivals
    └─ events

  musea-en-galeries
    └─ galeries
    └─ musea

  nachtleven
    └─ clubbing

  overnachten
    └─ accommodaties

  tentoonstellingen
    └─ alle-tentoonstellingen

  vervoer
    └─ parkeren
    └─ verhuur

  voorstellingen
    └─ theater-en-podiumkunsten

  winkelen
    └─ markten
    └─ winkels

English (EN) - /en/whats-on/calendar/
--------------------------------------------------------------------------------

  accomodations
    └─ all-accomodations

  attractions-and-sights
    └─ attractions
    └─ canal

{'nl': {'voorstellingen': ['theater-en-podiumkunsten'],
  'attracties-en-bezienswaardigheden': ['attracties',
   'bezienswaardigheden',
   'parken-en-natuurgebieden',
   'rondvaarten',
   'tours'],
  'vervoer': ['parkeren', 'verhuur'],
  'musea-en-galeries': ['galeries', 'musea'],
  'eten-en-drinken': ['cafes-en-bars', 'restaurants'],
  'festivals': ['events'],
  'winkelen': ['markten', 'winkels'],
  'concerten-en-muziek': ['concerten'],
  'overnachten': ['accommodaties'],
  'tentoonstellingen': ['alle-tentoonstellingen'],
  'nachtleven': ['clubbing']},
 'en': {'eating-and-drinking': ['cafes-and-bars', 'restaurants'],
  'accomodations': ['all-accomodations'],
  'transportation': ['parking', 'rental'],
  'museums-and-galleries': ['galleries', 'museums'],
  'attractions-and-sights': ['attractions',
   'canal-cruises',
   'nature-and-active',
   'sights',
   'tours'],
  'concerts-and-music': ['concerts'],
  'festivals': ['events'],
  'shopping': ['markets', 'shops'],
  'theatre-and-stage'