In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException, StaleElementReferenceException
from time import sleep
from datetime import datetime
import json
import time

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--window-size=1920,1080")  # Desktop resolution
chrome_options.add_argument("--start-maximized")  # Maximize window
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")  # Desktop user agent
chrome_options.add_argument('--disable-blink-features=AutomationControlled')  # Hide automation
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])  # Hide automation
chrome_options.add_experimental_option('useAutomationExtension', False)  # Hide automation

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Mask WebDriver to avoid detection
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

url = "https://dekoepel.virtuagym.com//classes/week/?event_type=2&embedded=1"
driver.get(url)
print("WebDriver initialized successfully")

WebDriver initialized successfully


In [21]:
import os
# Create scraped_data folder if it doesn't exist
output_dir = "scraped_data"
os.makedirs(output_dir, exist_ok=True)

# List to store scraped class details
class_details = []
scraped_count = 0
max_classes = 100

while True:
    try:
        # Get all clickable event elements
        elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@onclick, 'openScheduleModal')]"))
        )

        for element in elements:
            if scraped_count >= max_classes:
                break  # Stop if 100 classes are scraped

            try:
                # Scroll into view and click the current element
                driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", element)
                WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, element.get_attribute("id")))).click()
                
                # Extract the modal details
                modal_content = WebDriverWait(driver, 10).until(
                    EC.visibility_of_element_located((By.CSS_SELECTOR, ".modal-content"))
                )
                details_text = modal_content.find_element(By.CSS_SELECTOR, ".modal-body").text

                # Filter and structure the details
                details_lines = details_text.split('\n')
                filtered_details = {
                    "description": "",
                    "date": "",
                    "time": "",
                    "capacity": "",
                    "instructor": ""
                }

                # Extract relevant fields using regex and line parsing
                for line in details_lines:
                    line = line.strip()
                    if not line or "Welkom bij" in line or "Tot snel!" in line:
                        continue  # Skip irrelevant or promotional text
                    if re.match(r"^(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\s+\d{2}\s+[a-z]+\s*$", line, re.IGNORECASE):
                        filtered_details["date"] = line
                    elif re.match(r"^\d{2}:\d{2}\s*-\s*\d{2}:\d{2}\s*$", line):
                        filtered_details["time"] = line
                    elif re.match(r"^\d+\s*/\s*\d+\s*$", line):
                        filtered_details["capacity"] = line
                    elif re.match(r"^[A-Za-z\s]+$", line) and not any(keyword in line.lower() for keyword in ["pilates", "reformer", "core", "lichaam"]):
                        filtered_details["instructor"] = line
                    else:
                        if not filtered_details["description"] and "pilates" in line.lower():
                            filtered_details["description"] = line

                class_details.append(filtered_details)
                scraped_count += 1
                print(f"Scraped class {scraped_count}: {filtered_details}")
                
                # Improved modal closing with multiple fallback methods
                max_close_attempts = 3
                for attempt in range(max_close_attempts):
                    try:
                        # Method 1: Try clicking the close button directly
                        close_button = WebDriverWait(driver, 5).until(
                            EC.element_to_be_clickable((By.CSS_SELECTOR, "button.close[data-dismiss='modal'][data-cy='modalDismissBtn']"))
                        )
                        driver.execute_script("arguments[0].click();", close_button)
                        
                        # Verify modal is closed
                        if WebDriverWait(driver, 3).until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".modal-content"))):
                            break
                    except Exception:
                        if attempt == 0:
                            # Method 2: Try clicking by XPath
                            try:
                                close_xpath = WebDriverWait(driver, 3).until(
                                    EC.element_to_be_clickable((By.XPATH, "//button[@class='close' and @data-dismiss='modal' and @data-cy='modalDismissBtn']"))
                                )
                                driver.execute_script("arguments[0].click();", close_xpath)
                            except Exception:
                                pass
                        elif attempt == 1:
                            # Method 3: Try pressing Escape key
                            webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
                        else:
                            # Method 4: Last resort - execute dismiss via JavaScript
                            driver.execute_script("$('.modal').modal('hide');")
                            driver.execute_script("document.querySelector('.modal-backdrop')?.remove();")
                            driver.execute_script("document.body.classList.remove('modal-open');")
                
                # Ensure modal is fully closed before continuing
                WebDriverWait(driver, 5).until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".modal-content")))
                time.sleep(1)

            except ElementClickInterceptedException:
                print("Modal interaction issue, attempting recovery...")
                driver.execute_script("$('.modal').modal('hide');")
                driver.execute_script("document.querySelector('.modal-backdrop')?.remove();")
                driver.execute_script("document.body.classList.remove('modal-open');")
                WebDriverWait(driver, 5).until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".modal-content")))
                time.sleep(1)
            except (TimeoutException, StaleElementReferenceException) as e:
                print("Error interacting with modal:", e)
                continue
        
        if scraped_count >= max_classes:
            break  # Exit the outer loop if 100 classes are scraped

        # Click the 'volgende' button to go to next week
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'volgende')]"))
        )
        
        # Make sure no modal backdrop is present before clicking next button
        try:
            backdrop = driver.find_element(By.CSS_SELECTOR, ".modal-backdrop")
            if backdrop:
                driver.execute_script("document.querySelector('.modal-backdrop').remove();")
                driver.execute_script("document.body.classList.remove('modal-open');")
                time.sleep(0.5)
        except:
            pass
            
        driver.execute_script("arguments[0].click();", next_button)
        time.sleep(2)

    except TimeoutException:
        print("No more pages to scrape. Exiting...")
        break
    except ElementClickInterceptedException as e:
        print("Click intercepted, attempting recovery...")
        driver.execute_script("$('.modal').modal('hide');")
        driver.execute_script("document.querySelector('.modal-backdrop')?.remove();")
        driver.execute_script("document.body.classList.remove('modal-open');")
        time.sleep(1)
        driver.refresh()
        time.sleep(3)

# Save the results to a JSON file
output_file = os.path.join(output_dir, f"koepel_schedule_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(class_details, f, ensure_ascii=False, indent=4)

print(f"Done! Saved {scraped_count} classes to {output_file}")
driver.quit()

Scraped class 1: {'description': 'Samen zullen we met behulp van een reformer in kleine groepen werken aan balans, mobiliteit en spierversteviging, met een grote nadruk op core stabiliteit. Ideaal voor pijnbestrijding, nek- en rugklachten en in het algemeen voor het bewegen van het lichaam. Bij pilates ligt de focus op controle, ademhaling en precisie. Een perfecte workout voor lichaam en geest.', 'date': 'maandag 05 mei', 'time': '07:00 - 07:45', 'capacity': '1 / 4', 'instructor': 'Gilltumn Vanhauwaert'}
Scraped class 2: {'description': 'Samen zullen we met behulp van een reformer in kleine groepen werken aan balans, mobiliteit en spierversteviging, met een grote nadruk op core stabiliteit. Ideaal voor pijnbestrijding, nek- en rugklachten en in het algemeen voor het bewegen van het lichaam. Bij pilates ligt de focus op controle, ademhaling en precisie. Een perfecte workout voor lichaam en geest.', 'date': 'maandag 05 mei', 'time': '08:00 - 08:45', 'capacity': '2 / 4', 'instructor': 'G