In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from collections import deque

# Initialize the WebDriver
driver = webdriver.Chrome()

def scrape_site(start_url, max_depth=10):
    visited_urls = set()
    to_visit = deque([(start_url, 0)])  # Queue of (url, depth)
    all_text_content = []

    while to_visit:
        url, depth = to_visit.popleft()
        if url in visited_urls or depth > max_depth:
            continue

        visited_urls.add(url)
        print(f"Scraping {url}, Depth: {depth}")

        driver.get(url)
        time.sleep(2)  # Wait for JavaScript to load

        # Extract text content
        elements = driver.find_elements(By.XPATH, '//*')
        text_content = [element.text.strip() for element in elements if element.text.strip() != '']
        all_text_content.extend(text_content)

        print(f"Text found on {url}: {text_content[:5]}")  # Print first 5 elements of text content for brevity

        if depth < max_depth:
            # Extract links and add them to the queue
            links = driver.find_elements(By.TAG_NAME, 'a')
            hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href') and 'http' in link.get_attribute('href')]
            for href in hrefs:
                to_visit.append((href, depth + 1))

    return all_text_content

# Start scraping from the main page
all_text_content = scrape_site('https://www.ams.at/arbeitsuchende/')

# Close the WebDriver
driver.quit()

# Do something with the scraped data
print(all_text_content[:500])  # Print first 500 characters of the result for brevity


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from collections import deque

# Initialize the WebDriver
driver = webdriver.Chrome()

def accept_cookies(first_page):
    if first_page:
        try:
            # Find the cookie accept button by its text and click it
            accept_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Einverstanden')]")
            accept_button.click()
            time.sleep(1)  # Wait for the banner to disappear
        except Exception as e:
            print("Attempt to find and click cookie banner failed:", str(e))
            

def scrape_site(start_url, max_depth=10):
    visited_urls = set()
    to_visit = deque([(start_url, 0)])  # Queue of (url, depth)
    all_text_content = []
    first_page = True  # Flag to indicate the first page

    while to_visit:
        url, depth = to_visit.popleft()
        if url in visited_urls or depth > max_depth:
            continue

        visited_urls.add(url)
        print(f"Scraping {url}, Depth: {depth}")

        driver.get(url)
        accept_cookies(first_page)  # Attempt to accept cookies only on the first page
        first_page = False  # Set the flag to False after the first page
        time.sleep(2)  # Wait for JavaScript to load

        # Extract text content
        elements = driver.find_elements(By.XPATH, '//*')
        text_content = [element.text.strip() for element in elements if element.text.strip() != '']
        all_text_content.extend(text_content)

        print(f"Text found on {url}: {text_content[:5]}")  # Print first 5 elements of text content for brevity

        if depth < max_depth:
            # Extract links and add them to the queue
            links = driver.find_elements(By.TAG_NAME, 'a')
            hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href') and 'http' in link.get_attribute('href')]
            for href in hrefs:
                to_visit.append((href, depth + 1))

    return all_text_content

# Start scraping from the main page
all_text_content = scrape_site('https://www.ams.at/')

# Close the WebDriver
driver.quit()

# Do something with the scraped data
print(all_text_content[:500])  # Print first 500 characters of the result for brevity


In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from collections import deque

# Initialize the WebDriver
driver = webdriver.Chrome()

def accept_cookies(first_page):
    if first_page:
        try:
            # Find the cookie accept button by its text and click it
            accept_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Einverstanden')]")
            accept_button.click()
            time.sleep(1)  # Wait for the banner to disappear
        except Exception as e:
            print("Attempt to find and click cookie banner failed:", str(e))

def load_existing_data(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            return set(file.read().splitlines())
    except FileNotFoundError:
        return set()

def save_to_file(data, existing_data, filename='scraped_content.txt'):
    # Filter out duplicates from the new data
    unique_data = set(data) - existing_data

    # Update existing data set
    existing_data.update(unique_data)

    # Write the unique data to the file
    with open(filename, 'a', encoding='utf-8') as file:
        for line in unique_data:
            file.write(line + '\n')

def scrape_site(start_url, max_depth=10):
    visited_urls = set()
    to_visit = deque([(start_url, 0)])  # Queue of (url, depth)
    all_text_content = []
    first_page = True  # Flag to indicate the first page
    existing_data = load_existing_data('scraped_content.txt')

    while to_visit:
        url, depth = to_visit.popleft()
        if url in visited_urls or depth > max_depth:
            continue

        visited_urls.add(url)
        print(f"Scraping {url}, Depth: {depth}")

        driver.get(url)
        accept_cookies(first_page)  # Attempt to accept cookies only on the first page
        first_page = False  # Set the flag to False after the first page
        time.sleep(2)  # Wait for JavaScript to load

        # Extract text content
        elements = driver.find_elements(By.XPATH, '//*')
        text_content = [element.text.strip() for element in elements if element.text.strip() != '']
        all_text_content.extend(text_content)

        # Save the scraped content to file
        save_to_file(text_content, existing_data)

        print(f"Text found on {url}: {text_content[:5]}")  # Print first 5 elements of text content for brevity

        if depth < max_depth:
            # Extract links and add them to the queue
            links = driver.find_elements(By.TAG_NAME, 'a')
            hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href') and 'http' in link.get_attribute('href')]
            for href in hrefs:
                to_visit.append((href, depth + 1))

    return all_text_content

# Start scraping from the main page
all_text_content = scrape_site('https://www.ams.at/')

# Close the WebDriver
driver.quit()

# Do something with the scraped data
print(all_text_content[:500])  # Print first 500 characters of the result for brevity


Scraping https://www.ams.at/, Depth: 0
Text found on https://www.ams.at/: ['Arbeitsuchende Österreich\nA\nA\nA\nSuche starten\nArbeitslos - Was tun?\nRichtig bewerben\nBerufe, Aus- und Weiterbildung\nKarenz und Wiedereinstieg\nArbeiten in Österreich und der EU\nHäufig gestellte Fragen (FAQ)\nAlle Jobs\nDie Stellensuche des Arbeitsmarktservice\nJob suchen\nin\n         Umkreis\n         \n5 km\n         10 km\n         20 km\n         50 km\n         75 km\n         100 km\n         Suchen\neJob-Room\nPersonal suchen\nHäufige Anliegen\nInformatsiya dlya ukrayinsʹkykh bizhentsiv\neJob-Room für Arbeitsuchende\nServiceLine (Anfragen per Telefon klären)\neAMS-Konto für Arbeitsuchende\neService Zone\nArbeitslos melden\nKrankheit, Umzug oder Ende der Arbeitslosigkeit melden\nArbeitslosengeld\nInformationen über Berufe\nLehrstellenbörse\neAMS-Konto für Arbeitsuchende - Login\nSuchen Sie die Adresse oder die Telefonnummer Ihrer AMS Geschäftsstelle?\nGeschäftsstellen finden\nSuchen Sie ein besti