In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from collections import deque

# Initialize the WebDriver
driver = webdriver.Chrome()

def scrape_site(start_url, max_depth=10):
    visited_urls = set()
    to_visit = deque([(start_url, 0)])  # Queue of (url, depth)
    all_text_content = []

    while to_visit:
        url, depth = to_visit.popleft()
        if url in visited_urls or depth > max_depth:
            continue

        visited_urls.add(url)
        print(f"Scraping {url}, Depth: {depth}")

        driver.get(url)
        time.sleep(2)  # Wait for JavaScript to load

        # Extract text content
        elements = driver.find_elements(By.XPATH, '//*')
        text_content = [element.text.strip() for element in elements if element.text.strip() != '']
        all_text_content.extend(text_content)

        print(f"Text found on {url}: {text_content[:5]}")  # Print first 5 elements of text content for brevity

        if depth < max_depth:
            # Extract links and add them to the queue
            links = driver.find_elements(By.TAG_NAME, 'a')
            hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href') and 'http' in link.get_attribute('href')]
            for href in hrefs:
                to_visit.append((href, depth + 1))

    return all_text_content

# Start scraping from the main page
all_text_content = scrape_site('https://www.ams.at/arbeitsuchende/')

# Close the WebDriver
driver.quit()

# Do something with the scraped data
print(all_text_content[:500])  # Print first 500 characters of the result for brevity


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from collections import deque

# Initialize the WebDriver
driver = webdriver.Chrome()

def accept_cookies(first_page):
    if first_page:
        try:
            # Find the cookie accept button by its text and click it
            accept_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Einverstanden')]")
            accept_button.click()
            time.sleep(1)  # Wait for the banner to disappear
        except Exception as e:
            print("Attempt to find and click cookie banner failed:", str(e))
            

def scrape_site(start_url, max_depth=10):
    visited_urls = set()
    to_visit = deque([(start_url, 0)])  # Queue of (url, depth)
    all_text_content = []
    first_page = True  # Flag to indicate the first page

    while to_visit:
        url, depth = to_visit.popleft()
        if url in visited_urls or depth > max_depth:
            continue

        visited_urls.add(url)
        print(f"Scraping {url}, Depth: {depth}")

        driver.get(url)
        accept_cookies(first_page)  # Attempt to accept cookies only on the first page
        first_page = False  # Set the flag to False after the first page
        time.sleep(2)  # Wait for JavaScript to load

        # Extract text content
        elements = driver.find_elements(By.XPATH, '//*')
        text_content = [element.text.strip() for element in elements if element.text.strip() != '']
        all_text_content.extend(text_content)

        print(f"Text found on {url}: {text_content[:5]}")  # Print first 5 elements of text content for brevity

        if depth < max_depth:
            # Extract links and add them to the queue
            links = driver.find_elements(By.TAG_NAME, 'a')
            hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href') and 'http' in link.get_attribute('href')]
            for href in hrefs:
                to_visit.append((href, depth + 1))

    return all_text_content

# Start scraping from the main page
all_text_content = scrape_site('https://www.ams.at/')

# Close the WebDriver
driver.quit()

# Do something with the scraped data
print(all_text_content[:500])  # Print first 500 characters of the result for brevity


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
from collections import deque

# Initialize the WebDriver
driver = webdriver.Chrome()

def accept_cookies(first_page):
    if first_page:
        try:
            # Find the cookie accept button by its text and click it
            accept_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Einverstanden')]")
            accept_button.click()
            time.sleep(1)  # Wait for the banner to disappear
        except Exception as e:
            print("Attempt to find and click cookie banner failed:", str(e))

def load_existing_data(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            return set(file.read().splitlines())
    except FileNotFoundError:
        return set()

def save_to_file(data, existing_data, filename='scraped_content.txt'):
    # Filter out duplicates from the new data
    unique_data = set(data) - existing_data

    # Update existing data set
    existing_data.update(unique_data)

    # Write the unique data to the file
    with open(filename, 'a', encoding='utf-8') as file:
        for line in unique_data:
            file.write(line + '\n')

def scrape_site(start_url, max_depth=10):
    visited_urls = set()
    to_visit = deque([(start_url, 0)])  # Queue of (url, depth)
    all_text_content = []
    first_page = True  # Flag to indicate the first page
    existing_data = load_existing_data('scraped_content.txt')

    while to_visit:
        url, depth = to_visit.popleft()
        if url in visited_urls or depth > max_depth:
            continue

        visited_urls.add(url)
        print(f"Scraping {url}, Depth: {depth}")

        driver.get(url)
        accept_cookies(first_page)
        first_page = False
        time.sleep(2)

        try:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*')))
            elements = driver.find_elements(By.XPATH, '//*')
            text_content = [element.text.strip() for element in elements if element.text.strip() != '']
            all_text_content.extend(text_content)

            save_to_file(text_content, existing_data)
            print(f"Text found on {url}: {text_content[:5]}")  # Print first 5 elements of text content for brevity

            if depth < max_depth:
                links = driver.find_elements(By.TAG_NAME, 'a')
                hrefs = [link.get_attribute('href') for link in links if link.get_attribute('href') and 'http' in link.get_attribute('href')]
                for href in hrefs:
                    to_visit.append((href, depth + 1))

        except TimeoutException:
            print(f"Timed out waiting for page elements to load at {url}")
        except NoSuchElementException:
            print(f"Element not found on {url}")

    return all_text_content

# Start scraping from the main page
all_text_content = scrape_site('https://www.ams.at/')

# Close the WebDriver
driver.quit()

# Do something with the scraped data
print(all_text_content[:500])  # Print first 500 characters of the result for brevity


In [6]:
def remove_duplicates(input_filename, output_filename):
    with open(input_filename, 'r') as file:
        lines = file.readlines()

    unique_lines = set()
    for line in lines:
        # You can strip whitespace or do other processing if needed
        unique_lines.add(line.strip())

    with open(output_filename, 'w') as file:
        for line in unique_lines:
            file.write(line + '\n')

# Replace 'inputfile.txt' with the path to your original text file
# and 'outputfile.txt' with the path for the new file
remove_duplicates('scraped_content.txt', 'scraped_content_uniquelines.txt')


'\n\nGlossar » Informationen zu zentralen Themen | AMS\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nZum Inhalt\n\n\nZur AMS-Navigation\n\n\n\n\n\n\nSind Sie damit einverstanden, dass wir anonymisiert Ihr Surfverhalten zur  Verbesserung aufzeichnen?\nAMS.at verwendet Cookies um Ihnen das bestmögliche Surferlebnis zu\xa0 ermöglichen. Um auch weiterhin unseren Webauftritt besser zu gestalten, analysieren wir anonymisiert das Surfverhalten unserer Nutzer und\xa0 Nutzerinnen. Sie können dies jederzeit in den Privatsphäre-Einstellungen anpassen.\n\nMehr darüber erfahren...\n\n\nEinverstanden\nNicht Einverstanden\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nZu welchen Inhalten möchten Sie?\n\n\n\nInhalte für:\n\n\nArbeitsuchende\n\n\n\nUnternehmen\n\n\n\nArbeitsmarktdaten und Medien\n\n\n\nOrganisation\n\n\n\n\n\nIn Bundesland:\n\n\n\nBurgenland\n\n\nKärnten\n\n\nNiederösterreich\n\n\nOberösterreich\n\n\nSalzburg\n\n\nSteiermark\n\n\nTirol\n\n\nVorarlberg\n\n\nWien\n\n\n