In [None]:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def get_driver():
    """Create a Selenium WebDriver."""
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    return webdriver.Chrome(options=options)

def accept_cookies(driver):
    """Clicks the 'Einverstanden' button on the cookie banner if present."""
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//button[contains(text(), 'Einverstanden')]"))
        )
        cookie_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Einverstanden')]")
        cookie_button.click()
        print("Cookie banner accepted.")
    except Exception as e:
        print(f"Cookie banner not found or error occurred: {e}")

def get_links(driver, url, visited):
    """Fetches all unique links on a given webpage using Selenium"""
    try:
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        links = set()
        for link in soup.find_all('a', href=True):
            full_link = urljoin(url, link['href'])
            if urlparse(full_link).netloc == urlparse(url).netloc and full_link not in visited:
                links.add(full_link)
        return links
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return set()

def save_content(driver, url, folder):
    """Saves the content of a URL to a file in the specified folder, skips PDF and JPEG"""
    try:
        driver.get(url)
        content_type = driver.execute_script("return document.contentType")
        if 'text/html' in content_type:
            filename = os.path.join(folder, 'index.html')
            with open(filename, 'w', encoding='utf-8') as file:
                file.write(driver.page_source)
            print(f"Content saved from {url}")
        else:
            print(f"Skipped non-text content (PDF or JPEG) from {url}")
    except Exception as e:
        print(f"Error saving content from {url}: {e}")

def crawl(driver, url, base_dir, visited):
    if url in visited:
        return
    visited.add(url)

    links = get_links(driver, url, visited)
    for link in links:
        parsed_link = urlparse(link)
        folder_path = os.path.join(base_dir, parsed_link.netloc + parsed_link.path.strip('/'))
        os.makedirs(folder_path, exist_ok=True)
        save_content(driver, link, folder_path)
        crawl(driver, link, base_dir, visited)

def main(url, base_dir):
    driver = get_driver()
    visited = set()
    # Accept cookies on the first page
    accept_cookies(driver)
    crawl(driver, url, base_dir, visited)
    driver.quit()

if __name__ == "__main__":
    base_url = 'https://www.ams.at/arbeitsuchende/'  # Change to your target URL
    base_directory = 'ams_content'  # Change to your preferred directory
    main(base_url, base_directory)


In [37]:
from langchain.document_loaders import BSHTMLLoader, DirectoryLoader

loader = DirectoryLoader('./ams_content', loader_cls=BSHTMLLoader)

docs = loader.load()
len(docs)

350

In [38]:
docs[10].page_content

import tiktoken

tokenizer_name = tiktoken.encoding_for_model('gpt-4')
tokenizer_name.name

'cl100k_base'

In [39]:
tokenizer = tiktoken.get_encoding(tokenizer_name.name)

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [40]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["/n/n", "/n", " ", ""]
)

import re

def preprocess_text(text):
    # Replace multiple newline characters with a single newline
    text = re.sub(r'/n{1,}', ' ', text)
    text = re.sub(r'/t{1,}', ' ', text)
    text = re.sub(r'/xa0{1,}', ' ', text)
    #remove space that is more thgan one
    text = re.sub(r' {1,}', ' ', text)
    return text

In [41]:
from typing_extensions import Concatenate
from uuid import uuid4
from tqdm.auto import tqdm

chunks = []

for idx, page in enumerate(tqdm(docs)):
    content = preprocess_text(page.page_content)
    if len(content) > 100:
        url = page.metadata['source'].replace('rtdocs/', 'https://')
        texts = text_splitter.split_text(content)
        chunks.extend([{
            'id': str(uuid4()),
            'text': texts[i],
            'chunk': i,
            'url': url
        } for i in range(len(texts))])

  3%|▎         | 10/350 [00:00<00:03, 94.69it/s]

100%|██████████| 350/350 [00:04<00:00, 79.02it/s] 


In [42]:
print(f"Created {len(chunks)} chunks")
chunks[1]

Created 3806 chunks


{'id': 'e3ef59be-7153-4575-a55b-6cd39757b1a4',
 'text': 'per Telefon klären) eAMS-Konto für Arbeitsuchende eService Zone Arbeitslos melden Krankheit, Umzug oder Ende der Arbeitslosigkeit melden Arbeitslosengeld Informationen über Berufe Lehrstellenbörse eAMS-Konto für Arbeitsuchende - Login Suchen Sie die Adresse oder die Telefonnummer Ihrer AMS Geschäftsstelle? Geschäftsstellen finden Suchen Sie ein bestimmtes Formular? Dokument finden Neuigkeiten vom Arbeitsmarkt ARBEITSMARKT BURGENLAND, DEZEMBER 2023 Ab 1.1.2024: AMS Stegersbach/Jennersdorf – Zusammenführung zu einer regionalen Geschäftsstelle in Stegersbach ARBEITSMARKT BURGENLAND, AUGUST 2023 Leichter Anstieg der Arbeitslosigkeit, mehr Lehrstellensuchende als offene Lehrstellen ARBEITSMARKT BURGENLAND, JUNI 2023 Weitere Neuigkeiten Neuigkeiten vom Arbeitsmarkt Der Kärntner Arbeitsmarkt im Dezember 2023 Der Kärntner Arbeitsmarkt im November 2023 Der Kärntner Arbeitsmarkt im Oktober 2023 Der Kärntner Arbeitsmarkt im September 2023 A

In [45]:
#create on text file with all chunks
with open('ams_chunks.txt', 'w', encoding='utf-8') as file:
    for chunk in chunks:
        file.write(chunk['text'] + '\n')

In [46]:
def remove_duplicates(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Counting occurrences of each line
    line_count = {}
    for line in lines:
        if line in line_count:
            line_count[line] += 1
        else:
            line_count[line] = 1

    # Writing back only unique lines (those that occur once)
    with open(file_path, 'w') as file:
        for line, count in line_count.items():
            if count == 1:
                file.write(line)

# Example usage
file_path = 'ams_chunks.txt'
remove_duplicates(file_path)


ModuleNotFoundError: No module named 'langchain.document'