In [None]:
import requests
from bs4 import BeautifulSoup
import os

In [17]:

def sanitize_filename(filename):
    """Sanitize the filename by removing or replacing invalid characters."""
    invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*','\r', '\n',' ']
    for char in invalid_chars:
        filename = filename.replace(char, '_')
    return filename

def save_content_to_file(directory, title, content):
    """Saves the content to a file in the specified directory."""
    # Ensure the directory exists
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Convert title to a valid filename
    sanitized_title = sanitize_filename(title)
    filename = os.path.join(directory, sanitized_title.replace(' ', '_').replace('.', '').replace(':', '') + '.txt')
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

def split_and_save_by_tags(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # For parts
    for part in soup.find_all('h1'):
        title = part.get_text()
        content = title
        for sibling in part.find_all_next():
            if sibling.name and sibling.name.startswith('h1'):
                break
            content += '\n' + sibling.get_text()
        save_content_to_file('./parts', title, content)

    # For chapters
    for chapter in soup.find_all('h2'):
        title = chapter.get_text()
        content = title
        for sibling in chapter.find_all_next():
            if sibling.name and (sibling.name.startswith('h1') or sibling.name.startswith('h2')):
                break
            content += '\n' + sibling.get_text()
        save_content_to_file('./chapters', title, content)

    # For sections
    for section in soup.find_all('h3'):
        title = section.get_text()
        content = title
        for sibling in section.find_all_next():
            if sibling.name and (sibling.name.startswith('h1') or sibling.name.startswith('h2') or sibling.name.startswith('h3')):
                break
            content += '\n' + sibling.get_text()
        save_content_to_file('./sections', title, content)

In [18]:
url = 'https://www.regjeringen.no/en/dokumenter/planning-building-act/id570450/'
split_and_save_by_tags(url)
