In [1]:
import os
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup

def extract_domain_name(url: str) -> str:
    """
    Extract the domain name from a given URL.
    """
    parsed_uri = urlparse(url)
    domain = '{uri.netloc}'.format(uri=parsed_uri).split('.')[0]
    return domain

def scrape_website(url: str, domain: str) -> list:
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract text content from paragraphs
    sentences = [p.text.strip() for p in soup.find_all('p') if p.text.strip()]
    
    # Extract links to follow
    for a_tag in soup.find_all('a'):
        href = a_tag.attrs.get('href')
        if href:
            # Ensure it's an absolute URL
            href = urljoin(url, href)
            # Check if it's a link within the primary domain
            if domain in href:
                sentences.extend(scrape_website(href, domain))
    
    return sentences

def save_sentences_to_file(sentences: list, filename: str):
    # Check if directory 'scraped_websites' exists, if not create it
    if not os.path.exists('scraped_websites'):
        os.mkdir('scraped_websites')
    
    # Save sentences to the file within 'scraped_websites' directory
    with open(os.path.join('scraped_websites', filename), 'w') as file:
        for sentence in sentences:
            file.write(f"{sentence}\n")

# Test
url = "https://websitenearme.online"
domain_name = extract_domain_name(url)
sentences = scrape_website(url, domain_name)
save_sentences_to_file(sentences, f"scraped_{domain_name}.txt")


KeyboardInterrupt: 