In [9]:
import requests
import time
import random
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [2]:
def crawl_web(url):
    valid_links = set()
    valid_links.append(url)

    for page in valid_links:
        response = requests.get(page)
        if response.status_code() == 200:
            soup = BeautifulSoup(response, 'html.parser')
            link_tags = soup.find_all('a')
    
            for link in link_tags:
                if link['href'].startswith('/'):
                    complete_url = url + link['href']
                    valid_links.append(complete_url) 
        else:
            print(f'An error occured {response.status_code()}')
            
    return valid_links

In [None]:
def crawl_web(url):
    # Extract the domain of the starting URL
    base_domain = urlparse(url).netloc

    valid_links = set()  # Set to store unique links
    links_to_crawl = set([url])  # Initialize with the starting URL
    crawled_links = set()  # Set to store already crawled links

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
    }
    
    while links_to_crawl:
        page = links_to_crawl.pop()  # Get a link to crawl

        # Skip if the page has already been crawled
        if page in crawled_links:
            continue

        try:
            response = requests.get(page, timeout=10, headers=headers)
            if response.status_code == 200:
                print(f"Visiting: {page}")
                crawled_links.add(page)  # Mark the page as crawled
                soup = BeautifulSoup(response.text, 'html.parser')
                link_tags = soup.find_all('a', href=True)

                for link in link_tags:
                    # Resolve relative URLs
                    complete_url = urljoin(page, link['href'])
                    # Ensure the link is from the same domain
                    if urlparse(complete_url).netloc == base_domain:
                        # Add new links to the sets
                        if complete_url not in valid_links and complete_url not in crawled_links:
                            valid_links.add(complete_url)
                            links_to_crawl.add(complete_url)
            else:
                print(f"Failed to fetch {page}: {response.status_code}")
        except requests.RequestException as e:
            print(f"Error fetching {page}: {e}")

        # Add a random delay between requests
        sleep_time = random.uniform(1, 3)  # Random delay between 1 and 3 seconds
        print(f"Sleeping for {sleep_time:.2f} seconds...")
        time.sleep(sleep_time)
    
    return valid_links

# Usage example
start_url = "https://www.boldbusiness.com/"
all_links = crawl_web(start_url)

# Print all collected links
print("\nCrawled Links:")
for link in all_links:
    print(link)


Visiting: https://www.boldbusiness.com/
Sleeping for 1.41 seconds...
Visiting: https://www.boldbusiness.com/resources/webinars-videos/
Sleeping for 1.84 seconds...
Visiting: https://www.boldbusiness.com/webinars/workforce-strategy-webinar-slides/
Sleeping for 2.52 seconds...
Visiting: https://www.boldbusiness.com/#elementor-action%3Aaction%3Dpopup%3Aopen%26settings%3DeyJpZCI6Ijg2NzM5IiwidG9nZ2xlIjpmYWxzZX0%3D
Sleeping for 1.50 seconds...
Visiting: https://www.boldbusiness.com/careers/
Sleeping for 1.13 seconds...
Visiting: https://www.boldbusiness.com/privacy-policy/
Sleeping for 2.34 seconds...
Visiting: https://www.boldbusiness.com/#get_started
Sleeping for 1.14 seconds...
Visiting: https://www.boldbusiness.com/videos/store-manage-and-disseminate-data-centers/
Sleeping for 1.45 seconds...
Visiting: https://www.boldbusiness.com/wp-content/uploads/2020/04/img2.jpg
Sleeping for 2.94 seconds...
Visiting: https://www.boldbusiness.com/fun/
Sleeping for 2.68 seconds...
Visiting: https://www