In [1]:
pip install requests beautifulsoup4



In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
# I want to scrape all of the pages on one website.
# I need a parser that can "jump" from web page to web page on a website.

In [6]:
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import time

def can_fetch(robots_parser, url):
  return robots_parser.can_fetch('*', url)
# Looking for websites on Google that have the phrases "digital twin networks" and "personalized simulations of the human body."
# Tried using interactive agents to find this info on the web, but didn't have success.
# Thinking about crawling https://www.google.com/search?q or someone's blog for answers to science questions.
# We should probably use a Depth-First Search crawling strategy (follow a single link from the homepage and explore all subsequent links)

In [None]:
def crawl_website(start_url):
  # Crawls a website, extracts its links, and downloads the server's content.
  # This function starts at a given website start_url.
  # A given website start_url is the value that's passed through the
  # function.
  parsed_start_url = urlparse(start_url)
  base_domain = parsed_start_url.netloc
  robots_url = urljoin(start_url, 'robots.txt')


  robots_parser = RobotFileParser()
  robots_parser.set_url(robots_url)
  robots_parser.read()


  visited_urls = set()
  # This is the collection of websites that we've already visited.

  urls_to_visit = [start_url]
  # Initializes a list and adds start_url as its first element.
  # Visits the URL and adds new URLs to the list.
  url_count = 0

  headers = {
      'Code-Collector': 'WebCrawler/1.0 (Simple web crawler that identifies and collects foundational code for a prototype of a multimodal neural network. mailto: chris@nexusbios.co.)'
  }
  while urls_to_visit:
    # This is a loop (continues execution while a coniditon is true/while urls_to_visit is not empty.
    url = urls_to_visit.pop(0)
    url_count += 1

    # Locates the url in index 0, takes it out of the list, and attaches it to url for processing.
    print(f"Visiting: {url}")

    if url in visited_urls or not can_fetch(robots_parser, url):
      continue
    visited_urls.add(url)

    try:
      response = requests.get(url, headers=headers, timeout=5)
      # Request and response. Requests html files from the web page/fetches web pages from the Server.
      # Server responds (sends us html files).
      response.raise_for_status()
      # Checks if the request to the server is successful. If not, it raises an exception.
    except requests.exceptions.RequestException as e:
      print(f"Error fetching {url}: {e}")
      continue

    soup = BeautifulSoup(response.content, 'html.parser')
    # From response.content extracts HTML content from the files (stores data)
    # HTML parsing and parse tree is created and stored in soup to allow for data analysis.


    # Extract links from the page
    links = soup.find_all('a', href=True)
    for link in links:
      href = link['href']
      full_url = urljoin(url, href)
      parsed_link_url = urlparse(full_url)
      if parsed_link_url.netloc == base_domain and full_url not in visited_urls and full_url not in urls_to_visit:
        urls_to_visit.append(full_url)

    time.sleep(1)


start_url = "https://blog.codinghorror.com/"
crawl_website(start_url)
# Stop at each link, search for a line of code, and if we find one, download the file for the source code. Move on to the next link.
# How do I search every GitHub repository for code I want, and download the program files?
# Search box, enter code, and it returns program files with that code.