## Installs

In [2]:
!pip install requests bs4



## Setup

In [3]:
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

In [4]:
logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

In [5]:
class Crawler:

    def __init__(self, urls=[]):
        self.visited_urls = []
        self.urls_to_visit = urls

    def download_url(self, url):
        return requests.get(url).text

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path

    def add_url_to_visit(self, url):
        if url not in self.visited_urls and url not in self.urls_to_visit:
            self.urls_to_visit.append(url)

    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            logging.info(f'Crawling: {url}')
            try:
                self.crawl(url)
            except Exception:
                logging.exception(f'Failed to crawl: {url}')
            finally:
                self.visited_urls.append(url)



In [8]:
if __name__ == '__main__':
    Crawler(urls=['https://www.cadencebank.com/']).run()

2023-05-24 14:59:35,521 INFO:Crawling: https://www.cadencebank.com/
2023-05-24 14:59:36,130 INFO:Crawling: #main
2023-05-24 14:59:36,131 ERROR:Failed to crawl: #main
Traceback (most recent call last):
  File "/var/folders/q8/1gf829m141157nbz7kc4_19h00tb0q/T/ipykernel_25569/525982781.py", line 32, in run
    self.crawl(url)
  File "/var/folders/q8/1gf829m141157nbz7kc4_19h00tb0q/T/ipykernel_25569/525982781.py", line 23, in crawl
    html = self.download_url(url)
  File "/var/folders/q8/1gf829m141157nbz7kc4_19h00tb0q/T/ipykernel_25569/525982781.py", line 8, in download_url
    return requests.get(url).text
  File "/Users/mikegoodman/Documents/developer/venv/lib/python3.9/site-packages/requests/api.py", line 73, in get
    return request("get", url, params=params, **kwargs)
  File "/Users/mikegoodman/Documents/developer/venv/lib/python3.9/site-packages/requests/api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
  File "/Users/mikegoodman/Documents/dev

KeyboardInterrupt: 