Good link for all BeautifulSoup options :
https://www.pluralsight.com/guides/web-scraping-with-beautiful-soup

In [None]:
import requests
from bs4 import BeautifulSoup

class Crawler:
    '''This is the main crawler for one full website. Maintain session after login.'''
    def __init__(self, base_url, login_url=None, login_form_data=None):
        self.base_url = base_url
        self.session = requests.Session()        
        if login_url is not None:
            self.session.post(login_url, login_form_data)

        self.unique_urls = {base_url}
        self.visited_urls = set()
        
    def crawl(self, url, tag_names, url_attr, url_pattern, soup_observer, debug=False):
        while len(self.unique_urls) > len(self.visited_urls):
            self.loadUrls(url, tag_names, url_attr, url_pattern, debug=False)

            unvisited_url = (self.unique_urls - self.visited_urls).pop()
            self.visited_urls.add(unvisited_url)

            soup = self.parseUrl(unvisited_url, debug)
            soup_observer(soup)
        
    def loadUrls(self, url, tag_names, url_attr, url_pattern, debug=False):
        soup = self.parseUrl(url)
        return self.loadUrlsFromBS(soup, tag_names, url_attr, url_pattern, debug)
        
    def loadUrlsFromBS(self, soup, tag_names, url_attr, url_pattern, debug=False):
        for link in soup.find_all(tag_names, attrs={url_attr : re.compile(url_pattern)}):
            try:
                url = link[url_attr]
            except:
                continue
                
            if (debug):
                print('Adding:', url)
            self.unique_urls.add(url)
        return soup
            
    def parseUrl(self, url, debug=False):
        '''Return a BeautifulSoup from the URL'''
        if (debug):
            print('Downloading from:', url)
            
        html = self.session.get(url).content
        return self.parseContent(html, debug)

    def parseContent(self, html, debug=False):
        '''Return a BeautifulSoup from an HTML content'''
        soup = BeautifulSoup(html, "html.parser")
        if (debug):
            print(soup.prettify())
        return soup

In [None]:
import requests
from urllib.parse import urlparse

from pathlib import Path
import os
import errno

class Downloader:
    '''Taking care of local storage & remote download'''
    def __init__(self, root_path, debug=False):
        self.root_folder = self.createFolder(root_path)

    def createFolder(self, *paths):
        '''Create local folders based on multiple paths'''
        folder_path = os.path.join(*paths, '')
        Path(folder_path).mkdir(parents=True, exist_ok=True)
        return folder_path

    def createSubFolder(self, *paths):
        '''Create local folders based on multiple paths'''
        target_folder = os.path.join(self.root_folder, *paths, '')
        
        # Create folder if not exist
        if not os.path.exists(target_folder):
            try:
                self.createFolder(target_folder)
                # os.makedirs(target_folder, exist_ok=True)
            except OSError as exc: # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        return target_folder

    def downloadAll(self, target_sub_folder, soup, tag_names, url_attr, url_pattern, debug=False):
        content = self.writePage(target_sub_folder, soup, debug)
        self.downloadAllTags(target_sub_folder, soup, tag_names, url_attr, url_pattern, debug)
        return content

    def writePage(self, target_sub_folder, soup, debug=False):
        content = soup.prettify()
        self.writeFile(target_sub_folder, "index.html", content, debug)
        return content

    def downloadAllTags(self, target_sub_folder, soup, tag_names, url_attr, url_pattern, debug=False):
        tags = soup.find_all(tag_names, attrs={url_attr : re.compile(url_pattern)})
        self.downloadTags(target_sub_folder, tags, url_attr, debug)
        
    def downloadTags(self, target_sub_folder, tags, url_attr, debug=False):
        '''Download all the Tag into *sub_folder_target*'''
        for link in tags:
            url = link.get(url_attr)
            self.downloadFile(target_sub_folder, url, debug)        
    
    def downloadFile(self, target_sub_folder, url, debug=False):
        '''Download the *url* into *target_sub_folder*'''
        response = requests.get(url, allow_redirects=True, stream=True)
        if response.status_code == 200:
            filename = self.getFilenameFromHeader(response.headers, url)
            return self.writeStream(target_sub_folder, filename, response, debug)
                    
        # TODO : HANDLE EXCEPTION LOGS

    def writeStream(self, target_sub_folder, filename, response, debug=False):
        '''Write a response stream by 1024 bytes'''
        target_folder = self.createSubFolder(target_sub_folder, '')
        filepath = os.path.join(target_folder, filename)
        
        with open(filepath, 'wb') as f:
            if (debug):
                print("Writing file to :", f.name)
            
            for chunk in response.iter_content(1024):
                f.write(chunk)
        return filepath

    def writeFile(self, target_sub_folder, filename, content, debug=False):
        '''Write a content'''
        target_folder = self.createSubFolder(target_sub_folder)
        filepath = os.path.join(target_folder, filename)
        
        with open(filepath, 'w') as f:
            if (debug):
                print("Writing file to :", f.name)            
            f.write(content)
            
        return filepath
    
    def getFilenameFromHeader(self, headers, url):
        """
        Get filename from content-disposition, else parse from url basename
        """
        cd = headers.get('content-disposition')
        if cd:
            fname = re.findall('filename=(.+)', cd)
            if len(fname) != 0:
                return fname[0]

        urlObj = urlparse(url)
        return os.path.basename(urlObj.path)
