In [None]:
import requests
from bs4 import BeautifulSoup

class Crawler:
    '''This is the main crawler for one full website. Maintain session after login.'''
    def __init__(self, base_url, login_url=None, login_form_data=None):
        self.base_url = base_url
        self.session = requests.Session()
        if login_url is not None:
            self.session.post(login_url, login_form_data)
    
    def parseUrl(self, url, debug=False):
        '''Return a BeautifulSoup from the URL'''
        if (debug):
            print('Downloading from:', url)
            
        html = self.session.get(url).content
        return self.parseContent(html, debug)

    def parseContent(self, html, debug=False):
        '''Return a BeautifulSoup from an HTML content'''
        soup = BeautifulSoup(html, "html.parser")
        if (debug):
            print(soup.prettify())
        return soup

In [None]:
import os
from urllib.parse import urlparse

def getFilenameFromHeader(headers, url):
    """
    Get filename from content-disposition, else parse from url basename
    """
    cd = headers.get('content-disposition')
    if cd:
        fname = re.findall('filename=(.+)', cd)
        if len(fname) != 0:
            return fname[0]
    
    urlObj = urlparse(url)
    return os.path.basename(urlObj.path)

In [None]:
from pathlib import Path
import requests

class Downloader:
    '''Taking care of local storage'''
    def __init__(self, root_path, debug=False):
        self.root_folder = self.createFolder(root_path, debug)

    def createFolder(self, folder_path, debug=False):
        if (debug):
            print("Create folder path:", folder_path)

        Path(folder_path).mkdir(parents=True, exist_ok=True)
        return folder_path + "/"

    def downloadFile(self, url, sub_folder_target, debug=False):
        response = requests.get(url, allow_redirects=True, stream=True)
        if response.status_code == 200:
            filename = getFilenameFromHeader(response.headers, url)
            
            folder = self.createFolder(self.root_folder + sub_folder_target)
            filepath = folder + filename
            
            if (debug):
                print("Write file to :", filepath)
                
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
                    
        # TODO : HANDLE EXCEPTION LOGS