In [1]:
# downloads a website into a set of interlinked .md files

In [4]:
import os
import bs4
import requests
import re

def get_soup(url) -> bs4.BeautifulSoup:
    response = requests.get(url)
    return bs4.BeautifulSoup(response.text, 'html.parser')

def get_title(soup: bs4.BeautifulSoup) -> str:
    title = soup.title.string
    title = re.sub(r'[^\w\s]', '', title)
    title = re.sub(r'\s+', '_', title)
    return title

import urllib.parse
def post_process_md(md: str, page_url: str, website_root: str) -> str:
    # replace relative links of resources with absolute links of that site
    stem = urllib.parse.urlparse(page_url).scheme + '://' + urllib.parse.urlparse(page_url).netloc\
        + urllib.parse.urlparse(page_url).path.rsplit('/', 1)[0] + '/'
    md = re.sub(r'\]\((?!http)([^)]+)\)', r'](' + stem + r'\1)', md)

    # replace absolute links within the site with relative links
    # pattern: [some text](http://root_url/some/path.html) -> [some text](some/path.md)
    md = re.sub(r'\]\(' + website_root + r'([^)]+).html\)', r'](\1.md)', md)
    
    
    return md

In [5]:
website_first_page = 'https://ccrma.stanford.edu/~jos/pasp/Rectilinear_2D_Mesh.html'
website_root = 'https://ccrma.stanford.edu/~jos/'
search_range = 'https://ccrma.stanford.edu/~jos/pasp/'

md_root = 'output/jos'


In [6]:
from pathlib import Path
import queue
from markdownify import markdownify


visited = set()
max_pages = 10

q = queue.Queue()
q.put(website_first_page)

while not q.empty():
    if len(visited) >= max_pages:
        break
    url = q.get()
    if url in visited:
        continue
    
    visited.add(url)
    html = requests.get(url).text

    print('visiting', url, len(html), 'bytes')
    md = markdownify(html,keep_inline_images_in=['td'])
    md = post_process_md(md, url, website_root)
    
    soup = bs4.BeautifulSoup(html, 'html.parser')

    md_path = Path(md_root) / Path(url).relative_to(website_root).with_suffix('.md')
    md_path.parent.mkdir(parents=True, exist_ok=True)
    with open(md_path, 'w') as f:
        f.write(md)
    for a in soup.find_all('a'):
        href = a.get('href')
        if href is None:
            continue
        if not href.startswith('http'):
            href = urllib.parse.urljoin(url, href)
        if search_range in href and website_root in href:
            q.put(href)

visiting https://ccrma.stanford.edu/~jos/pasp/Rectilinear_2D_Mesh.html 9367 bytes
visiting https://ccrma.stanford.edu/~jos/pasp/Dispersion.html 9227 bytes
visiting https://ccrma.stanford.edu/~jos/pasp/Digital_Waveguide_Mesh.html 8605 bytes
visiting https://ccrma.stanford.edu/~jos/pasp/pasp.html 85382 bytes
visiting https://ccrma.stanford.edu/~jos/pasp/Index_this_Document.html 219631 bytes


In [44]:
len(soup)

6

In [46]:
title

'The_Rectilinear_2D_Mesh'