The idea behind this notebook is to get the material of <a href="http://www.dabeaz.com"> David Beazley</a> tutorials and create a notebook in jupyter structure, linking the content to a tmp file. 

In [None]:
from glob import glob
from io import BytesIO
from itertools import filterfalse
from jupyter_core import paths
from lib2to3.main import main as py2to3
from lxml import (html, etree)
import nbformat as nbf
from os.path import (join as pjoin, dirname)
import pathlib
import tempfile
import urllib.request
import zipfile

In [None]:
def get_notebook_conf(opt):
    """
    Return the "opt" key found in configuration file
    """
    res = []
    for f in glob(pjoin(paths.jupyter_config_dir(), "*.py")):
        with open(f, 'r') as buf:
            # drop if line is not empty, not commented and do not contains opt
            for line in filterfalse(lambda x : x.strip().startswith("#") or x.strip() == '' or not opt in x, buf):
                res.append(line.split("=")[-1].strip().strip("'"))

    assert len(res) == 1, "notebook_dir not found or more than once"
    return res[0]

In [77]:
def get_online_content(href, notebook_dir):

    nb = nbf.v4.new_notebook()

    nb['cells'] = []

    TMP = tempfile.gettempdir()

    dirName = pathlib.Path(href).parts[-2]
    localPath = pjoin(TMP, dirName)

    # Create a parser
    parser = etree.HTMLParser()

    # and feed it with urllib response
    with urllib.request.urlopen(href) as f:
        page = html.parse(f, parser)

        
    _l = page.xpath("//a[contains(@href,'.zip')]")[0].attrib['href']
    _href = f"{pjoin(dirname(href), _l)}"
    _zip = urllib.request.urlopen(_href).read()
    
    # Info : extract content of archive in directory
    # zipfile.ZipFile(BytesIO(_zip.content)).extractall(f"{TMP}")
    #
    # Info: open a specific file
    # with myzip.open('generators-uk/apachelog.py') as myfile:
    #     _f = myfile.read().decode()
        
    myzip = zipfile.ZipFile(BytesIO(_zip))

    myzip.extractall(f"{TMP}")

    # Set TMP dir to archive folder
    TMP = pjoin(TMP,myzip.namelist()[0])
    nb['cells'].append(nbf.v4.new_code_cell(f"# Move do tmp file\nfrom os import chdir\nchdir(\"{TMP}\")"))


    for h4 in page.xpath('//h4'):
        ul = h4.getnext()  # Get element sibbling

        title = h4.text.split(':')[-1].strip()
        nb['cells'].append(nbf.v4.new_markdown_cell(f"""# {title}"""))

        for li in filter(lambda x: len(x) > 0, ul.iterchildren()):
            if li.tag != "li":
                print(f"Got: {li.tag}")
            
            # Write comment of the source code
            comment = etree.tostring(list(li.iter())[-1]).decode()
            nb['cells'].append(nbf.v4.new_markdown_cell(f"""{comment}."""))
            for anchor in filter(lambda x: x.attrib['href'].endswith('.py'), li.findall("a")):
                # Get path to python source code
                filePath = pjoin(TMP, anchor.attrib['href'])
                # Convert python 2 source code python 3
                py2to3("lib2to3.fixes", ['-w', '--no-diffs', filePath])

                # Read python source code and write to new code cell 
                with open(filePath, 'r') as myfile:
                    source = myfile.read().strip()
                    nb['cells'].append(nbf.v4.new_code_cell(source))

    fname = f"David_Beazley_{dirName}.ipynb"

    output = pjoin(notebook_dir, "10_Github", "Iterables, iterators, generators, coroutines", fname)
    with open(output, 'w') as f:
        nbf.write(nb, f)

# Get <code>generators-uk</code> tutorials

In [78]:
notebook_dir = get_notebook_conf('notebook_dir')

href = "http://www.dabeaz.com/generators-uk/index.html"

get_online_content(href, notebook_dir)

# Fetching all tutorials

In [1]:
tutorials = "http://www.dabeaz.com/tutorials.html"

In [82]:
parser = etree.HTMLParser()

with urllib.request.urlopen(tutorials) as f:
    tutorialspage = html.parse(f, parser)

In [92]:
for tuto in tutorialspage.xpath("//a[text()='materials']"):
    url = pjoin("http://www.dabeaz.com/", tuto.attrib["href"])
    print(f"Fetching {url}")
    get_online_content(url, notebook_dir)

Fectching http://www.dabeaz.com/modulepackage/index.html
Fectching http://www.dabeaz.com/finalgenerator/index.html
Fectching http://www.dabeaz.com/py3meta/index.html
Fectching http://www.dabeaz.com/pydata/index.html
Fectching http://www.dabeaz.com/python3io/index.html
Fectching http://www.dabeaz.com/usenix2009/pythonprog/index.html
Fectching http://www.dabeaz.com/usenix2009/concurrent/index.html
Fectching http://www.dabeaz.com/usenix2009/generators/index.html
Fectching http://www.dabeaz.com/coroutines/index.html
Fectching http://www.dabeaz.com/generators-uk/index.html
Fectching http://www.dabeaz.com/SwigMaster/index.html
