# Scrape the docs
## Convert any page from [The Python Standard Library](https://docs.python.org/3/library/index.html) to a working Jupyter notebook file.

In [1]:
# Copywrite James Draper 2017 MIT License.

import re
from urllib import request
from urllib import parse
from bs4 import BeautifulSoup
import nbformat as nbf

# Define the what version of Python and the desired module.
py_version = '3.6'
module = 'functools'

url = 'https://docs.python.org/{}/library/{}.html'.format(py_version, module)
req = request.urlopen(url)
info = req.read()
soup = BeautifulSoup(info, 'html.parser')

# Isolate the portion of the page that we want.
soup = soup.body
sections = soup.find_all('div', class_='section')
main_section = sections[0]
str_soup = main_section.decode()

# Get all of the Python code divs
py = soup.find_all(class_=re.compile('python'))

def clean_up(dirty_code):
    """Clean up sphinx formatted code."""
    # Use the re module to find working code.
    if '>>>' in dirty_code.text:
        result = re.findall('^>{3}(.+)|^\.{3}(.+)', dirty_code.text, flags=re.M)
        result = '\n'.join([[k[1:] for k in i if len(k)>0][0] for i in result if len(i)>0])
        return result
    else:
        return dirty_code.text
    
# Split up the text on every code div
first_part = str_soup.split(str(py[0]))
start = str_soup.split(str(py[0]))
chunks = [start[0]]
remainder = start[1]

for i in range(1,len(py)):
    parts = remainder.split(str(py[i]))
    chunks.append(parts[0])
    if len(parts)>1:
        remainder = parts[1]

# Create the notebook object.
nb = nbf.v4.new_notebook()


# Adding cell that imports all of the desired modules.
code_cell = nbf.v4.new_code_cell('from {} import *'.format(module))
nb['cells'].append(code_cell)

for i in range(len(py)):
    
    # Formatting a chunk of HTML as a markdown cell.
    markdown_cell = nbf.v4.new_markdown_cell(chunks[i])
    nb['cells'].append(markdown_cell)
    # Formatting code as code cell.
    code_cell = nbf.v4.new_code_cell(clean_up(py[i]))
    nb['cells'].append(code_cell)

# Write the output to a notebook file in the same directory.
fname = '{}_python{}_docs.ipynb'.format(module, py_version.replace('.',''))
nbf.write(nb, fname)