fastdoc/asciidoc.py

# AUTOGENERATED! DO NOT EDIT! File to edit: 00_asciidoc.ipynb (unless otherwise specified).

__all__ = ['markdown_cell', 'code_cell', 'remove_hidden_cells', 'isolate_adoc_blocks', 'replace_old_jekylls',
           'hide_input', 'hide_output', 'extract_html', 'split_max_len', 'deal_error', 'remove_interrupted_pbars',
           'get_cell_meta', 'caption_tables', 'TEXT_MAX_WIDTH', 'wrap_text_outputs', 'CODE_MAX_LEN', 'check_code_len',
           'deal_quotes', 'add_title_level', 'deal_with_lists', 'replace_jekylls', 'interpret_sidebar',
           'IMAGE_CONV_MULT', 'process_images', 'wrap_references', 'extract_attachments', 'sidebar_headers',
           'code_cell_tfms', 'md_cell_tfms', 'add_new_line', 'treat_notebook', 'rep_spec_tok', 'ipython2python',
           'remove_cells', 'clear_cells', 'format_latex', 'format_outputs', 'fix_quotes', 'fix_references',
           'format_tables', 'remove_lines', 'post_process_tfms', 'post_process', 'c', 'exporter', 'add_metadata',
           'output_num', 'IMAGE_OUT_MULT', 'get_output_width', 'convert_nb', 'copy_images', 'fastdoc_convert_all']

# Cell
from .imports import *
from fastcore.script import *
from warnings import warn

# Cell
def markdown_cell(md):
    return nbformat.notebooknode.NotebookNode({'cell_type': 'markdown', 'source': md, 'metadata': {}})

# Cell
def code_cell(code, metadata=None, outputs=None):
    return nbformat.notebooknode.NotebookNode(
        {'cell_type': 'code',
         'execution_count': None,
         'source': code,
         'metadata': {} if metadata is None else metadata,
         'outputs': [] if outputs is None else outputs})

# Cell
_re_hidden = re.compile(r'^\s*#\s*(hide|clean)\s*$', re.MULTILINE)

# Cell
def remove_hidden_cells(cells):
    "Remove cells marked with #hide"
    return [c for c in cells if _re_hidden.search(c['source']) is None]

# Cell
def isolate_adoc_blocks(cells):
    res = []
    for cell in cells:
        if cell['cell_type'] == 'markdown' and re.search(r'```\s*asciidoc', cell['source']) is not None:
            lines = cell['source'].split('\n')
            adoc,s,idx = False,0,0
            for line in lines:
                if re.search(r'^```\s*asciidoc\s*$', line) is not None and not adoc:
                    res.append(markdown_cell('\n'.join(lines[s:idx])))
                    adoc,s = True,idx+1
                elif re.search(r'^```\s*$', line) is not None and adoc:
                    res.append(code_cell('##clear##' + '\n'.join(lines[s:idx])))
                    adoc,s = False,idx+1
                idx+=1
            assert not adoc, f"Triple-quote asciidoc block not ended in {cell['source']}"
            res.append(markdown_cell('\n'.join(lines[s:])))
        else: res.append(cell)
    return res

# Cell
#TODO: remove when all notebooks have been ported to v2
def replace_old_jekylls(cell):
    if cell['source'].startswith('jekyll'):
        pat1 = re.compile(r"""jekyll_(.*)\(['"].*""")
        pat2 = re.compile(r"""jekyll_.*\(['"]+([\s\S]*[^'"])['"]+\)$""")
        jekyll_type = re.match(pat1, cell['source']).groups()[0]
        message = re.match(pat2, cell['source']).groups()[0]
        inst = {'warn':'WARNING', 'note':'NOTE', 'important':'TIP'}
        cell['metadata'] = {}
        cell['source'] = f'##clear##[{inst[jekyll_type]}]\n====\n{message}\n===='
        cell['outputs'] = []
    return cell

# Cell
_re_hide_input = re.compile(r'^\s*#\s*hide_input\s*$', re.MULTILINE)

# Cell
def hide_input(cell):
    if cell['metadata'].get('hide_input', False) or _re_hide_input.search(cell["source"]) is not None: cell['source'] = '##remove##'
    return cell

# Cell
_re_hide_output = re.compile(r'^\s*#\s*hide_output\s*$', re.MULTILINE)

# Cell
def hide_output(cell):
    if cell['metadata'].get('collapsed', False) or _re_hide_output.search(cell["source"]) is not None:
        cell['outputs'] = []
        cell['source'] = re.sub(r'#\s*hide_output\s*\n', '', cell['source'])
    return cell

# Cell
def extract_html(cell):
    for o in cell['outputs']:
        if 'data' in o and 'text/html' in o['data']:
            o['data']['text/plain'] = o['data']['text/html']
            del o['data']['text/html']
    return cell

# Cell
def split_max_len(text, l):
    words = text.split(' ')
    line,lines = "",[]
    for word in words:
        if len(line) + len(word) + 1 <= l: line += f' {word}'
        else:
            lines.append(line)
            line = ""
    if len(line) > 0: lines.append(line)
    return "\n".join(lines)

# Cell
def deal_error(cell):
    for i,out in enumerate(cell['outputs']):
        if out['output_type'] == 'error':
            msg = f"{out['ename']}: {out['evalue']}"
            cell['outputs'][i] = nbformat.notebooknode.NotebookNode({
                'data': {'text/plain': split_max_len(msg, 81) },
                'execution_count': None,
                'metadata': {},
                'output_type': 'execute_result'})
    return cell

# Cell
def remove_interrupted_pbars(cell):
    outs = []
    for out in cell['outputs']:
        if 'data' not in out or 'text/plain' not in out['data'] or 'progress-bar-interrupted' not in out['data']['text/plain']:
            outs.append(out)
    cell['outputs'] = outs
    return cell

# Cell
def get_cell_meta(cell):
    for attr in ["id", "caption", "alt", "width"]:
        if re.search(r'^\s*#\s*' + attr + r'\s(.*)$', cell["source"], re.MULTILINE) is not None:
            cell["metadata"][attr] = re.search(r'^\s*#\s*' + attr + r'\s(.*)$', cell["source"], re.MULTILINE).groups()[0]
            cell["source"] = re.sub(r'#\s*' + attr + r'\s.*?($|\n)', '', cell["source"])
    return cell

# Cell
def caption_tables(cell):
    if 'outputs' not in cell or len(cell['outputs']) == 0: return cell
    output = cell['outputs'][0]
    if 'data' not in output or 'text/plain' not in output['data']: return cell
    text = output['data']['text/plain']
    if re.search(r'^<\s*table\s+([^>]*>)', text) is None: return cell
    table_id = cell['metadata'].get('id', None)
    caption = cell['metadata'].get('caption', None)
    text_id = '' if table_id is None else f'id="{table_id}" '
    text_caption = '' if caption is None else f'\n  <caption>{caption}</caption>'
    output['data']['text/plain'] = re.sub(r'^<\s*table\s+([^>]*>)', '<table '+text_id+r'\1'+text_caption, text)
    cell['outputs'][0] = output
    return cell

# Cell
TEXT_MAX_WIDTH = 80

# Cell
def _wrap_output(output):
    if 'text' in output:
        lines = ['\n'.join(textwrap.wrap(l, width=TEXT_MAX_WIDTH, subsequent_indent = ' > ')) for l in output['text'].split('\n')]
        output['text'] = '\n'.join(lines)
        return output
    if ('data' not in output or 'text/plain' not in output['data']): return output
    text = output['data']['text/plain']
    if re.search(r'^<\s*table\s*([^>]*>)', text) is not None: return output
    lines = ['\n'.join(textwrap.wrap(l, width=TEXT_MAX_WIDTH, subsequent_indent = ' > ')) for l in text.split('\n')]
    output['data']['text/plain'] = '\n'.join(lines)
    return output

# Cell
def wrap_text_outputs(cell):
    if 'outputs' not in cell or len(cell['outputs']) == 0: return cell
    cell['outputs'] = [_wrap_output(o) for o in cell['outputs']]
    return cell

# Cell
CODE_MAX_LEN = 80

# Cell
def check_code_len(cell):
    lines = cell['source'].split('\n')
    for l in lines:
        if len(l) > CODE_MAX_LEN: warn(f"Found code too long in a cell:\n{cell['source']}")
    return cell

# Cell
def deal_quotes(cell):
    cell['source'] = re.sub(r'"`([^`]*)`"', r'`\1`', cell['source'])
    cell['source'] = re.sub(r"'", r'xxsinglequote', cell['source'])
    return cell

# Cell
def add_title_level(cell):
    if cell['source'].startswith('#'): cell['source'] = '#' + cell['source']
    return cell

# Cell
def deal_with_lists(cell):
    lines = cell['source'].split('\n')
    for i in range(len(lines)):
        lines[i] = re.sub(r'(^\s*)\d*\.(.*)$', r'\1.\2xxnewl', lines[i])
        lines[i] = re.sub(r'(^\s*)-\s(.*::)\s(.*)$', r'\2xxnewls\3xxnewl', lines[i])
    cell['source'] = '\n'.join(lines)
    return cell

# Cell
_re_block_notes = re.compile(r"""
# Catches any pattern > Title: content with title in group 1 and content in group 2
^\s*>\s*     # > followed by any number of whitespace
([^:]*)      # Catching group for any character but :
:\s*         # : then any number of whitespace
([^\n]*)     # Catching group for anything but a new line character
(?:\n|$)     # Non-catching group for either a new line or the end of the text
""", re.VERBOSE | re.MULTILINE)

_re_forgot_column = re.compile("^\s*>[^:]*$", re.MULTILINE)

# Cell
_re_urls = re.compile("\[(.*?)\]\((.*?)\)")

# Cell
def replace_jekylls(cell):
    block_names = {'warning':'WARNING', 'note':'NOTE', 'important':'TIP', 'tip': 'TIP', 'stop': 'WARNING',
                   'jargon':'JARGON', 'question':'QUESTION', 'a': 'ALEXIS', 'j': 'JEREMY', 's': 'SYLVAIN'}
    def _rep(m):
        typ,text = m.groups()
        text = re.sub(_re_urls, r"\2[\1]", text)
        name = block_names.get(typ.lower(), typ.upper())
        if name in ['ALEXIS', 'JEREMY', 'SYLVAIN', 'JARGON', 'QUESTION']:
            title = name[0]+name[1:].lower()
            surro = 'NOTE'
            if name=='JARGON':
                splits = text.split(': ')
                title = f'{title}: {splits[0]}'
                text = re.sub(_re_urls, r"\2[\1]", ': '.join(splits[1:]))
            if name in ['ALEXIS', 'JEREMY', 'SYLVAIN']:
                title = f"{title} says"
                surro = 'TIP'
            return f'```asciidoc\n.{title}\n[{surro}]\n====\n{text}\n====\n```\n'
        elif len(name) != 0: return f"```asciidoc\n[{name}]\n====\n{text}\n====\n```\n"
        else:              return f"```asciidoc\n____\n{text}\n____\n```\n"
    if _re_forgot_column.search(cell["source"]): warn("Found a non-processed block quote, please fix")
    cell["source"] = _re_block_notes.sub(_rep, cell["source"])
    return cell

# Cell
_re_sidebar = re.compile(r'^\s*#\s*sidebar\s(.*)$', re.MULTILINE)

# Cell
def interpret_sidebar(cell):
    lines = cell["source"].split("\n")
    if _re_sidebar.search(lines[0]) is not None:
        title = _re_sidebar.search(lines[0]).groups()[0]
        body = "\n".join(lines[1:])
        cell["source"] = f"```asciidoc\n.{title}\n****\n{body}\n****\n```\n"
    return cell

# Cell
_re_md_image = re.compile(r"^(<img\ [^>]*>)", re.MULTILINE)

# Cell
IMAGE_CONV_MULT = 0.6

# Cell
def process_images(cell):
    h = HTMLParseAttrs()
    def _rep(m):
        d = h(m.groups()[0])
        attrs = ['"' + d.get('alt', '') + '"']
        if 'width' in d: attrs.append(str(int(IMAGE_CONV_MULT * int(d['width']))))
        if 'width' in d and 'height' in d: attrs.append(str((int(IMAGE_CONV_MULT * int(d['height'])))))
        suff = f"[{', '.join(attrs)}]"
        pid = f"[[{d['id']}]]\n" if 'id' in d else ""
        caption = f".{d['caption']}\n" if 'caption' in d else ""
        return f"```asciidoc\n{pid}{caption}image::{d['src']}{suff}\n```"
    cell["source"] = _re_md_image.sub(_rep, cell["source"])
    return cell

# Cell
_re_reference = re.compile(r'<<([^>]*)>>')

# Cell
def wrap_references(cell):
    cell["source"] = _re_reference.sub(r'xxref\1xxeref', cell["source"])
    return cell

# Cell
def extract_attachments(cell, dest):
    if not 'attachments' in cell: return cell
    mime,img = first(first(cell['attachments'].values()).items())
    ext = mime.split('/')[1]
    for i in range(99999):
        p = dest/(f'att_{i:05d}.{ext}')
        if not p.exists(): break
    p.write_bytes(b64decode(img))
    del(cell['attachments'])
    cell['source'] = re.sub('attachment:image.png', str(p), cell['source'])
    return cell

# Cell
_re_sidebar_title = re.compile(r'#+\s+Sidebar:\s+(.*)$', re.IGNORECASE)
_re_end_sidebar = re.compile(r'#+\s+End sidebar', re.IGNORECASE)

# Cell
def sidebar_headers(cell):
    cell['source'] = _re_sidebar_title.sub(r'```asciidoc\n.\1\n****\n```', cell['source'])
    cell['source'] = _re_end_sidebar.sub(r'```asciidoc\n****\n```', cell['source'])
    return cell

# Cell
code_cell_tfms = [get_cell_meta, replace_old_jekylls, hide_input, hide_output, extract_html, deal_error,
                  remove_interrupted_pbars, wrap_text_outputs, caption_tables, check_code_len]
md_cell_tfms = [deal_quotes, wrap_references, interpret_sidebar, sidebar_headers, add_title_level, deal_with_lists,
                process_images, replace_jekylls]

# Cell
def add_new_line(cell):
    cell['source'] = '\n' + cell['source']
    return cell

# Cell
def treat_notebook(nb, dest):
    nb['cells'] = remove_hidden_cells(nb['cells'])
    tfm_func = {'code': compose(*code_cell_tfms), 'markdown': compose(partial(extract_attachments, dest=dest), *md_cell_tfms),
                'raw': add_new_line}
    nb['cells'] = [tfm_func[c['cell_type']](c) for c in nb['cells']]
    nb['cells'] = isolate_adoc_blocks(nb['cells'])
    return nb

# Cell
def rep_spec_tok(adoc, metadata=None):
    adoc = re.sub('xxsinglequote', "'", adoc)
    adoc = re.sub('xxnewls', '\n  ', adoc)
    return re.sub('xxnewl\s', '\n', adoc)

# Cell
def ipython2python(adoc, metadata=None):
    return re.sub(r'\[source, ipython3\]','[source, python]', adoc)

# Cell
def remove_cells(adoc, metadata=None):
    adoc = re.sub(r'\n\[source, python\]\n----(\n)*----\n','', adoc)
    return re.sub(r'\n\[source, python\]\n----\n##remove##\n----\n','', adoc)

# Cell
_re_clear = re.compile(r'\[source, python\]\n----\n##clear##(.*?)----\n', re.DOTALL)
def clear_cells(adoc, metadata=None): return _re_clear.sub(r'\1', adoc)

# Cell
def format_latex(adoc, metadata=None):
    #LaTeX equations
    adoc = re.sub(r"latexmath:\[\$([^\$]*)\$\]", r"latexmath:[\\(\1\\)]", adoc)
    return re.sub(r"latexmath:\[\\\[(.*)\\\]\]", r"\n[latexmath]\n++++\n\\begin{equation}\n\1\n\\end{equation}\n++++\n", adoc)

# Cell
_re_image_output = re.compile(r'----\n!\[(?:svg|png|jpg)\]\((.+)\)\n----')

# Cell
def format_outputs(adoc, metadata=None):
    folder = ({} if metadata is None else metadata).get('folder', '.')
    def _rep(m):
        name = m.groups()[0]
        d = metadata[name] if metadata is not None and name in metadata else {}
        attrs = ['"' + d.get('alt', '') + '"']
        if 'width' in d: attrs.append(str(d['width']))
        if 'width' in d and 'height' in d: attrs.append(str(d['height']))
        suff = f"[{', '.join(attrs)}]"
        pid = f"[[{d['id']}]]\n" if 'id' in d else ""
        caption = f".{d['caption']}\n" if 'caption' in d else ""
        return f"{pid}{caption}image::{str(folder)}/{name}{suff}"
    return _re_image_output.sub(_rep, adoc)

# Cell
def fix_quotes(adoc, metadata=None):
    return re.sub(r"``([^'`]*)''", r'"\1"', adoc)

# Cell
def fix_references(adoc, metadata=None): return re.sub(r"xxref(.*)xxeref", r"<<\1>>", adoc)

# Cell
def format_tables(adoc, metadata=None):
    splits = adoc.split('----')
    seps = [''] + ['----' for _ in range(len(splits)-1)] + ['']
    for i,s in enumerate(splits):
        s = re.sub(r'<div>[\s\S]*<table', '<table', s)
        s = re.sub('</div>', '', s)
        s = re.sub('<p>', '', s)
        s = re.sub('</p>', '', s)
        if len(s) > 0 and not s.startswith('\n'): s = '\n' + s
        if len(s) > 0 and not s.endswith('\n'):   s = s + '\n'
        if s.startswith('\n<table'): seps[i],seps[i+1] = '++++','++++'
        elif '<table' in s:
            res = re.search('<table', s)
            begin,end = res.span()
            s = s[:begin] + '\n----\n\n++++\n' + s[begin:]
            seps[i+1] = '++++'
        splits[i] = s
    res = ''
    for s,c in zip(seps,splits): res = res + s + c
    return res.replace('\n\n--------', '')

# Cell
def remove_lines(text, metadata=None):
    return re.sub(r'\n\n\n\n+([^\n])', r'\n\n\n\1', text)

# Cell
post_process_tfms = [fix_quotes, rep_spec_tok, ipython2python, remove_cells, clear_cells, format_latex,
                     format_outputs, fix_references, format_tables, remove_lines]

# Cell
def post_process(adoc, metadata=None):
    if not adoc.startswith('\n'): adoc = '\n' + adoc
    adoc = re.sub('xxnewl\s', '\n', adoc)
    adoc = compose(*post_process_tfms)(adoc, metadata=metadata)
    return adoc.strip()

# Cell
c = ExportConfig()
exporter = ASCIIDocExporter(c)
exporter.exclude_input_prompt=True
exporter.exclude_output_prompt=True

# Cell
def add_metadata(nb):
    "Stripping removes metadata used in the conversion."
    if 'language_info' not in nb['metadata']:
        nb['metadata']['language_info'] = {
            'codemirror_mode': {'name': 'ipython', 'version': 3},
            'file_extension': '.py',
            'mimetype': 'text/x-python',
            'name': 'python',
            'nbconvert_exporter': 'python',
            'pygments_lexer': 'ipython3',
            'version': '3.7.1'}
    return nb

# Cell
def output_num(n):
    m = re.search(r'^output_(\d*)_', n)
    if m is None: return
    return int(m.groups()[0])

# Cell
import PIL

# Cell
IMAGE_OUT_MULT = 0.8

# Cell
import xml.etree.ElementTree as ET

# Cell
def get_output_width(name, raw, folder):
    if name.endswith('.svg'): return ET.fromstring(raw).attrib['width'].split('.')[0].replace('pt', '')
    try: return PIL.Image.open(Path(folder)/name).size[0]
    except: return None

# Cell
def convert_nb(fname, dest_path='.', folder=None):
    "Convert a notebook `fname` to html file in `dest_path`."
    print(f"Converting {fname}")
    fname = Path(fname)
    dest_name = fname.with_suffix('.asciidoc').name
    if folder is None: folder = Path(dest_path)/f'{fname.stem}_files'
    #folder for images. Clear if exists
    if folder.exists(): shutil.rmtree(folder)
    os.makedirs(folder, exist_ok=True)

    nb = add_metadata(treat_notebook(read_nb(fname), folder))
    export = exporter.from_notebook_node(nb)
    metadata = {'folder': folder.relative_to(dest_path)}
    metadata.update({n: nb["cells"][output_num(n)]['metadata'] for n in export[1]['outputs'].keys() if output_num(n) is not None})
    for n,o in export[1]['outputs'].items():
        with open(Path(folder)/n, 'wb') as f: f.write(o)
        w = metadata[n]['width'] if 'width' in metadata[n] else get_output_width(n, o, folder)
        if w is not None: metadata[n]['width'] = str(int(IMAGE_OUT_MULT * int(w)))
    with open(f'{dest_path}/{dest_name}','w', encoding="utf8") as f:
        f.write(post_process(export[0], metadata))

# Cell
def _copy_images(path, dest_path):
    os.makedirs(dest_path, exist_ok=True)
    for f in path.iterdir():
        if f.is_file(): shutil.copy(f, dest_path/f.name)
        if f.is_dir(): _copy_images(f, dest_path/f.name)

# Cell
def copy_images(path, dest_path):
    img_folder = dest_path/"images"
    if img_folder.exists(): shutil.rmtree(img_folder)
    _copy_images(path/"images", img_folder)

# Cell
def _convert1(fname, dest_path='.'):
    try: convert_nb(fname, dest_path=dest_path)
    except Exception as e:
        print(f"Error in notebook {fname}")
        print(e)

# Cell
@call_parse
def fastdoc_convert_all(
    path:str='book',  # Path to notebooks
    dest_path:str='../convert_book'  # Path to generated asciidoc files
):
    path,dest_path = Path(path),Path(dest_path)
    dest_path.mkdir(parents=True,exist_ok=True)
    (path/'images').mkdir(parents=True,exist_ok=True)
    nbs = [f for f in path.iterdir() if f.suffix == '.ipynb' and not f.name.startswith('_')]
    parallel(_convert1, nbs, dest_path=dest_path)
    for f in path.iterdir():
        if f.suffix in ['.adoc', '.asciidoc']: shutil.copy(f, dest_path/f.name)
    copy_images(path, dest_path)