In [4]:
import notedown
import os
import pathlib

entry = '''    {
        'title':'TITLE',
        'day':0,
        'book':'URL',
        'slides':('part-0.pdf',0),
        'slides_video':'',
        'notebook':True,
        'notebook_video':'',
        'tag':'',
    },
'''

book_repo = pathlib.Path('/Users/mli/repos/d2l-zh')

def get_toc(root):
    """return a list of files in the order defined by TOC"""
    subpages = _get_subpages(root)
    res = [root]
    for fn in subpages:
        res.extend(get_toc(fn))
    return res

def _get_subpages(input_fn):
    """read toc in input_fn, returns what it contains"""
    subpages = []
    reader = notedown.MarkdownReader()
    with open(input_fn, 'r', encoding='UTF-8') as f:
        nb = reader.read(f)
    for cell in nb.cells:
        if (cell.cell_type == 'code' and 'attributes' in cell.metadata and
                'toc' in cell.metadata.attributes['classes']):
            for l in cell.source.split('\n'):
                l = l.strip()
                if not l.startswith(':'):
                    fn = os.path.join(os.path.dirname(input_fn), l + '.md')
                    if os.path.exists(fn):
                        subpages.append(fn)
    return subpages

def _get_title(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
        for l in lines:
            if l.startswith('#'): return l[1:].strip()

with open('contents.txt', 'w') as f:
    notebooks = get_toc(str(book_repo/'index.md'))
    for nb in notebooks:
        p = str(pathlib.Path(nb).relative_to(book_repo).with_suffix('.html'))
        if 'index.md' in p: continue
        title = _get_title(nb)
        if not title: continue
        f.write(entry.replace('TITLE', title).replace('URL', p))


In [5]:
part_0 = [
    {
        'title':'课程安排',
        'day':0,
        'slides':('part-01.pdf',7),
        'slides_video':'',
        'notebook':'',
        'notebook_video':'',
        'tag':'',
    },
    {
        'title':'深度学习介绍',
        'day':0,
        'book':'chapter_introduction/index.html',
        'slides':('part-01.pdf',14),
        'slides_video':'',
        'notebook':'',
        'notebook_video':'',
        'tag':'',
    },
    {
        'title':'安装',
        'day':0,
        'book':'chapter_installation/index.html',
        'slides':('part-01.pdf',3),
        'slides_video':'',
        'notebook':'',
        'notebook_video':'',
        'tag':''
    },
    {
        'title':'数据操作',
        'day':0,
        'book':'chapter_preliminaries/ndarray.html',
        'slides':('part-01.pdf',5),
        'slides_video':'',
        'notebook':True,
        'notebook_video':'',
        'tag':'',
    },
    {
        'title':'数据预处理',
        'day':0,
        'book':'chapter_preliminaries/pandas.html',
        'slides':('part-01.pdf',0),
        'slides_video':'',
        'notebook':True,
        'notebook_video':'',
        'tag':'',
    },

    {
        'title':'线性代数',
        'day':1,
        'book':'chapter_preliminaries/linear-algebra.html',
        'slides':('part-01.pdf',14),
        'slides_video':'',
        'notebook':True,
        'notebook_video':'',
        'tag':'',
    },
    {
        'title':'微分和矩阵计算',
        'day':1,
        'book':'chapter_preliminaries/calculus.html',
        'slides':('part-01.pdf',10),
        'slides_video':'',
        'notebook':True,
        'notebook_video':'',
        'tag':'',
    },
    {
        'title':'自动求导',
        'day':1,
        'book':'chapter_preliminaries/autograd.html',
        'slides':('part-01.pdf',18),
        'slides_video':'',
        'notebook':True,
        'notebook_video':'',
        'tag':'',
    },
    {
        'title':'概率',
        'day':1,
        'book':'chapter_preliminaries/probability.html',
        'slides':('part-01.pdf',0),
        'slides_video':'',
        'notebook':True,
        'notebook_video':'',
        'tag':'',
    },
    {
        'title':'查阅文档',
        'day':1,
        'book':'chapter_preliminaries/lookup-api.html',
        'slides':('part-0.pdf',0),
        'slides_video':'',
        'notebook':True,
        'notebook_video':'',
        'tag':'',
    },
    {
        'title':'线性神经网络',
        'day':2,
        'book':'chapter_linear-networks/linear-regression.html',
        'slides':('part-01.pdf',11),
        'slides_video':'',
        'notebook':True,
        'notebook_video':'',
        'tag':'',
    },
    {
        'title':'线性回归的从零开始实现',
        'day':2,
        'book':'chapter_linear-networks/linear-regression-scratch.html',
        'slides':('part-01.pdf',0),
        'slides_video':'',
        'notebook':'',
        'notebook_video':'',
        'tag':'',
    },
    {
        'title':'线性回归的简洁实现',
        'day':2,
        'book':'chapter_linear-networks/linear-regression-concise',
        'slides':('part-01.pdf',0),
        'slides_video':'',
        'notebook':'',
        'notebook_video':'',
        'tag':'',
    }
]

part_1 = []
part_2 = []



contents = [(part_0, '深度学习基础'), (part_1, '卷积神经网络'),
            (part_2, '循环神经网络')]


In [23]:
import datetime

pdf_dir = '/Users/mli/Google Drive/d2l-zh-v2-slides/'
slides_dir = 'assets/pdfs/'
book_url = 'http://preview.d2l.ai/d2l-zh/v2/'
notebook_url = 'https://nbviewer.jupyter.org/format/slides/github/d2l-ai/d2l-pytorch-slides/blob/main/'

slide_pages = {}
for i, (part, title) in enumerate(contents):
    if not part: continue
    with open(f'_modules/part_{i}.md', 'w') as f:
        f.write(f'---\ntitle: {title}\n---\n')
        cur_day = None
        for entry in part:
            d = entry['day']
            if d < 0: continue
            day = starting_day + datetime.timedelta(days=((d//2)*7+d%2))
            if cur_day != day:
                f.write(f'\n{day.month}月{day.day}日\n\n')
                cur_day = day
            if 'book' in entry and entry['book']:
                f.write(f': [{entry["title"]}]({book_url+entry["book"]})\n')
            else:
                f.write(f': {entry["title"]}\n')

            f.write('  :')
            pdf, page = entry['slides']
            if page:
                if pdf not in slide_pages:
                    slide_pages[pdf] = [0,]
                save_pdf = f'{slides_dir}part-{i}_{len(slide_pages[pdf])}.pdf'
                cur_page = sum(slide_pages[pdf])
                extract_pdf(pdf_dir+pdf, cur_page, cur_page+page, save_pdf)
                slide_pages[pdf].append(page)
                f.write(f' [<span class="iconfont icon-KeynoteOutline"></span>]({save_pdf})')
            if entry['notebook']:
                url = notebook_url + entry["book"].replace('.html', '.ipynb')
                f.write(f' [<span class="iconfont icon-jupyter"></span>]({url})')

            f.write('\n')

!touch index.md

Written to assets/pdfs/part-0_1.pdf
Written to assets/pdfs/part-0_2.pdf
Written to assets/pdfs/part-0_3.pdf
Written to assets/pdfs/part-0_4.pdf
Written to assets/pdfs/part-0_5.pdf
Written to assets/pdfs/part-0_6.pdf
Written to assets/pdfs/part-0_7.pdf
Written to assets/pdfs/part-0_8.pdf


In [15]:
from PyPDF2 import PdfFileReader, PdfFileWriter

def extract_pdf(source, start_page, end_page, target):
    source = pathlib.Path(source)
    target = pathlib.Path(target)
    if target.exists() and target.stat().st_mtime > source.stat().st_mtime:
        return
    pdf = PdfFileReader(str(source))
    assert end_page > start_page
    assert end_page <= pdf.getNumPages()
    pdf_writer = PdfFileWriter()
    for page in range(start_page, end_page):
        pdf_writer.addPage(pdf.getPage(page))
    with target.open('wb') as out:
        print(f'Written to {target}')
        pdf_writer.write(out)
