Generate `contents.json` based on the `d2l-zh` repos, then manually copy and modify to `_modelules/*.json`

In [None]:
!pip install PyPDF2

In [1]:
import notedown
import os
import pathlib

entry = '''    {
        "title":"TITLE",
        "day_break":false,
        "book":"URL",
        "slides":["part-0.pdf",0],
        "slides_video":"",
        "notebook_video":"",
        "qa_video":""
    }'''


book_repo = pathlib.Path('/Users/mli/repos/d2l-en')

def get_toc(root):
    """return a list of files in the order defined by TOC"""
    subpages = _get_subpages(root)
    res = [root]
    for fn in subpages:
        res.extend(get_toc(fn))
    return res

def _get_subpages(input_fn):
    """read toc in input_fn, returns what it contains"""
    subpages = []
    reader = notedown.MarkdownReader()
    with open(input_fn, 'r', encoding='UTF-8') as f:
        nb = reader.read(f)
    for cell in nb.cells:
        if (cell.cell_type == 'code' and 'attributes' in cell.metadata and
                'toc' in cell.metadata.attributes['classes']):
            for l in cell.source.split('\n'):
                l = l.strip()
                if not l.startswith(':'):
                    fn = os.path.join(os.path.dirname(input_fn), l + '.md')
                    if os.path.exists(fn):
                        subpages.append(fn)
    return subpages

def _get_title(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
        for l in lines:
            if l.startswith('#'): return l[1:].strip()

entries = []
notebooks = get_toc(str(book_repo/'index.md'))
for nb in notebooks:
    p = str(pathlib.Path(nb).relative_to(book_repo).with_suffix('.html'))
    if 'index.md' in p: continue
    title = _get_title(nb)
    if not title: continue
    entries.append(entry.replace('TITLE', title).replace('URL', p))

with open('contents.json', 'w') as f:
    f.write('[\n' + ',\n'.join(entries) + '\n]\n')


In [2]:
from PyPDF2 import PdfFileReader, PdfFileWriter
import subprocess

def extract_pdf(source, start_page, end_page, target):
    source = pathlib.Path(source)
    target = pathlib.Path(target)
    if target.exists() and target.stat().st_mtime > source.stat().st_mtime:
        return
    pdf = PdfFileReader(str(source))
    assert end_page > start_page
    assert end_page <= pdf.getNumPages()
    pdf_writer = PdfFileWriter()
    for page in range(start_page, end_page):
        pdf_writer.addPage(pdf.getPage(page))
    with open('/tmp/tmp.pdf', 'wb') as out:
        pdf_writer.write(out)
    # compress pdf size
    # refer to https://askubuntu.com/questions/113544/how-can-i-reduce-the-file-size-of-a-scanned-pdf-file
    cmd = f'gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/prepress -dNOPAUSE  -dBATCH -sOutputFile={str(target)} /tmp/tmp.pdf'
    process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    stdout, _ = process.communicate()
    if process.returncode != 0:
        print(stdout.decode().splitlines())
    print(f'Written {end_page-start_page} pages to {target}')

Generate all `_modules/part*.md`

In [4]:
import datetime
import pathlib
import json
import os


pdf_dir = '/Users/mli/Google Drive/d2l-zh-v2-slides/'
slides_dir = 'assets/pdfs/'
notebooks_dir = 'assets/notebooks/'
book_url = 'https://zh-v2.d2l.ai/'
notebook_url = 'https://nbviewer.jupyter.org/format/slides/github/d2l-ai/d2l-zh-pytorch-slides/blob/main/'
notbook_repo = '../d2l-zh-pytorch-slides/'
video_url = 'https://www.bilibili.com/video/'
cur_day = datetime.datetime(2021, 3, 19)

titles = ['深度学习基础', '卷积神经网络', '计算机视觉', '循环神经网络', '注意力机制',]
holidays = [datetime.datetime(2021, m, d) for m, d in ((4,3),(4,4),(4,25),(5,1),(5,2),(5,8),(5,9),(6,12),(6,13),(7,31),(8,1))]
slide_pages = {}

for i, title in enumerate(titles):
    p = pathlib.Path('_modules')
    with (p/f'part_{i}.md').open('w') as f:
        f.write(f'---\ntitle: {title}\n---\n')
        if not (p/f'part_{i}.json').exists():
            contents = []
        else:
            contents = json.load((p/f'part_{i}.json').open('r'))
        for entry in contents:
            if entry['day_break']:
                while True:
                    if cur_day.weekday() == 6: # sun
                        cur_day += datetime.timedelta(days=6)
                    else:
                        cur_day += datetime.timedelta(days=1)
                    f.write(f'\n{cur_day.month}月{cur_day.day}日\n\n')
                    if cur_day not in holidays:
                        break
                    f.write(': **休课**{: .label .label-green }\n')
            # title
            f.write(f': {entry["title"]}\n')
            # html page
            if 'book' in entry and entry['book']:
                f.write(f'  : [<span class="iconfont icon-xiaoshuo-copy"></span>]({book_url+entry["book"]})\n')
            else:
                f.write('  : &nbsp; \n')
            # pdf
            pdf, page = entry['slides']
            if page:
                if pdf not in slide_pages:
                    slide_pages[pdf] = [0,]
                save_pdf = f'{slides_dir}part-{i}_{len(slide_pages[pdf])}.pdf'
                cur_page = sum(slide_pages[pdf])
                extract_pdf(pdf_dir+pdf, cur_page, cur_page+page, save_pdf)
                slide_pages[pdf].append(page)
                f.write(f'  : [<span class="iconfont icon-KeynoteOutline"></span>]({save_pdf})\n')
            else:
                f.write('  : &nbsp; \n')

            # notebook
            write_notebook = False
            if 'book' in entry and entry['book'] and  (not 'notebook' in  entry or entry['notebook']):
                notebook_path = entry["book"].replace('.html', '.ipynb')
                notebook_file = notbook_repo + notebook_path
                notebook_output = pathlib.Path(notebooks_dir + notebook_path).with_suffix('.slides.html')
                if os.path.exists(notebook_file):
                    if not notebook_output.exists():
                        os.system(f'jupyter nbconvert {notebook_file} --to slides --output-dir {str(notebook_output.parent)}')
                    if notebook_output.exists():
                        write_notebook = True
                        f.write(f'  : [<span class="iconfont icon-jupyter"></span>]({str(notebook_output)})\n')
            if not write_notebook:
                f.write('  :  &nbsp; \n')
                    #if entry['notebook_video']:
                    #    f.write(f' [<span style="font-size:130%"  class="iconfont icon-bilibili-fill"></span> 代码]({entry["notebook_video"]}) &nbsp;')
            #if 'qa_video' in entry and entry['qa_video']:
            #   f.write(f' [<span style="font-size:130%"  class="iconfont icon-bilibili-fill"></span> 问答]({entry["qa_video"]}) &nbsp;')
            if entry['slides_video']:
                f.write(f'  : [<span style="font-size:130%"  class="iconfont icon-bilibili-fill"></span>]({entry["slides_video"]})\n')
            else:
                f.write('  :  &nbsp; \n')
            f.write('\n')


!touch index.md


In [2]:
%%writefile video
6	488
14	813
2	966
5	281
17	875	1
5	446	1
14	543
19	771	1
10	758
15	699
7	402	1
11	897
6	503
8	902	1
6	402	1
11	626
7	384
6	564	1
16	1123	1
4	249	1
11	832
13	1359
7	505	1
7	1116
10	1006
8	407	1
6	781
8	754	1
6	719
6	736	1
10	902
12	1443
13	870	1
2	519
6	807	1
10	1039	1
4	328	1
5	363	1
8	814	1
5	998
9	661
7	899
8	607	1
8	847
3	304	1
10	1172
3	395	1
7	620
4	417
6	510
6	1306	1
13	2115
4	638	1
7	543
5	442	1
6	654
4	600	1
15	1618
4	429	1
6	1078
7	1031	1
9	849
6	564	1
2	396
14	2361
16	1758
5	1300
5	721	1
16	1304
9	1238
14	768	1
9	832
8	534	1
13	1676	1
10	1148	1
6	695
4	227	1
6	585	1
7	1086
18	1400	1
16	2524
5	596	1
14	3165	1
5	431
10	1676	1
3	560
5	760	1
2	262
11	1060	1
3	297
8	1157	1

Overwriting video


In [10]:
with open('video') as f:
    lines = f.readlines()
    tokens = [[int(i) for i in l.strip().split('\t')] for l in lines]
nbs = [x for x in tokens if len(x)==2]
code = [x[:2] for x in tokens if len(x)==3]

import numpy as np

def analy(x):
    x = np.array(x)
    print('total:', len(x))
    print('avg page:', x[:,0].mean())
    print('avg length:', x[:,1].mean()/60)
    print('page per min:', (x[:,1]/x[:,0]).mean()/60)

analy(nbs)
analy(code)

total: 48
avg page: 8.395833333333334
avg length: 15.159722222222223
page per min: 2.045208144478978
total: 43
avg page: 8.0
avg length: 13.234883720930233
page per min: 1.7313444384536163
