Generate `contents.json` based on the `d2l-zh` repos, then manually copy and modify to `_modelules/*.json`

In [None]:
!pip install PyPDF2

In [2]:
import notedown
import os
import pathlib

entry = '''    {
        "title":"TITLE",
        "day_break":false,
        "book":"URL",
        "slides":["part-0.pdf",0],
        "slides_video":"",
        "notebook_video":"",
        "qa_video":""
    }'''


book_repo = pathlib.Path('/Users/mli/repos/d2l-en')

def get_toc(root):
    """return a list of files in the order defined by TOC"""
    subpages = _get_subpages(root)
    res = [root]
    for fn in subpages:
        res.extend(get_toc(fn))
    return res

def _get_subpages(input_fn):
    """read toc in input_fn, returns what it contains"""
    subpages = []
    reader = notedown.MarkdownReader()
    with open(input_fn, 'r', encoding='UTF-8') as f:
        nb = reader.read(f)
    for cell in nb.cells:
        if (cell.cell_type == 'code' and 'attributes' in cell.metadata and
                'toc' in cell.metadata.attributes['classes']):
            for l in cell.source.split('\n'):
                l = l.strip()
                if not l.startswith(':'):
                    fn = os.path.join(os.path.dirname(input_fn), l + '.md')
                    if os.path.exists(fn):
                        subpages.append(fn)
    return subpages

def _get_title(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
        for l in lines:
            if l.startswith('#'): return l[1:].strip()

entries = []
notebooks = get_toc(str(book_repo/'index.md'))
for nb in notebooks: 
    p = str(pathlib.Path(nb).relative_to(book_repo).with_suffix('.html'))
    if 'index.md' in p: continue
    title = _get_title(nb)
    if not title: continue
    entries.append(entry.replace('TITLE', title).replace('URL', p))

with open('contents.json', 'w') as f:
    f.write('[\n' + ',\n'.join(entries) + '\n]\n')


Generate all `_modules/part*.md`

In [42]:
cur_day = datetime.datetime(2021, 3, 20)
cur_day.weekday()
#cur_day + datetime.timedelta(days=1)

5

In [35]:
import datetime
import pathlib 
import json 

pdf_dir = '/Users/mli/Google Drive/d2l-zh-v2-slides/'
slides_dir = 'assets/pdfs/'
book_url = 'https://zh-v2.d2l.ai/'
notebook_url = 'https://nbviewer.jupyter.org/format/slides/github/d2l-ai/d2l-zh-pytorch-slides/blob/main/'
notbook_repo = '../d2l-zh-pytorch-slides/'
video_url = 'https://www.bilibili.com/video/'
cur_day = datetime.datetime(2021, 3, 19)

titles = ['深度学习基础', '卷积神经网络', '计算机视觉', '优化算法','循环神经网络', '注意力机制', '自然语言处理']
holidays = [datetime.datetime(2021, m, d) for m, d in ((4,3),(4,4),(5,1),(5,2),(6,12),(6,13))]
slide_pages = {}

for i, title in enumerate(titles):
    p = pathlib.Path('_modules')
    with (p/f'part_{i}.md').open('w') as f:
        f.write(f'---\ntitle: {title}\n---\n')
        if not (p/f'part_{i}.json').exists():
            contents = []
        else:
            contents = json.load((p/f'part_{i}.json').open('r'))
        for entry in contents:
            if entry['day_break']:
                while True:
                    if cur_day.weekday() == 6: # sun
                        cur_day += datetime.timedelta(days=6)
                    else:
                        cur_day += datetime.timedelta(days=1)
                    f.write(f'\n{cur_day.month}月{cur_day.day}日\n\n')
                    if cur_day not in holidays: 
                        break
                    f.write(': **长假无课**{: .label .label-green }\n')
            # title
            f.write(f': {entry["title"]}\n  :')
            # html page
            if 'book' in entry and entry['book']:
                f.write(f' [<span class="iconfont icon-xiaoshuo-copy"></span> 书]({book_url+entry["book"]}) &nbsp;')


            # pdf
            pdf, page = entry['slides']                        
            if page:
                if pdf not in slide_pages:
                    slide_pages[pdf] = [0,]
                save_pdf = f'{slides_dir}part-{i}_{len(slide_pages[pdf])}.pdf'
                cur_page = sum(slide_pages[pdf])
                extract_pdf(pdf_dir+pdf, cur_page, cur_page+page, save_pdf)
                slide_pages[pdf].append(page)
                f.write(f' [<span class="iconfont icon-KeynoteOutline"></span> 幻灯片]({save_pdf}) &nbsp;')

            if entry['slides_video']:
                f.write(f' [<span style="font-size:130%"  class="iconfont icon-bilibili-fill"></span> 幻灯片 &nbsp;]({entry["slides_video"]})')

            if 'book' in entry and entry['book'] and  (not 'notebook' in  entry or entry['notebook']):
                notebook_path = entry["book"].replace('.html', '.ipynb')
                notebook_file = notbook_repo + notebook_path
                if os.path.exists(notebook_file):
                    url = notebook_url + notebook_path
                    f.write(f'[<span class="iconfont icon-jupyter"></span> 代码]({url}) &nbsp; ')
                    if entry['notebook_video']:
                        f.write(f' [<span style="font-size:130%"  class="iconfont icon-bilibili-fill"></span> 代码]({entry["notebook_video"]}) &nbsp;')
            if 'qa_video' in entry and entry['qa_video']:
               f.write(f' [<span style="font-size:130%"  class="iconfont icon-bilibili-fill"></span> 问答]({entry["qa_video"]}) &nbsp;') 
            f.write('\n')


!touch index.md

In [None]:
from autogluon.tabular import TabularPredictor
predictor = TabularPredictor(label='label').fit(train_data='train.csv')
predictions = predictor.predict('test.csv')

In [3]:
from PyPDF2 import PdfFileReader, PdfFileWriter
import subprocess

def extract_pdf(source, start_page, end_page, target):
    source = pathlib.Path(source)
    target = pathlib.Path(target)
    if target.exists() and target.stat().st_mtime > source.stat().st_mtime:
        return
    pdf = PdfFileReader(str(source))
    assert end_page > start_page
    assert end_page <= pdf.getNumPages()
    pdf_writer = PdfFileWriter()
    for page in range(start_page, end_page):
        pdf_writer.addPage(pdf.getPage(page))
    with open('/tmp/tmp.pdf', 'wb') as out:
        pdf_writer.write(out)
    # compress pdf size
    # refer to https://askubuntu.com/questions/113544/how-can-i-reduce-the-file-size-of-a-scanned-pdf-file
    cmd = f'gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/prepress -dNOPAUSE  -dBATCH -sOutputFile={str(target)} /tmp/tmp.pdf'
    process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    stdout, _ = process.communicate()
    if process.returncode != 0:
        print(stdout.decode().splitlines())
    print(f'Written {end_page-start_page} pages to {target}')