# Notebook to get textbook chapters and convert to markdown

In [2]:
import tempfile, zipfile, urllib.request, re, nbformat, yaml
from pathlib import Path

## Set paths for the source files and the output directory

- Set source directory for notebooks

In [3]:
# Data 88E textbook repo
REPO_URL  = "https://github.com/data-88e/textbook"
BRANCH    = "master"
SUBDIR    = "content"   # the subtree we want



- Set output directory for markdown files

In [5]:
# Where to put downloaded raw files
RAW_DIR   = Path("~/Documents/Data88E-ForTraining/F24Textbook_RAW").expanduser()
RAW_DIR.mkdir(parents=True, exist_ok=True)

print("Destination:", RAW_DIR)

Destination: /Users/ericvandusen/Documents/Data88E-ForTraining/F24Textbook_RAW


In [6]:
def download_repo_zip(repo_url: str, branch: str, tmp_dir: Path) -> Path:
    zip_url = repo_url.rstrip("/") + f"/archive/refs/heads/{branch}.zip"
    zip_path = tmp_dir / "repo.zip"
    print("Downloading:", zip_url)
    with urllib.request.urlopen(zip_url) as r:
        zip_path.write_bytes(r.read())
    return zip_path

def extract_subdir(zip_path: Path, subdir: str, dest_dir: Path):
    with zipfile.ZipFile(zip_path) as zf:
        # GitHub zips have a top-level folder like "textbook-master/"
        top = zf.namelist()[0].split("/")[0]
        prefix = f"{top}/{subdir.strip('/')}/"
        members = [n for n in zf.namelist() if n.startswith(prefix)]
        if not members:
            raise FileNotFoundError(f"Subdir '{subdir}' not found in archive.")
        zf.extractall(dest_dir, members)
        # move extracted subtree up one level (drop the top/prefix folder)
        (dest_dir / top / subdir).rename(dest_dir / subdir)
        shutil.rmtree(dest_dir / top)
    print(f"Extracted '{subdir}/' → {dest_dir / subdir}")

In [7]:
with tempfile.TemporaryDirectory() as td:
    zip_path = download_repo_zip(REPO_URL, BRANCH, Path(td))
    extract_subdir(zip_path, SUBDIR, RAW_DIR)

print("Done. Raw files at:", RAW_DIR / SUBDIR)

Downloading: https://github.com/data-88e/textbook/archive/refs/heads/master.zip


OSError: [Errno 66] Directory not empty: '/Users/ericvandusen/Documents/Data88E-ForTraining/F24Textbook_RAW/textbook-master/content' -> '/Users/ericvandusen/Documents/Data88E-ForTraining/F24Textbook_RAW/content'

*one note here - there are a lot of extra files, such as all of the csv data files - we have downloaded everything, but we will only convert the .ipynb files to markdown*

## Reformat the files into markdown files and add a summary at the top of the file

- again specify the source and output directories

In [8]:
SRC_DIR  = Path("~/Documents/Data88E-ForTraining/F24Textbook_RAW/content").expanduser()
OUT_DIR  = Path("~/Documents/Data88E-ForTraining/F24Textbook_MD").expanduser()
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [9]:
def ensure_newline(s: str) -> str:
    return s if s.endswith("\n") else s + "\n"

def ipynb_to_markdown(nb_path: Path) -> str:
    nb = nbformat.read(nb_path.open(), as_version=4)
    parts = []
    for cell in nb.cells:
        if cell.cell_type == "markdown":
            parts.append(cell.source.strip())
        elif cell.cell_type == "code":
            parts.append("```python\n" + cell.source.rstrip() + "\n```")
    return ensure_newline("\n\n".join(parts))

def parse_chapter(folder_name: str) -> tuple[int | None, str]:
    # "00-intro" -> (0, "intro")
    m = re.match(r"(\d+)[-_]?(.*)", folder_name)
    if not m:
        return None, folder_name
    chap = int(m.group(1))
    title = m.group(2) or folder_name
    return chap, title

def write_with_frontmatter(out_path: Path, body_md: str, title: str, chapter: int | None, src_rel: Path):
    meta = {"title": title, "type": "textbook", "source_path": str(src_rel)}
    if chapter is not None:
        meta["chapter"] = chapter
    out_path.parent.mkdir(parents=True, exist_ok=True)
    fm = "---\n" + yaml.safe_dump(meta, sort_keys=False, allow_unicode=True) + "---\n\n"
    out_path.write_text(fm + body_md, encoding="utf-8")

def reformat_tree(src_root: Path, out_root: Path, repo_root_label: str = "content"):
    count = 0
    for src in src_root.rglob("*"):
        if not src.is_file():
            continue
        if src.suffix.lower() not in (".md", ".ipynb"):
            continue

        # infer chapter from top-level folder (e.g., 01-demand)
        rel_to_src = src.relative_to(src_root)
        top = rel_to_src.parts[0] if rel_to_src.parts else src_root.name
        chapter, _ = parse_chapter(top)

        # normalize content
        if src.suffix.lower() == ".md":
            body = ensure_newline(src.read_text(encoding="utf-8"))
            title = src.stem
        else:  # .ipynb
            body = ipynb_to_markdown(src)
            title = src.stem

        out_rel  = rel_to_src.with_suffix(".md")
        out_path = out_root / out_rel

        # record a relative source path so you know provenance
        src_rel_for_meta = Path(repo_root_label) / rel_to_src
        write_with_frontmatter(out_path, body, title, chapter, src_rel_for_meta)
        count += 1

    print(f"Normalized {count} files → {out_root}")



In [10]:
reformat_tree(SRC_DIR, OUT_DIR)

Normalized 79 files → /Users/ericvandusen/Documents/Data88E-ForTraining/F24Textbook_MD


## Build week-ordered summary.yaml for lecture notebooks in F24Lec_MD/  

In [11]:
ROOT = Path("~/Documents/Data88E-ForTraining/F24Textbook_MD").expanduser()
SUMMARY = ROOT / "summary.yaml"

In [12]:
def read_front_matter(md_path: Path) -> dict:
    text = md_path.read_text(encoding="utf-8")
    if not text.startswith("---"):
        return {}
    lines = text.splitlines(True)
    try:
        end = lines.index("---\n", 1)
    except ValueError:
        return {}
    return yaml.safe_load("".join(lines[1:end])) or {}

# collect records
chapters = {}
for md in ROOT.rglob("*.md"):
    fm = read_front_matter(md)
    chap = fm.get("chapter")
    if chap is None:   # skip non-chapter files, if any
        continue
    rec = {
        "file": str(md.relative_to(ROOT)),           # e.g., 01-demand/index.md
        "title": fm.get("title") or md.stem,
        "source_path": fm.get("source_path", ""),
        "type": fm.get("type", "textbook"),
    }
    chapters.setdefault(int(chap), []).append(rec)

# order chapters and files (index.md first within each chapter)
def sort_key(rec):
    p = Path(rec["file"])
    return (0 if p.stem == "index" else 1, rec["title"].lower())

ordered = []
for chap in sorted(chapters.keys()):
    items = sorted(chapters[chap], key=sort_key)
    # add per-file order
    for i, it in enumerate(items, start=1):
        it["order"] = i
    ordered.append({
        "chapter": chap,
        "count": len(items),
        "files": items
    })

In [13]:
SUMMARY.write_text(yaml.safe_dump(ordered, sort_keys=False, allow_unicode=True), encoding="utf-8")
print(f"Wrote {SUMMARY} with {sum(c['count'] for c in ordered)} files across {len(ordered)} chapters.")

Wrote /Users/ericvandusen/Documents/Data88E-ForTraining/F24Textbook_MD/summary.yaml with 75 files across 13 chapters.
