# Fetching Lecture Jupyter Notebooks and Converting ipynb to Markdown

In [3]:
from pathlib import Path
import re
import yaml
import urllib.request, zipfile, tempfile, shutil
import nbformat


# ensure PyYAML
try:
    import yaml
except ImportError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyyaml"])
    import yaml

## Build a workflow to fetch and convert notebooks from Github
This script fetches Jupyter notebooks from a specified GitHub repository and converts them to Markdown format. 


- First  set he paths to the Github
- Starting with just the `lec` notebooks 
- Download to a local directory

There are a few steps that need to be done in order 


In [2]:

REPO_URL  = "https://github.com/data-88e/fa24-materials"   
BRANCH    = "main"                          

# OPTIONAL: only copy notebooks from within this subfolder of the repo
# leave "" to copy from the whole repo
SUBPATH_IN_REPO = "lec"               # e.g., "lectures", "labs", "" (all)

# Where to put the notebooks we fetch
DEST_DIR = Path("~/Documents/Data88E-ForTraining/F24Lec_NBs").expanduser()
DEST_DIR.mkdir(parents=True, exist_ok=True)

print("Destination:", DEST_DIR)

Destination: /Users/ericvandusen/Documents/Data88E-ForTraining/F24Lec_NBs


## Build a functions to export ipynb to md

In [5]:
def fetch_notebooks(repo_url: str, branch: str, subpath: str, dest_dir: Path):
    # Build GitHub archive URL
    zip_url = repo_url.rstrip("/") + f"/archive/refs/heads/{branch}.zip"
    print("Downloading:", zip_url)

    with tempfile.TemporaryDirectory() as td:
        zip_path = Path(td) / "repo.zip"
        urllib.request.urlretrieve(zip_url, zip_path)

        with zipfile.ZipFile(zip_path) as zf:
            top = zf.namelist()[0].split("/")[0]   # repo folder inside zip
            zf.extractall(td)
        
        src_root = Path(td) / top
        if subpath:
            src_root = src_root / subpath

        copied = 0
        for nb in src_root.rglob("*.ipynb"):
            rel = nb.relative_to(src_root)
            out = dest_dir / rel
            out.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(nb, out)
            copied += 1
    
    print(f"Copied {copied} notebooks → {dest_dir}")

## Run the download script 

In [6]:
# ---- run it ----
fetch_notebooks(REPO_URL, BRANCH, SUBPATH_IN_REPO, DEST_DIR)

print("\nSample files:")
for p in sorted(DEST_DIR.rglob("*.ipynb"))[:10]:
    print(" -", p.relative_to(DEST_DIR))

Downloading: https://github.com/data-88e/fa24-materials/archive/refs/heads/main.zip
Copied 41 notebooks → /Users/ericvandusen/Documents/Data88E-ForTraining/F24Lec_NBs

Sample files:
 - lec01/lec01.ipynb
 - lec02/Avocados_demand.ipynb
 - lec02/Demand_Steps_24.ipynb
 - lec02/PriceElasticity.ipynb
 - lec02/ScannerData_Beer.ipynb
 - lec02/demand-curve-Fa24.ipynb
 - lec03/3.0-CubicCostCurve.ipynb
 - lec03/3.1-Supply.ipynb
 - lec03/3.2-sympy.ipynb
 - lec03/3.3a-california-energy.ipynb


##  Next we will build a script to convert the ipynb files to markdown 

First set paths for the notebooks and the output directory

In [4]:
NB_DIR   = Path("~/Documents/Data88E-ForTraining/F24Lec_NBs").expanduser()
OUT_DIR  = Path("~/Documents/Data88E-ForTraining/F24Lec_MD").expanduser()
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [6]:

def infer_week(path: Path) -> int | None:
    """
    Look at parent folder name like 'lec01' → 1, 'lec12' → 12.
    """
    m = re.search(r"lec\s*0*([0-9]+)", path.parent.name, flags=re.IGNORECASE)
    return int(m.group(1)) if m else None

def nb_to_md(nb_path: Path, out_path: Path):
    nb = nbformat.read(nb_path.open(), as_version=4)
    week = infer_week(nb_path)

    with out_path.open("w", encoding="utf-8") as f:
        # --- front matter ---
        f.write("---\n")
        f.write(f'title: "{nb_path.stem}"\n')
        f.write("type: lecture-notebook\n")
        if week is not None:
            f.write(f"week: {week}\n")
        f.write(f'source_path: "{nb_path}"\n')
        f.write("---\n\n")

        # --- cells ---
        for cell in nb.cells:
            if cell.cell_type == "markdown":
                f.write(cell.source.strip() + "\n\n")
            elif cell.cell_type == "code":
                f.write("```python\n" + cell.source.rstrip() + "\n```\n\n")



In [7]:
# Walk all notebooks recursively, keep lec01/lec02/... structure in OUT_DIR
count = 0
for nb_file in NB_DIR.rglob("*.ipynb"):
    rel = nb_file.relative_to(NB_DIR)           # preserve folder layout
    out_file = OUT_DIR / rel.with_suffix(".md")
    out_file.parent.mkdir(parents=True, exist_ok=True)
    nb_to_md(nb_file, out_file)
    count += 1

print(f"Converted {count} notebooks → {OUT_DIR}")

Converted 41 notebooks → /Users/ericvandusen/Documents/Data88E-ForTraining/F24Lec_MD


*Now you should be able to check the markdown files in the output directory.*

e.g. `~/Documents/Data88E-ForTraining/F24Lec_MD/lec02/Avocados_demand.md`

## Build week-ordered summary.yaml for lecture notebooks in F24Lec_MD/
Folder pattern: F24Lec_MD/lec01/*.md, lec02/*.md, ...
Each week can have 1..N notebooks.

In [8]:
MD_ROOT = Path("~/Documents/Data88E-ForTraining/F24Lec_MD").expanduser()
SUMMARY_PATH = MD_ROOT / "LecNB_summary.yaml"

WEEK_DIR_RE = re.compile(r"lec\s*0*([0-9]+)", flags=re.IGNORECASE)

def infer_week_from_dir(p: Path) -> int | None:
    m = WEEK_DIR_RE.search(p.name)
    return int(m.group(1)) if m else None

def read_front_matter(md_path: Path) -> dict:
    text = md_path.read_text(encoding="utf-8")
    if not text.startswith("---"):
        return {}
    lines = text.splitlines(True)
    try:
        end = lines.index("---\n", 1)
    except ValueError:
        return {}
    return yaml.safe_load("".join(lines[1:end])) or {}

# Collect notebooks grouped by week
weeks = {}
for md_file in MD_ROOT.rglob("*.md"):
    # infer week from parent dir like lec01
    week = infer_week_from_dir(md_file.parent)
    # fall back to front-matter 'week' if present
    if week is None:
        fm = read_front_matter(md_file)
        week = fm.get("week")
    if week is None:
        # skip anything that doesn't match the pattern
        # (or put into a special bucket if you prefer)
        continue

    fm = read_front_matter(md_file)
    title = fm.get("title") or md_file.stem
    entry = {
        "file": str(md_file.relative_to(MD_ROOT)),   # keep lecXX/subpath.md
        "title": str(title),
        "type": fm.get("type", "lecture-notebook"),
        "source_path": fm.get("source_path", ""),
    }
    weeks.setdefault(int(week), []).append(entry)

# Order weeks, and notebooks inside each week
ordered = []
for wk in sorted(weeks.keys()):
    notebooks = sorted(weeks[wk], key=lambda e: e["title"].lower())
    ordered.append({
        "week": int(wk),
        "count": len(notebooks),
        "notebooks": notebooks,
    })

# Write summary.yaml
SUMMARY_PATH.write_text(
    yaml.safe_dump(ordered, sort_keys=False, allow_unicode=True),
    encoding="utf-8"
)

print(f"Wrote {SUMMARY_PATH} with {sum(w['count'] for w in ordered)} notebooks across {len(ordered)} weeks.")

Wrote /Users/ericvandusen/Documents/Data88E-ForTraining/F24Lec_MD/LecNB_summary.yaml with 41 notebooks across 14 weeks.


*Now you should be able to check the markdown files in the output directory.*

e.g. `~/Documents/Data88E-ForTraining/F24Lec_MD/LecNB_summary.yaml`