# Build a summary file across all the markdown files

- Need to build a summary.yaml file that guides all the markdown files
- second part - make a zip of all the markdown files and summary.yaml


## In the case of Data 88E - the exact progression of the topics can vary year to year.

We can use the course calendar from 2024 as a guide - to what topics are covered in which week


In [2]:
from pathlib import Path
import re, sys, subprocess

# deps
def ensure(pkg):
    try:
        __import__(pkg)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

ensure("requests"); ensure("bs4"); ensure("pyyaml")

import requests
from bs4 import BeautifulSoup
import yaml



##  Part 1: Scrape schedule to map Week -> textbook chapters
- (e.g., ["1.0","1.1",...])
 -  Writes: ~/Documents/Data88E-ForTraining/week_to_readings.yaml

In [None]:

SCHEDULE_URL = "https://data88e.org/fa24/"
OUT_YAML = Path("~/Documents/Data88E-ForTraining/week_to_readings.yaml").expanduser()

resp = requests.get(SCHEDULE_URL, timeout=20)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")

week_to_readings = {}

# Page structure: each Week is an <h2> with text "Week N", and a nearby line "Reading: ..."
for h2 in soup.find_all("h2"):
    text = h2.get_text(strip=True)
    m = re.match(r"Week\s+(\d+)", text, flags=re.IGNORECASE)
    if not m:
        continue
    week = int(m.group(1))

    # The "Reading:" text sits in the same block; search forward a bit
    readings = []
    # Look within the following siblings until the next h2
    for sib in h2.next_siblings:
        if getattr(sib, "name", None) == "h2":
            break  # stop at next week
        # anchor tags that look like: <a>1.0</a>, <a>1.1</a>, ...
        if hasattr(sib, "find_all"):
            for a in sib.find_all("a"):
                label = a.get_text(strip=True)
                if re.fullmatch(r"\d+(?:\.\d+)?", label):
                    readings.append(label)

    # Deduplicate while preserving order
    seen = set()
    readings = [x for x in readings if not (x in seen or seen.add(x))]
    if readings:
        week_to_readings[week] = readings

# Save YAML
OUT_YAML.write_text(yaml.safe_dump(week_to_readings, sort_keys=True, allow_unicode=True), encoding="utf-8")

print(f"Wrote {OUT_YAML}\n")
for wk in sorted(week_to_readings):
    print(f"Week {wk}: {week_to_readings[wk]}")

Wrote /Users/ericvandusen/Documents/Data88E-ForTraining/week_to_readings.yaml

Week 1: ['0']
Week 2: ['1.0', '1.1', '1.2', '1.3', '1.4']
Week 3: ['2.0', '2.1', '2.2', '2.3', '2.4']
Week 4: ['3.0', '3.1', '3.2', '3.3']
Week 5: ['4.0', '4.1', '4.2']
Week 6: ['5.0', '5.1', '5.2']
Week 7: ['6.0', '6.1', '6.2']
Week 8: ['9.0', '9.1', '9.2', '9.3', '9.4', '9.5']
Week 9: ['7.0', '7.1', '7.2', '7.3', '7.4', '7.5']
Week 10: ['8.0']
Week 11: ['11.0', '11.1', '11.2', '11.3', '11.4']
Week 12: ['10.0', '10.1', '10.2']
Week 13: ['12.0', '12.1', '12.2']


## 2) Merge three summaries → `course_summary.yaml`

**Assumptions:**
- Slides summary at: `~/Documents/Data88E-ForTraining/F24LS_md/summary.yaml`  
- Lecture notebooks summary at: `~/Documents/Data88E-ForTraining/F24Lec_MD/summary.yaml`  
- Textbook summary at: `~/Documents/Data88E-ForTraining/F24Textbook_MD/summary.yaml`  
- Week ↔ readings mapping at: `~/Documents/Data88E-ForTraining/week_to_readings.yaml`  

**Mapping logic:**
- **Slides**: already week-tagged → just drop in by week.  
- **Lecture notebooks**: already grouped by week → drop in.  
- **Textbook**: each file has a `chapter: <int>` in its front-matter.  
  - We’ll map a week’s readings like `['1.0','1.1', ...]` → chapter list `[1]` (the unique integer parts).  
  - Then include all textbook files whose `chapter` matches one of those chapters.  

In [8]:
SLIDES_SUMMARY   = Path("~/Documents/Data88E-ForTraining/F24LS_md/summary.yaml").expanduser()
LECTURE_SUMMARY  = Path("~/Documents/Data88E-ForTraining/F24Lec_MD/LecNB_summary.yaml").expanduser()
TEXTBOOK_SUMMARY = Path("~/Documents/Data88E-ForTraining/F24Textbook_MD/summary.yaml").expanduser()
READINGS_MAP     = Path("~/Documents/Data88E-ForTraining/week_to_readings.yaml").expanduser()
COURSE_SUMMARY   = Path("~/Documents/Data88E-ForTraining/course_summary.yaml").expanduser()


In [10]:

# ---- helpers ----
def load_yaml(p: Path):
    if not p.exists():
        print(f"⚠️ Missing file: {p}")
        return None
    return yaml.safe_load(p.read_text(encoding="utf-8"))

def normalize_readings_map(raw):
    """Dict[str|int, list[str]] -> Dict[int, list[str]]"""
    out = {}
    if isinstance(raw, dict):
        for k, v in raw.items():
            try:
                wk = int(k)
            except Exception:
                continue
            out[wk] = [str(x) for x in (v or [])]
    return out

def chapters_from_readings(readings_list):
    """['1.0','1.1','2.3'] -> [1,2]"""
    chapters = set()
    for r in readings_list or []:
        try:
            chapters.add(int(float(r)))
        except Exception:
            pass
    return sorted(chapters)

def sort_title_file(rec):
    return (str(rec.get("title","")).lower(), str(rec.get("file","")).lower())

def tb_sort_key(rec):
    p = Path(str(rec.get("file","")))
    return (0 if p.stem == "index" else 1, str(rec.get("title","")).lower())

# ---- load inputs ----
slides_idx   = load_yaml(SLIDES_SUMMARY)   or []
lectures_idx = load_yaml(LECTURE_SUMMARY)  or []
textbook_idx = load_yaml(TEXTBOOK_SUMMARY) or []
wk_to_reads  = normalize_readings_map(load_yaml(READINGS_MAP) or {})

# ---- index slides by week ----
slides_by_week = {}
if slides_idx and isinstance(slides_idx, list):
    # support either flat records or grouped {week, slides:[...]}
    grouped = isinstance(slides_idx[0], dict) and "slides" in slides_idx[0] and "week" in slides_idx[0]
    if grouped:
        for bucket in slides_idx:
            wk = bucket.get("week")
            if isinstance(wk, int):
                slides_by_week.setdefault(wk, []).extend(bucket.get("slides", []))
    else:
        for rec in slides_idx:
            wk = rec.get("week")
            if isinstance(wk, int):
                slides_by_week.setdefault(wk, []).append(rec)

# ---- index lecture notebooks by week ----
notebooks_by_week = {}
if lectures_idx and isinstance(lectures_idx, list):
    # expected: list of {week:int, count:int, notebooks:[...]}
    grouped = isinstance(lectures_idx[0], dict) and "notebooks" in lectures_idx[0]
    if grouped:
        for bucket in lectures_idx:
            wk = bucket.get("week")
            if isinstance(wk, int):
                notebooks_by_week[wk] = bucket.get("notebooks", [])
    else:
        # fallback: flat list of notebook records with 'week'
        for rec in lectures_idx:
            wk = rec.get("week") or rec.get("metadata", {}).get("week")
            if isinstance(wk, int):
                notebooks_by_week.setdefault(wk, []).append(rec)

# ---- textbook chapters -> files (filter checkpoints) ----
chap_to_files = {}
for chap_block in (textbook_idx or []):
    chap = chap_block.get("chapter")
    files = chap_block.get("files", [])
    if not isinstance(chap, int):
        continue
    cleaned = [rec for rec in files if ".ipynb_checkpoints" not in str(rec.get("file",""))]
    chap_to_files[chap] = cleaned

# ---- assemble by week ----
all_weeks = set(slides_by_week) | set(notebooks_by_week) | set(wk_to_reads)
if not all_weeks:
    raise SystemExit("No weeks found. Check that your summaries and readings map exist and are populated.")
print(f"Building course summary for {len(all_weeks)} weeks…")

merged = []
for wk in sorted(all_weeks):
    block = {"week": wk}

    if wk in slides_by_week:
        block["slides"] = sorted(slides_by_week[wk], key=sort_title_file)

    if wk in notebooks_by_week:
        block["notebooks"] = sorted(notebooks_by_week[wk], key=sort_title_file)

    chapters = chapters_from_readings(wk_to_reads.get(wk, []))
    if chapters:
        files = []
        for chap in chapters:
            files.extend(chap_to_files.get(chap, []))
        block["textbook"] = {"chapters": chapters, "files": sorted(files, key=tb_sort_key)}

    merged.append(block)

# ---- write output ----
COURSE_SUMMARY.write_text(yaml.safe_dump(merged, sort_keys=False, allow_unicode=True), encoding="utf-8")
print(f"✅ Wrote {COURSE_SUMMARY}")

# ---- quick peek ----
for b in merged[:5]:
    print(f"Week {b['week']}: slides={len(b.get('slides',[]))}, notebooks={len(b.get('notebooks',[]))}, chapters={b.get('textbook',{}).get('chapters',[])}")

Building course summary for 14 weeks…
✅ Wrote /Users/ericvandusen/Documents/Data88E-ForTraining/course_summary.yaml
Week 1: slides=1, notebooks=1, chapters=[0]
Week 2: slides=1, notebooks=5, chapters=[1]
Week 3: slides=1, notebooks=5, chapters=[2]
Week 4: slides=1, notebooks=6, chapters=[3]
Week 5: slides=1, notebooks=2, chapters=[4]


In [11]:
#make a zip of all the markdown files and summary.yaml
%%bash
# Zip up the course materials for Custom GPT upload

cd ~/Documents/Data88E-ForTraining

# Slides (F24LS_md)
zip -r F24LS_md.zip F24LS_md -x "*.ipynb_checkpoints*"

# Lecture Notebooks (F24Lec_MD)
zip -r F24Lec_MD.zip F24Lec_MD -x "*.ipynb_checkpoints*"

# Textbook (F24Textbook_MD)
zip -r F24Textbook_MD.zip F24Textbook_MD -x "*.ipynb_checkpoints*"

# Indices (course_summary.yaml + week_to_readings.yaml)
zip Indices.zip course_summary.yaml week_to_readings.yaml

SyntaxError: invalid syntax (3983456228.py, line 5)

In [12]:
%%bash
cd ~/Documents/Data88E-ForTraining

zip -r F24LS_md.zip F24LS_md -x "*.ipynb_checkpoints*"
zip -r F24Lec_MD.zip F24Lec_MD -x "*.ipynb_checkpoints*"
zip -r F24Textbook_MD.zip F24Textbook_MD -x "*.ipynb_checkpoints*"

zip Indices.zip course_summary.yaml week_to_readings.yaml

  adding: F24LS_md/ (stored 0%)
  adding: F24LS_md/Lecture 10 - Development_.md (deflated 60%)
  adding: F24LS_md/Lecture 12 - Finance.md (deflated 67%)
  adding: F24LS_md/Lecture 8 - Macro.md (deflated 71%)
  adding: F24LS_md/summary.yaml (deflated 82%)
  adding: F24LS_md/Lecture 13 - Environmental Economics.md (deflated 59%)
  adding: F24LS_md/Lecture 5 - Production C-D.md (deflated 55%)
  adding: F24LS_md/Lecture 9 - Game Theory_.md (deflated 63%)
  adding: F24LS_md/Lecture 8  - Macroeconomics.md (deflated 71%)
  adding: F24LS_md/ Lecture 4 - Public.md (deflated 60%)
  adding: F24LS_md/Lecture 1 - Introduction and Overview.md (deflated 57%)
  adding: F24LS_md/Lecture 11 - Econometrics.md (deflated 63%)
  adding: F24LS_md/Lecture 15 - Conclusion.md (deflated 56%)
  adding: F24LS_md/Lecture 3 - Supply.md (deflated 57%)
  adding: F24LS_md/Lecture 6 - Utility and Latex.md (deflated 60%)
  adding: F24LS_md/Lecture 7 - Inequality.md (deflated 66%)
  adding: F24LS_md/Lecture 2 - Demand.md 