# Script to Build Materials for training for Data 6

In [1]:
try:
    from pptx import Presentation
except ImportError:
    !pip install python-pptx
    from pptx import Presentation
import yaml
from pathlib import Path
import re


## Step 1 - Slide Decks 
Find the slide decks in the `Lectures` folder.

For instance [Fall 2024 Slides](https://drive.google.com/drive/u/0/folders/1hEHFGPpbTr8cs7gMrg85Bdg_8FmhOe7P)

Please download a zip file of the entire `Lectures` folder to your computer before proceeding.

::: {.callout-warning}
## Warning
The lectures are formated as `Lectures/Lecture_02/lec02.pptx` when unzipped. Code would need to be edited for a different structure. Add the zip file as `lectures.zip` in the same directory as this notebook.
:::

## Step 2: Script to convert pptx to markdown


In [2]:
# Script to convert pptx to markdown using python-pptx
# Saves each slide as a markdown header with bullet points for text
# Speaker notes are included in a collapsible section


def md_escape(s): 
    return (s or "").replace("*","\\*").replace("_","\\_").replace("#","\\#")

def para_to_md(p):
    txt = "".join(r.text for r in p.runs).strip()
    if not txt: return ""
    indent = "  " * (p.level or 0)
    return f"{indent}- {md_escape(txt)}"

def slide_title(slide):
    if slide.shapes.title and hasattr(slide.shapes.title, "text"):
        return slide.shapes.title.text.strip()
    for shp in slide.shapes:
        if getattr(shp, "has_text_frame", False):
            t = shp.text.strip()
            if t: return t
    return ""

def extract_notes(slide):
    try:
        ns = slide.notes_slide
        if ns and ns.notes_text_frame:
            return ns.notes_text_frame.text.strip()
    except Exception:
        pass
    return ""

def extract_body(slide):
    lines = []
    for shp in slide.shapes:
        if shp == slide.shapes.title: 
            continue
        if getattr(shp, "has_text_frame", False):
            for p in shp.text_frame.paragraphs:
                md = para_to_md(p)
                if md: lines.append(md)
    return lines

WEEK_NUM_RE = re.compile(r"\d+")

def infer_week_from_filename(p: Path, valid_min=1, valid_max=60):
    m = WEEK_NUM_RE.search(p.stem)
    if not m:
        return None
    w = int(m.group(0))
    return w if valid_min <= w <= valid_max else None

def pptx_to_md(pptx_path: Path, out_path: Path):
    prs = Presentation(pptx_path)
    week = infer_week_from_filename(pptx_path)

    with out_path.open("w", encoding="utf-8") as md:
        md.write("---\n")
        md.write(f'title: "{md_escape(pptx_path.stem)}"\n')
        md.write("---\n\n")

        for i, slide in enumerate(prs.slides, start=1):
            title = slide_title(slide) or "(untitled)"
            md.write(f"## Slide {i}: {md_escape(title)}\n\n")

            body = extract_body(slide)
            if body:
                md.write("\n".join(body) + "\n\n")

            notes = extract_notes(slide)
            if notes:
                md.write("<details><summary>Speaker notes</summary>\n\n")
                md.write("\n".join(md_escape(x) for x in notes.splitlines()) + "\n\n</details>\n\n")

    print(f"Converted {pptx_path.name} → {out_path.name} (week={week})")


## Step 3: Script to Convert all Lectures from Zip to Markdown

In [3]:
# End-to-end: unzip fa24/lectures.zip and convert all decks (supports nested folders)
from pathlib import Path
import zipfile

# --- configure paths dynamically ---
# Get the current notebook directory
NOTEBOOK_DIR = Path.cwd()  # This will be the directory containing this notebook
ZIP_PATH = NOTEBOOK_DIR / "lectures.zip"
EXTRACT_DIR = NOTEBOOK_DIR / "lectures_extracted"
OUTPUT_DIR = NOTEBOOK_DIR / "lectures_md"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Unzip only if needed
if ZIP_PATH.exists():
    if not EXTRACT_DIR.exists() or not any(EXTRACT_DIR.iterdir()):
        EXTRACT_DIR.mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(ZIP_PATH, 'r') as zf:
            zf.extractall(EXTRACT_DIR)
        print(f"Extracted {ZIP_PATH.name} -> {EXTRACT_DIR}")
    else:
        print(f"Using existing extracted contents at {EXTRACT_DIR}")
else:
    print(f"Zip not found: {ZIP_PATH}. Skipping extraction.")

# Determine root where PPTX files live (handle zips that contain a top-level 'lectures/' folder)
if (EXTRACT_DIR / "lectures").exists():
    INPUT_DIR = EXTRACT_DIR / "lectures"
else:
    INPUT_DIR = EXTRACT_DIR

print(f"Searching for decks under: {INPUT_DIR}")

# Gather all .pptx recursively to handle structures like lectures/lec01/lec01.pptx
pptx_files = sorted(INPUT_DIR.rglob("*.pptx"))

if not pptx_files:
    print(f"No .pptx files found under {INPUT_DIR} (searched recursively)")
else:
    print(f"Found {len(pptx_files)} decks. Converting to: {OUTPUT_DIR}\n")

    failures = []
    for idx, pptx_path in enumerate(pptx_files, start=1):
        # Preserve subfolder structure in the output
        rel_path = pptx_path.relative_to(INPUT_DIR)
        out_md = (OUTPUT_DIR / rel_path).with_suffix(".md")
        out_md.parent.mkdir(parents=True, exist_ok=True)
        try:
            pptx_to_md(pptx_path, out_md)
        except Exception as e:
            failures.append((str(rel_path), str(e)))
            print(f"⚠️ Failed on {rel_path}: {e}")

    if failures:
        print("\nDone with errors on these files:")
        for name, err in failures:
            print(f" - {name}: {err}")
    else:
        print("\nAll decks converted successfully ✅")

Extracted lectures.zip -> /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_extracted
Searching for decks under: /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_extracted/lectures
Found 24 decks. Converting to: /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_md

Converted [Lec 01] Data 6 Fall 2024 - Introduction.pptx → [Lec 01] Data 6 Fall 2024 - Introduction.md (week=1)
Converted [Lec 02] Data 6 Fall 2024 - Jupyter Notebooks, Arithmetic.pptx → [Lec 02] Data 6 Fall 2024 - Jupyter Notebooks, Arithmetic.md (week=2)
Converted [Lab01_Lec03] Data 6 Fall 2024 - Evaluation, Names, and Data Types.pptx → [Lab01_Lec03] Data 6 Fall 2024 - Evaluation, Names, and Data Types.md (week=1)
Converted [Lec 04] Data 6 Fall 2024 - Arrays and Variables.pptx → [Lec 04] Data 6 Fall 2024 - Arrays and Variables.md (week=4)
Converted [Lec 05] Data 6 Fall 2024 - NumPy, Indexing, Variables in Data Science.pptx → [Lec 05] Data 6 Fall 2024 - NumPy, Indexing, Variables in Data Science.md (week=5)
Conv

## Step 4 Make a summary yaml file of the slide decks

In [4]:
# --- Build summary.yaml from Markdown front-matter for lectures_md ---
try:
    import yaml
except ImportError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyyaml"])
    import yaml

from pathlib import Path

# Use dynamic path
MD_DIR = OUTPUT_DIR  # Use the OUTPUT_DIR from Step 3
summary_path = MD_DIR / "summary.yaml"

def read_front_matter(md_path: Path) -> dict:
    """Read YAML front matter (between --- markers) and return as dict."""
    lines = md_path.read_text(encoding="utf-8").splitlines(True)
    if not lines or lines[0].strip() != "---":
        return {}
    try:
        end_idx = lines.index("---\n", 1)
    except ValueError:
        return {}
    return yaml.safe_load("".join(lines[1:end_idx])) or {}

records = []
for md_file in sorted(MD_DIR.rglob("*.md")):
    meta = read_front_matter(md_file)
    records.append({
        "file": str(md_file.relative_to(MD_DIR)),
        "title": meta.get("title", md_file.stem),
        "type": meta.get("type", "slides"),
        "source_path": meta.get("source_path", ""),
        "week": meta.get("week")
    })

# Order strictly by week (1..15), fall back to alpha if week missing
with_week    = [r for r in records if isinstance(r["week"], int)]
without_week = [r for r in records if not isinstance(r["week"], int)]

with_week.sort(key=lambda r: r["week"])
without_week.sort(key=lambda r: r["title"].lower())

ordered = with_week + without_week
for i, r in enumerate(ordered, start=1):
    r["order"] = i

# Write summary.yaml
summary_path.write_text(
    yaml.safe_dump(ordered, sort_keys=False, allow_unicode=True),
    encoding="utf-8"
)

print(f"Wrote {summary_path} with {len(ordered)} entries.")
if without_week:
    print("Note: These files had no 'week' in front matter and were placed at the end:")
    for r in without_week:
        print(" -", r["file"])

Wrote /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_md/summary.yaml with 24 entries.
Note: These files had no 'week' in front matter and were placed at the end:
 - Lecture 12/[DATA 6] Quiz 1 Review.md
 - Lecture 03/[Lab01_Lec03] Data 6 Fall 2024 - Evaluation, Names, and Data Types.md
 - Lecture 01/[Lec 01] Data 6 Fall 2024 - Introduction.md
 - Lecture 02/[Lec 02] Data 6 Fall 2024 - Jupyter Notebooks, Arithmetic.md
 - Lecture 04/[Lec 04] Data 6 Fall 2024 - Arrays and Variables.md
 - Lecture 05/[Lec 05] Data 6 Fall 2024 - NumPy, Indexing, Variables in Data Science.md
 - Lecture 6/[Lec 06] Data 6 Fall 2024 - Introduction to Social Science Research.md
 - Lecture 07/[Lec 07] Data 6 Fall 2024 - Table Fundamentals.md
 - Lecture 10/[Lec 10] Data 6 Fall 2024 - Joining and Row Methods.md
 - Lecture 08/[Lec 10] Data 6 Fall 2024 - Taking and Filtering Rows.md
 - Lecture 12 (Actual) - Group and Pivot/[Lec 12 and 13] Data 6 Fall 2024 - Grouping and Pivoting.md
 - Lecture 11/[Lec 14] Data 6 F

## Step 5: Convert all Markdown lectures to QMD format


In [None]:
# Convert all markdown files to QMD format
# Adapted from converter.py logic

import os  # Add missing import

def md_to_qmd(md_file_path, output_qmd_path, title=None):
    """
    Converts a Markdown file to a Quarto QMD file with proper YAML frontmatter.
    
    Args:
        md_file_path (str): The path to the input Markdown file.
        output_qmd_path (str): The desired path for the output QMD file.
        title (str, optional): The title for the document. If None, will try to extract from content.
    """
    try:
        # Read the markdown file
        with open(md_file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Check if file already has frontmatter
        has_frontmatter = content.startswith('---')
        
        if has_frontmatter:
            # File already has frontmatter, use it as-is
            # This preserves the original lecture text without edits
            qmd_content = content
        else:
            # No frontmatter, add it
            # Try to extract title from first heading if not provided
            if title is None:
                # Look for title in the content
                title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
                if title_match:
                    title = title_match.group(1).strip()
                else:
                    # Use filename as fallback
                    title = os.path.splitext(os.path.basename(md_file_path))[0]
            
            # Create YAML frontmatter
            yaml_frontmatter = "---\n"
            yaml_frontmatter += f'title: "{title}"\n'
            yaml_frontmatter += "format:\n"
            yaml_frontmatter += "  html:\n"
            yaml_frontmatter += "    toc: true\n"
            yaml_frontmatter += "---\n\n"
            
            # Combine frontmatter with content
            qmd_content = yaml_frontmatter + content
        
        # Write the QMD file
        with open(output_qmd_path, 'w', encoding='utf-8') as file:
            file.write(qmd_content)
        
        print(f"Successfully created QMD: '{output_qmd_path}'")
        return True
            
    except Exception as e:
        print(f"Error during QMD conversion: {e}")
        return False

def process_md_directory(input_dir, output_dir="lectures_qmd"):
    """
    Process all markdown files in a directory structure and convert them to QMD.
    
    Args:
        input_dir (str): Directory containing markdown files
        output_dir (str): Directory to output QMD files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Find all markdown files
    md_files = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.md'):
                md_files.append(os.path.join(root, file))
    
    print(f"Found {len(md_files)} markdown files to convert.")
    
    # Process each markdown file
    converted_count = 0
    for md_path in sorted(md_files):
        # Extract lecture number from path or filename
        filename = os.path.basename(md_path)
        dir_name = os.path.basename(os.path.dirname(md_path))
        
        # Try different patterns to extract lecture number
        # Priority: directory name first, then filename
        patterns = [
            r'lec[\s_-]*(\d+)',
            r'lecture[\s_-]*(\d+)',
            r'\[Lec\s*(\d+)\]',
            r'^\d+',
        ]
        
        lec_num = None
        # First try directory name (higher priority)
        for pattern in patterns:
            match = re.search(pattern, dir_name, re.IGNORECASE)
            if match:
                lec_num = match.group(1) if match.groups() else match.group(0)
                break
        
        # If no match in directory, try filename
        if not lec_num:
            for pattern in patterns:
                match = re.search(pattern, filename, re.IGNORECASE)
                if match:
                    lec_num = match.group(1) if match.groups() else match.group(0)
                    break
        
        if lec_num:
            # Pad with zero if single digit
            lec_num = lec_num.zfill(2)
            output_filename = f"lec{lec_num}.qmd"
        else:
            # Fallback to using the original filename
            output_filename = os.path.splitext(filename)[0] + ".qmd"
        
        output_path = os.path.join(output_dir, output_filename)
        
        print(f"\nProcessing: {filename}")
        print(f"Output: {output_filename}")
        
        # Extract title from filename (clean it up)
        title = re.sub(r'\[.*?\]', '', filename)  # Remove brackets
        title = os.path.splitext(title)[0]  # Remove extension
        title = title.strip()
        
        # Convert MD to QMD
        if md_to_qmd(md_path, output_path, title):
            converted_count += 1
        else:
            print(f"Failed to convert: {filename}")
    
    print(f"\n{'='*50}")
    print(f"Conversion complete! Successfully converted {converted_count}/{len(md_files)} files.")
    print(f"Output directory: {output_dir}")
    
    return converted_count

# Convert all markdown files to QMD
QMD_OUTPUT_DIR = NOTEBOOK_DIR / "lectures_qmd"
converted_count = process_md_directory(OUTPUT_DIR, QMD_OUTPUT_DIR)

Found 24 markdown files to convert.

Processing: [Lec 01] Data 6 Fall 2024 - Introduction.md
Output: lec01.qmd
Successfully created QMD: '/Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd/lec01.qmd'

Processing: [Lec 02] Data 6 Fall 2024 - Jupyter Notebooks, Arithmetic.md
Output: lec02.qmd
Successfully created QMD: '/Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd/lec02.qmd'

Processing: [Lab01_Lec03] Data 6 Fall 2024 - Evaluation, Names, and Data Types.md
Output: lec03.qmd
Successfully created QMD: '/Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd/lec03.qmd'

Processing: [Lec 04] Data 6 Fall 2024 - Arrays and Variables.md
Output: lec04.qmd
Successfully created QMD: '/Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd/lec04.qmd'

Processing: [Lec 05] Data 6 Fall 2024 - NumPy, Indexing, Variables in Data Science.md
Output: lec05.qmd
Successfully created QMD: '/Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd/lec05.qmd'

Processing: [Lec 07] Data 6

## Step 5: Convert Lecture QMD Files to Chapter Structure


In [None]:
# Convert lecture QMD files to chapter structure
# Creates lectures_qmd_complete/lec01/index.qmd + topic files based on roadmap

import re
import os
from pathlib import Path

def extract_roadmap_topics(qmd_content):
    """Extract main topics from 'Today's Roadmap' slide"""
    # Find the roadmap slide - try multiple patterns
    patterns = [
        r"## Slide \d+: Today's Roadmap\s*\n(.*?)(?=\n## |$)",
        r"## Slide \d+: Today.s Roadmap\s*\n(.*?)(?=\n## |$)",
        r"## Slide \d+: Today's Roadmap\s*\n(.*?)(?=\n## |$)",
    ]
    
    roadmap_content = None
    for pattern in patterns:
        roadmap_match = re.search(pattern, qmd_content, re.DOTALL)
        if roadmap_match:
            roadmap_content = roadmap_match.group(1)
            break
    
    if not roadmap_content:
        print("No roadmap slide found")
        return []
    
    topics = []
    
    # Extract main bullet points (not sub-bullets)
    lines = roadmap_content.split('\n')
    for line in lines:
        line = line.strip()
        # Skip empty lines and sub-bullets (indented with spaces)
        if not line or line.startswith('  '):
            continue
        # Skip the "Lecture XX, Data 6 Fall 2024" line
        if 'Lecture' in line and 'Data 6 Fall 2024' in line:
            continue
        # Extract main topics (lines that start with - but aren't indented)
        if line.startswith('- '):
            topic = line[2:].strip()
            if topic and not topic.startswith('Lecture'):
                topics.append(topic)
    
    print(f"Found {len(topics)} topics: {topics}")
    return topics

def clean_topic_name(topic):
    """Convert topic to a clean filename"""
    # Remove special characters and convert to lowercase
    clean = re.sub(r'[^\w\s-]', '', topic)
    clean = re.sub(r'[-\s]+', '-', clean)
    return clean.lower().strip('-')

def clean_lecture_title(title):
    """Clean lecture title by removing Data 6 Fall 2024 and [Lec XX] parts"""
    # Remove [Lec XX] pattern
    title = re.sub(r'\[Lec \d+\]\s*', '', title)
    # Remove "Data 6 Fall 2024" pattern
    title = re.sub(r'Data 6 Fall 2024\s*-\s*', '', title)
    # Remove any remaining "Data 6 Fall 2024" at the end
    title = re.sub(r'\s*Data 6 Fall 2024\s*$', '', title)
    return title.strip()

def extract_lecture_title(qmd_content):
    """Extract and clean title from YAML frontmatter"""
    yaml_match = re.search(r'---\s*\ntitle:\s*"([^"]+)"', qmd_content)
    if yaml_match:
        raw_title = yaml_match.group(1)
        return clean_lecture_title(raw_title)
    return "Untitled Lecture"

def extract_slide_content_for_topic(qmd_content, topic):
    """Extract slide content related to a specific topic"""
    # Find slides that might contain content for this topic
    # Look for slides with similar titles or content
    topic_words = set(re.findall(r'\b\w+\b', topic.lower()))
    
    slides = re.findall(r'## Slide \d+: ([^\n]+)\n(.*?)(?=\n## |$)', qmd_content, re.DOTALL)
    
    relevant_content = []
    for slide_title, slide_content in slides:
        slide_title_lower = slide_title.lower()
        slide_words = set(re.findall(r'\b\w+\b', slide_title_lower))
        
        # Check if slide title has significant overlap with topic
        overlap = len(topic_words.intersection(slide_words))
        if overlap >= 2 or any(word in slide_title_lower for word in topic_words if len(word) > 3):
            # Clean up the slide content
            clean_content = slide_content.strip()
            if clean_content:
                relevant_content.append(f"## {slide_title}\n\n{clean_content}\n")
    
    return '\n'.join(relevant_content)

def create_chapter_structure(lecture_qmd_path, output_dir):
    """Convert a single lecture QMD to chapter structure"""
    # Read the lecture QMD file
    with open(lecture_qmd_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Extract lecture number from filename
    lec_match = re.search(r'lec(\d+)', lecture_qmd_path.stem)
    if not lec_match:
        print(f"Could not extract lecture number from {lecture_qmd_path}")
        return False
    
    lec_num = lec_match.group(1)
    lec_dir = output_dir / f"lec{lec_num}"
    lec_dir.mkdir(parents=True, exist_ok=True)
    
    # Extract title and topics
    title = extract_lecture_title(content)
    topics = extract_roadmap_topics(content)
    
    # Create index.qmd with just title and subtitle
    index_content = f"""---
title: "{title}"
subtitle: "Lecture {lec_num}"
---

This chapter covers the key concepts from Lecture {lec_num}.

"""
    
    # Add links to topic files if they exist
    if topics:
        index_content += "## Topics Covered\n\n"
        for topic in topics:
            clean_name = clean_topic_name(topic)
            index_content += f"* [{topic}]({clean_name}.qmd)\n"
        index_content += "\n"
    
    # Write index.qmd
    index_path = lec_dir / "index.qmd"
    with open(index_path, 'w', encoding='utf-8') as f:
        f.write(index_content)
    
    # Create topic files from roadmap with actual slide content
    for topic in topics:
        clean_name = clean_topic_name(topic)
        slide_content = extract_slide_content_for_topic(content, topic)
        
        topic_content = f"""---
title: "{topic}"
subtitle: "From Lecture {lec_num}"
---

{slide_content}

"""
        topic_path = lec_dir / f"{clean_name}.qmd"
        with open(topic_path, 'w', encoding='utf-8') as f:
            f.write(topic_content)
    
    print(f"Created chapter structure for {lecture_qmd_path.name}")
    print(f"  - {index_path}")
    for topic in topics:
        clean_name = clean_topic_name(topic)
        print(f"  - {lec_dir / f'{clean_name}.qmd'}")
    
    return True

def convert_all_lectures_to_chapters():
    """Convert all lecture QMD files to chapter structure"""
    # Set up paths
    lectures_qmd_dir = NOTEBOOK_DIR / "lectures_qmd"
    output_dir = NOTEBOOK_DIR / "lectures_qmd_complete"
    
    if not lectures_qmd_dir.exists():
        print(f"Lectures QMD directory not found: {lectures_qmd_dir}")
        return
    
    # Create output directory
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Find all lecture QMD files
    qmd_files = sorted(lectures_qmd_dir.glob("lec*.qmd"))
    
    if not qmd_files:
        print(f"No lecture QMD files found in {lectures_qmd_dir}")
        return
    
    print(f"Converting {len(qmd_files)} lecture files to chapter structure...")
    print(f"Output directory: {output_dir}")
    print()
    
    success_count = 0
    for qmd_file in qmd_files:
        if create_chapter_structure(qmd_file, output_dir):
            success_count += 1
        print()
    
    print(f"Successfully converted {success_count}/{len(qmd_files)} lectures to chapter structure")

# Run the conversion
convert_all_lectures_to_chapters()


Converting 24 lecture files to chapter structure...
Output directory: /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd_complete

Created chapter structure for lec01.qmd
  - /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd_complete/lec01/index.qmd

Created chapter structure for lec02.qmd
  - /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd_complete/lec02/index.qmd

Created chapter structure for lec03.qmd
  - /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd_complete/lec03/index.qmd

Created chapter structure for lec04.qmd
  - /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd_complete/lec04/index.qmd

Created chapter structure for lec05.qmd
  - /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd_complete/lec05/index.qmd

Created chapter structure for lec06.qmd
  - /Users/jedwin321/Documents/GitHub/notes/fa24/lectures_qmd_complete/lec06/index.qmd

Created chapter structure for lec07.qmd
  - /Users/jedwin321/Documents/GitHub/notes/fa24/lectu