Instructions:

1. Go to https://directory.churchofjesuschrist.org/
2. Go to the top right corner, click the 🖨️ Printer symbol, and under "Individual Contact Information", click "Show for Everyone". This will make the next step possible.
3. Scroll down to "Photo" and click "Show Photos."
4. Click "Print".
5. On the print screen, instead of printing to a printer, click "Save to PDF" and print it.
6. Rename that pdf "ward_directory.pdf".
7. Upload it to this project.
8. Click "Run All" on the command line above.

In [None]:
# -*- coding: utf-8 -*-
"""flashcards_pipeline.py

End‑to‑end pipeline that
1. Extracts head‑and‑shoulder photos and names from a Ward Directory PDF
2. Builds an Anki‑/Quizlet‑style TSV (tab‑separated) file
3. Renders a two‑sided study‑card PDF where **side A is only the photo** and **side B shows the preferred name in bold with the full name below** (no photo)
4. Writes a TXT (`no_photo.txt`) listing every person who didn’t have a picture
5. Packages all artefacts (TSV, images/, PDF, TXT) into a single ZIP archive

Usage
-----
```bash
python flashcards_pipeline.py ward_directory.pdf  # basic
python flashcards_pipeline.py ward_directory.pdf -o my_cards  # custom base name
```

Requirements
------------
```
pip install pymupdf Pillow matplotlib pandas tqdm
```
"""


import argparse
import csv
import os
import re
import shutil
import zipfile
from datetime import datetime
from pathlib import Path
# (Optional) make sure pip is up to date
!pip install --upgrade pip

# install all the non-stdlib deps in one go
!pip install PyMuPDF matplotlib pandas pillow tqdm
import fitz  # PyMuPDF
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
from tqdm import tqdm

# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────
NAME_RE = re.compile(r'^([A-Za-z][A-Za-z\'\- ]+,\s+[A-Za-z][A-Za-z\'\- ]+)$')
PREF_RE = re.compile(r'^(?:Preferred:\s*)?([A-Za-z][A-Za-z\'\- ]+)$')


def extract(pdf_path: Path, out_img_dir: Path):
    """Parse PDF and return list of dict rows (will also write PNGs)."""
    rows = []
    doc = fitz.open(pdf_path)
    out_img_dir.mkdir(parents=True, exist_ok=True)

    for page in tqdm(doc, desc="Scanning PDF", unit="page"):
        page_dict = page.get_text("dict")
        names = []  # temp store full/pref + y‑position and CSV index

        # (1) detect lines that are names
        for block in page_dict["blocks"]:
            for line in block.get("lines", []):
                text = "".join(span["text"] for span in line["spans"]).strip()
                if not text or "Ward Directory" in text:
                    continue
                if len(text) == 1 and text.isalpha() and text.isupper():
                    continue  # section header

                m_full = NAME_RE.match(text)
                m_pref = PREF_RE.match(text)
                if m_full:
                    entry = {
                        "full": m_full.group(1),
                        "pref": None,
                        "y": line["bbox"][1],
                        "matched": False,
                    }
                    entry["row_idx"] = len(rows)
                    rows.append({
                        "full_name": entry["full"],
                        "pref_name": None,
                        "img_file": "",
                    })
                    names.append(entry)
                elif m_pref and names:
                    names[-1]["pref"] = m_pref.group(1)
                    rows[names[-1]["row_idx"]]["pref_name"] = names[-1]["pref"]

        # (2) pull images and match by Y distance
        seen_xrefs = set()
        for img_idx, img in enumerate(page.get_images(full=True), start=1):
            xref = img[0]
            if xref in seen_xrefs:
                continue
            seen_xrefs.add(xref)
            pix = fitz.Pixmap(doc, xref)
            if pix.n >= 5:
                continue  # CMYK etc.

            img_name = f"{page.number+1:03d}_{img_idx}.png"
            img_path = out_img_dir / img_name
            pix.save(img_path)
            bbox = page.get_image_rects(xref)[0]
            y0 = bbox.y0

            # match to closest unmatched name
            cands = [n for n in names if not n["matched"]]
            if not cands:
                continue
            closest = min(cands, key=lambda n: abs(n["y"] - y0))
            closest["matched"] = True
            rows[closest["row_idx"]]["img_file"] = str(img_path)

    return rows


def write_tsv(rows, tsv_path: Path):
    with tsv_path.open("w", newline="", encoding="utf8") as f:
        w = csv.writer(f, delimiter="\t")
        w.writerow(["Front", "Back"])
        for r in rows:
            front = r["pref_name"] or r["full_name"]
            img_tag = f'<img src="{r["img_file"]}">' if r["img_file"] else ""
            back = f"{img_tag}{r['full_name']}"
            w.writerow([front, back])


def save_pdf(rows, pdf_path: Path):
    # Prepare dataframe of only rows WITH a photo
    df = pd.DataFrame(rows)
    df = df[df["img_file"].astype(bool)].reset_index(drop=True)
    if df.empty:
        print("No photos found – skipping PDF generation.")
        return

    from matplotlib.backends.backend_pdf import PdfPages

    with PdfPages(pdf_path) as pdf:
        for _, r in df.iterrows():
            front_name = r["pref_name"] or r["full_name"].split(", ")[1]
            full_name = r["full_name"]
            img_path = r["img_file"]

            fig, axes = plt.subplots(1, 2, figsize=(8.5, 5.5))
            for ax in axes:
                ax.axis("off")

            # Side A (left): PHOTO ONLY
            img = Image.open(img_path)
            axes[0].imshow(img)

            # Side B (right): preferred (bold) + full name
            axes[1].text(0.5, 0.6, front_name, ha="center", va="center", fontsize=28, weight="bold", transform=axes[1].transAxes)
            axes[1].text(0.5, 0.3, full_name, ha="center", va="center", fontsize=16, transform=axes[1].transAxes)

            pdf.savefig(fig, bbox_inches="tight")
            plt.close(fig)


def write_missing_txt(rows, txt_path: Path):
    missing = [r for r in rows if not r["img_file"]]
    if not missing:
        return
    with txt_path.open("w", encoding="utf8") as f:
        for r in missing:
            f.write(f"{r['full_name']}\n")


def zip_bundle(bundle_path: Path, artefacts):
    with zipfile.ZipFile(bundle_path, "w", zipfile.ZIP_DEFLATED) as zf:
        for art in artefacts:
            if Path(art).is_dir():
                for p in Path(art).rglob("*"):
                    zf.write(p, p.relative_to(Path(art).parent))
            else:
                zf.write(art, Path(art).name)


# ──────────────────────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="Ward‑directory flashcard pipeline")
    parser.add_argument("pdf", type=Path, help="Input ward_directory.pdf")
    parser.add_argument("-o", "--output_base", default="flashcards", help="Base filename for outputs")
    args = parser.parse_args()

    pdf_in = args.pdf
    base = Path(args.output_base).with_suffix("")  # strip extension if any

    img_dir = base.parent / "images"
    tsv_path = base.with_suffix(".tsv")
    pdf_path = base.with_suffix(".pdf")
    missing_txt = base.parent / "no_photo.txt"
    zip_path = base.parent / f"{base.name}_bundle.zip"

    rows = extract(pdf_in, img_dir)
    write_tsv(rows, tsv_path)
    save_pdf(rows, pdf_path)
    write_missing_txt(rows, missing_txt)

    artefacts = [tsv_path, pdf_path, img_dir, missing_txt]
    zip_bundle(zip_path, artefacts)

    print("✨ Done! Generated:")
    for a in artefacts:
        print(" •", a)
    print(" •", zip_path)



In [None]:
# 2) point at your PDF and choose an output base name
pdf_path   = Path("ward_directory.pdf")
output_base = Path("flashcards")  # or Path("my_cards")

# 3) run each stage
rows = extract(pdf_path, Path("images/"))
write_tsv(rows, output_base.with_suffix(".tsv"))
save_pdf(rows, output_base.with_suffix(".pdf"))
write_missing_txt(rows, Path("no_photo.txt"))
zip_bundle(Path(f"{output_base.name}_bundle.zip"),
           [ output_base.with_suffix(".tsv"),
             output_base.with_suffix(".pdf"),
             Path("images/"),
             Path("no_photo.txt") ])

Scanning PDF: 100%|██████████| 19/19 [00:01<00:00, 15.62page/s]


In [None]:
import os
from pathlib import Path
from IPython.display import display, FileLink

def download_zip(zip_path: Path):
    """
    If running in Colab, triggers an automatic browser download.
    Otherwise, displays a download link in a classic Jupyter notebook.
    """
    zip_path = Path(zip_path)
    if not zip_path.exists():
        raise FileNotFoundError(f"{zip_path} does not exist.")

    try:
        # Colab’s built-in download utility
        from google.colab import files
        files.download(str(zip_path))
    except (ImportError, ModuleNotFoundError):
        # Fallback for standard Jupyter: show a clickable link
        display(FileLink(str(zip_path), result_html_prefix="Click here to download: "))


In [None]:
download_zip(Path(f"{output_base.name}_bundle.zip"))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>