In [6]:
import pdfplumber
import re
from collections import Counter

def find_chapters_by_font(pdf_path):
    """
    Finds potential chapter titles by looking for text with a larger font size.
    """
    chapters = []
    font_sizes = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            words = page.extract_words(extra_attrs=['fontname', 'size'])
            for word in words:
                font_sizes.append(word['size'])

        if not font_sizes:
            return "Could not extract any text or font sizes."

        # Find the most common font size (likely the body text)
        most_common_size = Counter(font_sizes).most_common(1)[0][0]

        # Look for text that is larger than the most common size
        for page in pdf.pages:
            # You can adjust the tolerance as needed
            potential_titles = page.filter(lambda obj: obj["object_type"] == "char" and obj["size"] > most_common_size + 2)
            
            # Use regex to find common chapter formats
            chapter_pattern = re.compile(r'^(Chapter\s+\d+|[IVXLCDM]+\s*-|\d+\.\s+)[A-Za-z\s]+')
            
            if chapter_pattern.match(potential_titles.extract_text()):
                 chapters.append(potential_titles.extract_text())
                 
    return chapters

pdf_file = r"C:\Users\abiju\Desktop\Project-Velcro\REference_textbook\Charlie and the Chocolate Factory (Roald Dahl).pdf"
found_chapters = find_chapters_by_font(pdf_file)

if found_chapters:
    for chapter in found_chapters:
        print(f"Found potential chapter: {chapter}")
else:
    print("Could not identify chapters based on font size.")

Could not identify chapters based on font size.


In [7]:
import fitz  # PyMuPDF

def get_chapters_from_toc(pdf_path):
    """
    Extracts chapter titles and page numbers from a PDF's table of contents.
    """
    doc = fitz.open(pdf_path)
    toc = doc.get_toc()
    chapters = []
    for item in toc:
        level, title, page_num = item
        # Often, top-level entries are chapters
        if level == 1:
            chapters.append((title, page_num))
    return chapters

pdf_file = r"C:\Users\abiju\Desktop\Project-Velcro\REference_textbook\Charlie and the Chocolate Factory (Roald Dahl).pdf"
chapter_info = get_chapters_from_toc(pdf_file)

if chapter_info:
    for title, page in chapter_info:
        print(f"Chapter: {title}, Page: {page}")
else:
    print("No table of contents found.")

Chapter: Dedication, Page: 2
Chapter: About the Author, Page: 6
Chapter: Title Page, Page: 7
Chapter: Copyright Page, Page: 8
Chapter: Contents, Page: 10
Chapter: Charlie and the Chocolate Factory, Page: 12


In [13]:
#!/usr/bin/env python3
"""
Extract chapter titles from a PDF via:
  1) Built-in TOC / bookmarks
  2) Heuristics from page layout (font size + position + regex)

Usage:
  python extract_chapters.py /path/to/book.pdf
  python extract_chapters.py /path/to/book.pdf --csv chapters.csv

Output:
  Prints a JSON array of chapters to stdout and (optionally) writes CSV.

Requires:
  pip install pymupdf
"""

from __future__ import annotations
import argparse
import csv
import json
import os
import re
from dataclasses import dataclass, asdict
from typing import List, Dict, Any

import fitz  # PyMuPDF

# ------------------------------- Data model -------------------------------- #

@dataclass
class Chapter:
    title: str
    page: int             # 1-based page index
    level: int | None     # TOC level if available (1 = top level)
    method: str           # "toc" or "heuristic"

# ------------------------------- Utilities --------------------------------- #

def percentile(values: List[float], q: float) -> float:
    """
    Compute the q-quantile (0..1) without numpy.
    """
    if not values:
        return 0.0
    values_sorted = sorted(values)
    pos = (len(values_sorted) - 1) * q
    lower = int(pos)
    upper = min(lower + 1, len(values_sorted) - 1)
    if lower == upper:
        return values_sorted[lower]
    frac = pos - lower
    return values_sorted[lower] * (1 - frac) + values_sorted[upper] * frac


def normalize_heading(text: str) -> str:
    """
    Normalize heading text for deduping.
    """
    t = re.sub(r"\s+", " ", text.strip())
    # Remove excessive punctuation / trailing dots
    t = re.sub(r"[·•]+", "", t)
    return t


def likely_titlecase_or_caps(text: str) -> bool:
    if len(text) < 3:
        return False
    # Consider as heading if Title Case-ish or ALL CAPS-ish
    words = [w for w in re.findall(r"[A-Za-z][A-Za-z\-']*", text)]
    if not words:
        return False
    titleish = sum(w[0].isupper() for w in words) / max(1, len(words)) >= 0.6
    allcaps = (sum(w.isupper() for w in words) / max(1, len(words))) >= 0.6
    return titleish or allcaps


CHAPTER_PATTERNS = [
    re.compile(r"^\s*(Chapter|CHAPTER)\s+([IVXLCDM]+|\d+)\b"),
    re.compile(r"^\s*[IVXLCDM]+\.\s+[A-Z]"),          # e.g., "I. INTRODUCTION"
    re.compile(r"^\s*\d+\.\s+[A-Z]"),                 # e.g., "1. Getting Started"
    re.compile(r"^\s*[A-Z][A-Z \-:]{3,}$"),           # ALL CAPS headings
]

# --------------------------- Extraction methods ---------------------------- #

def extract_from_toc(doc: fitz.Document) -> List[Chapter]:
    """
    Use the document outline (bookmarks / TOC).
    Returns list of Chapter(level, title, page, method='toc').
    """
    chapters: List[Chapter] = []
    # get_toc(simple=True) -> [[level, title, page], ...]
    toc = doc.get_toc(simple=True) or []
    for level, title, page in toc:
        title_norm = normalize_heading(title)
        if title_norm:
            chapters.append(Chapter(title=title_norm, page=page, level=level, method="toc"))
    return chapters


def extract_by_layout_heuristics(doc: fitz.Document) -> List[Chapter]:
    """
    Heuristic heading finder:
      - For each page, compute font-size distribution of lines
      - Consider as candidate heading if:
          * line is near page top (y0 < top_cutoff)
          * line's max span size >= 92nd percentile for that page
          * text matches typical heading patterns OR looks like Title Case / ALL CAPS
          * line isn't too long
    """
    found: List[Chapter] = []
    seen_titles: set[str] = set()

    for pno, page in enumerate(doc, start=1):
        # Collect line-level font sizes
        page_dict = page.get_text("dict")  # blocks -> lines -> spans (with 'size' per span)
        line_entries = []  # (text, y0, size_max, fontnames)

        for block in page_dict.get("blocks", []):
            if block.get("type", 0) != 0:
                continue  # skip non-text blocks
            for line in block.get("lines", []):
                spans = line.get("spans", [])
                if not spans:
                    continue
                text = "".join(s.get("text", "") for s in spans).strip()
                if not text:
                    continue
                size_max = max(float(s.get("size", 0)) for s in spans)
                y0 = float(line.get("bbox", [0, 0, 0, 0])[1])
                fontnames = [s.get("font", "") or "" for s in spans]
                line_entries.append((text, y0, size_max, fontnames))

        if not line_entries:
            continue

        # Percentile threshold for "large" text on this page
        sizes = [s for _, __, s, ___ in line_entries]
        big_threshold = percentile(sizes, 0.92)

        # Top-of-page cutoff ~ upper 25% of page height
        page_rect = page.rect
        top_cutoff = page_rect.height * 0.28

        for text, y0, size_max, fontnames in line_entries:
            if y0 > top_cutoff:
                continue
            if size_max + 1e-6 < big_threshold:
                continue
            if len(text) > 140:
                continue

            # Pattern or style checks
            match_pat = any(pat.match(text) for pat in CHAPTER_PATTERNS)
            style_hint = likely_titlecase_or_caps(text)

            # If any font name contains 'Bold', prefer it as heading
            bold_hint = any("bold" in f.lower() for f in fontnames)

            if match_pat or style_hint or bold_hint:
                title_norm = normalize_heading(text)
                if title_norm and title_norm not in seen_titles:
                    seen_titles.add(title_norm)
                    found.append(Chapter(title=title_norm, page=pno, level=None, method="heuristic"))

    # Dedup consecutive near-duplicates (e.g., same heading repeated by header/footer quirks)
    pruned: List[Chapter] = []
    for ch in found:
        if pruned and pruned[-1].title == ch.title and abs(pruned[-1].page - ch.page) <= 1:
            continue
        pruned.append(ch)
    return pruned


def extract_chapters(pdf_path: str) -> List[Chapter]:
    doc = fitz.open(pdf_path)
    chapters = extract_from_toc(doc)
    if chapters:
        return chapters

    # Fallback: infer headings
    inferred = extract_by_layout_heuristics(doc)

    # If we inferred *too many* (e.g., section heads), try to keep the most distinctive ones:
    if len(inferred) > 80:
        # crude down-sampling: keep unique + first occurrence per page
        seen_pages: set[int] = set()
        slim: List[Chapter] = []
        for ch in inferred:
            if ch.page not in seen_pages:
                slim.append(ch)
                seen_pages.add(ch.page)
        return slim

    return inferred


# ---------------------------------- CLI ------------------------------------ #

def main():
    # parser = argparse.ArgumentParser(description="Extract chapters from a PDF.")
    # parser.add_argument("pdf", help="Path to PDF")
    # parser.add_argument("--csv", help="Optional CSV output path", default=None)
    # args = parser.parse_args()

    # if not os.path.isfile(args.pdf):
    #     raise SystemExit(f"File not found: {args.pdf}")

    chapters = extract_chapters(r"C:\Users\abiju\Desktop\Project-Velcro\REference_textbook\Chocolate.pdf")

    # Print JSON to stdout
    print(json.dumps([asdict(c) for c in chapters], ensure_ascii=False, indent=2))

    # # Optional CSV
    # if args.csv:
    #     with open(args.csv, "w", newline="", encoding="utf-8") as f:
    #         w = csv.writer(f)
    #         w.writerow(["title", "page", "level", "method"])
    #         for c in chapters:
    #             w.writerow([c.title, c.page, c.level if c.level is not None else "", c.method])

if __name__ == "__main__":
    main()


[
  {
    "title": "Charlie and the",
    "page": 1,
    "level": null,
    "method": "heuristic"
  },
  {
    "title": "Chocolate Factory",
    "page": 1,
    "level": null,
    "method": "heuristic"
  },
  {
    "title": "Charlie and the Chocolate Factory",
    "page": 2,
    "level": null,
    "method": "heuristic"
  },
  {
    "title": "- by Roald Dahl",
    "page": 2,
    "level": null,
    "method": "heuristic"
  },
  {
    "title": "ROALD DAHL",
    "page": 3,
    "level": null,
    "method": "heuristic"
  },
  {
    "title": "AUGUSTUS GLOOP",
    "page": 4,
    "level": null,
    "method": "heuristic"
  },
  {
    "title": "VERUCA SALT",
    "page": 4,
    "level": null,
    "method": "heuristic"
  },
  {
    "title": "VIOLET BEAUREGARDE",
    "page": 4,
    "level": null,
    "method": "heuristic"
  },
  {
    "title": "MIKE TEAVEE",
    "page": 4,
    "level": null,
    "method": "heuristic"
  },
  {
    "title": "2",
    "page": 6,
    "level": null,
    "method": "heuristic

In [15]:
import fitz  # pip install pymupdf

doc = fitz.open(r"C:\Users\abiju\Desktop\Project-Velcro\REference_textbook\Charlie and the Chocolate Factory (Roald Dahl).pdf")
toc = doc.get_toc(simple=True)  # -> [[level, title, page], ...]
for level, title, page in toc:
    print(f"{title}")


Dedication
About the Author
Title Page
Copyright Page
Contents
Charlie and the Chocolate Factory
1 Here Comes Charlie
2 Mr Willy Wonka's Factory
3 Mr Wonka and the Indian Prince
4 The Secret Workers
5 The Golden Tickets
6 The First Two Finders
7 Charlie's Birthday
8 Two More Golden Tickets Found
9 Grandpa Joe Takes a Gamble
10 The Family Begins to Starve
11 The Miracle
12 What It Said on the Golden Ticket
13 The Big Day Arrives
14 Mr Willy Wonka
15 The Chocolate Room
16 The Oompa-Loompas
17 Augustus Gloop Goes up the Pipe
18 Down the Chocolate River
19 The Inventing Room – Everlasting Gobstoppers and Hair Toffee
20 The Great Gum Machine
21 Good-bye Violet
22 Along the Corridor
23 Square Sweets That Look Round
24 Veruca in the Nut Room
25 The Great Glass Lift
26 The Television-Chocolate Room
27 Mike Teavee is Sent by Television
28 Only Charlie Left
29 The Other Children Go Home
30 Charlie's Chocolate Factory
