In [1]:
import pymupdf
from history_book.text_processing.text_processing import replace_ligatures
from history_book.utils.utils import print_with_wrapping

In [2]:
book_file = "../data/penguin_history_6.pdf"

In [3]:
doc = pymupdf.open(book_file)

In [4]:
doc

Document('../data/penguin_history_6.pdf')

## Basic text reading

In [5]:
for page in doc[:10]:
    print(page.get_text("text"))



J. M. Roberts and O. A. Westad
 
T HE PEN GUI N HI ST O RY O F T HE WO RLD
SIXTH EDITION

Contents
List of Maps
Preface to the Sixth Edition
BOOK ONE
BEFORE HISTORY
Introduction
1   The Foundations
2   Homo Sapiens
3   The Possibility of Civilization
BOOK TWO
CIVILIZATIONS
Introduction
1   Early Civilized Life
2   Ancient Mesopotamia
3   Ancient Egypt
4   Intruders and Invaders
5   The Beginnings of Civilization in South Asia
6   Ancient China
7   The Other Worlds of the Ancient Past
8   Transformations
BOOK THREE

THE CLASSICAL AGE
Introduction
1   Remaking the Old World
2   The Greeks
3   The Hellenistic World
4   Rome
5   Christianity and the Western Transition
6   Classical India
7   Classical China
BOOK FOUR
THE AGE OF DIVERGING TRADITIONS
Introduction
1   The Central Eurasian Crossroads
2   Islam and the Arab Empires
3   Byzantium and Its Sphere
4   The New Middle East and the Making of Europe
5   India
6   Imperial China
7   Japan
8   Worlds Apart
9   Europe: The Possibility o

In [6]:
page = doc[20]
print(page.get_text("text"))

1
The Foundations
The roots of history lie in the pre-human past and it is hard (but
important) to grasp just how long ago that was. If we think of a
century on our calendar as a minute on some great clock recording
the passage of time, then Europeans began to settle in the
Americas only about ɹve minutes ago. Slightly less than ɹfteen
minutes before that, Christianity appeared. Rather more than an
hour ago people settled in southern Mesopotamia who were soon
to evolve the oldest civilization known to us. This is already well
beyond the furthest margin of written record; according to our
clock, people began writing down the past much less than an hour
ago, too. Some six or seven hours further back on our scale, and
much more remote, we can discern the ɹrst recognizable human
beings of a modern physiological type already established in
western Europe. Behind them, anything from a fortnight to three
weeks earlier, appear the ɹrst traces of creatures with some human
characteristics whose 

In [7]:
page = doc[21]
text = page.get_text("text", flags=None)

In [8]:
text

'past even more remote than the much shorter period of time – 4½\nmillion years or so – in which creatures with at least some claim to\nhuman qualities are known to have existed. Though it is not our\ndirect concern, we need to try to understand what was in the\nbaggage of advantages and disadvantages with which human\nbeings alone among the primates emerged after these huge tracts\nof time as change-makers. Virtually all the physical and much of\nthe mental formation we still take for granted was by then\ndetermined, ɹxed in the sense that some possibilities were excluded\nand others were not. The crucial process is the evolution of human\ncreatures as a distinct branch among the primates, for it is at this\nfork in the line, as it were, that we begin to look out for the station\nat which we get oʃ for History. It is here that we can hope to ɹnd\nthe ɹrst signs of that positive, conscious impact upon environment\nwhich marks the ɹrst stage of human achievement.\nThe bedrock of the sto

## Ligatures: identify non-latin characters

In [9]:
len(doc)  # pages

1700

In [10]:
text = ""
total_length = 0
for page in doc:
    page_text = page.get_text("text")
    text += page_text
    total_length += len(page_text)
    last_page_text = page_text
print(total_length)

3148719


In [11]:
last_page_text

'ALLEN LANE\nPublished by the Penguin Group\nPenguin Books Ltd, 80 Strand, London WC2R 0RL, England\nPenguin Group (USA) Inc., 375 Hudson Street, New York, New York 10014, USA\nPenguin Group (Canada), 90 Eglinton Avenue East, Suite 700, Toronto, Ontario, Canada M4P\n2Y3 (a division of Pearson Canada Inc.)\nPenguin Ireland, 25 St Stephen’s Green, Dublin 2, Ireland (a division of Penguin Books Ltd)\nPenguin Group (Australia), 707 Collins Street, Melbourne, Victoria 3008, Australia (a\ndivision of Pearson Australia Group Pty Ltd)\nPenguin Books India Pvt Ltd, 11 Community Centre, Panchsheel Park, New Delhi – 110 017,\nIndia\nPenguin Group (NZ), 67 Apollo Drive, Rosedale, Auckland 0632, New Zealand (a division of\nPearson New Zealand Ltd)\nPenguin Books (South Africa) (Pty) Ltd, Block D, Rosebank Oɽce Park, 181 Jan Smuts\nAvenue, Parktown North, Gauteng 2193, South Africa\nPenguin Books Ltd, Registered Oɽces: 80 Strand, London WC2R 0RL, England\nwww.penguin.com\nFirst published in Great Br

In [12]:
len(text)

3148719

In [13]:
type(text)

str

In [14]:
unique_chars = set(text)

In [15]:
unique_chars = sorted(unique_chars)

In [16]:
len(unique_chars)

120

In [17]:
unique_chars

['\n',
 ' ',
 '!',
 '$',
 '%',
 '(',
 ')',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '=',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\xa0',
 '£',
 '©',
 '°',
 '´',
 '¼',
 '½',
 'Ç',
 'Î',
 'Ö',
 'à',
 'á',
 'â',
 'ä',
 'ç',
 'è',
 'é',
 'ê',
 'í',
 'î',
 'ó',
 'ô',
 'ö',
 'ü',
 'ć',
 'ł',
 'ō',
 'š',
 'ɹ',
 'ɻ',
 'ɽ',
 'ʀ',
 'ʃ',
 '–',
 '‘',
 '’',
 '“',
 '”',
 '…',
 '人']

In [18]:
weird_chars = unique_chars[-15:-7] + unique_chars[-1:]
weird_chars

['ł', 'ō', 'š', 'ɹ', 'ɻ', 'ɽ', 'ʀ', 'ʃ', '人']

In [19]:
# get locations of character in string
import re


def get_char_locations(text, char):
    return [m.start() for m in re.finditer(char, text)]

In [20]:
for char in weird_chars:
    print(char)
    weird_char_locs = get_char_locations(text, char)
    for i in range(0, min(10, len(weird_char_locs))):
        print(weird_char_locs[i] - 10, weird_char_locs[i] + 10)
        print("\t" + text[weird_char_locs[i] - 10 : weird_char_locs[i] + 10])

ł
1620775 1620795
	he Radziwiłłs, owned
1620776 1620796
	e Radziwiłłs, owned 
2992388 2992408
	r, Lech Wałesa, a de
2993472 2993492
	ed, but Wałesa strov
2993630 2993650
	ay from Wałesa’s
con
2995485 2995505
	st 1989 Wałesa annou
3004047 3004067
	0, Lech Wałesa becam
ō
1204751 1204771
	itle of
shōgun. They
1204931 1204951
	 in the shōgun’s int
1210878 1210898
	for the shōguns to c
1211239 1211259
	neither shōgun nor
e
1215162 1215182
	 and the
Nō drama.
I
1217628 1217648
	itle
of shōgun and s
1218074 1218094
	ip. The shōguns
them
1218461 1218481
	okugawa
shōgun, was 
1219014 1219034
	 to the shōgun in th
1219373 1219393
	 at the
shōgun’s cou
š
2724295 2724315
	bodan Milošević and 
3027882 3027902
	bodan Milošević, and
3029046 3029066
	erbs, Milošević was 
3084091 3084111
	bodan Milošević, had
ɹ
3293 3313
	published ɹfteen boo
6700 6720
	bably the ɹnest ever
6739 6759
	h. When I ɹrst read 
7112 7132
	 does not ɹt
easily 
7793 7813
	n updated
ɹfth editi
7908 7928
	s to an unɹnished re
84

In [21]:
# LIGATURES = {
#     "ɻ": "fl",
#     "ɹ": "fi",
#     "ɽ": "ffi",
#     "ʀ": "ffl",
#     "ʃ": "ff",
# }
# def replace_ligatures(text: str) -> str:
#     for ligature, replacement in LIGATURES.items():
#         text = text.replace(ligature, replacement)
#     return text

In [22]:
text_fixed = replace_ligatures(text)

In [23]:
# note: ligature replacement increases character count --- doesn't line up with original text
for char in weird_chars:
    print(char)
    weird_char_locs = get_char_locations(text, char)
    for i in range(0, min(10, len(weird_char_locs))):
        print("\t" + text_fixed[weird_char_locs[i] - 20 : weird_char_locs[i] + 20])

ł
	 poll
tax, for example, and being ruled,
	poll
tax, for example, and being ruled, 
	 grew as the USSR showed signs of
growin
	 an agreement over intermediate-range nu
	had held long enough for the first
stand
	ht hung
over intellectual life and polit
	n other Communist countries, whose leade
ō
	t cultural
attraction and dependence had
	eriod of centralization
and major effort
	 of
men seeking security in troubled tim
	 warrior clans in an elementary tie
of s
	no merely passive acceptance of a foreig
	es but then blending with their
traditio
	ese life down to the present day. Buddhi
	rama.
In particular areas, the lawlessne
	rts – particularly the exquisite
example
	able to carry the costs of civil strife 
š
	h phase of a
country’s economy. They oft
	izers with liberalization in the governi
	 of liberalization. This cut both ways, 
	were killed and millions (out of
a total
ɹ
	s. He has published fifteen books on mod
	orld is probably the finest ever produce
	ed in
English. When I fir

## Check first chapter

In [24]:
doc[20].get_text("text")

'1\nThe Foundations\nThe roots of history lie in the pre-human past and it is hard (but\nimportant) to grasp just how long ago that was. If we think of a\ncentury on our calendar as a minute on some great clock recording\nthe passage of time, then Europeans began to settle in the\nAmericas only about ɹve minutes ago. Slightly less than ɹfteen\nminutes before that, Christianity appeared. Rather more than an\nhour ago people settled in southern Mesopotamia who were soon\nto evolve the oldest civilization known to us. This is already well\nbeyond the furthest margin of written record; according to our\nclock, people began writing down the past much less than an hour\nago, too. Some six or seven hours further back on our scale, and\nmuch more remote, we can discern the ɹrst recognizable human\nbeings of a modern physiological type already established in\nwestern Europe. Behind them, anything from a fortnight to three\nweeks earlier, appear the ɹrst traces of creatures with some human\nchar

In [25]:
doc[40].get_text("text")

'of Homo erectus it is his human, not pre-human, characteristics\nwhich are most striking. Physically, he has a brain of an order of\nmagnitude comparable to our own. He makes tools (and does so\nwithin more than one technical tradition), builds shelters, takes\nover natural refuges by exploiting ɹre, and sallies out of them to\nhunt and gather his food. He does this in groups with a discipline\nwhich can sustain complicated operations; he therefore has some\nability to exchange ideas by speech. The basic biological units of\nhis hunting groups probably preɹgure the nuclear human family,\nbeing founded on the institutions of the home base and a sexual\ndiʃerentiation of activity. There may even be some complexity of\nsocial organization in so far as ɹre-bearers and gatherers or old\ncreatures whose memories made them the databanks of their\n‘societies’ could be supported by the labour of others. There has to\nbe some social organization to permit the sharing of co-operatively\nobtained

In [26]:
# get text from chapter 1 and concat into one string
chapter_1_text = ""
for page in doc[20:41]:
    chapter_1_text += page.get_text("text")
print(chapter_1_text)

1
The Foundations
The roots of history lie in the pre-human past and it is hard (but
important) to grasp just how long ago that was. If we think of a
century on our calendar as a minute on some great clock recording
the passage of time, then Europeans began to settle in the
Americas only about ɹve minutes ago. Slightly less than ɹfteen
minutes before that, Christianity appeared. Rather more than an
hour ago people settled in southern Mesopotamia who were soon
to evolve the oldest civilization known to us. This is already well
beyond the furthest margin of written record; according to our
clock, people began writing down the past much less than an hour
ago, too. Some six or seven hours further back on our scale, and
much more remote, we can discern the ɹrst recognizable human
beings of a modern physiological type already established in
western Europe. Behind them, anything from a fortnight to three
weeks earlier, appear the ɹrst traces of creatures with some human
characteristics whose 

In [27]:
chapter_1_text = replace_ligatures(chapter_1_text)

In [28]:
print(chapter_1_text)

1
The Foundations
The roots of history lie in the pre-human past and it is hard (but
important) to grasp just how long ago that was. If we think of a
century on our calendar as a minute on some great clock recording
the passage of time, then Europeans began to settle in the
Americas only about five minutes ago. Slightly less than fifteen
minutes before that, Christianity appeared. Rather more than an
hour ago people settled in southern Mesopotamia who were soon
to evolve the oldest civilization known to us. This is already well
beyond the furthest margin of written record; according to our
clock, people began writing down the past much less than an hour
ago, too. Some six or seven hours further back on our scale, and
much more remote, we can discern the first recognizable human
beings of a modern physiological type already established in
western Europe. Behind them, anything from a fortnight to three
weeks earlier, appear the first traces of creatures with some human
characteristics wh

In [29]:
set(chapter_1_text)

{'\n',
 ' ',
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Y',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '½',
 '–',
 '‘',
 '’'}

In [30]:
chapter_1_text

'1\nThe Foundations\nThe roots of history lie in the pre-human past and it is hard (but\nimportant) to grasp just how long ago that was. If we think of a\ncentury on our calendar as a minute on some great clock recording\nthe passage of time, then Europeans began to settle in the\nAmericas only about five minutes ago. Slightly less than fifteen\nminutes before that, Christianity appeared. Rather more than an\nhour ago people settled in southern Mesopotamia who were soon\nto evolve the oldest civilization known to us. This is already well\nbeyond the furthest margin of written record; according to our\nclock, people began writing down the past much less than an hour\nago, too. Some six or seven hours further back on our scale, and\nmuch more remote, we can discern the first recognizable human\nbeings of a modern physiological type already established in\nwestern Europe. Behind them, anything from a fortnight to three\nweeks earlier, appear the first traces of creatures with some human\n

## Find book and chapter starts

In [None]:
book_start_pages = []
chapter_start_pages = []
for i, page in enumerate(doc):
    blocks = page.get_text("blocks")
    for block in blocks:
        if block[6] == 0:  # only text blocks
            text = block[4].strip()
            # check if text is a book header, e.g. 'Book One', 'Book Two', etc.
            if text.startswith("Book ") and len(text.split()) == 2:
                print(f"Found book header: {text}, on page {i + 1}")
                book_start_pages.append(i)
                chapter_start_pages.append(i)  # all book start are intro chapters too
            # for chapters: if first line is only a number, it's a chapter header
            elif text.isdigit() and len(text) < 3:  # assuming chapter numbers are short
                print(f"Found chapter header: {text}, on page {i + 1}")
                chapter_start_pages.append(i)

Found book header: Book One, on page 19
Found chapter header: 1, on page 21
Found chapter header: 2, on page 42
Found chapter header: 3, on page 59
Found book header: Book Two, on page 74
Found chapter header: 1, on page 76
Found chapter header: 2, on page 87
Found chapter header: 3, on page 110
Found chapter header: 4, on page 139
Found chapter header: 5, on page 173
Found chapter header: 6, on page 192
Found chapter header: 7, on page 219
Found chapter header: 8, on page 233
Found book header: Book Three, on page 242
Found chapter header: 1, on page 244
Found chapter header: 2, on page 249
Found chapter header: 3, on page 297
Found chapter header: 4, on page 317
Found chapter header: 5, on page 361
Found chapter header: 6, on page 438
Found chapter header: 7, on page 454
Found book header: Book Four, on page 474
Found chapter header: 1, on page 476
Found chapter header: 2, on page 484
Found chapter header: 3, on page 519
Found chapter header: 4, on page 557
Found chapter header: 5, o

In [111]:
# try with nested list
book_start_pages = []
chapter_start_pages = []
for i, page in enumerate(doc):
    blocks = page.get_text("blocks")
    for block in blocks:
        if block[6] == 0:  # only text blocks
            text = block[4].strip()
            # check if text is a book header, e.g. 'Book One', 'Book Two', etc.
            if text.startswith("Book ") and len(text.split()) == 2:
                print(f"Found book header: {text}, on page {i + 1}")
                if len(chapter_start_pages) > 0:
                    # dont' add empty list at the start
                    book_start_pages.append(chapter_start_pages)
                chapter_start_pages = [i]
            # for chapters: if first line is only a number, it's a chapter header
            elif text.isdigit() and len(text) < 3:  # assuming chapter numbers are short
                print(f"Found chapter header: {text}, on page {i + 1}")
                chapter_start_pages.append(i)
book_start_pages.append(chapter_start_pages)

Found book header: Book One, on page 19
Found chapter header: 1, on page 21
Found chapter header: 2, on page 42
Found chapter header: 3, on page 59
Found book header: Book Two, on page 74
Found chapter header: 1, on page 76
Found chapter header: 2, on page 87
Found chapter header: 3, on page 110
Found chapter header: 4, on page 139
Found chapter header: 5, on page 173
Found chapter header: 6, on page 192
Found chapter header: 7, on page 219
Found chapter header: 8, on page 233
Found book header: Book Three, on page 242
Found chapter header: 1, on page 244
Found chapter header: 2, on page 249
Found chapter header: 3, on page 297
Found chapter header: 4, on page 317
Found chapter header: 5, on page 361
Found chapter header: 6, on page 438
Found chapter header: 7, on page 454
Found book header: Book Four, on page 474
Found chapter header: 1, on page 476
Found chapter header: 2, on page 484
Found chapter header: 3, on page 519
Found chapter header: 4, on page 557
Found chapter header: 5, o

In [112]:
book_start_pages

[[18, 20, 41, 58],
 [73, 75, 86, 109, 138, 172, 191, 218, 232],
 [241, 243, 248, 296, 316, 360, 437, 453],
 [473, 475, 483, 518, 556, 623, 633, 663, 679, 697, 740],
 [771, 773, 794, 825, 865, 911, 950, 974],
 [1005, 1007, 1035, 1071, 1102, 1132, 1166, 1192],
 [1233, 1235, 1262, 1308, 1333, 1352, 1384],
 [1420, 1424, 1480, 1557, 1604, 1635, 1681]]

In [73]:
# # book starts are missing text for books 7 and 8, add by hand: -- fixed with new pdf
# book_start_pages.append(1229)  # Book Seven starts on page 19
# book_start_pages.append(1417)  # Book Eight starts on page 41

In [115]:
# book_start_pages

In [116]:
# chapter_start_pages

In [113]:
# add start of pages after last chapter
# chapter_start_pages.append(1699)
book_start_pages.append([1699])

In [117]:
book_start_pages

[[18, 20, 41, 58],
 [73, 75, 86, 109, 138, 172, 191, 218, 232],
 [241, 243, 248, 296, 316, 360, 437, 453],
 [473, 475, 483, 518, 556, 623, 633, 663, 679, 697, 740],
 [771, 773, 794, 825, 865, 911, 950, 974],
 [1005, 1007, 1035, 1071, 1102, 1132, 1166, 1192],
 [1233, 1235, 1262, 1308, 1333, 1352, 1384],
 [1420, 1424, 1480, 1557, 1604, 1635, 1681],
 [1699]]

## Get book and chapter titles from ToC

In [77]:
toc_page_start = 4
toc_page_end = 7
# concat blocks into one list
toc_blocks = []
for i in range(toc_page_start - 1, toc_page_end):
    toc_blocks.extend(doc[i].get_text("blocks"))

In [78]:
# book 5 chapter block is split across two pages. handle by combining blocks from both pages
# need to concat text from 2nd block into first, and delete the second block
def combine_blocks(blocks, index1, index2):
    blocks[index1][4] += " " + blocks[index2][4]  # concatenate text
    del blocks[index2]  # remove the second block

In [79]:
# convert list of tuples to list of lists
def convert_blocks_to_lists(blocks):
    return [list(block) for block in blocks]

In [80]:
toc_blocks = convert_blocks_to_lists(toc_blocks)

In [81]:
# also: book 2 chapter block has "introduction" in it's ownb block
combine_blocks(toc_blocks, 7, 8)

In [82]:
# book 4 chapters
combine_blocks(toc_blocks, 13, 14)

In [83]:
# book 5 chapters
combine_blocks(toc_blocks, 16, 17)

In [84]:
# book 8 chapters
combine_blocks(toc_blocks, 25, 26)

In [85]:
# chapter title block format:
# 'Introduction\n1\xa0\xa0 [chapter title]\n2\xa0\xa0 [chapter title]\n3\xa0\xa0 [chapter title]\n'
def extract_chapter_titles(block):
    chapter_titles = []
    text = block[4].strip()
    # split by newlines and take the first part
    parts = text.split("\n")
    for part in parts:
        if part == "Introduction":
            # introductions are considered chapter 0
            chapter_titles.append("Introduction")
        # split by whitespace and take the second part (the chapter title)
        elif "\xa0\xa0" in part:
            title_part = part.split("\xa0\xa0")[1]
            chapter_titles.append(title_part.strip())
        else:
            # double digit chapter numbers might not have the '\xa0\xa0' separator
            # in that case remove the leading number and any whitespace
            title_part = part.lstrip("0123456789 ").strip()
            if title_part:
                chapter_titles.append(title_part)
    return chapter_titles

In [86]:
# check
extract_chapter_titles(toc_blocks[4])

['Introduction',
 'The Foundations',
 'Homo Sapiens',
 'The Possibility of Civilization']

In [87]:
# process toc blocks to extract book and chapter titles
# format:
# block 0: 'Contents'
# block 1: [filler...]
# block 2: 'Book [n]'
# block 3: '[book title]'
# block 4: 'Introduction\n1\xa0\xa0 [chapter title]\n2\xa0\xa0 [chapter title]\n3\xa0\xa0 [chapter title]\n'
# blocks 2-4 repeat for each book...
book_titles = []
chapter_titles = []

for i, block in enumerate(toc_blocks):
    if i == 0:
        continue
    elif i == 1:
        continue
    elif i % 3 == 0:
        text = block[4].strip()
        book_titles.append(text)
    elif i % 3 == 1:
        chapter_titles.extend(extract_chapter_titles(block))

In [88]:
book_titles

['BEFORE HISTORY',
 'CIVILIZATIONS',
 'THE CLASSICAL AGE',
 'THE AGE OF DIVERGING TRADITIONS',
 'THE MAKING OF THE EUROPEAN AGE',
 'THE GREAT ACCELERATION',
 'THE END OF THE EUROPEAN AGE',
 'OUR OWN TIME']

In [89]:
chapter_titles

['Introduction',
 'The Foundations',
 'Homo Sapiens',
 'The Possibility of Civilization',
 'Introduction',
 'Early Civilized Life',
 'Ancient Mesopotamia',
 'Ancient Egypt',
 'Intruders and Invaders',
 'The Beginnings of Civilization in South Asia',
 'Ancient China',
 'The Other Worlds of the Ancient Past',
 'Transformations',
 'Introduction',
 'Remaking the Old World',
 'The Greeks',
 'The Hellenistic World',
 'Rome',
 'Christianity and the Western Transition',
 'Classical India',
 'Classical China',
 'Introduction',
 'The Central Eurasian Crossroads',
 'Islam and the Arab Empires',
 'Byzantium and Its Sphere',
 'The New Middle East and the Making of Europe',
 'India',
 'Imperial China',
 'Japan',
 'Worlds Apart',
 'Europe: The Possibility of Change',
 'New Limits, New Horizons',
 'Introduction',
 'Qing China and Mughal India',
 'A New Kind of Society: Early Modern Europe',
 'Authority and Its Challengers in Europe',
 'The New World of Great Powers',
 'Europe’s Assault on the World',


In [90]:
len(book_titles), len(chapter_titles)

(8, 62)

In [91]:
len(book_start_pages), len(chapter_start_pages)

(8, 63)

In [119]:
def organize_chapter_titles_by_book():
    """Group chapter titles by book"""
    chapters_by_book = []

    # Find where each book's chapters start in the flat chapter_titles list
    book_chapter_indices = []
    current_idx = 0

    # Each book starts with "Introduction"
    for i, title in enumerate(chapter_titles):
        if title == "Introduction" and i > current_idx:
            book_chapter_indices.append(current_idx)
            current_idx = i

    # Add the last section
    book_chapter_indices.append(current_idx)
    # Add the end index
    book_chapter_indices.append(len(chapter_titles))

    # Create lists of chapter titles for each book
    for i in range(len(book_chapter_indices) - 1):
        start_idx = book_chapter_indices[i]
        end_idx = book_chapter_indices[i + 1]
        chapters_by_book.append(chapter_titles[start_idx:end_idx])

    return chapters_by_book

In [120]:
chapters_by_book = organize_chapter_titles_by_book()

In [121]:
chapters_by_book

[['Introduction',
  'The Foundations',
  'Homo Sapiens',
  'The Possibility of Civilization'],
 ['Introduction',
  'Early Civilized Life',
  'Ancient Mesopotamia',
  'Ancient Egypt',
  'Intruders and Invaders',
  'The Beginnings of Civilization in South Asia',
  'Ancient China',
  'The Other Worlds of the Ancient Past',
  'Transformations'],
 ['Introduction',
  'Remaking the Old World',
  'The Greeks',
  'The Hellenistic World',
  'Rome',
  'Christianity and the Western Transition',
  'Classical India',
  'Classical China'],
 ['Introduction',
  'The Central Eurasian Crossroads',
  'Islam and the Arab Empires',
  'Byzantium and Its Sphere',
  'The New Middle East and the Making of Europe',
  'India',
  'Imperial China',
  'Japan',
  'Worlds Apart',
  'Europe: The Possibility of Change',
  'New Limits, New Horizons'],
 ['Introduction',
  'Qing China and Mughal India',
  'A New Kind of Society: Early Modern Europe',
  'Authority and Its Challengers in Europe',
  'The New World of Great Po

## Set up data models

In [131]:
from pydantic import BaseModel, Field
from typing import List, Optional
import uuid

In [132]:
# class Paragraph(BaseModel):
#     text: str
#     page: int
#     paragraph_index: int  # Position in the chapter
#     chapter_index: int  # Reference to its chapter
#     book_index: int     # Reference to its book

# class Chapter(BaseModel):
#     title: str
#     start_page: int
#     end_page: int = None
#     chapter_index: int  # Position within the book
#     book_index: int     # Reference to its book
#     paragraphs: list[Paragraph] = Field(default_factory=list)

# class Book(BaseModel):
#     title: str
#     start_page: int
#     end_page: int = None
#     book_index: int     # Position in the overall history book
#     chapters: list[Chapter] = Field(default_factory=list)

# class HistoryBook(BaseModel):
#     title: str
#     start_page: int
#     books: list[Book] = Field(default_factory=list)

In [133]:
# Database tables/collections


class ParagraphDBModel(BaseModel):
    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    text: str
    embedding: Optional[List[float]] = (
        None  # Will be populated later during embedding generation
    )
    page: int
    paragraph_index: int
    chapter_id: str  # Foreign key reference
    book_id: str  # Foreign key reference


class ChapterDBModel(BaseModel):
    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    title: str
    start_page: int
    end_page: int
    book_id: str  # Foreign key reference
    chapter_index: int


class BookDBModel(BaseModel):
    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    title: str
    start_page: int
    end_page: int
    book_index: int

## Get paragraphs using blocks

In [33]:
# convert newlines to spaces, and remove extra spaces
# process ligatures here too
def clean_text(text: str) -> str:
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)  # replace multiple spaces with a single space
    text = replace_ligatures(text)
    return text.strip()

In [34]:
# # get list of lists: for eah page, get the blocks of text
# def get_text_blocks(doc):
#     blocks = []
#     for page in doc:
#         page_blocks = page.get_text("blocks")
#         cleaned_blocks = [clean_text(block[4]) for block in page_blocks if block[6] == 0]  # only text blocks
#         blocks.append(cleaned_blocks)
#     return blocks

In [None]:
# get blocks from chapter 1
def get_chapter_text_blocks(doc, start_page, end_page):
    blocks = []
    block_pages = []
    # page numbers are 1-indexed, so we need to adjust accordingly
    for n, page in enumerate(doc[start_page - 1 : end_page]):
        page_blocks = page.get_text("blocks")
        cleaned_blocks = [
            clean_text(block[4]) for block in page_blocks if block[6] == 0
        ]  # only text blocks
        if n == 0:
            # if this is the first page, skip chapter header blocks
            # TODO: this works for chapter starts but not for book starts i.e. introductions
            cleaned_blocks = cleaned_blocks[2:]
        blocks.extend(cleaned_blocks)
        block_pages.extend([n + start_page] * len(cleaned_blocks))
    return blocks, block_pages

In [None]:
# actual page numbers - these are 1-indexed
start_page = 21
end_page = 41
chapter_blocks, block_pages = get_chapter_text_blocks(doc, start_page, end_page)

In [50]:
# # print long text with text wrapping
# import textwrap

# def print_with_wrapping(text, width=80):
#     # Split the text by newlines to preserve paragraph structure
#     paragraphs = text.split('\n')

#     # Process each paragraph separately
#     for paragraph in paragraphs:
#         if paragraph.strip():  # Skip empty paragraphs
#             # Use textwrap to wrap each paragraph to the specified width
#             wrapped_lines = textwrap.wrap(paragraph.strip(), width=width)

#             # Print each wrapped line
#             for line in wrapped_lines:
#                 print(line)


In [51]:
for block in chapter_blocks:
    print()
    print_with_wrapping(block)


The roots of history lie in the pre-human past and it is hard (but important) to
grasp just how long ago that was. If we think of a century on our calendar as a
minute on some great clock recording the passage of time, then Europeans began
to settle in the Americas only about five minutes ago. Slightly less than
fifteen minutes before that, Christianity appeared. Rather more than an hour ago
people settled in southern Mesopotamia who were soon to evolve the oldest
civilization known to us. This is already well beyond the furthest margin of
written record; according to our clock, people began writing down the past much
less than an hour ago, too. Some six or seven hours further back on our scale,
and much more remote, we can discern the first recognizable human beings of a
modern physiological type already established in western Europe. Behind them,
anything from a fortnight to three weeks earlier, appear the first traces of
creatures with some human characteristics whose contribution 

In [52]:
len(chapter_blocks), len(block_pages)

(53, 53)

## Get paragraphs using get_text("dict")

Reading by block does not tell you whether the paragraph break coincides with a page break. By reading into a dict, text can be read line-by-line with indentation info to find paragraph start

In [None]:
page_dict = doc[26].get_text("dict")

In [None]:
len(page_dict["blocks"])

3

In [None]:
page_dict["blocks"][0].keys()

dict_keys(['number', 'type', 'bbox', 'lines'])

In [None]:
page_dict["blocks"][1]

{'number': 1,
 'type': 0,
 'bbox': (76.9921875, 225.055908203125, 529.962158203125, 620.092041015625),
 'lines': [{'spans': [{'size': 15.0,
     'flags': 4,
     'bidi': 0,
     'char_flags': 16,
     'font': 'CharisSIL',
     'color': 0,
     'alpha': 255,
     'ascender': 1.1962890625,
     'descender': -0.439453125,
     'text': 'Some 25 or 30 million years ago, as desiccation began to reduce',
     'origin': (91.9921875, 243.000244140625),
     'bbox': (91.9921875,
      225.055908203125,
      520.0921630859375,
      249.592041015625)}],
   'wmode': 0,
   'dir': (1.0, 0.0),
   'bbox': (91.9921875,
    225.055908203125,
    520.0921630859375,
    249.592041015625)},
  {'spans': [{'size': 15.0,
     'flags': 4,
     'bidi': 0,
     'char_flags': 16,
     'font': 'CharisSIL',
     'color': 0,
     'alpha': 255,
     'ascender': 1.1962890625,
     'descender': -0.439453125,
     'text': 'the area of the forests, competition for diminishing forest resources',
     'origin': (76.992187

In [None]:
# blocks don't have tabs a paragraph start - extract text using get_text("dict")



## Read pages and join continuing paragraphs

In [65]:
# given paragraphs from consecutive pages, concatenate them if they are a continuation of the same paragraph
# Assume that if a paragraph ends with a period, the next paragraph is a new one and not a continuation.
def concatenate_paragraphs(paragraphs, page_numbers):
    concatenated = []
    concatenated_page_numbers = []
    previous_paragraph = ""
    previous_page = None

    for paragraph, page in zip(paragraphs, page_numbers):
        if previous_paragraph:
            # Check if the last character of the current paragraph is a period
            if previous_paragraph[-1] == ".":
                concatenated.append(previous_paragraph)
                concatenated_page_numbers.append(previous_page)
                previous_paragraph = paragraph
                previous_page = page
            else:
                previous_paragraph += " " + paragraph
        else:  # initialize
            previous_paragraph = paragraph
            previous_page = page

    if previous_paragraph:  # always append the last paragraph
        concatenated.append(previous_paragraph)
        concatenated_page_numbers.append(previous_page)

    return concatenated, concatenated_page_numbers

In [66]:
# run on chapter 1
concatenated_chapter_blocks, concatenated_page_numbers = concatenate_paragraphs(
    chapter_blocks, block_pages
)

In [67]:
# print
for block, page in zip(concatenated_chapter_blocks, concatenated_page_numbers):
    print(f"Page {page}:")
    print_with_wrapping(block)

Page 21:
The roots of history lie in the pre-human past and it is hard (but important) to
grasp just how long ago that was. If we think of a century on our calendar as a
minute on some great clock recording the passage of time, then Europeans began
to settle in the Americas only about five minutes ago. Slightly less than
fifteen minutes before that, Christianity appeared. Rather more than an hour ago
people settled in southern Mesopotamia who were soon to evolve the oldest
civilization known to us. This is already well beyond the furthest margin of
written record; according to our clock, people began writing down the past much
less than an hour ago, too. Some six or seven hours further back on our scale,
and much more remote, we can discern the first recognizable human beings of a
modern physiological type already established in western Europe. Behind them,
anything from a fortnight to three weeks earlier, appear the first traces of
creatures with some human characteristics whose contr

In [61]:
len(concatenated_chapter_blocks), len(concatenated_page_numbers)

(36, 36)

In [None]:
# TODO: this is recognizing chapter headers as incomplete paragraphs. Need to identify chapter headers and handle them separately.

## Build paragraph objects

outline data building process:

starting data:
- chapter start pages, as nested list with list for each book `book_start_pages`
- book titles `book_titles`
- chapter titile (nested list by book) `chapters_by_book`
- paragraph text blocks and page numbers

outline:
- Build books:
    - Get book indes, start/end pages, book title, chapter titles/start pages
    - make chapters:
        - get chapter title, start/end page, chapter index, book_id
        - make paragraphs
            - get book and chapter id, paragraph index in chapter
            - extract text and page numbers from 

In [124]:
def create_paragraphs_db(text_blocks, page_numbers, chapter_id, book_id, start_index=0):
    """Create paragraph DB models from text blocks"""
    paragraphs = []

    for i, (block, page) in enumerate(
        zip(text_blocks, page_numbers), start=start_index
    ):
        paragraph = ParagraphDBModel(
            text=block,
            page=page,
            paragraph_index=i,
            chapter_id=chapter_id,
            book_id=book_id,
            embedding=None,  # Will be generated later
        )
        paragraphs.append(paragraph)

    return paragraphs

In [None]:
# e.g. for book 1, chapter 1
book_index = 0
chapter_index = 0

paragraphs = create_paragraphs_db(blocks, block_pages, chapter.id, book_id)

NameError: name 'create_paragraphs' is not defined

## Create paragraphs and chapters

In [None]:
def process_chapter(doc, chapter_title, chapter_index, start_page, end_page, book_id):
    """Process a single chapter and extract its paragraphs

    Args:
        doc: PyMuPDF document
        chapter_title: Title of the chapter
        chapter_index: Index of chapter within its book
        start_page: 1-indexed start page
        end_page: 1-indexed end page (exclusive)

    Returns:
        tuple: (chapter_model, paragraph_models)
    """
    # Get text blocks from chapter pages
    blocks, block_pages = get_chapter_text_blocks(doc, start_page, end_page)

    # Join paragraphs that span across multiple blocks
    concatenated_blocks, concatenated_pages = concatenate_paragraphs(
        blocks, block_pages
    )

    # Create chapter DB model (without book_id for now)
    chapter = ChapterDBModel(
        title=chapter_title,
        start_page=start_page,
        end_page=end_page - 1,  # Store as inclusive end page
        chapter_index=chapter_index,
        book_id="",  # Will be set after book is created
    )

    # Create paragraph DB models (without book_id and chapter_id for now)
    paragraphs = create_paragraphs_db(blocks, block_pages, chapter.id, book_id)

    return chapter, paragraphs

In [149]:
book_index = 0  # assuming this is the first book
chapter_index = 0  # assuming this is the first chapter of the book
chapter_title = chapter_titles[chapter_index]  # e.g. "Introduction"
start_page = book_start_pages[book_index][chapter_index] + 1  # convert to 1-indexed
end_page = book_start_pages[book_index][chapter_index + 1]
chapter_test, paragraphs_test = process_chapter(
    doc, chapter_title, chapter_index, start_page, end_page
)

In [150]:
chapter_test

ChapterDBModel(id='fc615243-6b2c-4e06-952e-ba32b6ec1216', title='Introduction', start_page=19, end_page=19, book_id='', chapter_index=0)

In [151]:
paragraphs_test

[ParagraphDBModel(id='b83c7dc5-ac91-4ca2-865a-fed467c7d31e', text='BEFO RE HI ST O RY When does History begin? It is tempting to reply ‘in the beginning’, but like many obvious answers, this soon turns out to be unhelpful. As a great Swiss historian once pointed out in another connection, history is the one subject where you cannot begin at the beginning. We can trace the chain of human descent back to the appearance of vertebrates, or even to the photosynthetic cells and other basic structures which lie at the start of life itself. We can go back further still, to the almost unimaginable upheavals which formed this planet and even to the origins of the universe. Yet this is not ‘history’.', embedding=None, page=19, paragraph_index=0, chapter_id='', book_id=''),
 ParagraphDBModel(id='8634d72e-8750-4180-b83d-e7b81e0f8fd6', text='Common sense helps here: history is the story of mankind, of what it has done, suffered or enjoyed. We all know that dogs and cats do not have histories, while 

## Create books with chapters

In [153]:
book_start_pages

[[18, 20, 41, 58],
 [73, 75, 86, 109, 138, 172, 191, 218, 232],
 [241, 243, 248, 296, 316, 360, 437, 453],
 [473, 475, 483, 518, 556, 623, 633, 663, 679, 697, 740],
 [771, 773, 794, 825, 865, 911, 950, 974],
 [1005, 1007, 1035, 1071, 1102, 1132, 1166, 1192],
 [1233, 1235, 1262, 1308, 1333, 1352, 1384],
 [1420, 1424, 1480, 1557, 1604, 1635, 1681],
 [1699]]

In [None]:
def process_book(doc, book_index, book_title, book_start_pages, chapter_titles):
    """Process a book and all its chapters

    Args:
        doc: PyMuPDF document
        book_index: Index of this book in the collection
        book_title: Title of the book
        chapter_titles: List of chapter titles

    Returns:
        tuple: (book_model, chapter_models, paragraph_models)
    """
    # Calculate start and end pages for the book
    chapter_pages = book_start_pages[book_index]
    start_page = chapter_pages[0] + 1  # Convert 0-index to 1-index

    # # End page is start of next book or end of document
    # if book_index < len(book_start_pages) - 2:  # -2 because last item is end marker
    #     end_page = book_start_pages[book_index + 1][0] + 1  # +1 for 1-indexing
    # else:
    #     # end_page = len(doc)  # Last page of document (1-indexed already)
    #     end_page = book_start_pages[-1][0] + 1  # Last book's start page + 1 for 1-indexing
    # there is an extra entry in book_start_pages for the end of the document
    end_page = (
        book_start_pages[book_index + 1][0] + 1
    )  # Next book's start page (1-indexed)

    # Create book model
    book = BookDBModel(
        title=book_title,
        start_page=start_page,
        end_page=end_page - 1,  # Store as inclusive end page
        book_index=book_index,
    )

    # Process each chapter in this book
    all_chapters = []
    all_paragraphs = []

    for i, (chapter_title, chapter_page) in enumerate(
        zip(chapter_titles, chapter_pages)
    ):
        # Calculate chapter end page (start of next chapter or end of book)
        chapter_start = chapter_page + 1  # Convert to 1-indexed

        if i < len(chapter_pages) - 1:
            chapter_end = chapter_pages[i + 1] + 1  # Next chapter start (1-indexed)
        else:
            chapter_end = end_page  # End of book

        # Process the chapter
        chapter, paragraphs = process_chapter(
            doc, chapter_title, i, chapter_start, chapter_end, book.id
        )

        if chapter:
            # Set book_id on chapter
            chapter.book_id = book.id
            all_chapters.append(chapter)

            # Set chapter_id and book_id on paragraphs
            for p in paragraphs:
                p.chapter_id = chapter.id
                p.book_id = book.id
                all_paragraphs.append(p)

    return book, all_chapters, all_paragraphs

In [155]:
book_index = 0  # assuming this is the first book
book_title = book_titles[book_index]  # e.g. "Book One"
chapter_titles = chapters_by_book[book_index]  # List of chapter titles for this book
book_test, all_chapters_test, all_paragraphs_test = process_book(
    doc, book_index, book_title, book_start_pages, chapter_titles
)

In [156]:
book_test

BookDBModel(id='15538fa1-a067-42ee-92c8-38fe10257ae5', title='BEFORE HISTORY', start_page=19, end_page=73, book_index=0)

In [157]:
all_chapters_test

[ChapterDBModel(id='445a6190-1d6d-4384-8518-1ab23a8ecd01', title='Introduction', start_page=19, end_page=20, book_id='15538fa1-a067-42ee-92c8-38fe10257ae5', chapter_index=0),
 ChapterDBModel(id='77023e4f-aa69-403f-89ec-9a4be1f5a579', title='The Foundations', start_page=21, end_page=41, book_id='15538fa1-a067-42ee-92c8-38fe10257ae5', chapter_index=1),
 ChapterDBModel(id='e401498f-6c1e-4b78-b794-9b4034a43ff6', title='Homo Sapiens', start_page=42, end_page=58, book_id='15538fa1-a067-42ee-92c8-38fe10257ae5', chapter_index=2),
 ChapterDBModel(id='e03d68f5-ad4d-4e45-aa31-ab73d246ba4d', title='The Possibility of Civilization', start_page=59, end_page=73, book_id='15538fa1-a067-42ee-92c8-38fe10257ae5', chapter_index=3)]

In [159]:
len(all_paragraphs_test)

101

## Build the full history book

In [171]:
print([item for item in zip(book_titles, chapters_by_book)])

[('BEFORE HISTORY', ['Introduction', 'The Foundations', 'Homo Sapiens', 'The Possibility of Civilization']), ('CIVILIZATIONS', ['Introduction', 'Early Civilized Life', 'Ancient Mesopotamia', 'Ancient Egypt', 'Intruders and Invaders', 'The Beginnings of Civilization in South Asia', 'Ancient China', 'The Other Worlds of the Ancient Past', 'Transformations']), ('THE CLASSICAL AGE', ['Introduction', 'Remaking the Old World', 'The Greeks', 'The Hellenistic World', 'Rome', 'Christianity and the Western Transition', 'Classical India', 'Classical China']), ('THE AGE OF DIVERGING TRADITIONS', ['Introduction', 'The Central Eurasian Crossroads', 'Islam and the Arab Empires', 'Byzantium and Its Sphere', 'The New Middle East and the Making of Europe', 'India', 'Imperial China', 'Japan', 'Worlds Apart', 'Europe: The Possibility of Change', 'New Limits, New Horizons']), ('THE MAKING OF THE EUROPEAN AGE', ['Introduction', 'Qing China and Mughal India', 'A New Kind of Society: Early Modern Europe', 'Au

In [176]:
book_titles

['BEFORE HISTORY',
 'CIVILIZATIONS',
 'THE CLASSICAL AGE',
 'THE AGE OF DIVERGING TRADITIONS',
 'THE MAKING OF THE EUROPEAN AGE',
 'THE GREAT ACCELERATION',
 'THE END OF THE EUROPEAN AGE',
 'OUR OWN TIME']

In [177]:
chapters_by_book

[['Introduction',
  'The Foundations',
  'Homo Sapiens',
  'The Possibility of Civilization'],
 ['Introduction',
  'Early Civilized Life',
  'Ancient Mesopotamia',
  'Ancient Egypt',
  'Intruders and Invaders',
  'The Beginnings of Civilization in South Asia',
  'Ancient China',
  'The Other Worlds of the Ancient Past',
  'Transformations'],
 ['Introduction',
  'Remaking the Old World',
  'The Greeks',
  'The Hellenistic World',
  'Rome',
  'Christianity and the Western Transition',
  'Classical India',
  'Classical China'],
 ['Introduction',
  'The Central Eurasian Crossroads',
  'Islam and the Arab Empires',
  'Byzantium and Its Sphere',
  'The New Middle East and the Making of Europe',
  'India',
  'Imperial China',
  'Japan',
  'Worlds Apart',
  'Europe: The Possibility of Change',
  'New Limits, New Horizons'],
 ['Introduction',
  'Qing China and Mughal India',
  'A New Kind of Society: Early Modern Europe',
  'Authority and Its Challengers in Europe',
  'The New World of Great Po

In [179]:
def build_history_book_db():
    """Build database-ready models for the complete history book"""
    # Final collections for database
    all_books = []
    all_chapters = []
    all_paragraphs = []

    # Organize chapter titles by book
    # chapters_by_book = organize_chapter_titles_by_book()

    # Process each book (except last entry which is end marker)
    for book_index, (title, chapter_titles) in enumerate(
        zip(book_titles, chapters_by_book)
    ):
        # Process this book
        book, chapters, paragraphs = process_book(
            doc, book_index, title, book_start_pages, chapter_titles
        )

        # Add to our collections
        all_books.append(book)
        all_chapters.extend(chapters)
        all_paragraphs.extend(paragraphs)

        # Print progress
        print(f"Processed book {book_index + 1}/{len(book_titles)}: {title}")
        print(f"  Chapters: {len(chapters)}, Paragraphs: {len(paragraphs)}")

    return all_books, all_chapters, all_paragraphs

In [180]:
books, chapters, paragraphs = build_history_book_db()

Processed book 1/8: BEFORE HISTORY
  Chapters: 4, Paragraphs: 101
Processed book 2/8: CIVILIZATIONS
  Chapters: 9, Paragraphs: 336
Processed book 3/8: THE CLASSICAL AGE
  Chapters: 8, Paragraphs: 453
Processed book 4/8: THE AGE OF DIVERGING TRADITIONS
  Chapters: 11, Paragraphs: 607
Processed book 5/8: THE MAKING OF THE EUROPEAN AGE
  Chapters: 8, Paragraphs: 459
Processed book 6/8: THE GREAT ACCELERATION
  Chapters: 8, Paragraphs: 414
Processed book 7/8: THE END OF THE EUROPEAN AGE
  Chapters: 7, Paragraphs: 349
Processed book 8/8: OUR OWN TIME
  Chapters: 7, Paragraphs: 536


In [183]:
len(chapters)

62

In [184]:
len(paragraphs)

3255

## Cost estimate: how many tokens in the book?

In [107]:
import tiktoken

In [None]:
# Question: how many tokens in the book?
def count_tokens(text: str, model_name: str = "gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    tokens = encoding.encode(text)
    return len(tokens)


# Count tokens in the entire book text