In [1]:
import os
import pathlib

current = pathlib.Path().resolve()

while not (current / "data").exists():
    current = current.parent

BASE_DIR = current
RAW_DIR = BASE_DIR / "data" / "raw"

file_path = RAW_DIR / "the_count_of_monte_christo.txt"

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

raw_text[:1500]

'\ufeffThe Project Gutenberg eBook of The Count of Monte Cristo\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: The Count of Monte Cristo\n\nAuthor: Alexandre Dumas\n        Auguste Maquet\n\nRelease date: January 1, 1998 [eBook #1184]\n                Most recently updated: August 9, 2025\n\nLanguage: English\n\nCredits: Anonymous Project Gutenberg Volunteers, Dan Muller and David Widger\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK THE COUNT OF MONTE CRISTO ***\n\n\n\n\nTHE COUNT OF MONTE CRISTO\n\nby Alexandre Dumas [père]\n\n\n\n\nContents\n\n\n VOLUME ONE\nChapter 1. Mar

In [2]:
def strip_gutenberg_metadata(text: str) -> str:
    start = "*** START OF THE PROJECT GUTENBERG"
    end = "*** END OF THE PROJECT GUTENBERG"

    start_idx = text.find(start)

    if start_idx == -1:
        raise ValueError("Could not find Gutenberg header")
    
    cleaned_text = text[start_idx:].split('\n', 1)[1]
    cleaned_text = cleaned_text[:cleaned_text.find(end)]

    return cleaned_text.strip()

clean_text = strip_gutenberg_metadata(raw_text)
clean_text[:1500]

'THE COUNT OF MONTE CRISTO\n\nby Alexandre Dumas [père]\n\n\n\n\nContents\n\n\n VOLUME ONE\nChapter 1. Marseilles—The Arrival\nChapter 2. Father and Son\nChapter 3. The Catalans\nChapter 4. Conspiracy\nChapter 5. The Marriage Feast\nChapter 6. The Deputy Procureur du Roi\nChapter 7. The Examination\nChapter 8. The Château d’If\nChapter 9. The Evening of the Betrothal\nChapter 10. The King’s Closet at the Tuileries\nChapter 11. The Corsican Ogre\nChapter 12. Father and Son\nChapter 13. The Hundred Days\nChapter 14. The Two Prisoners\nChapter 15. Number 34 and Number 27\nChapter 16. A Learned Italian\nChapter 17. The Abbé’s Chamber\nChapter 18. The Treasure\nChapter 19. The Third Attack\nChapter 20. The Cemetery of the Château d’If\nChapter 21. The Island of Tiboulen\nChapter 22. The Smugglers\nChapter 23. The Island of Monte Cristo\nChapter 24. The Secret Cave\nChapter 25. The Unknown\nChapter 26. The Pont du Gard Inn\nChapter 27. The Story\n\n VOLUME TWO\nChapter 28. The Prison Registe

In [5]:
import re

sample = clean_text[:20000]  # first 20k characters
matches = re.findall(r"(Chapter\s+.*|CHAPTER\s+.*)", sample)
matches[:20]


['Chapter 1. Marseilles—The Arrival',
 'Chapter 2. Father and Son',
 'Chapter 3. The Catalans',
 'Chapter 4. Conspiracy',
 'Chapter 5. The Marriage Feast',
 'Chapter 6. The Deputy Procureur du Roi',
 'Chapter 7. The Examination',
 'Chapter 8. The Château d’If',
 'Chapter 9. The Evening of the Betrothal',
 'Chapter 10. The King’s Closet at the Tuileries',
 'Chapter 11. The Corsican Ogre',
 'Chapter 12. Father and Son',
 'Chapter 13. The Hundred Days',
 'Chapter 14. The Two Prisoners',
 'Chapter 15. Number 34 and Number 27',
 'Chapter 16. A Learned Italian',
 'Chapter 17. The Abbé’s Chamber',
 'Chapter 18. The Treasure',
 'Chapter 19. The Third Attack',
 'Chapter 20. The Cemetery of the Château d’If']

In [7]:
def split_chapters_cmc(text: str):

    pattern = re.compile(
        r"\n\s*(Chapter\s+[0-9IVXLC]+\.?(?:\s+.+)?)",
        re.IGNORECASE
    )

    parts = pattern.split(text)

    chapters = []
    for i in range(1, len(parts), 2):
        title = parts[i].strip()
        body = parts[i + 1].strip()
        chapters.append((title, body))

    return chapters

cmc_chapters_raw = split_chapters_cmc(clean_text)
len(cmc_chapters_raw)

234

In [8]:
for i in range(5):
    print(i, "->", cmc_chapters_raw[i][0], "| len(text) =", len(cmc_chapters_raw[i][1]))

0 -> Chapter 1. Marseilles—The Arrival | len(text) = 0
1 -> Chapter 2. Father and Son | len(text) = 0
2 -> Chapter 3. The Catalans | len(text) = 0
3 -> Chapter 4. Conspiracy | len(text) = 0
4 -> Chapter 5. The Marriage Feast | len(text) = 0


In [9]:
def filter_short_chapters(chapters, min_length: int = 500):
    return [
        (title, text)
        for title, text in chapters
        if len(text) >= min_length
    ]

cmc_chapters = filter_short_chapters(cmc_chapters_raw, min_length=500)
len(cmc_chapters), cmc_chapters[0][0], cmc_chapters[0][1][:400]


(117,
 'Chapter 1. Marseilles—The Arrival',
 'On the 24th of February, 1815, the look-out at Notre-Dame de la Garde\nsignalled the three-master, the _Pharaon_ from Smyrna, Trieste, and\nNaples.\n\nAs usual, a pilot put off immediately, and rounding the Château d’If,\ngot on board the vessel between Cape Morgiou and Rion island.\n\nImmediately, and according to custom, the ramparts of Fort Saint-Jean\nwere covered with spectators; it is always an even')

In [14]:
import json

PROCESSED_DIR = BASE_DIR / "data" / "processed"
out_path = PROCESSED_DIR / "the_count_of_monte_christo_clean.json"

cmc_data = {
    "book": "The Count of Monte Cristo",
    "chapters": cmc_chapters,
}

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(cmc_data, f, ensure_ascii=False, indent=2)

out_path


WindowsPath('C:/Users/ThinkPad T450/Desktop/ai-literary-companion/data/processed/the_count_of_monte_christo_clean.json')