In [2]:
import os
import pathlib

current = pathlib.Path().resolve()

while not (current / "data").exists():
    current = current.parent

BASE_DIR = current
RAW_DIR = BASE_DIR / "data" / "raw"

file_path = RAW_DIR / "great_expectations.txt"

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

raw_text[:1500]

'\ufeffThe Project Gutenberg eBook of Great Expectations\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: Great Expectations\n\nAuthor: Charles Dickens\n\nRelease date: July 1, 1998 [eBook #1400]\n                Most recently updated: December 17, 2024\n\nLanguage: English\n\nCredits: An Anonymous Volunteer and David Widger\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK GREAT EXPECTATIONS ***\n\n[Illustration]\n\n\n\n\nGreat Expectations\n\n[1867 Edition]\n\nby Charles Dickens\n\n\nContents\n\n Chapter I.\n Chapter II.\n Chapter III.\n Chapter IV.\n Chapter V.\n Chapter VI.\n 

In [3]:
def strip_gutenberg_metadata(text: str) -> str:
    start = "*** START OF THE PROJECT GUTENBERG"
    end = "*** END OF THE PROJECT GUTENBERG"

    start_idx = text.find(start)

    if start_idx == -1:
        raise ValueError("Could not find Gutenberg header")
    
    cleaned_text = text[start_idx:].split('\n', 1)[1]
    cleaned_text = cleaned_text[:cleaned_text.find(end)]

    return cleaned_text.strip()

clean_text = strip_gutenberg_metadata(raw_text)
clean_text[:1500]

'[Illustration]\n\n\n\n\nGreat Expectations\n\n[1867 Edition]\n\nby Charles Dickens\n\n\nContents\n\n Chapter I.\n Chapter II.\n Chapter III.\n Chapter IV.\n Chapter V.\n Chapter VI.\n Chapter VII.\n Chapter VIII.\n Chapter IX.\n Chapter X.\n Chapter XI.\n Chapter XII.\n Chapter XIII.\n Chapter XIV.\n Chapter XV.\n Chapter XVI.\n Chapter XVII.\n Chapter XVIII.\n Chapter XIX.\n Chapter XX.\n Chapter XXI.\n Chapter XXII.\n Chapter XXIII.\n Chapter XXIV.\n Chapter XXV.\n Chapter XXVI.\n Chapter XXVII.\n Chapter XXVIII.\n Chapter XXIX.\n Chapter XXX.\n Chapter XXXI.\n Chapter XXXII.\n Chapter XXXIII.\n Chapter XXXIV.\n Chapter XXXV.\n Chapter XXXVI.\n Chapter XXXVII.\n Chapter XXXVIII.\n Chapter XXXIX.\n Chapter XL.\n Chapter XLI.\n Chapter XLII.\n Chapter XLIII.\n Chapter XLIV.\n Chapter XLV.\n Chapter XLVI.\n Chapter XLVII.\n Chapter XLVIII.\n Chapter XLIX.\n Chapter L.\n Chapter LI.\n Chapter LII.\n Chapter LIII.\n Chapter LIV.\n Chapter LV.\n Chapter LVI.\n Chapter LVII.\n Chapter LVII

In [5]:
import re

def split_chapters_ge(text: str):
    pattern = re.compile(r"\n\s*(Chapter\s+[0-9IVXLC]+\.?)", re.IGNORECASE)
    parts = pattern.split(text)

    chapters = []
    for i in range(1, len(parts), 2):
        chapters.append((parts[i].strip(), parts[i + 1].strip()))

    return chapters

ge_chapters = split_chapters_ge(clean_text)
len(ge_chapters)

118

In [7]:
def filter_short_chapters(chapters, min_length:int):
    return[(title, text) for title, text in chapters if len(text) >= min_length]

ge_chapters = filter_short_chapters(ge_chapters, min_length = 500)
len(ge_chapters), ge_chapters[0][0], ge_chapters[0][1][:300]

(59,
 'Chapter I.',
 'My father’s family name being Pirrip, and my Christian name Philip, my\ninfant tongue could make of both names nothing longer or more explicit\nthan Pip. So, I called myself Pip, and came to be called Pip.\n\nI give Pirrip as my father’s family name, on the authority of his\ntombstone and my sister,—Mrs.')

In [10]:
import json

PROCESSED_DIR = BASE_DIR / "data" / "processed"
out_path = PROCESSED_DIR / "great_expectations_clean.json"

with open(out_path, "w", encoding = "utf-8") as f:
    json.dump({
        "book": "Great Expectations",
        "chapters": ge_chapters
    },

    f,

    ensure_ascii = False,

    indent = 2
    
    )