In [42]:
import os
import pathlib

current = pathlib.Path().resolve()

while not (current / "data").exists():
    current = current.parent

BASE_DIR = current
RAW_DIR = BASE_DIR / "data" / "raw"

file_path = RAW_DIR / "crime_and_punishment.txt"

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

raw_text[:1500]

'\ufeffThe Project Gutenberg eBook of Crime and Punishment\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: Crime and Punishment\n\nAuthor: Fyodor Dostoyevsky\n\nTranslator: Constance Garnett\n\nRelease date: March 28, 2006 [eBook #2554]\n                Most recently updated: June 10, 2025\n\nLanguage: English\n\nCredits: John Bickers, Dagny and David Widger\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK CRIME AND PUNISHMENT ***\n\n\n\n\nCRIME AND PUNISHMENT\n\nBy Fyodor Dostoevsky\n\n\n\nTranslated By Constance Garnett\n\n\n\n\nTRANSLATOR’S PREFACE\n\nA few words about Dostoe

In [43]:
def strip_gutenberg_metadata(text: str) -> str:
    start = "*** START OF THE PROJECT GUTENBERG"
    end = "*** END OF THE PROJECT GUTENBERG"

    start_idx = text.find(start)

    if start_idx == -1:
        raise ValueError("Could not find Gutenberg header")
    
    cleaned_text = text[start_idx:].split('\n', 1)[1]
    cleaned_text = cleaned_text[:cleaned_text.find(end)]

    return cleaned_text.strip()

clean_text = strip_gutenberg_metadata(raw_text)
clean_text[:1500]

'CRIME AND PUNISHMENT\n\nBy Fyodor Dostoevsky\n\n\n\nTranslated By Constance Garnett\n\n\n\n\nTRANSLATOR’S PREFACE\n\nA few words about Dostoevsky himself may help the English reader to\nunderstand his work.\n\nDostoevsky was the son of a doctor. His parents were very hard-working\nand deeply religious people, but so poor that they lived with their five\nchildren in only two rooms. The father and mother spent their evenings\nin reading aloud to their children, generally from books of a serious\ncharacter.\n\nThough always sickly and delicate Dostoevsky came out third in the\nfinal examination of the Petersburg school of Engineering. There he had\nalready begun his first work, “Poor Folk.”\n\nThis story was published by the poet Nekrassov in his review and\nwas received with acclamations. The shy, unknown youth found himself\ninstantly something of a celebrity. A brilliant and successful career\nseemed to open before him, but those hopes were soon dashed. In 1849 he\nwas arrested.\n\nTh

In [44]:
import re

def split_into_chapters(text: str):
    pattern = re.compile(r"\n\s*(Chapter\s+[0-9IVXLC]+\.?)", re.IGNORECASE)

    parts = re.split(pattern, text)

    chapters = []

    for i in range(1, len(parts), 2):
        chapter_title = parts[i].strip()
        chapter_text = parts[i + 1].strip()
        chapters.append((chapter_title, chapter_text))

    return chapters

chapters = split_into_chapters(clean_text)
len(chapters), chapters[0][0], chapters[0][1][:300]

(39,
 'CHAPTER I',
 'On an exceptionally hot evening early in July a young man came out of\nthe garret in which he lodged in S. Place and walked slowly, as though\nin hesitation, towards K. bridge.\n\nHe had successfully avoided meeting his landlady on the staircase. His\ngarret was under the roof of a high, five-storied hou')

In [45]:
PROCESSED_DIR = BASE_DIR / "data" / "processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

import json

output_path = os.path.join(PROCESSED_DIR, "crime_and_punishment_clean.json")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(
        {"book": "Crime and Punishment", "chapters": chapters},
        f,
        ensure_ascii=False,
        indent=2
    )

output_path

'C:\\Users\\ThinkPad T450\\Desktop\\ai-literary-companion\\data\\processed\\crime_and_punishment_clean.json'