In [1]:
import os
import re

In [2]:
BOOKS = {
    'DorianGray': {
        'path': '../data/the-picture-of-dorian-gray.txt',
        'chapter_divider_pattern': r'CHAPTER [IVX]+.\n\n',
        'start_at': '\n\n\nCHAPTER I.',
        'end_at': 'THE END',
    },
    'ChristmasCarol': {
        'path': '../data/a-christmas-carol.txt',
        'chapter_divider_pattern': r'\n\nSTAVE [A-Z]+\n\n',
        'start_at': '\n\nSTAVE ONE',
        'end_at': 'End of the Project Gutenberg EBook of A Christmas Carol, by Charles Dickens',
        'extra_removal_regexes': [
            r'\[[A-Za-z0-9_,\-.\'?!;": \n]+\]',
            r'\+-+\+',
            r'\|[A-Za-z0-9_,.\-\'?!;": ]+\|',
        ]
    },
    'PrideAndPredjudice': {
        'path': '../data/pride-and-prejudice.txt',
        'chapter_divider_pattern': '\n\nChapter [IVX]+.\[*|\n\nCHAPTER [IVX]+.\[*',
        'start_at': '\n\nChapter I.',
        'end_at': '\*\*\* END OF THE PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE \*\*\*',
        'extra_removal_regexes': [
            r'\[[A-Za-z0-9_,\-.\'"?!;{}^&\n: ]+\]',
        ],
    },
    'SilasMarner': {
        'path': '../data/silas-marner.txt',
        'chapter_divider_pattern': r'CHAPTER [IVX]+.\n\n',
        'start_at': '\n\nCHAPTER I.',
        'end_at': 'End of the Project Gutenberg EBook of Silas Marner, by George Eliot'
    }
}

In [3]:
for book, params in BOOKS.items():
    with open(params['path'], encoding='utf-8') as book_text:
        full_text = book_text.read()
        
    unicode = '“”‘’—æáàéèêíôóúÁÉÍÓÚç'
    mapping = '""\'\'-eaaeeeioouAEIOUc'
    
    for char, mapping in zip(unicode, mapping):
        full_text = full_text.replace(char, mapping)
    
        
    full_text = full_text[re.search(params['start_at'], full_text).end():]
    full_text = full_text = full_text[:re.search(params['end_at'], full_text).start()]

    if extra_removal_regexes := params.get('extra_removal_regexes'):
        for regex in extra_removal_regexes:
            while len(re.findall(regex, full_text)):
                full_text = re.sub(regex, '', full_text)
    
    full_text = full_text.strip()
    
    chapters = [text.strip() for text in re.split(params['chapter_divider_pattern'], full_text)]
    
    
    os.makedirs(f'../data/{book}', exist_ok=True)
    
    for i, chapter in enumerate(chapters):
        with open(f'../data/{book}/chapter-{i:02}.txt', 'w') as f:
            f.write(chapter)
    
    with open(f'../data/{book}/cleaned.txt', 'w') as f:
        f.write('\n\n\n\n'.join(chapters))
        
