In [3]:
from collections import Counter
import numpy as np
import pandas as pd

In [5]:
with open('./data/pg12242.txt', 'r') as file:
    text = file.read()

head = text[:10000]
tail = text[-25000:]

#identify starting point for prefaces

start_preface_1 = text.index('POEMS\n')
start_preface_2 = text.index('POEMS\n', start_preface_1 + 1)
start_preface_3 = text.index('POEMS\n', start_preface_2 + 1)
# what this does for start_preface_2 is it starts looking for the string "poems" after the previous instance plus 1

# identify starting point for poems

poems_start_1 = text.index('I.', start_preface_1)
poems_start_2 = text.index('I.', start_preface_2)
poems_start_3 = text.index('I.', start_preface_3)

# poems 1 = series 1 poems to series two preface. identify ending sections

end_poem_1 = start_preface_2
end_poem_2 = start_preface_3
end_project_gutenberg = text.index('End of Project Gutenberg')
# this is a string
end_poem_3 = end_project_gutenberg

#Define series

series1 = text[poems_start_1:end_poem_1].strip()
series2 = text[poems_start_2:end_poem_2].strip()
series3 = text[poems_start_3:end_poem_3].strip()

# join series into single string connected by "\n\n\n\n\n\n"
poems_text = ('\n'*6).join([series1.strip(),
                            series2.strip(),
                            series3.strip()])

# Remove annotation

poems_text_clean = poems_text.split('[')[0][:-1] + poems_text.split(']')[1][1:]

# identify sections

import re

def startswith_rn(line):
    # Pattern to match lines starting with a Roman numeral followed by a period
    pattern = r"^(I|V|X|L|C|D|M)+\."
    
    # Return True if the pattern is found at the start of the line, False otherwise
    return bool(re.match(pattern, line))

# Remove sections and numbering

lines_without_numbers = [line for line in poems_text_clean.split('\n') if not startswith_rn(line)]
        
poems_text_nonum = '\n'.join(lines_without_numbers).strip()

from pprint import pprint

# Create list of poems

def extract_poems(text):
    raw_poems = text.split('\n'*6)
    poems = [poem.strip() for poem in raw_poems]
    return poems

poem_list = extract_poems(poems_text_nonum)

# create poem dictionary

# helper functions
# Answers: what number should I increment to having now seen a duplicate?
next_title_num = lambda x: x[-2] + 1 if x[-2].isnumeric() else 2 
# Answers: what was the number of the previous poem with this title?
prev_title = lambda d, k: sorted([k for k in d.keys() if k.startswith(k)])[-1]

def update(d: dict, k: str, v: str) -> None:
    '''
    Adds key-value pair 'k' & 'v' to dictionary 'd'
    Uses helper functions to increment key string if key already exists
    Dictionary is changed inplace; Returns None.
    '''
    if d.get(k):
        k = f'{k} ({next_title_num(prev_title(d,k))})'
    d[k] = v  

is_editor_title = lambda x: x.endswith('.') and x.isupper()
has_editor_title = lambda x: is_editor_title(x.split('\n')[0]) # check 1st line for editor title

d = {}
for p in poem_list:
    # first line will always be the key 
    k = p.split('\n')[0]
    v = p
    if has_editor_title(p):
        # find string that should be the value (poem minus title)
        v = '\n'.join(p.split('\n')[1:]).strip()
        # reminder that once you pass in variable p, you should be using variable p inside not another variable
        # this is called local variable
    # add new new pair to dictionary
    # update function handles altering the key if neccessary
    # (i.e., incrementing the numerical suffix of the title)
    update(d, k, p)

# Random Poem Generator

class PoetryCollection():
    def __init__(self, aname, poemdict):
        self.author = aname
        self.collection = poemdict
        self.size = len(poemdict)
    """
    Attributes
    ----------
    author : str
        full name of author
    collection : dict
        dictionary of (title, poem) key-value pairs
    size : int
        number of poems in collection
    
    Methods
    -------
    random_poem(seed: int = None) -> str
        returns random poem; use seed for reproducibility (default seed=None)
    """
    ...
    
    def random_poem(self, seed : int = None) -> str:
        rng = np.random.default_rng(seed) 
        return str(rng.choice(list(self.collection.values())))

poems = PoetryCollection('Emily Dickinson', d)

print('Author:', poems.author)

print('Number of poems in collection:', poems.size)

print(poems.random_poem())

Author: Emily Dickinson
Number of poems in collection: 446
DAY'S PARLOR.

The day came slow, till five o'clock,
Then sprang before the hills
Like hindered rubies, or the light
A sudden musket spills.

The purple could not keep the east,
The sunrise shook from fold,
Like breadths of topaz, packed a night,
The lady just unrolled.

The happy winds their timbrels took;
The birds, in docile rows,
Arranged themselves around their prince
(The wind is prince of those).

The orchard sparkled like a Jew, --
How mighty 't was, to stay
A guest in this stupendous place,
The parlor of the day!
