In [398]:
from bs4 import BeautifulSoup, NavigableString
import requests

import pandas as pd
import re

URL = 'https://www.gutenberg.org/files/10031/10031-h/10031-h.htm'
page = requests.get(URL)
page_decode = page.content.decode('latin-1')
page_decode = re.sub('<br>', '\n', page_decode)
# page_decode.replace('<br>', '\n')
soup = BeautifulSoup(page_decode, 'html.parser')

In [399]:
invalid_tags = ['b', 'i', 'u']
for tag in invalid_tags:
    for match in soup.find_all(tag):
        match.unwrap()
        
for match in soup.find_all('h3'):
    ''.join(match.strings)

        
poem_titles = soup.find_all('h3')
poem_titles = [poem.text for poem in poem_titles]

In [400]:
def extract_between(current, end):
    while current and current != end:
        if isinstance(current, NavigableString):
            text = current.strip()
            if len(text):
                yield text
        current = current.next_element

In [401]:
poem_text = []
for i in range(len(poem_titles) - 1):
    text = ' '.join(text for text in extract_between(soup.find_all('h3')[i].next_sibling, 
                                                       soup.find_all('h3')[i+1]))
    poem_text.append(text)

In [402]:
df = pd.DataFrame(list(zip(poem_titles[:-1], poem_text)), columns=['title', 'text'])

In [404]:
remove_rows = ['Poems of Later Life', 'Preface', 
       'Note on The Raven', 'Note on The Bells', 'Note on Ulalume',
       'Note on To Helen', 'Note on Annabel Lee', 'Note on A Valentine',
       'Note on An Enigma', 'Note on To My Mother', 'Note on For Annie',
       'Note on To F——', 'Note on To Frances S. Osgood',
       'Note on Eldorado', 'Note on Eulalie',
       'Note on A Dream within a Dream', 'Note on To Marie Louise (Shew)',
       'Note on the second poem entitled \xa0To Marie Louise (Shew)',
       'Note on The City in the Sea', 'Note on The Sleeper',
       'Note on The Bridal Ballad', 'Note on Lenore',
       'Note on To One in Paradise', 'Note on The Coliseum',
       'Note on The Haunted Palace', 'Note on The Conqueror Worm',
       'Note on Silence', 'Note on Dreamland', 'Note on To Zante',
       'Note on Hymn', 'Note on Politian', 'Introduction (1831)',
       'Note on Al Aaraaf', 'Note on Tamerlane', 'Note on To Helen', 
       'Note on Romance', 'Note on Alone', 'Note on To Isadore etc.',
       'The Island of the Fay', 'The Power of Words',
       'The Colloquy of Monos and Una',
       'The Conversation of Eiros and Charmion', 'Shadow — a Parable',
       'Silence — a Fable', 'The Poetic Principle',
       'The Philosophy of Composition']

In [405]:
df = df.loc[~df['title'].isin(remove_rows)].reset_index(drop=True)

In [407]:
def clean_poem_text(poem_text):
    poem_text = poem_text.strip()
    poem_text = re.sub(' +', ' ', poem_text)
    poem_text = re.sub('\n\n\r\n\r\n', '\n\r\n \n\r\n', poem_text)
    poem_text = re.sub('\n\r\n', '\n', poem_text)
    poem_text = re.sub(' +', ' ', poem_text)
    return poem_text


In [409]:
def split_into_stanzas(poem_title_list, poem_text_list):
    return_title_list = []
    return_stanza_list = []
    for poem_index in range(len(poem_title_list)):
        poem_text = clean_poem_text(poem_text_list[poem_index])
        if poem_title_list[poem_index] == 'The Bells':
            first_split = poem_text.split('IV')
            stanza_four = first_split[1]
            second_split = first_split[0].split('III')
            stanza_three = second_split[1]
            third_split = second_split[0].split('II')
            stanza_two = third_split[1]
            stanza_one = third_split[0]
            poem_stanzas = [stanza_one, stanza_two, stanza_three, stanza_four]
        else:
            poem_stanzas = poem_text.split('\n \n')
        for stanza in poem_stanzas:
            return_title_list.append(poem_title_list[poem_index])
            return_stanza_list.append(stanza)
    return return_title_list, return_stanza_list

In [410]:
stanza_title, stanza_text = split_into_stanzas(df['title'].values, df['text'].values)

In [411]:
stanza_df = pd.DataFrame(list(zip(stanza_title, stanza_text)), columns=['title', 'stanza_text'])

In [413]:
def split_into_lines(poem_title_list, poem_text_list):
    return_title_list = []
    return_line_list = []
    for poem_index in range(len(poem_title_list)):
        poem_lines = poem_text_list[poem_index].split('\n')
        for line in poem_lines:
            return_title_list.append(poem_title_list[poem_index])
            return_line_list.append(line.strip())
    return return_title_list, return_line_list
        

In [414]:
line_title, line_text = split_into_lines(stanza_df['title'].values, stanza_df['stanza_text'].values)

In [415]:
line_df = pd.DataFrame(list(zip(line_title, line_text)), columns=['title', 'line_text'])

In [420]:
line_df.to_csv('poe_poems_lines.csv', index=False)
stanza_df.to_csv('poe_poems_stanzas.csv', index=False)