In [8]:
# Imports.
from bs4 import BeautifulSoup
import pandas as pd
from requests import get
from contextlib import closing
import re
from time import time
from pyspark import SparkContext

In [2]:
sc

In [3]:
# Url contains list of all poets on site.
all_poets_page = requests.get('http://www.famouspoetsandpoems.com/poets.html')
all_poets_page = BeautifulSoup(all_poets_page.text, 'lxml')

def grab_poets_info(all_poets_page):
    '''Input - page containing all poet info
    Output - list of all poets and their info'''
    poets = list()
    for tag in all_poets_page.findAll('td'):
        if '(' in tag.get_text():
            poets.append(tag.get_text().strip())
    poets = [x.strip() for x in poets]
    poets = poets[3:]
    poets = poets[::2]
    return poets

def extract_poet_info(poet_string):
    '''Extract name, number of poems, and years of poet's life from string'''
    poet_name = re.findall('^[^\(]+', poet_string)[0].strip()
    number_of_poems = re.findall('\((.*?)\)', poet_string)[0]
    poet_years = re.findall('\((.*?)\)', poet_string)[1]
    return poet_name, number_of_poems, poet_years

poets = grab_poets_info(all_poets_page)
poets_info = map(extract_poet_info, poets)
poets_df = pd.DataFrame(poets_info, columns=['name', 'number', 'years'])
#poets_df.to_csv('poets_years.csv', index=False)

In [27]:
def extract_poet_links(all_poets_page):
    '''Extract links to each poet's page from page which lists them all'''
    poet_links = list()
    for tag in all_poets_page.findAll('td'):
        try:
            link = tag.find('a')['href']
            if '/poets/' in link:
                poet_links.append(link)
        except:
            pass
    poet_links = list(set(poet_links))
    base = 'http://www.famouspoetsandpoems.com'
    poet_pages = [base + poet + '/poems' for poet in poet_links]
    return poet_pages

def get_poems(poet_page):
    '''Extract all links to individual poem pages from pages of poets. Takes about 4.5 minutes.'''
    poet_page = requests.get(poet_page)
    bib_soup = BeautifulSoup(poet_page.text, 'lxml')
    
    raw_poem_links = list()
    for poems in bib_soup.findAll('td'):
        try:
            poem = poems.find('a')['href']
            if '/poems/' in poem:
                raw_poem_links.append(poem)
        except:
            pass     
    raw_poem_links = list(set(raw_poem_links))
    poem_links = ['http://www.famouspoetsandpoems.com' + poem for poem in raw_poem_links]
    return poem_links

def scrape_poem(url):
    '''Extract one poem, and its poet, from its page'''
    with closing(get(url, stream=True)) as resp:
        page = resp.text
    soup = BeautifulSoup(page, 'lxml')
    poem = soup.find('div', style="padding-left:14px;padding-top:20px;font-family:Arial;font-size:13px;")
    poem = str(poem)
    poem = BeautifulSoup(poem.replace('<br/>', ' ')).get_text().strip()
    for tag in soup('span'):
        if 'by' in tag.get_text():
            poet = tag.get_text().strip()
        else:
            poet = ''
    return (poem, poet)

all_poet_pages = extract_poet_links(all_poets_page)

In [29]:
poets_rdd = sc.parallelize(all_poet_pages, 30)
poets_rdd = poets_rdd.flatMap(get_poems)
poets_rdd = poets_rdd.repartition(400)
poets_rdd = poets_rdd.map(scrape_poem)
result = poets_rdd.collect()

In [30]:
result[:10]

[(u'A new thing, coming forth Don\u2019t you see it?  Can\u2019t you see the signs? the Lord doing a new thing,  emerging now In these words, in the story, in the hope of the coming the return of the king fulfillment of prophecy the grace of God revealed Oh a new thing coming Let it come, wash over us joy in the morning!   July 28, 2009 Psalm 92:1-4 Isaiah 43:18-21 Luke 12:54-56 and sermon, \u201cThe View from Seven Decades: The Power and Danger of Paradigms\u201d, by the Reverend Huntley Halvorson Suncook United Methodist Church July 19, 2009 and Psalm 100 Luke 4:16-21 Isaiah 43:1-3a Isaiah 43:18-21 and sermon, \u201cThe Power of Vision\u201d, by the Reverend Huntley Halvorson, including, The Star Thrower Story by Joel Barker, inspired by the writing of Loren Eiseley, Suncook United Methodist Church July 26, 2009',
  u'by Raymond A. Foss'),
 (u'In challenging times, assurance the loving presence of God fulfillment of his promises comfort in his indwelling a new perspective on life He 

In [31]:
df = pd.DataFrame(result)

In [34]:
df.to_csv('clean_poems.csv', index=False, encoding='utf-8')

# Old way below

In [None]:
# This would take 3-4 hours.
start = time()
poems_soup = []
for link in tqdm(all_poem_links_list, desc='Scrape all poems'):
    url = requests.get(link)
    soup = BeautifulSoup(url.text, 'lxml')
    poem = soup.find('div', style="padding-left:14px;padding-top:20px;font-family:Arial;font-size:13px;")
    for tag in soup('span'):
        if 'by' in tag.get_text():
            poet = tag.get_text().strip()
    poems_soup.append([poem, poet])
print((time() - start) / 60)

## Creating a DataFrame from all the poems and poets.
df_all = pd.DataFrame(poems_soup)
df_all.to_csv('all_poets.csv')