In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def extract_poem(url, author):
    # Send a GET request to the page
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Locate the poem content
        poem = soup.find("div", id="maincontent", class_="main-content")  

        # Extract and print the poem text
        if poem:
            poem_text = poem.get_text(separator="\n", strip=True)
            
            # Remove unnecessary content like the "audio" message and extra whitespace
            poem_text = poem_text.replace("Your browser does not support the audio element.", "")
            
            # Use regex to remove any extra spaces or unwanted characters (e.g., extra line breaks, stray symbols)
            poem_text = re.sub(r'\n+', '\n', poem_text)  # Replace multiple newlines with a single one
            poem_text = re.sub(r'^\s+|\s+?$', '', poem_text)  # Strip leading and trailing spaces
            
            # Split the poem into lines
            poem_lines = poem_text.split("\n")
            
            # Attempt to get the title, the title is generally the first line
            title = poem_lines[0] if poem_lines else "No title found"
            
            # Join the body of the poem (all lines after the title)
            poem_content = "\n".join(poem_lines[1:])
            
            # Clean the poem text by cutting off everything after the word 'Text'
            poem_end_index = poem_content.find("Text")  # Find where "Text" occurs
            if poem_end_index != -1:
                poem_content = poem_content[:poem_end_index]  # Keep only the text before "Text"
            
            return title, poem_content  # Return both title and content
        else:
            return "Could not locate the poem content on the page.", ""
    else:
        return f"Failed to fetch the page. Status code: {response.status_code}", ""

# List of (Author, Title, URL) tuples
urls = [
    ("Alvaro de Campos", "Lisbon Revisited", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/alvaro-de-campos/lisbon-revisited-1926?eID="),
    ("Alvaro de Campos", "Ah the sound of the maid’s iron passing back and forth", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/alvaro-de-campos/ah-sound-maids-iron-passing-back-and-forth?eID="),
    ("Alvaro de Campos", "Ah, open my eyes to another reality!", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/alvaro-de-campos/ah-open-my-eyes-another-reality?eID="),
    ("Alvaro de Campos", "Ah Margarida", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/alvaro-de-campos/ah-margarida?eID="),
    ("Alberto Caeiro", "Iʾm a keeper of sheep", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/alberto-caeiro/im-keeper-sheep?eID="),
    ("Alberto Caeiro", "My gaze is clear like a sunflower", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/alberto-caeiro/my-gaze-clear-sunflower?eID="),
    ("Alberto Caeiro", "The Tagus is more beautiful than the river that flows through my village", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/alberto-caeiro/tagus-more-beautiful-river-flows-through-my-village?eID="),
    ("Alberto Caeiro", "If I could sink my teeth into the whole earth", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/alberto-caeiro/if-i-could-sink-my-teeth-whole-earth?eID="),
    ("Alberto Caeiro", "Before I Had You", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/alberto-caeiro/i-had-you?eID="),
    ("Ricardo Reis", "I love the roses of Adonis’s gardens", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/ricardo-reis/i-love-roses-adoniss-gardens?eID="),
    ("Ricardo Reis", "Come sit with me, Lydia, next to the river", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/ricardo-reis/come-sit-me-lydia-next-river?eID="),
    ("Ricardo Reis", "Countless lives inhabit us", "https://www.casafernandopessoa.pt/pt/cfp/visita/museu/poemas-e-textos-escolhidos/english/ricardo-reis/countless-lives-inhabit-us?eID="),
]

# Create an empty list to store the poems
poems_data = []

# Loop through the list of URLs and extract the poems
for author, title, url in urls:
    poem_title, poem_content = extract_poem(url, author)
    
    # If the poem was successfully extracted
    if poem_content:
        poems_data.append({"Author": author, "Title": poem_title, "Poem": poem_content})

# Create a DataFrame from the list of poems
poems_df = pd.DataFrame(poems_data)

# Show the DataFrame
print(poems_df)

              Author                                              Title  \
0   Alvaro de Campos                            Lisbon Revisited (1926)   
1   Alvaro de Campos  Ah the sound of the maid’s iron passing back a...   
2   Alvaro de Campos               Ah, open my eyes to another reality!   
3   Alvaro de Campos                                       Ah Margarida   
4     Alberto Caeiro                              Iʾm a keeper of sheep   
5     Alberto Caeiro                  My gaze is clear like a sunflower   
6     Alberto Caeiro  The Tagus is more beautiful than the river tha...   
7     Alberto Caeiro      If I could sink my teeth into the whole earth   
8     Alberto Caeiro                                   Before I Had You   
9       Ricardo Reis               I love the roses of Adonis’s gardens   
10      Ricardo Reis         Come sit with me, Lydia, next to the river   
11      Ricardo Reis                         Countless lives inhabit us   

                        

In [2]:
# Clean the 'Poem' column to remove excessive new line characters
poems_df['Poem'] = poems_df['Poem'].apply(lambda x: re.sub(r'\n+', ' ', x).strip())

In [3]:
poems_df.head()

Unnamed: 0,Author,Title,Poem
0,Alvaro de Campos,Lisbon Revisited (1926),Nothing holds me. I want fifty things at the s...
1,Alvaro de Campos,Ah the sound of the maid’s iron passing back a...,Ah the sound of the maid's iron passing back a...
2,Alvaro de Campos,"Ah, open my eyes to another reality!","Ah, open my eyes to another reality! I want to..."
3,Alvaro de Campos,Ah Margarida,"Ah Margarida, If I gave you my life, What woul..."
4,Alberto Caeiro,Iʾm a keeper of sheep,IX Iʾm a keeper of sheep. The sheep are my tho...


In [4]:
poems_df.to_csv('poems_data.csv', index=False, encoding='utf-8')