In [8]:
import re
from pypdf import PdfReader
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure you have the NLTK stopwords set downloaded
nltk.download('stopwords')

# Importing the PDF.
reader = PdfReader("Pessoa.pdf")
reis_pages = range(128, 189)

reis = []
# Formatting the PDF.
for page in reis_pages:
    page = reader.pages[page]
    text = page.extract_text(extraction_mode="layout")
    reis.append(text)

# Stripping the top of the pages and other unnecessary details.
for page in reis:
    if "a little larger than the entire universe" in page:
        reis[reis.index(page)] = page[2:].lstrip(" 0123456789 a little larger than the entire universe")
    if "ricardo reis" in page[:12]:
        reis[reis.index(page)] = page.lstrip("Ricardo reis")[2:]
    if len(page) == 0:
        reis.remove(page)

# Define a function to remove footnotes and trailing unwanted text
def remove_footnotes_and_extra_lines(text):
    # Match only footnotes that are separate from the main content
    footnote_pattern = r'\n\*.*'  # Matches lines starting with '*' on a new line
    explanation_pattern = r'The penultimate stanza.*'  # Example specific phrase
    # Remove footnotes and explanations
    text = re.sub(footnote_pattern, '', text, flags=re.DOTALL).strip()
    text = re.sub(explanation_pattern, '', text, flags=re.DOTALL).strip()
    return text

# Finding the indices of each poem split across two or more pages.
indeces = []
for page in reis:
    if page[:3] == "\n\n ":
        indeces.append(reis.index(page))

reis_joined = []
pages_to_join = []
for index in indeces:
    # Handling multi-page poems
    if index - 1 not in indeces and index + 1 not in indeces:
        # Single split
        pages_to_join.append(reis[index - 1])  # First page
        pages_to_join.append(reis[index])     # Second page
        poem = "".join(pages_to_join)
        reis_joined.append(remove_footnotes_and_extra_lines(poem))  # Apply footnote removal
        pages_to_join = []
    elif index - 1 not in indeces and index + 1 in indeces:
        # First page of a multi-page poem
        pages_to_join.append(reis[index - 1])
        pages_to_join.append(reis[index])
    elif index - 1 in indeces and index + 1 in indeces:
        # Middle pages
        pages_to_join.append(reis[index])
    elif index - 1 in indeces and index + 1 not in indeces:
        # Last page
        pages_to_join.append(reis[index])
        poem = "".join(pages_to_join)
        reis_joined.append(remove_footnotes_and_extra_lines(poem))  # Apply footnote removal
        pages_to_join = []

for poem in reis:
    # Handle single-page poems
    if reis.index(poem) not in indeces and reis.index(poem) + 1 not in indeces:
        reis_joined.append(remove_footnotes_and_extra_lines(poem))

# Removing empty list entries
reis_joined = [poem for poem in reis_joined if len(poem.strip()) > 0]

# Loading NLTK stopwords
stop_words = set(stopwords.words('english'))

# Cleaning and tokenizing the poems
reis_cleaned = []
reis_tokenized = []

all_tokens = []  # A list to hold all tokens for total count
for poem in reis_joined:
    poem = re.sub(r'\n', ' ', poem)  # Remove newlines
    poem = re.sub(r'\s+', ' ', poem)  # Collapse multiple spaces
    poem = re.sub(r'[^\w\s]', '', poem)  # Remove punctuation
    # The following two lines of code will remove the months formatted with a first capitalized letter
    poem = re.sub(r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', '', poem)  # Remove all the months
    poem = re.sub(r'\d+', '', poem)  # Remove all the digits
    reis_cleaned.append(poem)
    
    # Tokenization and stopwords removal
    tokens = re.findall(r'\w+', poem)  # Tokenize without punctuation
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]  # Remove stopwords
    reis_tokenized.append(filtered_tokens)
    
    all_tokens.extend(filtered_tokens)  # Add filtered tokens to the all_tokens list
reis_cleaned

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gheorghe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['To Alberto Caeiro Peaceful Master Are all the hours We lose if we place As in a vase Flowers on our Losing them There are in our life No sorrows or joys So let us learn Wisely unworried Not how to live life But to let it go by Keeping forever Peaceful and calm Taking children For our teachers And letting Nature Fill our eyes Along the river Or along the road Wherever we are Always remaining In the same easy Repose of living Time passes And tells us nothing We grow old Let us know how With a certain mischief To feel ourselves go Taking action Serves no purpose No one can resist The atrocious god Who always devours His own children Let us pick ﬂowers Let us lightly Wet our hands In the calm rivers So as to learn Some of their calmness Sunﬂowers forever Beholding the sun We will serenely Depart from life Without even the regret Of having lived   ',
 'Each thing in its time has its time The trees do not blossom in winter Nor does the white cold Cover the ﬁelds in spring The heat that the

In [2]:
# Giving each poem a title
reis_dict = {}
n = 1
for poem in reis_tokenized:
    reis_dict["untitled_"+str(n)] = poem
    n += 1

In [3]:
# Adding Caeiro's poems into a Df.
df = pd.DataFrame(list(reis_dict.items()), columns=["Title", "Tokenized_text"])
df["Raw_text"] = reis_cleaned
df["Heteronym"] = "Reis"
df

Unnamed: 0,Title,Tokenized_text,Raw_text,Heteronym
0,untitled_1,"[To, Alberto, Caeiro, Peaceful, Master, Are, a...",To Alberto Caeiro Peaceful Master Are all the ...,Reis
1,untitled_2,"[Each, thing, in, its, time, has, its, time, T...",Each thing in its time has its time The trees ...,Reis
2,untitled_3,"[THE, CHESS, PLAYERS, Ive, heard, that, once, ...",THE CHESS PLAYERS Ive heard that once during I...,Reis
3,untitled_4,"[I, love, the, roses, of, Adoniss, gardens, Ye...",I love the roses of Adoniss gardens Yes Lydia ...,Reis
4,untitled_5,"[The, god, Pan, isnt, dead, In, each, ﬁeld, th...",The god Pan isnt dead In each ﬁeld that shows ...,Reis
5,untitled_6,"[Snow, covers, the, sunlit, hills, in, the, di...",Snow covers the sunlit hills in the distance B...,Reis
6,untitled_7,"[The, days, paleness, is, tinged, with, gold, ...",The days paleness is tinged with gold The curv...,Reis
7,untitled_8,"[Wise, the, man, whos, content, with, the, wor...",Wise the man whos content with the worlds spec...,Reis
8,untitled_9,"[Bearing, in, mind, our, likeness, with, the, ...",Bearing in mind our likeness with the gods Let...,Reis
9,untitled_10,"[The, only, freedom, the, gods, grant, us, Is,...",The only freedom the gods grant us Is this to ...,Reis


In [11]:
import os

# Create a directory to store the text files if it doesn't exist
output_dir = "Reis"
os.makedirs(output_dir, exist_ok=True)

# Iterate through the DataFrame and save each poem as a text file
for index, row in df.iterrows():
    title = row['Title']
    raw_text = row['Raw_text']
    
    # Create a valid filename by sanitizing the title
    filename = f"{title}.txt".replace(" ", "_").replace("/", "_")
    
    # Write the poem to a text file in the output directory
    with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as file:
        file.write(raw_text)

print(f"All poems have been exported to the '{output_dir}' directory.")


All poems have been exported to the 'Reis' directory.
