In [2]:
# Importing modules.
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
from pypdf import PdfReader
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure you have the NLTK stopwords set downloaded
nltk.download('stopwords')

# Importing the PDF.
reader = PdfReader("Pessoa.pdf")
pessoa_pages = range(322, 428)

pessoa = []
# Formatting the PDF.
for page in pessoa_pages:
    page = reader.pages[page]
    text = page.extract_text(extraction_mode="layout")
    pessoa.append(text)

# stripping the top of the pages, and dates at the end.
# Due to how the PDF was imported, I had to apply several different parameters here.
for page in pessoa:
    if "a little larger than the entire universe" in page:
        pessoa[pessoa.index(page)] = page.lstrip(" 0123456789 a little larger than the entire universe")
    if "fernando pessoa–himself" in page[:23]:
        pessoa[pessoa.index(page)] = page.lstrip("fernando pessoa–himself")[3:]
    if len(page) == 0:
        pessoa.remove(page)

# Finding the indeces of each poem which was split across two or more pages.
indeces = []
marker1 = re.compile(r"^[\n\n]")
marker2 = re.compile(r"^[A-Z]")
for index, page in enumerate(pessoa):
    if marker1.match(page):
        if page[2] == " " or marker2.match(page[2]):
            indeces.append(index)

pessoa_joined = []
pages_to_join = []
for index in indeces:
    # This if condition applies when a poem is only two pages long.
    if index - 1 not in indeces and index + 1 not in indeces:
        # The two pages can be joined and directly appended to the list of poems.
        pages_to_join.append(pessoa[index - 1]) # first page
        pages_to_join.append(pessoa[index]) # second page
        pessoa_joined.append("".join(pages_to_join))
        pages_to_join = []
        
    # The next two conditions apply when a poem is more than two pages long.
    elif index - 1 not in indeces and index + 1 in indeces:
        # This condition adds the first two pages of a poem to a separate list, where all pages can be compiled and later joined.
        pages_to_join.append(pessoa[index - 1]) # first page
        pages_to_join.append(pessoa[index]) # second page
    elif index - 1 in indeces and index + 1 in indeces:
        pages_to_join.append(pessoa[index]) # each subsequent page
    elif index - 1 in indeces and index + 1 not in indeces:
        pages_to_join.append(pessoa[index]) # last page
        pessoa_joined.append("".join(pages_to_join))
        pages_to_join = []

for poem in pessoa:
    # This tests whether a poem is either the first, or later page of a multi-page poem, and if not, adds them to the final list.
    if pessoa.index(poem) not in indeces and pessoa.index(poem) + 1 not in indeces:
        pessoa_joined.append(poem)

# Joining the poems results in a few more empty list entries, which are removed here.
pessoa_joined = [poem for poem in pessoa_joined if len(poem.strip()) > 0]

# Loading NLTK stopwords
stop_words = set(stopwords.words('english'))

# Cleaning and tokenizing the poems
pessoa_cleaned = []
pessoa_tokenized = []

all_tokens = []  # A list to hold all tokens for total count
for poem in pessoa_joined:
    poem = re.sub(r'\n', ' ', poem)  # Remove newlines
    poem = re.sub(r'\s+', ' ', poem)  # Collapse multiple spaces
    poem = poem.lstrip(" 0123456789")  # Remove unwanted leading characters (numbers, etc.)
    poem = re.sub(r'[^\w\s]', '', poem)  # Remove punctuation
    # The following two lines of code will remove the months formatted with a first capitalized letter
    poem = re.sub(r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', '', poem)  # Remove all the months
    poem = re.sub(r'\d+', '', poem)  # Remove all the digits
    
    pessoa_cleaned.append(poem)
    
    # Tokenization and stopwords removal
    tokens = re.findall(r'\w+', poem)  # Tokenize without punctuation
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]  # Remove stopwords
    pessoa_tokenized.append(filtered_tokens)
    
    all_tokens.extend(filtered_tokens)  # Add filtered tokens to the all_tokens list

pessoa_cleaned

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gheorghe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['Swamps of yearnings brushing against my gilded soul Distant tolling of Other Bells The blond wheat p aling In the ashen sunset My soul is seized by a bodily chill How forever equal the Hour The tops of the p alms swaying The leaves staring at the silence inside us Wi spy autumn Of a hazy birds singing    Stagnant forgotten blue How quiet the shout of yearning that gives this Hour claws How my selfdread longs for something that doesnt weep My hands reach out to the beyond but even as theyre reaching I see that what I desire is not what I want Cymbals of Imperfection O distan tly ancient Hour Banished from its own Timeself Receding wave that invades My ceaseless retreating into myself until I faint So intent on the present I that I seem to forget myself Liquid of halos with no Was behind it no Self inside it The Mystery smacks of my being other Bur sting in the moonlight The sentry stands very straight but his lance planted on the ground Is still taller than heWhats all this for The ﬂa

In [4]:
# Giving each poem a title
pessoa_dict = {}
title = re.compile(r"\S[ÀA-Z0-9()-]{2,}\b")
n = 1
for index, poem in enumerate(pessoa_cleaned):
#    if title.match(poem) != None:
#        if title.match(poem).group() not in pessoa_dict.keys():
#            pessoa_dict[title.match(poem).group()] = re.sub(title, "", poem)
#        else:
#            pessoa_dict[title.match(poem).group()+"_1"] = re.sub(title, "", poem)
#    else:
        pessoa_dict["untitled_"+str(n)] = poem
        n += 1

for index, poem in enumerate(pessoa_tokenized):
    for token in poem[:5]:
        if title.match(token) != None:
            poem.remove(token)
            pessoa_tokenized[index] = poem

pessoa_dict

# Adding Caeiro's poems into a Df.
df = pd.DataFrame(list(pessoa_dict.items()), columns=["Title", "Raw_text"])
df["Tokenized_text"] = pessoa_tokenized
df["Heteronym"] = "Pessoa"
df

Unnamed: 0,Title,Raw_text,Tokenized_text,Heteronym
0,untitled_1,Swamps of yearnings brushing against my gilded...,"[swamps, yearnings, brushing, gilded, soul, di...",Pessoa
1,untitled_2,from SLANTING RAIN I My dream of an inﬁnite po...,"[slanting, rain, dream, inﬁnite, port, crosses...",Pessoa
2,untitled_3,She sings poor reaper perhaps Believing hersel...,"[sings, poor, reaper, perhaps, believing, happ...",Pessoa
3,untitled_4,DIARY IN THE SHADE Do you still remember me Yo...,"[diary, shade, still, remember, knew, long, ti...",Pessoa
4,untitled_5,Wheres my life going and whos taking it there ...,"[wheres, life, going, whos, taking, always, di...",Pessoa
...,...,...,...,...
72,untitled_73,ULYSSES Myth is the nothing that is everything...,"[ulysses, myth, nothing, everything, sun, brea...",Pessoa
73,untitled_74,VIRIATO If our feeling and acting soul has kno...,"[viriato, feeling, acting, soul, knowledge, re...",Pessoa
74,untitled_75,HENRY COUNT OF BURGUNDY Every beginning is inv...,"[henry, count, burgundy, every, beginning, inv...",Pessoa
75,untitled_76,THE COLUMBUSES Others are bound to have What w...,"[columbuses, others, bound, bound, lose, other...",Pessoa


In [5]:
import os

# Create a directory to store the text files if it doesn't exist
output_dir = "Pessoa"
os.makedirs(output_dir, exist_ok=True)

# Iterate through the DataFrame and save each poem as a text file
for index, row in df.iterrows():
    title = row['Title']
    raw_text = row['Raw_text']
    
    # Create a valid filename by sanitizing the title
    filename = f"{title}.txt".replace(" ", "_").replace("/", "_")
    
    # Write the poem to a text file in the output directory
    with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as file:
        file.write(raw_text)

print(f"All poems have been exported to the '{output_dir}' directory.")


All poems have been exported to the 'Pessoa' directory.
