In [4]:
# Importing modules.
import os
import pandas as pd
import re
from pypdf import PdfReader
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure you have the NLTK stopwords set downloaded
nltk.download('stopwords')

# Importing the PDF.
reader = PdfReader("Pessoa.pdf")
campos_pages = range(192, 318)

campos = []
# Formatting the PDF.
for page in campos_pages:
    page = reader.pages[page]
    text = page.extract_text(extraction_mode="layout")
    campos.append(text)

# stripping the top of the pages, and dates at the end.
# Due to how the PDF was imported, I had to apply several different parameters here.
for page in campos:
    if "a little larger than the entire universe" in page:
        campos[campos.index(page)] = page.lstrip(" 0123456789 a little larger than the entire universe")
    if "álvaro de campos" in page[:16]:
        campos[campos.index(page)] = page.lstrip("álvaro de campos")[3:]
    if "álva r o de campos" in page[:18]:
        campos[campos.index(page)] = page.lstrip("álvaro de campos")[3:]
    if len(page) == 0:
        campos.remove(page)

# Finding the indeces of each poem which was split across two or more pages.
indeces = []
marker1 = re.compile(r"^[\n\n]")
marker2 = re.compile(r"^[A-Z]")
for index, page in enumerate(campos):
    if marker1.match(page):
        if page[2] == " " or marker2.match(page[2]):
            indeces.append(index)

campos_joined = []
pages_to_join = []
for index in indeces:
    # This if condition applies when a poem is only two pages long.
    if index - 1 not in indeces and index + 1 not in indeces:
        # The two pages can be joined and directly appended to the list of poems.
        pages_to_join.append(campos[index - 1]) # first page
        pages_to_join.append(campos[index]) # second page
        campos_joined.append("".join(pages_to_join))
        pages_to_join = []
        
    # The next two conditions apply when a poem is more than two pages long.
    elif index - 1 not in indeces and index + 1 in indeces:
        # This condition adds the first two pages of a poem to a separate list, where all pages can be compiled and later joined.
        pages_to_join.append(campos[index - 1]) # first page
        pages_to_join.append(campos[index]) # second page
    elif index - 1 in indeces and index + 1 in indeces:
        pages_to_join.append(campos[index]) # each subsequent page
    elif index - 1 in indeces and index + 1 not in indeces:
        pages_to_join.append(campos[index]) # last page
        campos_joined.append("".join(pages_to_join))
        pages_to_join = []

for poem in campos:
    # This tests whether a poem is either the first, or later page of a multi-page poem, and if not, adds them to the final list.
    if campos.index(poem) not in indeces and campos.index(poem) + 1 not in indeces:
        campos_joined.append(poem)

# Joining the poems results in a few more empty list entries, which are removed here.
campos_joined = [poem for poem in campos_joined if len(poem.strip()) > 0]

# Loading NLTK stopwords
stop_words = set(stopwords.words('english'))

# Cleaning and tokenizing the poems
campos_cleaned = []
campos_tokenized = []

all_tokens = []  # A list to hold all tokens for total count
for poem in campos_joined:
    poem = re.sub(r'\n', ' ', poem)  # Remove newlines
    poem = re.sub(r'\s+', ' ', poem)  # Collapse multiple spaces
    poem = poem.lstrip(" 0123456789")  # Remove unwanted leading characters (numbers, etc.)
    poem = re.sub(r'[^\w\s]', '', poem)  # Remove punctuation
    # The following two lines of code will remove the months formatted with a first capitalized letter
    poem = re.sub(r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', '', poem)  # Remove all the months
    poem = re.sub(r'\d+', '', poem)  # Remove all the digits
    
    campos_cleaned.append(poem)
    
    # Tokenization and stopwords removal
    tokens = re.findall(r'\w+', poem)  # Tokenize without punctuation
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]  # Remove stopwords
    campos_tokenized.append(filtered_tokens)
    
    all_tokens.extend(filtered_tokens)  # Add filtered tokens to the all_tokens list

# Count the total number of tokens
len(all_tokens)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gheorghe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


12196

In [5]:
# Giving each poem a title
campos_dict = {}
title = re.compile(r"\S[ÀA-Z0-9()-]{2,}\b")
n = 1
for index, poem in enumerate(campos_cleaned):
    if title.match(poem) != None:
        if title.match(poem).group() not in campos_dict.keys():
            campos_dict[title.match(poem).group()] = re.sub(title, "", poem)
        else:
            campos_dict[title.match(poem).group()+"_1"] = re.sub(title, "", poem)
    else:
        campos_dict["untitled_"+str(n)] = poem
        n += 1

for index, poem in enumerate(campos_tokenized):
    for token in poem[:5]:
        if title.match(token) != None:
            poem.remove(token)
            campos_tokenized[index] = poem

# Adding Caeiro's poems into a Df.
df = pd.DataFrame(list(campos_dict.items()), columns=["Title", "Raw_text"])
df["Tokenized_text"] = campos_tokenized
df["Heteronym"] = "Campos"
df

Unnamed: 0,Title,Raw_text,Tokenized_text,Heteronym
0,OPIARY,Its before I take opium that my soul is sick ...,"[opiary, take, opium, soul, sick, feel, life, ...",Campos
1,TRIUMPHAL,By the painful light of the factorys huge el...,"[triumphal, ode, painful, light, factorys, hug...",Campos
2,EXCERPTS,I Come ancient and unchanging Night Queen ...,"[excerpts, two, odes, come, ancient, unchangin...",Campos
3,MARITIME,Alone this summer morning on the deserted wh...,"[maritime, ode, alone, summer, morning, desert...",Campos
4,SALUTATION,TO Portugal InﬁnityJune eleventh nineteen h...,"[salutation, walt, whitman, portugal, inﬁnityj...",Campos
5,LISBON,No I dont want anything I already said I ...,"[lisbon, revisited, dont, want, anything, alre...",Campos
6,LISBON_1,Nothing holds me I want ﬁfty things at th...,"[lisbon, revisited, nothing, holds, want, ﬁfty...",Campos
7,untitled_1,If you want to kill yourself why dont you want...,"[want, kill, dont, want, kill, nows, chance, g...",Campos
8,untitled_2,At the wheel of the Chevrolet on the road to S...,"[wheel, chevrolet, road, sintra, moonlight, dr...",Campos
9,CLOUDS,On this sad day my heart sadder than the day ...,"[clouds, sad, day, heart, sadder, day, moral, ...",Campos


In [6]:
import os

# Create a directory to store the text files if it doesn't exist
output_dir = "Campos"
os.makedirs(output_dir, exist_ok=True)

# Iterate through the DataFrame and save each poem as a text file
for index, row in df.iterrows():
    title = row['Title']
    raw_text = row['Raw_text']
    
    # Create a valid filename by sanitizing the title
    filename = f"{title}.txt".replace(" ", "_").replace("/", "_")
    
    # Write the poem to a text file in the output directory
    with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as file:
        file.write(raw_text)

print(f"All poems have been exported to the '{output_dir}' directory.")


All poems have been exported to the 'Campos' directory.
