In [12]:
import re
from pypdf import PdfReader
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure you have the NLTK stopwords set downloaded
nltk.download('stopwords')

# Importing the PDF.
reader = PdfReader("Pessoa.pdf")
caeiro_pages = range(56, 126)

caeiro = []
# Formatting the PDF.
for page in caeiro_pages:
    page = reader.pages[page]
    text = page.extract_text(extraction_mode="layout")
    caeiro.append(text)

# Stripping the top of the pages and other unnecessary details.
for page in caeiro:
    if "a little larger than the entire universe" in page:
        caeiro[caeiro.index(page)] = page[2:].lstrip("a little larger than the entire universe")
    if "alberto caeiro" in page:
        caeiro[caeiro.index(page)] = page.strip("alberto caeiro")[2:]
    if "alber t o caeiro" in page:
        caeiro[caeiro.index(page)] = page.strip("alber t o caeiro")[2:]
    if len(page) == 0:
        caeiro.remove(page)
    if "            from\nTHE SHEPHERD IN LOVE" in page or "           from\nUNCOLLECTED POEMS" in page:
        caeiro.remove(page)

# Define a function to remove footnotes and trailing unwanted text
def remove_footnotes(text):
    # Match only footnotes that are separate from the main content
    footnote_pattern = r'^\*.*$'  # Matches lines starting with '*' on a new line
    # Remove footnotes
    text = re.sub(footnote_pattern, '', text, flags=re.MULTILINE).strip()
    return text

# Finding the indices of each poem split across two or more pages.
indeces = []
for page in caeiro:
    if page[:3] == "\n\n ":
        indeces.append(caeiro.index(page))

caeiro_joined = []
pages_to_join = []
for index in indeces:
    # Handling multi-page poems
    if index - 1 not in indeces and index + 1 not in indeces:
        # Single split
        pages_to_join.append(caeiro[index - 1])  # First page
        pages_to_join.append(caeiro[index])     # Second page
        poem = "".join(pages_to_join)
        poem = remove_footnotes(poem)  # Apply footnote removal
        caeiro_joined.append(poem)
        pages_to_join = []
    elif index - 1 not in indeces and index + 1 in indeces:
        # First page of a multi-page poem
        pages_to_join.append(caeiro[index - 1])
        pages_to_join.append(caeiro[index])
    elif index - 1 in indeces and index + 1 in indeces:
        # Middle pages
        pages_to_join.append(caeiro[index])
    elif index - 1 in indeces and index + 1 not in indeces:
        # Last page
        pages_to_join.append(caeiro[index])
        poem = "".join(pages_to_join)
        poem = remove_footnotes(poem)  # Apply footnote removal
        caeiro_joined.append(poem)
        pages_to_join = []

for poem in caeiro:
    # Handle single-page poems
    if caeiro.index(poem) not in indeces and caeiro.index(poem) + 1 not in indeces:
        poem = remove_footnotes(poem)  # Apply footnote removal
        caeiro_joined.append(poem)

# Removing empty list entries
caeiro_joined = [poem for poem in caeiro_joined if len(poem.strip()) > 0]

# Loading NLTK stopwords
stop_words = set(stopwords.words('english'))

# Cleaning and tokenizing the poems
caeiro_cleaned = []
caeiro_tokenized = []

all_tokens = []  # A list to hold all tokens for total count
for poem in caeiro_joined:
    poem = re.sub(r'\n', ' ', poem)  # Remove newlines
    poem = re.sub(r'\s+', ' ', poem)  # Collapse multiple spaces
    poem = re.sub(r'[^\w\s]', '', poem)  # Remove punctuation
    # The following two lines of code will remove the months formatted with a first capitalized letter
    poem = re.sub(r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\b', '', poem)  # Remove all the months
    poem = re.sub(r'\d+', '', poem)  # Remove all the digits
    caeiro_cleaned.append(poem)
    
    # Tokenization and stopwords removal
    tokens = re.findall(r'\w+', poem)  # Tokenize without punctuation
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]  # Remove stopwords
    caeiro_tokenized.append(filtered_tokens)
    
    all_tokens.extend(filtered_tokens)  # Add filtered tokens to the all_tokens list

# Count the total number of tokens
len(all_tokens)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gheorghe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


3744

In [11]:
all_tokens

['II',
 'My',
 'gaze',
 'is',
 'clear',
 'like',
 'a',
 'sunﬂower',
 'It',
 'is',
 'my',
 'custom',
 'to',
 'walk',
 'the',
 'roads',
 'Looking',
 'right',
 'and',
 'left',
 'And',
 'sometimes',
 'looking',
 'behind',
 'me',
 'And',
 'what',
 'I',
 'see',
 'at',
 'each',
 'moment',
 'Is',
 'what',
 'I',
 'never',
 'saw',
 'before',
 'And',
 'Im',
 'very',
 'good',
 'at',
 'noticing',
 'things',
 'Im',
 'capable',
 'of',
 'feeling',
 'the',
 'same',
 'wonder',
 'A',
 'newborn',
 'child',
 'would',
 'feel',
 'If',
 'he',
 'noticed',
 'that',
 'hed',
 'really',
 'and',
 'truly',
 'been',
 'born',
 'I',
 'feel',
 'at',
 'each',
 'moment',
 'that',
 'Ive',
 'just',
 'been',
 'born',
 'Into',
 'a',
 'completely',
 'new',
 'world',
 'I',
 'believe',
 'in',
 'the',
 'world',
 'as',
 'in',
 'a',
 'daisy',
 'Because',
 'I',
 'see',
 'it',
 'But',
 'I',
 'dont',
 'think',
 'about',
 'it',
 'Because',
 'to',
 'think',
 'is',
 'to',
 'not',
 'understand',
 'The',
 'world',
 'wasnt',
 'made',
 'for'

In [3]:
# Next, a dictionary is created, in which, if a poem begins with a numeral, it is assigned as its title, or it is...
# ...otherwise labelled as untitled.
caeiro_dict = {}
numerals = "IVXLCDM"
n = 1
for poem in caeiro_tokenized:
    n_count = 0
    for char in poem[0]:
        if char in numerals:
            n_count += 1
    if n_count == len(poem[0]) and len(poem[0]) != 1:
        caeiro_dict[poem[0]] = poem[1:]
    else:
        caeiro_dict["untitled_"+str(n)] = poem
        n += 1

In [4]:
# Based on the above code, I created a function which tests whether a word is a numeral or not.

def is_numeral(text):
    numerals = "IVXLCDM "
    n_count = 0
    for char in text:
        if char in numerals:
            n_count += 1
    if n_count == len(text):
        return True
    else:
        return False
        
numeral = re.compile(r'^[IVXLCDM]+\S')
for poem in caeiro_cleaned:
    if is_numeral(poem.split(" ")[0]) == True:
        caeiro_cleaned[caeiro_cleaned.index(poem)] = numeral.sub('', poem)

In [5]:
# Adding Caeiro's poems into a Df.
df = pd.DataFrame(list(caeiro_dict.items()), columns=["Title", "Tokenized_text"])
df["Raw_text"] = caeiro_cleaned
df

Unnamed: 0,Title,Tokenized_text,Raw_text
0,II,"[My, gaze, is, clear, like, a, sunﬂower, It, i...",My gaze is clear like a sunﬂower It is my cus...
1,IV,"[This, afternoon, a, thunderstorm, Rolled, dow...",This afternoon a thunderstorm Rolled down fro...
2,VIII,"[One, midday, in, late, spring, I, had, a, dre...",One midday in late spring I had a dream that ...
3,XXVIII,"[Today, I, read, nearly, two, pages, In, the, ...",Today I read nearly two pages In the book of ...
4,XLVI,"[In, this, way, or, that, way, As, it, may, ha...",In this way or that way As it may happen or n...
5,XLVIII,"[From, the, highest, window, of, my, house, I,...",From the highest window of my house I wave fa...
6,untitled_1,"[The, astonishing, reality, of, things, Is, my...",The astonishing reality of things Is my discov...
7,untitled_2,"[Whoever, or, whatever, is, at, the, center, o...",Whoever or whatever is at the center of the wo...
8,untitled_3,"[War, which, inﬂicts, suffering, on, the, worl...",War which inﬂicts suffering on the world with ...
9,untitled_4,"[You, say, Im, something, more, Than, a, stone...",You say Im something more Than a stone or a pl...


In [14]:
import os

# Create a directory to store the text files if it doesn't exist
output_dir = "Caeiro"
os.makedirs(output_dir, exist_ok=True)

# Iterate through the DataFrame and save each poem as a text file
for index, row in df.iterrows():
    title = row['Title']
    raw_text = row['Raw_text']
    
    # Create a valid filename by sanitizing the title
    filename = f"{title}.txt".replace(" ", "_").replace("/", "_")
    
    # Write the poem to a text file in the output directory
    with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as file:
        file.write(raw_text)

print(f"All poems have been exported to the '{output_dir}' directory.")


All poems have been exported to the 'Caeiro' directory.
