In [1]:
import pandas as pd

In [2]:
path = "../data/PoetryFoundationData.csv"
df = pd.read_csv(path)
print("Loaded {} poems.".format(len(df)))

Loaded 13854 poems.


In [3]:
# select and rename columns
df = df[['Title', 'Poem', 'Poet', 'Tags']]
df.columns = df.columns.str.lower()

In [4]:
# print a few poems
for i in range(15):
    print(df['title'][i])
    print(df['poem'][i])
    print("By {}.".format(df['poet'][i]))
    print('=' * 20)
    


                    Objects Used to Prop Open a Window
                

Dog bone, stapler,
cribbage board, garlic press
     because this window is loose—lacks
suction, lacks grip.
Bungee cord, bootstrap,
dog leash, leather belt
     because this window had sash cords.
They frayed. They broke.
Feather duster, thatch of straw, empty
bottle of Elmer's glue
     because this window is loud—its hinges clack
open, clack shut.
Stuffed bear, baby blanket,
single crib newel
     because this window is split. It's dividing
in two.
Velvet moss, sagebrush,
willow branch, robin's wing
     because this window, it's pane-less. It's only
a frame of air.

By Michelle Menting.

                    The New Church
                

The old cupola glinted above the clouds, shone
among fir trees, but it took him an hour
for the half mile all the way up the hill. As he trailed,
the village passed him by, greeted him,
asked about his health, but 

In [5]:
def clean_column(col: pd.Series) -> pd.Series:
    col_clean = col.str.replace('\r', '')
    col_clean = col_clean.str.replace('\n', ' ')  # replace new line by space
    return col_clean

In [6]:
# clean poem string
df['title_clean'] = clean_column(df['title'])
df['poem_clean'] = clean_column(df['poem'])

# remove whitespace
df['title_clean'] = df['title_clean'].str.strip()
df = df[df['title_clean'] != ""]
df['poem_clean'] = df['poem_clean'].str.strip()
df = df[df['poem_clean'] != ""]

# concat title and poem
df['poem_clean'] = df['title_clean'] + " \n" + df['poem_clean']

In [7]:
# get word count
df['poem_len'] = df['poem_clean'].str.split(' ').apply(len)
print("Poems length stats")
print(df['poem_len'].describe())

# keep poems that are between 20 and 500 words
df = df[(df['poem_len'] >= 20) & (df['poem_len'] <= 500)]

Poems length stats
count    13753.000000
mean       317.269687
std        588.602764
min          3.000000
25%        111.000000
50%        179.000000
75%        323.000000
max      18875.000000
Name: poem_len, dtype: float64


In [8]:
# get id as column
df = df.reset_index(drop=True)
df['id'] = df.index

In [9]:
# there can be several tags per poem. 'Explode' dataframe in these cases.
# df['tags'] = df['tags'].fillna("")
# df['tags'] = df['tags'].str.split(",")
# df = df.explode('tags')
# df['tags'] = df['tags'].fillna("")
# df['tags'] = df['tags'].str.strip()


In [10]:
# save data for training
# cols = ['id', 'title_clean', 'poem_clean', 'tags']
cols = ['id', 'title_clean', 'poem_clean']
df[cols].to_csv("../data/poems.csv", index=False)

In [13]:
df

Unnamed: 0,title,poem,poet,tags,title_clean,poem_clean,poem_len,id
0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting,,Objects Used to Prop Open a Window,"Objects Used to Prop Open a Window \nDog bone,...",118,0
1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu,,The New Church,The New Church \nThe old cupola glinted above ...,125,1
2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser,,Look for Me,Look for Me \nLook for me under the hood of th...,77,2
3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri,,Wild Life,"Wild Life \nBehind the silo, the Mother Rabbit...",168,3
4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek,,Umbrella,Umbrella \nWhen I push your button you fly off...,117,4
...,...,...,...,...,...,...,...,...
11770,\r\r\n !\r\r\n,"\r\r\nDear Writers, I’m compiling the first in...",Wendy Videlock,"Relationships,Gay, Lesbian, Queer,Arts & Scien...",!,"! \nDear Writers, I’m compiling the first in w...",40,11770
11771,\r\r\n 1 January 1965\r\r\n...,\r\r\nThe Wise Men will unlearn your name.\r\r...,Joseph Brodsky,"Living,Death,Growing Old,Time & Brevity,Nature...",1 January 1965,1 January 1965 \nThe Wise Men will unlearn you...,154,11771
11772,\r\r\n 1-800-FEAR\r\r\n ...,\r\r\nWe'd like to talk with you about ...,Jody Gladding,"Living,Social Commentaries,Popular Culture",1-800-FEAR,1-800-FEAR \nWe'd like to talk with you ...,210,11772
11773,\r\r\n 0\r\r\n,\r\r\n Philosophic\r\r\nin its comple...,Hailey Leithauser,"Arts & Sciences,Philosophy",0,"0 \nPhilosophic in its complex, ovoid emptines...",136,11773
