In [1]:
import json
from pathlib import Path
import re

import inflect
import numpy as np
import pandas as pd

In [2]:
root_path = Path('../..')
input_path = root_path / 'input' 
dictionary_path = input_path / 'dictionaries'
poem_path = input_path / 'poems'
raw_path = poem_path / 'raw'

all_haikus = []

# Load Tempslibres Poems

Load haikus from tempslibres, which were downloaded to `tempslibres.csv` using `Download Haikus From Tempslibres.ipynb`

In [3]:
# Load the file
df = pd.read_csv(str(raw_path / 'tempslibres.csv'), encoding='latin1')
df = df[df['lang'] == 'en'].copy()

# Only keep the first three lines of english haikus 
df = df[~pd.isnull(df['0']) & ~pd.isnull(df['1']) & ~pd.isnull(df['2'])].copy()
df = df[['0', '1', '2']]
df.columns = [0, 1, 2]

df['source'] = 'tempslibres'
all_haikus.append(df)
df.head()

Unnamed: 0,0,1,2,source
3,Memorial Day --,a shadow for each,white cross,tempslibres
4,spring rain -,as the doctor speaks,i think of lilacs,tempslibres
5,spring moonset --,a rice ball for,breakfast,tempslibres
6,sunny afternoon,an old man lingers,near the mailbox,tempslibres
7,cinco de mayo,horses roll,in the shallows,tempslibres


In [4]:
df.columns

Index([0, 1, 2, 'source'], dtype='object')

# Load img2poem

The data from the paper [Beyond Narrative Description: Generating Poetry from Images by Multi-Adversarial Training](https://arxiv.org/abs/1804.08473). 

From: https://github.com/bei21/img2poem

It contains many different types of poems, so only keep three line poems.

In [5]:
# Read the JSON file
with (raw_path / 'unim_poem.json').open('r') as f:
    j = json.load(f)
    
# keep only the three line poems
df = pd.DataFrame({
    'haiku': [i['poem'] for i in j if len(i['poem'].split('\n')) == 3]
})

# Split them into lines
df = df['haiku'].str.split('\n', expand=True)

df['source'] = 'img2poems'
all_haikus.append(df)
df.head()

Unnamed: 0,0,1,2,source
0,jesus smiles grabs his guitar and plays,a couple of familiar riffs crooning take it,take another little piece of my heart now baby,img2poems
1,after a flying ovation she realizes,she is not with the kozmic blues band or,big brother and the holding company,img2poems
2,you do a lot more talking in your sleep,than you used to do now you're growing old,i wonder what there is for me to keep,img2poems
3,look back: you see the path we chose was steep,and needed all our strength but we were bold,it took our breath and left us only sleep,img2poems
4,there was a time when talk of love came cheap,and what was bought was what the other sold,there was so much we didn't think to keep,img2poems


# PoetRNN

Training data from [Sam Ballas's PoetRNN](https://github.com/sballas8/PoetRNN)

In [6]:
# Read the CSV and split the haikus into lines
df = pd.read_csv(str(raw_path / 'sballas8.csv'), names=['haiku'])
df = df['haiku'].str.split('\n', expand=True)

# Drop ones without at least three lines
df = df.dropna(subset=[0,1,2])

# Keep only the ones with exactly three lines
df = df[(((df[3] == '') | pd.isnull(df[3])) & pd.isnull(df[[4, 5, 6, 7, 8]]).all(axis=1))]


df = df[[0, 1, 2]]
df['source'] = 'sballas'
all_haikus.append(df)
df.head()

Unnamed: 0,0,1,2,source
0,rectory roofers,their ladders,take them higher,sballas
1,summer cabin,the ants,do the dishes,sballas
2,lagoon at sunrise?,the shadow,chases its pelican,sballas
3,barren trees,even the tiniest twig,embraced by the mist,sballas
4,windfall apples,bees tango,to a waltz,sballas


# Haikuzao

https://github.com/herval/creative_machines/tree/master/haikuzao

In [7]:
with (raw_path / 'haikuzao.txt').open('r') as f:
    text = f.read()
    
df = pd.DataFrame({
    'haiku': [i for i in text.split('\n\n') if len(i.split('\n')) == 3]
})

# Split them into lines
df = df['haiku'].str.split('\n', expand=True)

df['source'] = 'haikuzao'
all_haikus.append(df)
df.head()

Unnamed: 0,0,1,2,source
0,a skein of birds,twines across the sky,the northbound train departs,haikuzao
1,dawn chorus begins,I reach for,the snooze button,haikuzao
2,en haut des cuisses,dans l'espace sous le slip,un morceau de mer,haikuzao
3,new March snow,the grouse with a missing toe,still around,haikuzao
4,Remembrance Day-,even the traffic,pauses for 2 minutes,haikuzao


In [8]:
all_haikus = pd.concat(all_haikus, sort=False)

# Drop duplicates because there are some poems in multiple sources
all_haikus['hash'] = (all_haikus[0] + all_haikus[1] + all_haikus[2]).str.replace(r'[^A-Za-z]', '').str.upper()
all_haikus = all_haikus.drop_duplicates(subset=['hash'])

all_haikus

Unnamed: 0,0,1,2,source,hash
3,Memorial Day --,a shadow for each,white cross,tempslibres,MEMORIALDAYASHADOWFOREACHWHITECROSS
4,spring rain -,as the doctor speaks,i think of lilacs,tempslibres,SPRINGRAINASTHEDOCTORSPEAKSITHINKOFLILACS
5,spring moonset --,a rice ball for,breakfast,tempslibres,SPRINGMOONSETARICEBALLFORBREAKFAST
6,sunny afternoon,an old man lingers,near the mailbox,tempslibres,SUNNYAFTERNOONANOLDMANLINGERSNEARTHEMAILBOX
7,cinco de mayo,horses roll,in the shallows,tempslibres,CINCODEMAYOHORSESROLLINTHESHALLOWS
8,quitting time,the smell of rain,in the lobby,tempslibres,QUITTINGTIMETHESMELLOFRAININTHELOBBY
9,waves,slowly cresting towards shore,a faint moon,tempslibres,WAVESSLOWLYCRESTINGTOWARDSSHOREAFAINTMOON
10,overnight rain --,the scent of orange blossoms,in a desert town,tempslibres,OVERNIGHTRAINTHESCENTOFORANGEBLOSSOMSINADESERT...
13,misty summer rain,calling pheasant,in Zen temple,tempslibres,MISTYSUMMERRAINCALLINGPHEASANTINZENTEMPLE
14,day is done,poppies amidst,the dying grass,tempslibres,DAYISDONEPOPPIESAMIDSTTHEDYINGGRASS


# Get Syllable Count For Each Row

In [9]:
# Load Phonemes

# Standard Dict
WORDS = {}
with (dictionary_path / 'cmudict.dict.txt').open('r') as f:
    for line in f.readlines():
        word, phonemes = line.strip().split(' ', 1)
        word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0]
        phonemes = phonemes.split(' ')
        syllables = sum([re.match(r'.*\d', p) is not None for p in phonemes])
        #print(word, phonemes, syllables)
        if word not in WORDS:
            WORDS[word] = []
        WORDS[word].append({
            'phonemes': phonemes,
            'syllables': syllables
        })
        
# Load custom phonemes
CUSTOM_WORDS = {}
vowels = ['AA', 'AE', 'AH', 'AO', 'AW', 'AX', 'AXR', 'AY', 'EH', 'ER', 'EY', 'IH', 'IX', 'IY', 'OW', 'OY', 'UH', 'UW', 'UX']
with (dictionary_path / 'custom.dict.txt').open('r') as f:
    for line in f.readlines():
        try:
            word, phonemes = line.strip().split('\t', 1)
        except:
            print(line)
            continue
        word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0].lower()
        phonemes = phonemes.split(' ')
        syllables = sum([(p in vowels) for p in phonemes])
        
        if word not in CUSTOM_WORDS:
            CUSTOM_WORDS[word] = []
        CUSTOM_WORDS[word].append({
            'phonemes': phonemes,
            'syllables': syllables
        })

In [10]:
inflect_engine = inflect.engine()

# Dictionary of words not found, must go get the phonemes
# http://www.speech.cs.cmu.edu/tools/lextool.html
NOT_FOUND = set()

def get_words(line):
    """
    Get a list of the words in a line
    """
    line = line.lower()
    # Replace numeric words with the words written out
    ws = []
    for word in line.split(' '):
        if re.search(r'\d', word):
            x = inflect_engine.number_to_words(word).replace('-', ' ')
            ws = ws + x.split(' ')
        else:
            ws.append(word)

    line = ' '.join(ws)

    words = []
    for word in line.split(' '):
        word = re.match(r'[\'"]*([\w\']*)[\'"]*(.*)', word).groups()[0]
        word = word.replace('_', '')
        words.append(word)
        
    return words

def count_non_standard_words(line):
    """
    Count the number of words on the line that don't appear in the default CMU Dictionary.
    """
    count = 0
    for word in get_words(line):
        if word and (word not in WORDS):
            count += 1
    return count

def get_syllable_count(line):
    """
    Get the possible syllable counts for the line
    """
    counts = [0]
    return_none = False
    for word in get_words(line):
        try:
            if word:
                if (word not in WORDS) and (word not in CUSTOM_WORDS):
                    word = word.strip('\'')
                    
                if word in WORDS:
                    syllables = set(p['syllables'] for p in WORDS[word])
                else:
                    syllables = set(p['syllables'] for p in CUSTOM_WORDS[word])
                #print(syllables)
                new_counts = []
                for c in counts:
                    for s in syllables:
                        new_counts.append(c+s)

                counts = new_counts
        except:
            NOT_FOUND.add(word)
            return_none = True

    if return_none:
        return None
    
    return ','.join([str(i) for i in set(counts)])

In [11]:
# Remove haikus with lots of unknown words
# Likely either non-english or just lots of typos
all_haikus['unknown_word_count'] = np.sum([all_haikus[i].apply(count_non_standard_words) for i in range(3)], axis=0)
all_haikus = all_haikus[all_haikus['unknown_word_count'] < 3].copy()

for i in range(3):
    all_haikus['%s_syllables' % i] = all_haikus[i].apply(get_syllable_count)
    
print("Unknown Words: ", len(NOT_FOUND))

with open('unrecognized_words.txt', 'w') as f:
    for w in NOT_FOUND:
        f.write(w)
        f.write('\n')

all_haikus

Unknown Words:  0


Unnamed: 0,0,1,2,source,hash,unknown_word_count,0_syllables,1_syllables,2_syllables
3,Memorial Day --,a shadow for each,white cross,tempslibres,MEMORIALDAYASHADOWFOREACHWHITECROSS,0,5,5,2
4,spring rain -,as the doctor speaks,i think of lilacs,tempslibres,SPRINGRAINASTHEDOCTORSPEAKSITHINKOFLILACS,0,23,5,5
5,spring moonset --,a rice ball for,breakfast,tempslibres,SPRINGMOONSETARICEBALLFORBREAKFAST,1,34,4,2
6,sunny afternoon,an old man lingers,near the mailbox,tempslibres,SUNNYAFTERNOONANOLDMANLINGERSNEARTHEMAILBOX,0,5,5,4
7,cinco de mayo,horses roll,in the shallows,tempslibres,CINCODEMAYOHORSESROLLINTHESHALLOWS,1,5,3,4
8,quitting time,the smell of rain,in the lobby,tempslibres,QUITTINGTIMETHESMELLOFRAININTHELOBBY,0,3,4,4
9,waves,slowly cresting towards shore,a faint moon,tempslibres,WAVESSLOWLYCRESTINGTOWARDSSHOREAFAINTMOON,0,1,67,3
10,overnight rain --,the scent of orange blossoms,in a desert town,tempslibres,OVERNIGHTRAINTHESCENTOFORANGEBLOSSOMSINADESERT...,0,4,7,5
13,misty summer rain,calling pheasant,in Zen temple,tempslibres,MISTYSUMMERRAINCALLINGPHEASANTINZENTEMPLE,0,5,4,4
14,day is done,poppies amidst,the dying grass,tempslibres,DAYISDONEPOPPIESAMIDSTTHEDYINGGRASS,0,3,4,4


In [15]:
all_haikus.drop(columns=['unknown_word_count', 'hash']).to_csv(str(poem_path / 'haikus.csv'), index=False)