In [1]:
import re
import pandas as pd
import inflect
inflect_engine = inflect.engine()

In [2]:
# Load Phonemes

WORDS = {}

# Standard Dict
with open('cmudict.dict.txt', 'r') as f:
    for line in f.readlines():
        word, phonemes = line.strip().split(' ', 1)
        word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0]
        phonemes = phonemes.split(' ')
        syllables = sum([re.match(r'.*\d', p) is not None for p in phonemes])
        #print(word, phonemes, syllables)
        if word not in WORDS:
            WORDS[word] = []
        WORDS[word].append({
            'phonemes': phonemes,
            'syllables': syllables
        })
        
# Load custom phonemes
vowels = ['AA', 'AE', 'AH', 'AO', 'AW', 'AX', 'AXR', 'AY', 'EH', 'ER', 'EY', 'IH', 'IX', 'IY', 'OW', 'OY', 'UH', 'UW', 'UX']

with open('8659.dict.txt', 'r') as f:
    for line in f.readlines():
        word, phonemes = line.strip().split('\t', 1)
        word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0].lower()
        phonemes = phonemes.split(' ')
        syllables = sum([(p in vowels) for p in phonemes])
        
        if word not in WORDS:
            WORDS[word] = []
        WORDS[word].append({
            'phonemes': phonemes,
            'syllables': syllables
        })

In [5]:
# Load Haikus
df = pd.read_csv('haikus.csv', encoding='latin1')
df = df[df['lang'] == 'en'].copy()
df = df[~pd.isnull(df['0']) & ~pd.isnull(df['1']) & ~pd.isnull(df['2'])].copy()
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,haiku,lang
3,3,Memorial Day --,a shadow for each,white cross,,,,,,,Memorial Day --\r\na shadow for each\r\nwhite ...,en
4,4,spring rain -,as the doctor speaks,i think of lilacs,,,,,,,spring rain -\r\nas the doctor speaks\r\ni thi...,en
5,5,spring moonset --,a rice ball for,breakfast,,,,,,,spring moonset --\r\na rice ball for\r\nbreakfast,en
6,6,sunny afternoon,an old man lingers,near the mailbox,,,,,,,sunny afternoon\r\nan old man lingers\r\nnear ...,en
7,7,cinco de mayo,horses roll,in the shallows,,,,,,,cinco de mayo\r\nhorses roll\r\nin the shallows,en
8,8,quitting time,the smell of rain,in the lobby,,,,,,,quitting time\r\nthe smell of rain\r\nin the l...,en
9,9,waves,slowly cresting towards shore,a faint moon,,,,,,,waves\r\nslowly cresting towards shore\r\na fa...,en
10,10,overnight rain --,the scent of orange blossoms,in a desert town,,,,,,,overnight rain --\r\nthe scent of orange bloss...,en
13,13,misty summer rain,calling pheasant,in Zen temple,,,,,,,misty summer rain\r\ncalling pheasant\r\nin Ze...,en
14,14,day is done,poppies amidst,the dying grass,,,,,,,day is done\r\npoppies amidst\r\nthe dying grass,en


In [18]:
# Dictionary of words not found, must go get the phonemes
# http://www.speech.cs.cmu.edu/tools/lextool.html
NOT_FOUND = set()

def get_syllable_count(line):
    if pd.isnull(line):
        return None
    
    counts = [0]
    line = line.lower()

    # Replace numeric words with the words written out
    ws = []
    for word in line.split(' '):
        if re.search(r'\d', word):
            x = inflect_engine.number_to_words(word).replace('-', ' ')
            ws = ws + x.split(' ')
        else:
            ws.append(word)

    line = ' '.join(ws)

    for word in line.split(' '):
        try:
            word = re.match(r'[\'"]*([\w\']*)[\'"]*(.*)', word).groups()[0]

            if word:
                if word not in WORDS:
                    word = word.strip('\'')
                syllables = set(p['syllables'] for p in WORDS[word])
                #print(syllables)
                new_counts = []
                for c in counts:
                    for s in syllables:
                        new_counts.append(c+s)

                counts = new_counts
        except:
            NOT_FOUND.add(word)
            return None

    return ','.join([str(i) for i in set(counts)])

In [19]:
for i in range(9):
    df['%s_syllables' % i] = df[str(i)].apply(get_syllable_count)
    
print(NOT_FOUND)

df

{'à', 'éclairs'}


Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,lang,0_syllables,1_syllables,2_syllables,3_syllables,4_syllables,5_syllables,6_syllables,7_syllables,8_syllables
3,3,Memorial Day --,a shadow for each,white cross,,,,,,,...,en,5,5,2,,,,,,
4,4,spring rain -,as the doctor speaks,i think of lilacs,,,,,,,...,en,23,5,5,,,,,,
5,5,spring moonset --,a rice ball for,breakfast,,,,,,,...,en,34,4,2,,,,,,
6,6,sunny afternoon,an old man lingers,near the mailbox,,,,,,,...,en,5,5,4,,,,,,
7,7,cinco de mayo,horses roll,in the shallows,,,,,,,...,en,5,3,4,,,,,,
8,8,quitting time,the smell of rain,in the lobby,,,,,,,...,en,3,4,4,,,,,,
9,9,waves,slowly cresting towards shore,a faint moon,,,,,,,...,en,1,67,3,,,,,,
10,10,overnight rain --,the scent of orange blossoms,in a desert town,,,,,,,...,en,4,7,5,,,,,,
13,13,misty summer rain,calling pheasant,in Zen temple,,,,,,,...,en,5,4,4,,,,,,
14,14,day is done,poppies amidst,the dying grass,,,,,,,...,en,3,4,4,,,,,,


In [20]:
df.to_csv('haikus_with_syllables.csv', index=False)

In [23]:
df[(df['0_syllables'] == '5') & (df['1_syllables'] == '7') & (df['2_syllables'] == '5')]

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,lang,0_syllables,1_syllables,2_syllables,3_syllables,4_syllables,5_syllables,6_syllables,7_syllables,8_syllables
49,49,last red in the sky,a small girl's moon face rises,over the counter,,,,,,,...,en,5,7,5,,,,,,
284,284,christmas services,a cellular phone rings out,handel's messiah,,,,,,,...,en,5,7,5,,,,,,
560,560,Passover darkness -,before the buds burst open;,a child's eyes in death.,,,,,,,...,en,5,7,5,,,,,,
635,635,Last night of Summer,the bright full moon of last night,hidden by a cloud,,,,,,,...,en,5,7,5,,,,,,
639,639,Midnight and full moon,my neighbour asks to borrow,the vacum cleaner,,,,,,,...,en,5,7,5,,,,,,
718,718,yellow walnut leaves,slowly appear on the lawn--,early morning light,,,,,,,...,en,5,7,5,,,,,,
956,956,after its first flight,the young gerfalcon's talons,tighter on my glove,,,,,,,...,en,5,7,5,,,,,,
1010,1010,sultry afternoon,only the mailbox shadow,crosses the dirt road,,,,,,,...,en,5,7,5,,,,,,
1011,1011,long journey back home --,a forgotten bale of hay,slowly rots away,,,,,,,...,en,5,7,5,,,,,,
1046,1046,Autumn mist obscures,the island in the distance,she cleans her glasses,,,,,,,...,en,5,7,5,,,,,,
