In [35]:
import re
import pandas as pd
import inflect
inflect_engine = inflect.engine()

In [57]:
# Load Phonemes

WORDS = {}

# Standard Dict
with open('cmudict.dict.txt', 'r') as f:
    for line in f.readlines():
        word, phonemes = line.strip().split(' ', 1)
        word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0]
        phonemes = phonemes.split(' ')
        syllables = sum([re.match(r'.*\d', p) is not None for p in phonemes])
        #print(word, phonemes, syllables)
        if word not in WORDS:
            WORDS[word] = []
        WORDS[word].append({
            'phonemes': phonemes,
            'syllables': syllables
        })
        
# Load custom phonemes
vowels = ['AA', 'AE', 'AH', 'AO', 'AW', 'AX', 'AXR', 'AY', 'EH', 'ER', 'EY', 'IH', 'IX', 'IY', 'OW', 'OY', 'UH', 'UW', 'UX']

with open('8659.dict.txt', 'r') as f:
    for line in f.readlines():
        word, phonemes = line.strip().split('\t', 1)
        word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0].lower()
        phonemes = phonemes.split(' ')
        syllables = sum([(p in vowels) for p in phonemes])
        
        if word not in WORDS:
            WORDS[word] = []
        WORDS[word].append({
            'phonemes': phonemes,
            'syllables': syllables
        })

In [58]:
df = pd.read_csv('haikus_sballas8.csv', names=['haiku'])
df = df.join(df['haiku'].str.split('\n', expand=True))
# Get ones with exactly 3 lines
df = df.dropna(subset=[0,1,2])
df = df[(((df[3] == '') | pd.isnull(df[3])) & pd.isnull(df[[4, 5, 6, 7, 8]]).all(axis=1))]
#df = df['']

df = df[['haiku', 0, 1, 2]]
df['lang'] = 'en'
df

Unnamed: 0,haiku,0,1,2,lang
0,rectory roofers\ntheir ladders\ntake them high...,rectory roofers,their ladders,take them higher,en
1,summer cabin\nthe ants\ndo the dishes\n,summer cabin,the ants,do the dishes,en
2,lagoon at sunrise?\nthe shadow\nchases its pel...,lagoon at sunrise?,the shadow,chases its pelican,en
3,barren trees\neven the tiniest twig\nembraced ...,barren trees,even the tiniest twig,embraced by the mist,en
4,windfall apples\nbees tango\nto a waltz\n,windfall apples,bees tango,to a waltz,en
5,that foghorn bawl?\ncalf separated\nfrom its m...,that foghorn bawl?,calf separated,from its mother,en
6,spray art\nthe cherry knows\nno bounds\n,spray art,the cherry knows,no bounds,en
7,cold night\nthe escalating heat\nof my old not...,cold night,the escalating heat,of my old notebook,en
8,fly fishing\nmy thoughts\nuntangle\n,fly fishing,my thoughts,untangle,en
9,arrows of geese\nthe puppy chases\nleaves on t...,arrows of geese,the puppy chases,leaves on the wind,en


In [59]:
# Dictionary of words not found, must go get the phonemes
# http://www.speech.cs.cmu.edu/tools/lextool.html
NOT_FOUND = set()

def get_syllable_count(line):
    if pd.isnull(line):
        return None
    
    counts = [0]
    line = line.lower()

    # Replace numeric words with the words written out
    ws = []
    for word in line.split(' '):
        if re.search(r'\d', word):
            x = inflect_engine.number_to_words(word).replace('-', ' ')
            ws = ws + x.split(' ')
        else:
            ws.append(word)

    line = ' '.join(ws)

    for word in line.split(' '):
        try:
            word = re.match(r'[\'"]*([\w\']*)[\'"]*(.*)', word).groups()[0]

            if word:
                if word not in WORDS:
                    word = word.strip('\'')
                syllables = set(p['syllables'] for p in WORDS[word])
                #print(syllables)
                new_counts = []
                for c in counts:
                    for s in syllables:
                        new_counts.append(c+s)

                counts = new_counts
        except:
            NOT_FOUND.add(word)
            return None

    return ','.join([str(i) for i in set(counts)])

In [60]:
for i in range(3):
    df['%s_syllables' % i] = df[i].apply(get_syllable_count)
    
print(NOT_FOUND)

df

set()


Unnamed: 0,haiku,0,1,2,lang,0_syllables,1_syllables,2_syllables
0,rectory roofers\ntheir ladders\ntake them high...,rectory roofers,their ladders,take them higher,en,5,3,4
1,summer cabin\nthe ants\ndo the dishes\n,summer cabin,the ants,do the dishes,en,4,2,4
2,lagoon at sunrise?\nthe shadow\nchases its pel...,lagoon at sunrise?,the shadow,chases its pelican,en,5,3,6
3,barren trees\neven the tiniest twig\nembraced ...,barren trees,even the tiniest twig,embraced by the mist,en,3,7,5
4,windfall apples\nbees tango\nto a waltz\n,windfall apples,bees tango,to a waltz,en,4,3,3
5,that foghorn bawl?\ncalf separated\nfrom its m...,that foghorn bawl?,calf separated,from its mother,en,4,5,4
6,spray art\nthe cherry knows\nno bounds\n,spray art,the cherry knows,no bounds,en,2,4,2
7,cold night\nthe escalating heat\nof my old not...,cold night,the escalating heat,of my old notebook,en,2,6,5
8,fly fishing\nmy thoughts\nuntangle\n,fly fishing,my thoughts,untangle,en,3,2,3
9,arrows of geese\nthe puppy chases\nleaves on t...,arrows of geese,the puppy chases,leaves on the wind,en,4,5,4


In [52]:
with open('words3.txt', 'w') as f:
    for word in sorted(NOT_FOUND):
        f.write(word)
        f.write('\n')

In [61]:
df.to_csv('haikus_sballas8_with_syllables.csv', index=False)