## Load lines of text

In [20]:
f = open("demo.txt", "r")
lines = f.readlines()

## Phonemes and Carnegie Mellon Pronouncing Dictionary

(See https://github.com/cmusphinx/cmudict/tree/4c6a365cea2c34340ffc218d5af7a38920fa7e37)

From https://www.nltk.org/_modules/nltk/corpus/reader/cmudict.html:

The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
Copyright 1998 Carnegie Mellon University

File Format: Each line consists of an uppercased word, a counter
(for alternative pronunciations), and a transcription.  Vowels are
marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.:
NATURAL 1 N AE1 CH ER0 AH0 L

The dictionary contains 127069 entries.  Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations.  Many of these are fast-speech variants.

Phonemes: There are 39 phonemes, as shown below:

    Phoneme Example Translation    Phoneme Example Translation
    ------- ------- -----------    ------- ------- -----------
    AA      odd     AA D           AE      at      AE T
    AH      hut     HH AH T        AO      ought   AO T
    AW      cow     K AW           AY      hide    HH AY D
    B       be      B IY           CH      cheese  CH IY Z
    D       dee     D IY           DH      thee    DH IY
    EH      Ed      EH D           ER      hurt    HH ER T
    EY      ate     EY T           F       fee     F IY
    G       green   G R IY N       HH      he      HH IY
    IH      it      IH T           IY      eat     IY T
    JH      gee     JH IY          K       key     K IY
    L       lee     L IY           M       me      M IY
    N       knee    N IY           NG      ping    P IH NG
    OW      oat     OW T           OY      toy     T OY
    P       pee     P IY           R       read    R IY D
    S       sea     S IY           SH      she     SH IY
    T       tea     T IY           TH      theta   TH EY T AH
    UH      hood    HH UH D        UW      two     T UW
    V       vee     V IY           W       we      W IY
    Y       yield   Y IY L D       Z       zee     Z IY
    ZH      seizure S IY ZH ER
    
From https://www.pythonstudio.us/language-processing/a-pronouncing-dictionary.html:

For each word, this lexicon provides a list of phonetic codes—distinct labels for each contrastive sound—known as phones. Observe that fire has two pronunciations (in U.S. English): the one-syllable F AY1 R, and the two-syllable F AY1 ER0. The symbols in the CMU Pronouncing Dictionary are from the Arpabet, described in more detail at http://en.wikipedia.org/wiki/Arpabet.

In [21]:
import nltk

entries = nltk.corpus.cmudict.entries()

phoneme_list = ['AA','AH','AW','B','D','EH','EY','G','IH','JH','L','N','OW','P','S','T','UH','V','Y','ZH',
                'AE','AO','AY','CH','DH','ER','F','HH','IY','K','M','NG','OY','R','SH','TH','UW','W','Z']
phoneme_vowel_list = ['AA','AH','AW','EH','EY','IH','OW','UH','AE','AO','AY','ER','IY','OY','UW']
phoneme_consonant_list = ['B','D','G','JH','L','N','P','S','T','V','Y','ZH','CH',
                          'DH','F','HH','K','M','NG','R','SH','TH','W','Z']

## Code for converting words to phonemes

In [22]:
from g2p_en import G2p
g2p = G2p()

# Attempt to convert phonemes to graphemes:
# https://github.com/wassname/phoneme2grapheme/blob/master/main.ipynb

## Code for converting phonemes to words

In [23]:
def get_unique_numbers(numbers):
    unique = []
    for number in numbers:
        if number not in unique:
            unique.append(number)
    return unique

def p2g(phoneme_list, istart=0): 
    '''
    Generate a list of words from a list of phonemes,
    by concatenating sequences of the phonemes 
    and searching in CMU's Pronunciation Dictionary.
    '''
    words_from_phonemes = []
    for istop in range(istart + 1, len(phoneme_list) + 1):
        phoneme_subset = phoneme_list[istart:istop]
        #print(phoneme_subset)
        for word, pron in entries:
            if len(pron) == len(phoneme_subset):
                match = 0
                for index, p in enumerate(pron):
                    if re.sub(r'\d+', '', p) == re.sub(r'\d+', '', phoneme_subset[index]):
                        match += 1
                if match == len(pron):
                    words_from_phonemes.append([word, istart, istart + index])
                    
    unique_stops = get_unique_numbers([i2 for x,i1,i2 in words_from_phonemes])

    return words_from_phonemes, unique_stops

## Code for counting syllables

In [24]:
# https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word

import re

VOWEL_RUNS = re.compile("[aeiouy]+", flags=re.I)
EXCEPTIONS = re.compile(
    # fixes trailing e issues:
    # smite, scared
    "[^aeiou]e[sd]?$|"
    # fixes adverbs:
    # nicely
    + "[^e]ely$",
    flags=re.I
)
ADDITIONAL = re.compile(
    # fixes incorrect subtractions from exceptions:
    # smile, scarred, raises, fated
    "[^aeioulr][lr]e[sd]?$|[csgz]es$|[td]ed$|"
    # fixes miscellaneous issues:
    # flying, piano, video, prism, fire, evaluate
    + ".y[aeiou]|ia(?!n$)|eo|ism$|[^aeiou]ire$|[^gq]ua",
    flags=re.I
)

def count_syllables(word):
    vowel_runs = len(VOWEL_RUNS.findall(word))
    exceptions = len(EXCEPTIONS.findall(word))
    additional = len(ADDITIONAL.findall(word))
    return max(1, vowel_runs - exceptions + additional)

## Find phonemes, stresses, and number of syllables per input line of text

In [25]:
phonemes_per_line = []
stresses_per_line = []
syllables_per_line = []
consonants_per_line = []

for line in lines:
    if line.strip() != "":
        words = line.split()
        phonemes_for_line = []
        stresses_for_line = []
        syllables_for_line = 0
        for word in words:
            
            # Extract phonemes per word (choose the first version of the phoneme)
            #     :: multiple pronunciations: pronouncing.phones_for_word(word) 
            phonemes_and_stresses_for_word = g2p(word)
                      
            phonemes_for_word = [re.sub(r'\d+', '', x) for x in phonemes_and_stresses_for_word]
            stresses_blanks_for_word = [re.sub(r"(?:[A-Z])",'', x) for x in phonemes_and_stresses_for_word]
            stresses_for_word = []
            syllables_for_word = 0
            for i,p in enumerate(phonemes_for_word):
                if p in phoneme_list:
                    if stresses_blanks_for_word[i] == '':
                        stresses_for_word.append(0)
                    elif stresses_blanks_for_word[i] == '0':
                        stresses_for_word.append(0)
                    elif stresses_blanks_for_word[i] == '1':
                        stresses_for_word.append(1)
                    elif stresses_blanks_for_word[i] == '2':
                        stresses_for_word.append(2)                        
            phonemes_for_word = [x for x in phonemes_for_word if x in phoneme_list]                  
            phonemes_for_line += phonemes_for_word  
            stresses_for_line += stresses_for_word
            syllables_for_line += count_syllables(word)

        phonemes_per_line.append(phonemes_for_line)
        stresses_per_line.append(stresses_for_line)
        syllables_per_line.append(syllables_for_line)
        consonants_per_line.append([x for x in phonemes_for_line if x in phoneme_consonant_list]) 

print(phonemes_per_line)
print(consonants_per_line)
print(stresses_per_line)
print(syllables_per_line)

[['L', 'AY', 'K', 'AH', 'D', 'AH', 'K', 'T', 'UW', 'W', 'AO', 'T', 'ER']]
[['L', 'K', 'D', 'K', 'T', 'W', 'T']]
[[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0]]
[6]


## Find all words that sound like each segment of a phoneme list

In [26]:
new_words_per_line = []
for phonemes_for_line in phonemes_per_line:   
    new_words = []
    istart = 0
    unique_stops = [-1]
    while istart < len(phonemes_for_line):
        if len(unique_stops) == 0:
            unique_stops = [istart + 1]
        for istop in unique_stops:
            istart = istop + 1
            if istart < len(phonemes_for_line):
                words_from_phonemes, unique_stops = p2g(phonemes_for_line, istart)
                new_words += words_from_phonemes
    new_words_per_line.append(new_words)
    
print(new_words_per_line)

[[['lai', 0, 1], ['lie', 0, 1], ['ly', 0, 1], ['lye', 0, 1], ['like', 0, 2], ['lyke', 0, 2], ['ca', 2, 3], ['cudd', 2, 4], ['a', 3, 3], ['uh', 3, 3], ['uhh', 3, 3], ['uhde', 3, 4], ['dah', 4, 5], ['de', 4, 5], ['du', 4, 5], ['duh', 4, 5], ['duc', 4, 6], ['duck', 4, 6], ['duk', 4, 6], ['ducked', 4, 7], ['duct', 4, 7], ['a', 5, 5], ['uh', 5, 5], ['uhh', 5, 5], ['ooh', 8, 8], ['oooh', 8, 8], ['ou', 8, 8], ['waugh', 9, 10], ['wat', 9, 11], ['water', 9, 12], ['ter', 11, 12], ['are', 12, 12], ['er', 12, 12], ['err', 12, 12], ['eure', 12, 12], ['or', 12, 12], ['ur', 12, 12]]]


## Construct word sequences that sound like each line of input text

In [55]:
def find_words_with_istart_index(word_istart_istop_list, istart_index):
    # store each word that begins at istart
    istart_words = []
    istart_istarts = []
    istart_istops = []
    for word, istart, istop in word_istart_istop_list:
        if istart == istart_index:
            istart_words.append(word)
            istart_istarts.append(istart)
            istart_istops.append(istop)
    
    return istart_words, istart_istarts, istart_istops







candidates_per_line = []
for new_words_for_line in new_words_per_line:
    new_words = []
    istarts = []
    istops = []
    for new_word, istart, istop in new_words_for_line:
        new_words.append(new_word)
        istarts.append(istart)
        istops.append(istop)        
    unique_istarts = get_unique_numbers(istarts)
    unique_istops = get_unique_numbers(istops)

    #print(unique_istarts)
    #print(unique_istops)
    
    new_line = []
    running_start = 0
    print(new_words_for_line)
    
#        find words that begin at that word's istop + 1
#        advance to next word
#            for each word that begins at that word's istop
#                find words that begin at that word's istop + 1
#
#          add word if (1 + istop for word) in unique_istarts

        
#    unique_stops = [-1]
#    while istart < len(phonemes_for_line):
#        if len(unique_stops) == 0:
#            unique_stops = [istart + 1]
#        for istop in unique_stops:
#            istart = istop + 1
#            if istart < len(phonemes_for_line):
#                words_from_phonemes, unique_stops = p2g(phonemes_for_line, istart)
#                new_words += words_from_phonemes
#    new_words_per_line.append(new_words)
            
    
    istart_index = 0
    running_start = 0
    istart_words, istart_istarts, istart_istops = find_words_with_istart_index(new_words_for_line, istart_index)
    print(istart_words, istart_istarts, istart_istops)
    for i_istart_word, istart_word in enumerate(istart_words):
        if running_start in unique_istarts:
            if istart == running_start:
                new_line.append(new_word)
                running_start = istop + 1
                #print(new_line)
                #print(running_start)
        else: 
            istart0 += 1
            break
    
    
    
    
    for new_word, istart, istop in new_words_for_line:
        #print(new_words_for_line[istart0::])
        if running_start in unique_istarts:
            if istart == running_start:
                new_line.append(new_word)
                running_start = istop + 1
                #print(new_line)
                #print(running_start)
        else: 
            istart0 += 1
            break
    

[['lai', 0, 1], ['lie', 0, 1], ['ly', 0, 1], ['lye', 0, 1], ['like', 0, 2], ['lyke', 0, 2], ['ca', 2, 3], ['cudd', 2, 4], ['a', 3, 3], ['uh', 3, 3], ['uhh', 3, 3], ['uhde', 3, 4], ['dah', 4, 5], ['de', 4, 5], ['du', 4, 5], ['duh', 4, 5], ['duc', 4, 6], ['duck', 4, 6], ['duk', 4, 6], ['ducked', 4, 7], ['duct', 4, 7], ['a', 5, 5], ['uh', 5, 5], ['uhh', 5, 5], ['ooh', 8, 8], ['oooh', 8, 8], ['ou', 8, 8], ['waugh', 9, 10], ['wat', 9, 11], ['water', 9, 12], ['ter', 11, 12], ['are', 12, 12], ['er', 12, 12], ['err', 12, 12], ['eure', 12, 12], ['or', 12, 12], ['ur', 12, 12]]
['lai', 'lie', 'ly', 'lye', 'like', 'lyke'] [0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 2, 2]
lai
lie
ly
lye
like
lyke
