## Phonemes and Carnegie Mellon Pronouncing Dictionary

(See https://github.com/cmusphinx/cmudict/tree/4c6a365cea2c34340ffc218d5af7a38920fa7e37)

From https://www.nltk.org/_modules/nltk/corpus/reader/cmudict.html:

The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
Copyright 1998 Carnegie Mellon University

File Format: Each line consists of an uppercased word, a counter
(for alternative pronunciations), and a transcription.  Vowels are
marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.:
NATURAL 1 N AE1 CH ER0 AH0 L

The dictionary contains 127069 entries.  Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations.  Many of these are fast-speech variants.

Phonemes: There are 39 phonemes, as shown below:

    Phoneme Example Translation    Phoneme Example Translation
    ------- ------- -----------    ------- ------- -----------
    AA      odd     AA D           AE      at      AE T
    AH      hut     HH AH T        AO      ought   AO T
    AW      cow     K AW           AY      hide    HH AY D
    B       be      B IY           CH      cheese  CH IY Z
    D       dee     D IY           DH      thee    DH IY
    EH      Ed      EH D           ER      hurt    HH ER T
    EY      ate     EY T           F       fee     F IY
    G       green   G R IY N       HH      he      HH IY
    IH      it      IH T           IY      eat     IY T
    JH      gee     JH IY          K       key     K IY
    L       lee     L IY           M       me      M IY
    N       knee    N IY           NG      ping    P IH NG
    OW      oat     OW T           OY      toy     T OY
    P       pee     P IY           R       read    R IY D
    S       sea     S IY           SH      she     SH IY
    T       tea     T IY           TH      theta   TH EY T AH
    UH      hood    HH UH D        UW      two     T UW
    V       vee     V IY           W       we      W IY
    Y       yield   Y IY L D       Z       zee     Z IY
    ZH      seizure S IY ZH ER
    
From https://www.pythonstudio.us/language-processing/a-pronouncing-dictionary.html:

For each word, this lexicon provides a list of phonetic codes—distinct labels for each contrastive sound—known as phones. Observe that fire has two pronunciations (in U.S. English): the one-syllable F AY1 R, and the two-syllable F AY1 ER0. The symbols in the CMU Pronouncing Dictionary are from the Arpabet, described in more detail at http://en.wikipedia.org/wiki/Arpabet.


## Process and filter CMU words

In [1]:
import nltk
import re
import pickle
import enchant

process_dictionary_words = False
filter_dictionary = 'pyenchant' #'english_words_py'

phoneme_list = ['AA','AH','AW','B','D','EH','EY','G','IH','JH','L','N','OW','P','S','T','UH','V','Y','ZH',
                'AE','AO','AY','CH','DH','ER','F','HH','IY','K','M','NG','OY','R','SH','TH','UW','W','Z']
phoneme_vowel_list = ['AA','AH','AW','EH','EY','IH','OW','UH','AE','AO','AY','ER','IY','OY','UW']
phoneme_consonant_list = ['B','D','G','JH','L','N','P','S','T','V','Y','ZH','CH',
                          'DH','F','HH','K','M','NG','R','SH','TH','W','Z']

if filter_dictionary == 'pyenchant':
    enchant_dict = enchant.Dict("en_US")
    #pip install cmudict
    #nltk.download('cmudict')
    #pip install pyenchant
elif filter_dictionary == 'english_words_py':
    # english-words-py (https://pypi.org/project/english-words/)
    # "Contains sets of English words from svnweb.freebsd.org/csrg/share/dict/. 
    # This is up to date with revision 61569 of their words list."
    from english_words import english_words_set
        

def filter_dictionary_words(words, pronunciations, pronunciations_no_stresses, 
                            stresses, filter_words, filter_strings, 
                            filter_dictionary='english_words_py', verbose=False):
    filtered_words = []
    filtered_pronunciations = []
    filtered_pronunciations_no_stresses = []
    filtered_stresses = []
    removed_words = []
    for iword, word in enumerate(words): 
        if filter_dictionary == 'pyenchant':
            if (enchant_dict.check(word) or enchant_dict.check(word.capitalize())) and \
                (word not in filter_words) and \
                (word not in filtered_words) and \
                all([x not in word for x in filter_strings]):
                    filtered_words.append(word)
                    filtered_pronunciations.append(pronunciations[iword])
                    filtered_pronunciations_no_stresses.append(pronunciations_no_stresses[iword])
                    filtered_stresses.append(stresses[iword])
        elif filter_dictionary == 'english_words_py':
            if (word in english_words_set or \
                    (word[:-1] in english_words_set and word[-1] == 's')) or \
                (word.capitalize() in english_words_set or \
                    (word.capitalize()[:-1] in english_words_set and word.capitalize()[-1] == 's')) and \
                (word not in filter_words) and \
                (word not in filtered_words) and \
                all([x not in word for x in filter_strings]):
                    filtered_words.append(word)
                    filtered_pronunciations.append(pronunciations[iword])
                    filtered_pronunciations_no_stresses.append(pronunciations_no_stresses[iword])
                    filtered_stresses.append(stresses[iword])
        else:
            removed_words.append(word)
    
    if verbose and removed_words != []:
        print('{0} retained words, {1} removed words'.format(len(filtered_words),len(removed_words)))

    return filtered_words, filtered_pronunciations, filtered_pronunciations_no_stresses, filtered_stresses, removed_words


def save_object(obj, pickle_file):
    try:
        with open(pickle_file, "wb") as f:
            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    except Exception as ex:
        print("Error during pickling object (Possibly unsupported):", ex)

        
def load_object(pickle_file):
    try:
        with open(pickle_file, "rb") as f:
            return pickle.load(f)
    except Exception as ex:
        print("Error during unpickling object (Possibly unsupported):", ex)
 

# CMU Pronunciation dictionary words and pronunciations:
if process_dictionary_words:
    cmu_entries = nltk.corpus.cmudict.entries()
    cmu_words = []
    cmu_pronunciations = []
    cmu_pronunciations_no_stresses = []
    cmu_stresses = []
    for cmu_word, cmu_pronunciation in cmu_entries:
        cmu_words.append(cmu_word.strip())
        cmu_pronunciations.append(cmu_pronunciation)
        cmu_pronunciations_no_stresses.append([re.sub(r'\d+', '', x) for x in cmu_pronunciation])
        cmu_stresses.append([re.sub(r'[A-Za-z]', '', x) for x in cmu_pronunciation])
    #for i in range(len(cmu_words)): 
    #    print(cmu_words[i], cmu_pronunciations[i], cmu_pronunciations_no_stresses[i], cmu_stresses[i])

    print('Filter the CMU dictionary...')

    filter_strings = ['.',',']

    # Load filter words
    fread_filter = open("data/dictionaries/filter_words.txt", "r")
    filter_words = fread_filter.readlines()
    filter_words = [x.strip() for x in filter_words]
    #print(filter_words)

    all_words, all_pronunciations, all_pronunciations_no_stresses, all_stresses, nonwords = filter_dictionary_words(cmu_words, 
        cmu_pronunciations, cmu_pronunciations_no_stresses, cmu_stresses, filter_words, filter_strings, 
        filter_dictionary, verbose=False)    

    save_object(all_words, 'data/dictionaries/words_{0}.pkl'.format(filter_dictionary))
    save_object(all_pronunciations, 'data/dictionaries/pronunciations_{0}.pkl'.format(filter_dictionary))
    save_object(all_pronunciations_no_stresses, 'data/dictionaries/pronunciations_no_stresses_{0}.pkl'.format(filter_dictionary))
    save_object(all_stresses, 'data/dictionaries/stresses_{0}.pkl'.format(filter_dictionary))
else:
    all_words = load_object('data/dictionaries/words_{0}.pkl'.format(filter_dictionary))
    all_pronunciations = load_object('data/dictionaries/pronunciations_{0}.pkl'.format(filter_dictionary))
    all_pronunciations_no_stresses = load_object('data/dictionaries/pronunciations_no_stresses_{0}.pkl'.format(filter_dictionary))
    all_stresses = load_object('data/dictionaries/stresses_{0}.pkl'.format(filter_dictionary))

## Test different dictionaries

In [2]:
test_dictionaries = False
if test_dictionaries:

    enchant_dict = enchant.Dict("en_US")
    #pip install cmudict
    #nltk.download('cmudict')
    #pip install pyenchant

    # english-words-py (https://pypi.org/project/english-words/)
    # "Contains sets of English words from svnweb.freebsd.org/csrg/share/dict/. 
    # This is up to date with revision 61569 of their words list."
    from english_words import english_words_set

    # Most Common English Words (https://github.com/dolph/dictionary)
    # "enable1.txt (172,819), the more verbose version of the Official Scrabble Player's Dictionary 
    # (which is limited to words of 8 letters or less)"
    # "popular.txt (25,322) represents the common subset of words found in both enable1.txt and Wiktionary's 
    # word frequency lists, which are in turn compiled by statistically analyzing a sample of 29 million 
    # words used in English TV and movie scripts."
    enable1 = [line.rstrip() for line in open('data/dictionaries/enable1.txt')]
    popular = [line.rstrip() for line in open('data/dictionaries/popular.txt')]

    # NLTK words corpus:
    #nltk.download('words')
    from nltk.corpus import words
    nltk_wordset = set(words.words())

    # Wiktionary Word Frequency_lists (https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#English)
    #https://gist.github.com/h3xx/1976236
    
    print('nltk_wordset:      {0}'.format(len(nltk_wordset)))
    print('enable1:           {0}'.format(len(enable1)))
    print('pyenchant:         {0}'.format('?')) #len(enchant_dict.values())))
    print('filtered CMU:      {0}'.format(len(all_words)))
    print('english-words-py:  {0}'.format(len(english_words_set)))
    print('popular:           {0}'.format(len(popular)))
    print()
    test_words = ["can't", 'geese', 'shelves', 'Thai', 'thai', 'e.', 'eure', 'bott', 'bitter', 'used']
    for test_word in test_words:
        print(test_word)
        print('        NLTK words corpus:  {0}'.format(test_word in nltk_wordset))
        print('        enable frequency:   {0}'.format(test_word in enable1))
        print('        pyenchant spelling: {0}'.format(enchant_dict.check(test_word)))
        print('        filtered CMU:       {0}'.format(test_word in all_words))
        print('        english-words-py:   {0}'.format(test_word in english_words_set))
        print('        popular frequency:  {0}'.format(test_word in popular))
        print()

    nltk_wordset:      235892
    enable1:           172823
    pyenchant:         ?
    filtered CMU:      59539
    english-words-py:  25487
    popular:           25322

    can't
            NLTK words corpus:  False
            enable frequency:   False
            pyenchant spelling: True
            filtered CMU:       True
            english-words-py:   True
            popular frequency:  False

    geese
            NLTK words corpus:  False
            enable frequency:   True
            pyenchant spelling: True
            filtered CMU:       True
            english-words-py:   True
            popular frequency:  True

    shelves
            NLTK words corpus:  False
            enable frequency:   True
            pyenchant spelling: True
            filtered CMU:       True
            english-words-py:   False
            popular frequency:  True

    Thai
            NLTK words corpus:  True
            enable frequency:   False
            pyenchant spelling: True
            filtered CMU:       False
            english-words-py:   True
            popular frequency:  False

    thai
            NLTK words corpus:  False
            enable frequency:   False
            pyenchant spelling: False
            filtered CMU:       True
            english-words-py:   False
            popular frequency:  False

    e.
            NLTK words corpus:  False
            enable frequency:   False
            pyenchant spelling: True
            filtered CMU:       False
            english-words-py:   False
            popular frequency:  False

    eure
            NLTK words corpus:  False
            enable frequency:   False
            pyenchant spelling: False
            filtered CMU:       True
            english-words-py:   False
            popular frequency:  False

    bott
            NLTK words corpus:  True
            enable frequency:   True
            pyenchant spelling: True
            filtered CMU:       True
            english-words-py:   False
            popular frequency:  False

    bitter
            NLTK words corpus:  True
            enable frequency:   True
            pyenchant spelling: True
            filtered CMU:       True
            english-words-py:   False
            popular frequency:  True

    used
            NLTK words corpus:  True
            enable frequency:   True
            pyenchant spelling: True
            filtered CMU:       True
            english-words-py:   False
            popular frequency:  True

## Code to convert text to phonemes (and stresses and number of syllables)

In [3]:
from g2p_en import G2p
word_to_phonemes = G2p()


# Code to count syllables
# https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word
VOWEL_RUNS = re.compile("[aeiouy]+", flags=re.I)
EXCEPTIONS = re.compile(
    # fixes trailing e issues:
    # smite, scared
    "[^aeiou]e[sd]?$|"
    # fixes adverbs:
    # nicely
    + "[^e]ely$",
    flags=re.I
)
ADDITIONAL = re.compile(
    # fixes incorrect subtractions from exceptions:
    # smile, scarred, raises, fated
    "[^aeioulr][lr]e[sd]?$|[csgz]es$|[td]ed$|"
    # fixes miscellaneous issues:
    # flying, piano, video, prism, fire, evaluate
    + ".y[aeiou]|ia(?!n$)|eo|ism$|[^aeiou]ire$|[^gq]ua",
    flags=re.I
)
def count_syllables(word):
    vowel_runs = len(VOWEL_RUNS.findall(word))
    exceptions = len(EXCEPTIONS.findall(word))
    additional = len(ADDITIONAL.findall(word))
    return max(1, vowel_runs - exceptions + additional)


def words_to_sounds(words):
    
    if line.strip() != "":
        phonemes = []
        stresses = []
        syllables = 0
        for word in words:
            
            # Extract phonemes per word (choose the first version of the phoneme)
            #     :: multiple pronunciations: pronouncing.phones_for_word(word) 
            phonemes_and_stresses_for_word = word_to_phonemes(word)
            #print(word, phonemes_and_stresses_for_word)
            
            phonemes_for_word = [re.sub(r'\d+', '', x) for x in phonemes_and_stresses_for_word]
            stresses_blanks_for_word = [re.sub(r"(?:[A-Z])",'', x) for x in phonemes_and_stresses_for_word]
            stresses_for_word = []
            for i,p in enumerate(phonemes_for_word):
                if p in phoneme_list:
                    if stresses_blanks_for_word[i] == '':
                        stresses_for_word.append(0)
                    elif stresses_blanks_for_word[i] == '0':
                        stresses_for_word.append(0)
                    elif stresses_blanks_for_word[i] == '1':
                        stresses_for_word.append(1)
                    elif stresses_blanks_for_word[i] == '2':
                        stresses_for_word.append(2)                        
            phonemes_for_word = [x for x in phonemes_for_word if x in phoneme_list]                  
            phonemes += phonemes_for_word  
            stresses += stresses_for_word
            syllables += count_syllables(word)

        consonants = [x for x in phonemes if x in phoneme_consonant_list] 

    return phonemes, consonants, stresses, syllables

## Code to convert phonemes to candidate words

In [4]:
def get_unique_numbers(numbers):
    unique = []
    for number in numbers:
        if number not in unique:
            unique.append(number)
    return unique


def phonemes_to_candidate_words(phonemes, all_words, all_pronunciations, start=0): 
    '''
    Generate a list of words from a list of phonemes,
    by concatenating sequences of the phonemes 
    and searching in CMU's Pronunciation Dictionary.
    '''
    words_from_phonemes = []
    words_from_consonants = []
    #rhymes_from_consonants = []
    
    # For each subsequence of phonemes
    for stop in range(start + 1, len(phonemes) + 1):
        
        # Remove stresses from the subsequence of phonemes
        phoneme_subset = phonemes[start:stop]
        phoneme_subset = [re.sub(r'\d+', '', p) for p in phoneme_subset]
        
        # For each word in the CMU dictionary
        for iword, word in enumerate(all_words):
            pron_no_stress = [re.sub(r'\d+', '', p) for p in all_pronunciations[iword]]
            
            # Find matching phonemes (homonym)
            if len(pron_no_stress) == len(phoneme_subset):
                match = 0
                for index, p in enumerate(pron_no_stress):
                    if p == phoneme_subset[index]:
                        match += 1
                if match == len(pron_no_stress):
                    words_from_phonemes.append([word, start, stop - 1]) #start + index])

            # Find matching consonants
            if len(pron_no_stress) == len(phoneme_subset):
            #if len(pron_no_stress) >= len(phoneme_subset):
                
                pron_consonant_subset = [x for x in pron_no_stress if x in phoneme_consonant_list]
                if pron_consonant_subset != []:
                    phon_consonant_subset = [x for x in phoneme_subset if x in phoneme_consonant_list]
                    if phon_consonant_subset != []:
                        if len(pron_consonant_subset) == len(phon_consonant_subset):
                            match = 0
                            for index, p in enumerate(pron_consonant_subset):
                                if p == phon_consonant_subset[index]:
                                    match += 1
                            if match == len(phon_consonant_subset):
                                #print(pron_no_stress, phoneme_subset)
                                words_from_consonants.append([word, start, stop - 1])
                                #rhymes_from_consonants.append([word, start, stop - 1])
                    
    unique_stops = get_unique_numbers([i2 for x,i1,i2 in words_from_phonemes])
    unique_stops_consonants = get_unique_numbers([i2 for x,i1,i2 in words_from_consonants])

    return words_from_phonemes, unique_stops, words_from_consonants, unique_stops_consonants


# Code to find all words that sound like each segment of each phoneme list
def phoneme_subsets_to_words(phonemes, all_words, all_pronunciations):

    phoneme_words = []
    consonant_words = []
    start = 0
    unique_stops = [-1]
    while start < len(phonemes):
        if len(unique_stops) == 0:
            unique_stops = [start + 1]
        for stop in unique_stops:
            start = stop + 1
            if start < len(phonemes):
                words_from_phonemes, unique_stops, words_from_consonants, unique_stops_consonants = phonemes_to_candidate_words(phonemes, 
                    all_words, all_pronunciations, start)
                phoneme_words += words_from_phonemes
                consonant_words += words_from_consonants

    return phoneme_words, consonant_words

## Code to construct word sequences with matching phoneme stop and start indices

In [5]:
def copy_list(list_to_copy, ncopies):
    list_copies = []
    for i in range(ncopies):
        list_copies.extend(list_to_copy)
    return list_copies


def flatten_list(nested_list):
    '''
    Flatten so that there are no tuples or lists within the list.
    
    >>> nested_list = [('e1d1', ('e1d2'), ['e2d1']), 'e3d0', [], ['e5d1']]
    >>> flatten_list(nested_list)
    ['e1d1', 'e1d2', 'e2d1', 'e3d0', 'e5d1']
    '''
    result=[]
    if nested_list != []:
        for element in nested_list:
            if isinstance(element, list) or isinstance(element, tuple):
                result.extend(flatten_list(element))
            else:
                result.append(element)
    return result

            
def flatten_to_sublists_of_strings(nested_list):
    '''
    Flatten list to strings and sublists of strings.

    >>> nested_list = [[[], '0', ('1',11,12), ('2',21,22), ['3',31,32]], [['4',41,42]]]
    >>> flatten_to_sublists_of_strings(nested_list)
    [[], '0', ['1', 11, 12], ['2', 21, 22], ['3', 31, 32], ['4', 41, 42]]
    '''
    result=[]
    if nested_list == []:
        result.extend([[]])
    else:
        if not any([isinstance(x, list)  for x in nested_list]) and \
           not any([isinstance(x, tuple) for x in nested_list]):
            y=[]
            for x in nested_list:
                y.append(x)
            result.append(y)
        else:
            for element in nested_list:
                if isinstance(element, str):
                    result.extend(element)
                elif isinstance(element, list) or isinstance(element, tuple):
                    if element == []:
                        result.extend([[]])
                    else:
                        result.extend(flatten_to_sublists_of_strings(element))          
    return result

            
def find_words_with_start_index(word_start_stop_list, start_index):
    # store words that start at start_index
    start_words = []
    starts = []
    stops = []
    for word, start, stop in word_start_stop_list:
        if start == start_index and start != []:
            start_words.append(word)
            starts.append(start)
            stops.append(stop)
            
    return start_words, starts, stops


def organize_words_by_start(words_list):

    if not isinstance(words_list[0], list) and not isinstance(words_list[0], tuple):
        words_list = [words_list]
        
    # Get unique starts and stops, and max start and stop
    words2 = []
    starts2 = []
    stops2 = []
    for word, start, stop in words_list:
        words2.append(word)
        starts2.append(start)
        stops2.append(stop)
    unique_starts = get_unique_numbers(starts2)
    unique_stops = get_unique_numbers(stops2)
    max_start = max(get_unique_numbers(starts2))
    max_stop = max(get_unique_numbers(stops2))

    # Words organized by start index
    words_by_start = []
    stops = []
    for start_index in range(max_start + 1):
        start_words, istarts, istops = find_words_with_start_index(words_list, start_index)
        words_by_start.append(start_words)
        stops.append(istops)        

    return words_by_start, stops, unique_starts, unique_stops, max_start, max_stop


def concatenate_lists(list_of_lists1, list_of_lists2):
    result = []
    for item1, item2 in zip(list_of_lists1, list_of_lists2):
        if isinstance(item1, str) and isinstance(item2, list):
            for element in item2:
                result.append((item1, element))
        elif isinstance(item1, list) and isinstance(item2, list):
            result.append((item1 + item2))
        elif isinstance(item1, tuple) and isinstance(item2, list):
            result.append((list(item1) + item2))
    return result


def concatenate_words(prev_words, prev_stops, words_by_start, stops_by_start, unique_starts):
    '''
    Concatenate words where the stop index of one matches the start index of the next.
    '''
    # Initialize / format words
    new_words = []
    new_stops = []
    words1 = prev_words
    stops1 = prev_stops
 
    # For each word that starts at a given index
    for iword1, word1 in enumerate(words1):

        # Find words that start after that word stops
        word1_stop = stops1[iword1]
        word2_start = word1_stop + 1
        if word2_start in unique_starts:
            words2 = words_by_start[word2_start]
            stops2 = stops_by_start[word2_start]

            # Concatenate the first word with each of the second set of words
            if len(words2) > 0:
                word1_copies = copy_list([word1], len(words2))
                words2_list = [[x] for x in words2]
                new_words.append(concatenate_lists(word1_copies, words2_list))
                new_stops.append(stops2)
           
    new_words = flatten_to_sublists_of_strings(new_words)
    new_stops = flatten_list(new_stops)
        
    return new_words, new_stops


def words_stop_to_start(words_by_start, stops_by_start, unique_starts, max_stop, output_file=[]):

    # Initialize write to text file or to list
    if output_file != []:
        write_file = True
        fwrite = open(output_file, "w")
        fwrite.write('')
        fwrite.close()
        fwrite = open(output_file, "a")
    else:
        all_lines = []
    candidate_lines = []

    # Initialize loop
    prev_words = words_by_start[0]
    prev_stops = stops_by_start[0]
    run = True
    while(run):

        new_words, new_stops = concatenate_words(prev_words, prev_stops, 
                                                 words_by_start, stops_by_start, unique_starts)
        # Stop when all stops equal max_stop
        if all([x == max_stop for x in new_stops]):
            run = False
        
        # Write to text file or to list
        for istop, stop in enumerate(new_stops):
            if stop == max_stop:
                if write_file:
                    fwrite.write(' '.join(new_words[istop]) + '\n')
                else:
                    all_lines.append(' '.join(new_words[istop]))

        prev_words = new_words
        prev_stops = new_stops

    # After exiting the while loop, finalize
    if write_file:
        fwrite.close()
    else:
        for line in all_lines:
            if line not in candidate_lines:
                candidate_lines.append(line)

    return candidate_lines


def remove_duplicates(infile, outfile):
    unique_lines = set(open(infile).readlines())
    out = open(outfile, 'w').writelines(unique_lines)

## Run all code on input text

In [None]:
# Load text
fread = open("data/demo.txt", "r")
lines = fread.readlines()

verbose = True
verbose2 = False

odd_even = 0
for line in lines:
    odd_even += 1
    if (odd_even % 2) == 0:
        spaces = '    '
    else:
        spaces = ''
        
    words = line.split()
    
    filename_base = '_'.join(words)
    filename_phoneme_morph = filename_base + '_PHONEME_MORPH.txt'
    filename_consonant_morph = filename_base + '_CONSONANT_MORPH.txt'
    
    if verbose:
        print('')
        print('===============================================================================')
        print('Input line:  "{0}"'.format(line.strip()))
        print('===============================================================================')
    
    phonemes, consonants, stresses, syllables = words_to_sounds(words)
    
    if verbose:
        print('Syllables:   {0}'.format(syllables))
        print('Stresses:    {0}'.format(stresses))
        print('Phonemes:    {0}'.format(', '.join(phonemes)))
        print('Consonants:  {0}'.format(', '.join(consonants)), end='\n\n')

    if verbose and not verbose2:
        print('Convert phoneme subsets and consonant subsets to words...', end='\n\n')
    
    all_phoneme_words, all_consonant_words = phoneme_subsets_to_words(phonemes, all_words, 
                                                                      all_pronunciations)

    if verbose:
        print('Filter out any original words from phoneme-generated words...', end='\n\n')
    
    phoneme_words = [] 
    for phoneme_word in all_phoneme_words:
        #print(phoneme_word[0], words)
        if phoneme_word[0] not in words:
            phoneme_words.append(phoneme_word)

    if verbose:
        print('Filter out any original words from consonant-generated words...', end='\n\n')
    
    consonant_words = [] 
    for consonant_word in all_consonant_words:
        if consonant_word[0] not in words:
            consonant_words.append(consonant_words)
    
    if verbose2:
        print('Phoneme words:  {0}'.format(', '.join([x[0] for x in phoneme_words])), end='\n\n')
        print('Consonant words:  {0}'.format(', '.join([x[0] for x in consonant_words])), end='\n\n')

    if verbose and not verbose2:
        print('Organize words by start index...', end='\n\n')

    if phoneme_words:
        words_by_start1, stops1, unique_starts1, unique_stops1, max_start1, max_stop1 = organize_words_by_start(flatten_to_sublists_of_strings(phoneme_words))
        
        if verbose2:
            print('Phoneme-generated words sorted by start index:  {0}'.format(flatten_to_sublists_of_strings(words_by_start1)), end='\n\n')

        candidate_lines1 = words_stop_to_start(words_by_start1, stops1, unique_starts1, max_stop1, filename_phoneme_morph)

        if verbose:
            print('Phoneme-generated words written to {0}'.format(filename_phoneme_morph), end='\n')

            if candidate_lines1 != []:
                for candidate_line1 in flatten_list(candidate_lines1):
                   print('    {0}{1}'.format(candidate_line1, spaces), end='\n')
                print('')
        
    if consonant_words:
        words_by_start2, stops2, unique_starts2, unique_stops2, max_start2, max_stop2 = organize_words_by_start(flatten_to_sublists_of_strings(consonant_words))

        if verbose2:
            print('Consonant-generated words sorted by start index:  {0}'.format(flatten_to_sublists_of_strings(words_by_start2)), end='\n\n')

        candidate_lines2 = words_stop_to_start(words_by_start2, stops2, unique_starts2, max_stop2, filename_consonant_morph)

        if verbose:
            print('Consonant-generated words written to {0}'.format(filename_consonant_morph), end='\n\n')

            if candidate_lines2 != []:
                for candidate_line2 in flatten_list(candidate_lines2):
                   print('    {0}{1}'.format(candidate_line2, spaces), end='\n')


Input line:  "I used to work for the government, but now I work for the public."
Syllables:   17
Stresses:    [1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
Phonemes:    AY, Y, UW, Z, D, T, UW, W, ER, K, F, AO, R, DH, AH, G, AH, V, ER, M, AH, N, T, B, AH, T, N, AW, AY, W, ER, K, F, AO, R, DH, AH, P, AH, B, L, IH, K
Consonants:  Y, Z, D, T, W, K, F, R, DH, G, V, M, N, T, B, T, N, W, K, F, R, DH, P, B, L, K

Convert phoneme subsets and consonant subsets to words...



In [11]:
phoneme_words

[['ai', 0, 0],
 ['aye', 0, 0],
 ['eye', 0, 0],
 ['i', 0, 0],
 ['ewe', 1, 2],
 ['yew', 1, 2],
 ['you', 1, 2],
 ['ewes', 1, 3],
 ["u's", 1, 3],
 ["yew's", 1, 3],
 ['used', 1, 4],
 ['to', 5, 6],
 ['too', 5, 6],
 ['tu', 5, 6],
 ['tue', 5, 6],
 ['two', 5, 6],
 ['were', 7, 8],
 ['work', 7, 9],
 ['faure', 10, 12],
 ['for', 10, 12],
 ['fore', 10, 12],
 ['four', 10, 12],
 ['the', 13, 14],
 ['gov', 15, 17],
 ['government', 15, 22],
 ['er', 18, 18],
 ['eure', 18, 18],
 ['ur', 18, 18],
 ['erma', 18, 20],
 ['irma', 18, 20],
 ['ermine', 18, 21],
 ['but', 23, 25],
 ['butt', 23, 25],
 ['now', 26, 27],
 ['ai', 28, 28],
 ['aye', 28, 28],
 ['eye', 28, 28],
 ['i', 28, 28],
 ['were', 29, 30],
 ['work', 29, 31],
 ['faure', 32, 34],
 ['for', 32, 34],
 ['fore', 32, 34],
 ['four', 32, 34],
 ['the', 35, 36],
 ['pub', 37, 39],
 ['public', 37, 42],
 ['lick', 40, 42]]