## Phonemes and Carnegie Mellon Pronouncing Dictionary

(See https://github.com/cmusphinx/cmudict/tree/4c6a365cea2c34340ffc218d5af7a38920fa7e37)

From https://www.nltk.org/_modules/nltk/corpus/reader/cmudict.html:

The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
Copyright 1998 Carnegie Mellon University

File Format: Each line consists of an uppercased word, a counter
(for alternative pronunciations), and a transcription.  Vowels are
marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.:
NATURAL 1 N AE1 CH ER0 AH0 L

The dictionary contains 127069 entries.  Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations.  Many of these are fast-speech variants.

Phonemes: There are 39 phonemes, as shown below:

    Phoneme Example Translation    Phoneme Example Translation
    ------- ------- -----------    ------- ------- -----------
    AA      odd     AA D           AE      at      AE T
    AH      hut     HH AH T        AO      ought   AO T
    AW      cow     K AW           AY      hide    HH AY D
    B       be      B IY           CH      cheese  CH IY Z
    D       dee     D IY           DH      thee    DH IY
    EH      Ed      EH D           ER      hurt    HH ER T
    EY      ate     EY T           F       fee     F IY
    G       green   G R IY N       HH      he      HH IY
    IH      it      IH T           IY      eat     IY T
    JH      gee     JH IY          K       key     K IY
    L       lee     L IY           M       me      M IY
    N       knee    N IY           NG      ping    P IH NG
    OW      oat     OW T           OY      toy     T OY
    P       pee     P IY           R       read    R IY D
    S       sea     S IY           SH      she     SH IY
    T       tea     T IY           TH      theta   TH EY T AH
    UH      hood    HH UH D        UW      two     T UW
    V       vee     V IY           W       we      W IY
    Y       yield   Y IY L D       Z       zee     Z IY
    ZH      seizure S IY ZH ER
    
From https://www.pythonstudio.us/language-processing/a-pronouncing-dictionary.html:

For each word, this lexicon provides a list of phonetic codes—distinct labels for each contrastive sound—known as phones. Observe that fire has two pronunciations (in U.S. English): the one-syllable F AY1 R, and the two-syllable F AY1 ER0. The symbols in the CMU Pronouncing Dictionary are from the Arpabet, described in more detail at http://en.wikipedia.org/wiki/Arpabet.

In [14]:
import nltk
import enchant

entries = nltk.corpus.cmudict.entries()
my_dict = enchant.Dict("en_US")

phoneme_list = ['AA','AH','AW','B','D','EH','EY','G','IH','JH','L','N','OW','P','S','T','UH','V','Y','ZH',
                'AE','AO','AY','CH','DH','ER','F','HH','IY','K','M','NG','OY','R','SH','TH','UW','W','Z']
phoneme_vowel_list = ['AA','AH','AW','EH','EY','IH','OW','UH','AE','AO','AY','ER','IY','OY','UW']
phoneme_consonant_list = ['B','D','G','JH','L','N','P','S','T','V','Y','ZH','CH',
                          'DH','F','HH','K','M','NG','R','SH','TH','W','Z']

## Code to convert words to phonemes and phonemes to words

In [23]:
from g2p_en import G2p
word_to_phonemes = G2p()


def get_unique_numbers(numbers):
    unique = []
    for number in numbers:
        if number not in unique:
            unique.append(number)
    return unique


def phonemes_to_candidate_words(phonemes, start=0): 
    '''
    Generate a list of words from a list of phonemes,
    by concatenating sequences of the phonemes 
    and searching in CMU's Pronunciation Dictionary.
    '''
    words_from_phonemes = []
    words_from_consonants = []
    for stop in range(start + 1, len(phonemes) + 1):
        phoneme_subset = phonemes[start:stop]
        for word, pron in entries:
            
            # Find matching phonemes (homonym)
            if len(pron) == len(phoneme_subset):
                match = 0
                for index, p in enumerate(pron):
                    if re.sub(r'\d+', '', p) == re.sub(r'\d+', '', phoneme_subset[index]):
                        match += 1
                if match == len(pron):
                    words_from_phonemes.append([word, start, start + index])

            # Find matching consonants
            if len(pron) >= len(phoneme_subset):
                
                pron_consonant_subset = [x for x in pron if x in phoneme_consonant_list]
                if pron_consonant_subset != []:
                    consonant_subset = [x for x in phoneme_subset if x in phoneme_consonant_list]
                    if consonant_subset != []:
                        if len(pron_consonant_subset) == len(consonant_subset):
                            match = 0
                            for index, p in enumerate(pron_consonant_subset):
                                if p == consonant_subset[index]:
                                    match += 1
                            if match == len(consonant_subset):
                                words_from_consonants.append([word, start, start + index])
                    
    unique_stops = get_unique_numbers([i2 for x,i1,i2 in words_from_phonemes])
    unique_stops_consonants = get_unique_numbers([i2 for x,i1,i2 in words_from_consonants])

    return words_from_phonemes, unique_stops, words_from_consonants, unique_stops_consonants

## Code to count syllables

In [24]:
# https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word

import re

VOWEL_RUNS = re.compile("[aeiouy]+", flags=re.I)
EXCEPTIONS = re.compile(
    # fixes trailing e issues:
    # smite, scared
    "[^aeiou]e[sd]?$|"
    # fixes adverbs:
    # nicely
    + "[^e]ely$",
    flags=re.I
)
ADDITIONAL = re.compile(
    # fixes incorrect subtractions from exceptions:
    # smile, scarred, raises, fated
    "[^aeioulr][lr]e[sd]?$|[csgz]es$|[td]ed$|"
    # fixes miscellaneous issues:
    # flying, piano, video, prism, fire, evaluate
    + ".y[aeiou]|ia(?!n$)|eo|ism$|[^aeiou]ire$|[^gq]ua",
    flags=re.I
)

def count_syllables(word):
    vowel_runs = len(VOWEL_RUNS.findall(word))
    exceptions = len(EXCEPTIONS.findall(word))
    additional = len(ADDITIONAL.findall(word))
    return max(1, vowel_runs - exceptions + additional)

## Code to extract phonemes, stresses, and number of syllables per line of text

In [25]:
def words_to_sounds(line):

    if line.strip() != "":
        words = line.split()
        phonemes = []
        stresses = []
        syllables = 0
        for word in words:

            # Extract phonemes per word (choose the first version of the phoneme)
            #     :: multiple pronunciations: pronouncing.phones_for_word(word) 
            phonemes_and_stresses_for_word = word_to_phonemes(word)
                      
            phonemes_for_word = [re.sub(r'\d+', '', x) for x in phonemes_and_stresses_for_word]
            stresses_blanks_for_word = [re.sub(r"(?:[A-Z])",'', x) for x in phonemes_and_stresses_for_word]
            stresses_for_word = []
            for i,p in enumerate(phonemes_for_word):
                if p in phoneme_list:
                    if stresses_blanks_for_word[i] == '':
                        stresses_for_word.append(0)
                    elif stresses_blanks_for_word[i] == '0':
                        stresses_for_word.append(0)
                    elif stresses_blanks_for_word[i] == '1':
                        stresses_for_word.append(1)
                    elif stresses_blanks_for_word[i] == '2':
                        stresses_for_word.append(2)                        
            phonemes_for_word = [x for x in phonemes_for_word if x in phoneme_list]                  
            phonemes += phonemes_for_word  
            stresses += stresses_for_word
            syllables += count_syllables(word)

        consonants = [x for x in phonemes if x in phoneme_consonant_list] 

    return phonemes, consonants, stresses, syllables

## Find all words that sound like each segment of each phoneme list

In [26]:
def phoneme_subsets_to_words(phonemes):

    phoneme_words = []
    consonant_words = []
    start = 0
    unique_stops = [-1]
    while start < len(phonemes):
        if len(unique_stops) == 0:
            unique_stops = [start + 1]
        for stop in unique_stops:
            start = stop + 1
            if start < len(phonemes):
                words_from_phonemes, unique_stops, words_from_consonants, unique_stops_consonants = phonemes_to_candidate_words(phonemes, start)
                phoneme_words += words_from_phonemes
                consonant_words += words_from_consonants

    return phoneme_words, consonant_words

## Filter words by another English dictionary

In [27]:
#print(my_dict.check("Thai"))

def filter_dictionary_words(words, verbose=False):

    filtered_words = []
    removed_words = []
    for word in words: 
        if my_dict.check(word[0]):
            filtered_words.append(word)
        else:
            removed_words.append(word)
    
    if verbose and removed_words != []:
        print('Removed words:  {0}'.format(', '.join([x[0] for x in removed_words])))

    return filtered_words, removed_words

## Organize words by their phoneme start and stop indices

In [28]:
def copy_list(list_to_copy, ncopies):
    list_copies = []
    for i in range(ncopies):
        list_copies.extend(list_to_copy)
    return list_copies


def flatten_list(nested_list):
    '''
    Flatten so that there are no tuples or lists within the list.
    
    >>> nested_list = [(['tye', 'a'], 'ja')]
    >>> flatten_list(nested_list)
    ... ['tye', 'a', 'ja']
    '''
    result=[]
    if nested_list != []:
        for element in nested_list:
            if isinstance(element, list) or isinstance(element, tuple):
                result.extend(flatten_list(element))
            else:
                result.append(element)
    return result

            
def flatten_sublists(nested_list):
    '''
    Flatten so that there are no subsublists within the sublists.
    
    >>> nested_list = [[('pty', 'a'), ('pty', 'uh'), ('pty', 'uhh')], [('tae', 'a'), ('tae', 'uh'), ('tae', 'uhh')]]
    >>> flatten_sublists(nested_list)
    [('pty', 'a'),
     ('pty', 'uh'),
     ('pty', 'uhh'),
     ('tae', 'a'),
     ('tae', 'uh'),
     ('tae', 'uhh')]
    '''
    result=[]
    if nested_list != []:
        for element in nested_list:
            if isinstance(element, list) or isinstance(element, tuple):
                if element != []:
                    if isinstance(element[0], list) or isinstance(element[0], tuple):
                        result.extend(flatten_sublists(element))
                    else:
                        result.append(element)
    return result

            
def find_words_with_start_index(word_start_stop_list, start_index):
    # store words that start at start_index
    start_words = []
    starts = []
    stops = []
    for word, start, stop in word_start_stop_list:
        if start == start_index and start != []:
            start_words.append(word)
            starts.append(start)
            stops.append(stop)
            
    return start_words, starts, stops


def organize_words_by_start(words_list):

    if not isinstance(words_list[0], list) and not isinstance(words_list[0], tuple):
        words_list = [words_list]
        
    # Get unique starts and stops, and max start and stop
    words2 = []
    starts2 = []
    stops2 = []
    for word, start, stop in words_list:
        words2.append(word)
        starts2.append(start)
        stops2.append(stop)
    unique_starts = get_unique_numbers(starts2)
    unique_stops = get_unique_numbers(stops2)
    max_start = max(get_unique_numbers(starts2))
    max_stop = max(get_unique_numbers(stops2))

    # Words organized by start index
    words_by_start = []
    stops = []
    for start_index in range(max_start + 1):
        start_words, istarts, istops = find_words_with_start_index(words_list, start_index)
        words_by_start.append(start_words)
        stops.append(istops)        
    #stops = stops[0:-1] 

    return words_by_start, stops, unique_starts, unique_stops, max_start, max_stop

## Construct word sequences with matching phoneme stop and start indices

In [29]:
def concatenate_lists(list_of_lists1, list_of_lists2):
    result = []
    for item1, item2 in zip(list_of_lists1, list_of_lists2):
        if isinstance(item1, str) and isinstance(item2, list):
            for element in item2:
                result.append((item1, element))
        elif isinstance(item1, tuple) and isinstance(item2, list):
            result.append((list(item1) + list(item2)))
    return result


def concatenate_words(new_words, new_stops, words, stops, unique_starts):
    '''
    Concatenate words where the stop index of one matches the start index of the next.
    '''
    # Initialize / format words
    if new_words == []:
        words1 = words[0]
        stops1 = stops[0]
    else:
        words1 = flatten_sublists(new_words)
        stops1 = flatten_list(new_stops)

    # For each word that starts at  a given index
    for iword1, word1 in enumerate(words1):

        # Find words that start after that word stops
        if isinstance(stops1, int):
            stops1 = [stops1]

        word1_stop = stops1[iword1]
        word2_start = word1_stop + 1
        if word2_start in unique_starts:
            words2 = words[word2_start]
            stops2 = stops[word2_start]

            # Concatenate the first word with each of the second set of words
            if len(words2) > 0:
                word1_copies = copy_list([word1], len(words2))
                words2_list = [[x] for x in words2]
                new_words.append(concatenate_lists(word1_copies, words2_list))
                new_stops.append(stops2)
            else:
                break
        else:
            break
            
    return new_words, new_stops
            

def words_stop_to_start(words, stops, unique_starts, max_stop):

    updated_words = []
    updated_stops = []
    candidate_lines = []
    run = True
    while(run):

        updated_words, updated_stops = concatenate_words(updated_words, updated_stops, 
                                                         words, stops, unique_starts)
        if updated_words == [] or updated_words[-1] == []:
            break
 
        # Store list of words if reached max_stop
        all_words = flatten_sublists(updated_words)
        all_stops = flatten_list(updated_stops)
        for istop, stop_words in enumerate(all_words):
            stop = all_stops[istop]
            if stop == max_stop:
                candidate_lines.append(' '.join(all_words[istop]))
                
    return candidate_lines

## Run all code on input text

In [None]:
# Load text
f = open("demo.txt", "r")
lines = f.readlines()


verbose = True
verbose2 = False

for line in lines:

    if verbose:
        print('')
        print('===============================================================================')
        print('Input line:  "{0}"'.format(line.strip()))
        print('===============================================================================')
    
    phonemes, consonants, stresses, syllables = words_to_sounds(line)
    
    if verbose:
        print('Syllables:   {0}'.format(syllables))
        print('Stresses:    {0}'.format(stresses))
        print('Phonemes:    {0}'.format(', '.join(phonemes)))
        print('Consonants:  {0}'.format(', '.join(consonants)), end='\n\n')

    phoneme_words, consonant_words = phoneme_subsets_to_words(phonemes)

    if verbose2:
        print('Phoneme words:  {0}'.format(', '.join([x[0] for x in flatten_sublists(phoneme_words)])))
        print('Consonant words:  {0}'.format(', '.join([x[0] for x in flatten_sublists(consonant_words)])))

    filtered_words_from_phonemes = filter_dictionary_words(phoneme_words, verbose=False)
    filtered_words_from_consonants = filter_dictionary_words(consonant_words, verbose=False)
    
    if verbose2:
        if filtered_words_from_phonemes != []:
            print('Remaining words from phonemes:  {0}'.format(', '.join([x[0] for x in flatten_sublists(filtered_words_from_phonemes)])), end='\n\n')
        if filtered_words_from_consonants != []:
            print('Remaining words from consonants:  {0}'.format(', '.join([x[0] for x in flatten_sublists(filtered_words_from_consonants)])), end='\n\n')

    words_by_start1, stops1, unique_starts1, unique_stops1, max_start1, max_stop1 = organize_words_by_start(flatten_sublists(filtered_words_from_phonemes))
    words_by_start2, stops2, unique_starts2, unique_stops2, max_start2, max_stop2 = organize_words_by_start(flatten_sublists(filtered_words_from_consonants))
    
    if verbose2:
        print('Phoneme-generated words organized by start index:')
        print('{0}'.format(words_by_start1), end='\n\n')
        print('Consonant-generated words organized by start index:')
        print('{0}'.format(words_by_start2), end='\n\n')

    candidate_lines1 = words_stop_to_start(words_by_start1, stops1, unique_starts1, max_stop1)    
    candidate_lines2 = words_stop_to_start(words_by_start2, stops2, unique_starts2, max_stop2)
    
    if verbose:
        if candidate_lines1 != []:
            print('Candidate lines from phonemes:', end='\n\n')
            for candidate_line1 in flatten_list(candidate_lines1):
               print('    {0}'.format(candidate_line1), end='\n')
            print('')

        if candidate_lines2 != []:
            print('Candidate lines from consonants:', end='\n\n')
            for candidate_line2 in flatten_list(candidate_lines2):
               print('    {0}'.format(candidate_line2), end='\n')


Input line:  "Happy New Year"
Syllables:   4
Stresses:    [0, 1, 0, 0, 0, 1, 0, 1, 0]
Phonemes:    HH, AE, P, IY, N, UW, Y, IH, R
Consonants:  HH, P, N, Y, R

