## Load lines of text

In [10]:
f = open("demo.txt", "r")
lines = f.readlines()

## Phonemes and Carnegie Mellon Pronouncing Dictionary

(See https://github.com/cmusphinx/cmudict/tree/4c6a365cea2c34340ffc218d5af7a38920fa7e37)

From https://www.nltk.org/_modules/nltk/corpus/reader/cmudict.html:

The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
Copyright 1998 Carnegie Mellon University

File Format: Each line consists of an uppercased word, a counter
(for alternative pronunciations), and a transcription.  Vowels are
marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.:
NATURAL 1 N AE1 CH ER0 AH0 L

The dictionary contains 127069 entries.  Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations.  Many of these are fast-speech variants.

Phonemes: There are 39 phonemes, as shown below:

    Phoneme Example Translation    Phoneme Example Translation
    ------- ------- -----------    ------- ------- -----------
    AA      odd     AA D           AE      at      AE T
    AH      hut     HH AH T        AO      ought   AO T
    AW      cow     K AW           AY      hide    HH AY D
    B       be      B IY           CH      cheese  CH IY Z
    D       dee     D IY           DH      thee    DH IY
    EH      Ed      EH D           ER      hurt    HH ER T
    EY      ate     EY T           F       fee     F IY
    G       green   G R IY N       HH      he      HH IY
    IH      it      IH T           IY      eat     IY T
    JH      gee     JH IY          K       key     K IY
    L       lee     L IY           M       me      M IY
    N       knee    N IY           NG      ping    P IH NG
    OW      oat     OW T           OY      toy     T OY
    P       pee     P IY           R       read    R IY D
    S       sea     S IY           SH      she     SH IY
    T       tea     T IY           TH      theta   TH EY T AH
    UH      hood    HH UH D        UW      two     T UW
    V       vee     V IY           W       we      W IY
    Y       yield   Y IY L D       Z       zee     Z IY
    ZH      seizure S IY ZH ER
    
From https://www.pythonstudio.us/language-processing/a-pronouncing-dictionary.html:

For each word, this lexicon provides a list of phonetic codes—distinct labels for each contrastive sound—known as phones. Observe that fire has two pronunciations (in U.S. English): the one-syllable F AY1 R, and the two-syllable F AY1 ER0. The symbols in the CMU Pronouncing Dictionary are from the Arpabet, described in more detail at http://en.wikipedia.org/wiki/Arpabet.

In [11]:
import nltk
import enchant

entries = nltk.corpus.cmudict.entries()
my_dict = enchant.Dict("en_US")

phoneme_list = ['AA','AH','AW','B','D','EH','EY','G','IH','JH','L','N','OW','P','S','T','UH','V','Y','ZH',
                'AE','AO','AY','CH','DH','ER','F','HH','IY','K','M','NG','OY','R','SH','TH','UW','W','Z']
phoneme_vowel_list = ['AA','AH','AW','EH','EY','IH','OW','UH','AE','AO','AY','ER','IY','OY','UW']
phoneme_consonant_list = ['B','D','G','JH','L','N','P','S','T','V','Y','ZH','CH',
                          'DH','F','HH','K','M','NG','R','SH','TH','W','Z']

## Code for converting words to phonemes and phonemes to words

In [12]:
from g2p_en import G2p
g2p = G2p()


def get_unique_numbers(numbers):
    unique = []
    for number in numbers:
        if number not in unique:
            unique.append(number)
    return unique


def p2g(phoneme_list, istart=0): 
    '''
    Generate a list of words from a list of phonemes,
    by concatenating sequences of the phonemes 
    and searching in CMU's Pronunciation Dictionary.
    '''
    words_from_phonemes = []
    for istop in range(istart + 1, len(phoneme_list) + 1):
        phoneme_subset = phoneme_list[istart:istop]
        #print(phoneme_subset)
        for word, pron in entries:
            if len(pron) == len(phoneme_subset):
                match = 0
                for index, p in enumerate(pron):
                    if re.sub(r'\d+', '', p) == re.sub(r'\d+', '', phoneme_subset[index]):
                        match += 1
                if match == len(pron):
                    words_from_phonemes.append([word, istart, istart + index])
                    
    unique_stops = get_unique_numbers([i2 for x,i1,i2 in words_from_phonemes])

    return words_from_phonemes, unique_stops

## Code for counting syllables

In [13]:
# https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word

import re

VOWEL_RUNS = re.compile("[aeiouy]+", flags=re.I)
EXCEPTIONS = re.compile(
    # fixes trailing e issues:
    # smite, scared
    "[^aeiou]e[sd]?$|"
    # fixes adverbs:
    # nicely
    + "[^e]ely$",
    flags=re.I
)
ADDITIONAL = re.compile(
    # fixes incorrect subtractions from exceptions:
    # smile, scarred, raises, fated
    "[^aeioulr][lr]e[sd]?$|[csgz]es$|[td]ed$|"
    # fixes miscellaneous issues:
    # flying, piano, video, prism, fire, evaluate
    + ".y[aeiou]|ia(?!n$)|eo|ism$|[^aeiou]ire$|[^gq]ua",
    flags=re.I
)

def count_syllables(word):
    vowel_runs = len(VOWEL_RUNS.findall(word))
    exceptions = len(EXCEPTIONS.findall(word))
    additional = len(ADDITIONAL.findall(word))
    return max(1, vowel_runs - exceptions + additional)

## Find phonemes, stresses, and number of syllables per input line of text

In [14]:
phonemes_per_line = []
stresses_per_line = []
syllables_per_line = []
consonants_per_line = []

for line in lines:
    if line.strip() != "":
        words = line.split()
        phonemes_for_line = []
        stresses_for_line = []
        syllables_for_line = 0
        for word in words:
            
            # Extract phonemes per word (choose the first version of the phoneme)
            #     :: multiple pronunciations: pronouncing.phones_for_word(word) 
            phonemes_and_stresses_for_word = g2p(word)
                      
            phonemes_for_word = [re.sub(r'\d+', '', x) for x in phonemes_and_stresses_for_word]
            stresses_blanks_for_word = [re.sub(r"(?:[A-Z])",'', x) for x in phonemes_and_stresses_for_word]
            stresses_for_word = []
            syllables_for_word = 0
            for i,p in enumerate(phonemes_for_word):
                if p in phoneme_list:
                    if stresses_blanks_for_word[i] == '':
                        stresses_for_word.append(0)
                    elif stresses_blanks_for_word[i] == '0':
                        stresses_for_word.append(0)
                    elif stresses_blanks_for_word[i] == '1':
                        stresses_for_word.append(1)
                    elif stresses_blanks_for_word[i] == '2':
                        stresses_for_word.append(2)                        
            phonemes_for_word = [x for x in phonemes_for_word if x in phoneme_list]                  
            phonemes_for_line += phonemes_for_word  
            stresses_for_line += stresses_for_word
            syllables_for_line += count_syllables(word)

        phonemes_per_line.append(phonemes_for_line)
        stresses_per_line.append(stresses_for_line)
        syllables_per_line.append(syllables_for_line)
        consonants_per_line.append([x for x in phonemes_for_line if x in phoneme_consonant_list]) 

print(phonemes_per_line)
print(consonants_per_line)
print(stresses_per_line)
print(syllables_per_line)

[['HH', 'AE', 'P', 'IY', 'N', 'UW', 'Y', 'IH', 'R']]
[['HH', 'P', 'N', 'Y', 'R']]
[[0, 1, 0, 0, 0, 1, 0, 1, 0]]
[4]


## Find all words that sound like each segment of each phoneme list

In [15]:
words_per_line = []
for phonemes_for_line in phonemes_per_line:   
    new_words = []
    istart = 0
    unique_stops = [-1]
    while istart < len(phonemes_for_line):
        if len(unique_stops) == 0:
            unique_stops = [istart + 1]
        for istop in unique_stops:
            istart = istop + 1
            if istart < len(phonemes_for_line):
                words_from_phonemes, unique_stops = p2g(phonemes_for_line, istart)
                new_words += words_from_phonemes
    words_per_line.append(new_words)

print(words_per_line)

[[['hap', 0, 2], ['happ', 0, 2], ['happe', 0, 2], ['happy', 0, 3], ['e', 3, 3], ['e.', 3, 3], ['ee', 3, 3], ['gnu', 4, 5], ['knew', 4, 5], ['new', 4, 5], ['nu', 4, 5], ['year', 6, 8]]]


## Filter words by another English dictionary

In [16]:
#print(my_dict.check("Thai"))

filter2 = True
if filter2:
    words_per_line2 = []
    for tentative_words in words_per_line: 
        words_for_line2 = []
        for tentative_word in tentative_words: 
            if my_dict.check(tentative_word[0]):
                words_for_line2.append(tentative_word)
            else:
                print('Removed:  ', tentative_word[0])
        words_per_line2.append(words_for_line2)

words_per_line = words_per_line2
print(words_per_line2)

Removed:   happ
Removed:   happe
Removed:   ee
[[['hap', 0, 2], ['happy', 0, 3], ['e', 3, 3], ['e.', 3, 3], ['gnu', 4, 5], ['knew', 4, 5], ['new', 4, 5], ['nu', 4, 5], ['year', 6, 8]]]


## Organize words by their phoneme start and stop indices

In [17]:
def copy_list(list_to_copy, ncopies):
    list_copies = []
    for i in range(ncopies):
        list_copies.extend(list_to_copy)
    return list_copies


def flatten_list(nested_list):
    '''
    Flatten so that there are no tuples or lists within the list.
    
    >>> nested_list = [(['tye', 'a'], 'ja')]
    >>> flatten_list(nested_list)
    ... ['tye', 'a', 'ja']
    '''
    result=[]
    for element in nested_list:
        if isinstance(element, list) or isinstance(element, tuple):
            result.extend(flatten_list(element))
        else:
            result.append(element)
    return result

            
def flatten_sublists(nested_list):
    '''
    Flatten so that there are no subsublists within the sublists.
    
    >>> nested_list = [[('pty', 'a'), ('pty', 'uh'), ('pty', 'uhh')], [('tae', 'a'), ('tae', 'uh'), ('tae', 'uhh')]]
    >>> flatten_sublists(nested_list)
    [('pty', 'a'),
     ('pty', 'uh'),
     ('pty', 'uhh'),
     ('tae', 'a'),
     ('tae', 'uh'),
     ('tae', 'uhh')]
    '''
    result=[]
    for element in nested_list:
        if isinstance(element, list) or isinstance(element, tuple):
            if isinstance(element[0], list) or isinstance(element[0], tuple):
                result.extend(flatten_sublists(element))
            else:
                result.append(element)
    return result

            
def find_words_with_start_index(word_istart_istop_list, start_index):
    # store words that start at start_index
    istart_words = []
    istart_istarts = []
    istart_istops = []
    for word, istart, istop in word_istart_istop_list:
        if istart == start_index:
            istart_words.append(word)
            istart_istarts.append(istart)
            istart_istops.append(istop)
    
    return istart_words, istart_istarts, istart_istops


# Unique start, stop indices for each word
#istarts_per_line = []
#istops_per_line = []
unique_starts_per_line = []
unique_stops_per_line = []
max_starts_per_line = []
max_stops_per_line = []
for words_for_line in words_per_line:
    if not isinstance(words_for_line[0], list) and not isinstance(words_for_line[0], tuple):
        words_for_line = [words_for_line]
    new_words = []
    istarts = []
    istops = []
    for new_word, istart, istop in words_for_line:
        new_words.append(new_word)
        istarts.append(istart)
        istops.append(istop)
    #istarts_per_line.append(istarts)
    #istops_per_line.append(istops)
    unique_starts_per_line.append(get_unique_numbers(istarts))
    unique_stops_per_line.append(get_unique_numbers(istops))
    max_starts_per_line.append(max(get_unique_numbers(istarts)))
    max_stops_per_line.append(max(get_unique_numbers(istops)))

# Words per line, organized by start index
words_per_line_by_start_index = []
stops_per_line_by_start_index = []
for iline, words_for_line in enumerate(words_per_line):
    words_start = []
    stops = []
    for start_index in range(max_starts_per_line[iline] + 1):
        istart_words, istart_istarts, istart_istops = find_words_with_start_index(words_for_line, start_index)
        words_start.append(istart_words)
        stops.append(istart_istops)
        
    words_per_line_by_start_index.append(words_start)
    stops_per_line_by_start_index.append(stops)

print(words_per_line_by_start_index)
print(stops_per_line_by_start_index)

[[['hap', 'happy'], [], [], ['e', 'e.'], ['gnu', 'knew', 'new', 'nu'], [], ['year']]]
[[[2, 3], [], [], [3, 3], [5, 5, 5, 5], [], [8]]]


## Construct word sequences with matching phoneme stop and start indices

In [22]:
def concatenate_lists(list_of_lists1, list_of_lists2):
    result = []
    for item1, item2 in zip(list_of_lists1, list_of_lists2):
        if isinstance(item1, str) and isinstance(item2, list):
            for element in item2:
                result.append((item1, element))
        elif isinstance(item1, tuple) and isinstance(item2, list):
            result.append((list(item1) + list(item2)))
    return result


def concatenate_words(word_lists, word_stops, words_for_line, stops_for_line, unique_starts):
    '''
    Concatenate words where the stop index of one matches the start index of the next.
    '''
    # For each word that starts at start_index
    if word_lists == []:
        words1 = words_for_line[0]
        stops1 = stops_for_line[0]
    else:
        words1 = flatten_sublists(word_lists)
        stops1 = flatten_list(word_stops)

    for iword1, word1 in enumerate(words1):

        # Find words that start after that word stops
        word1_stop = stops1[iword1]
        word2_start = word1_stop + 1
        if word2_start in unique_starts:
            words2 = words_for_line[word2_start]
            stops2 = stops_for_line[word2_start]

            # Concatenate the first word with each of the second set of words
            if len(words2) > 0:
                word1_copies = copy_list([word1], len(words2))
                words2_list = [[x] for x in words2]
                word_lists.append(concatenate_lists(word1_copies, words2_list))
                word_stops.append(stops2)
            else:
                break
        else:
            break
            
    return word_lists, word_stops
            

# For each line
candidate_lines = []
for iline, words_for_line in enumerate(words_per_line_by_start_index):
    stops_for_line = stops_per_line_by_start_index[iline]
    unique_starts = unique_starts_per_line[iline]
    max_stop = max_stops_per_line[iline]
    words_lists = []
    words_stops = []
    candidate_line = []
    halt = 0
    while(halt == 0):

        words_lists, words_stops = concatenate_words(words_lists, words_stops, 
                                                     words_for_line, stops_for_line, 
                                                     unique_starts)
        for sublist in words_lists:
            if sublist == []:
                halt = 1
        
        # Store list of words if reached max_stop
        if halt == 0:
            all_words_lists = flatten_sublists(words_lists)
            all_words_stops = flatten_list(words_stops)
            for iword_stop, word_stop in enumerate(all_words_stops):
                if word_stop == max_stop:
                    candidate_line.append(' '.join(all_words_lists[iword_stop]))
                
    candidate_lines.append(candidate_line)
     
for candidate_line in flatten_list(candidate_lines):
    print(candidate_line)


happy gnu year
happy knew year
happy new year
happy nu year
