In [1]:
from phonemes import *
import nltk
from nltk.corpus import cmudict
from wordfreq import top_n_list
from collections.abc import Iterable
import re
from enum import Enum, auto

TOP_N = 100000
VOWELS = {'a', 'e', 'i', 'o', 'u'}


In [8]:
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/cpleasants/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [2]:
def strip_emphasis(word_phonemes: list):
    stripped = []
    for phoneme in word_phonemes:
        stripped.append(''.join(c for c in phoneme if c.isalpha()))
    return stripped

In [3]:
top_n = top_n_list('en', TOP_N)
full_cmudict = cmudict.dict()

simplified_cmudict = {
    word : strip_emphasis(full_cmudict[word][0]) for word in top_n if word in full_cmudict
}

In [4]:
def get_flat_list(li:Iterable):
    """
    Flattens an iterable into a single list.

    If an element in the input iterable is a list, set, or tuple, its items are
    added individually to the output list. Strings and other non-iterables are
    treated as single elements.

    Args:
        li (Iterable): The input iterable, which may contain nested lists, sets, or tuples.

    Returns:
        list: A flattened list containing all individual elements.
    """
    output = []
    for value in li:
        if isinstance(value, Iterable) and not isinstance(value, str):
            output.extend(value)
        else:
            output.append(value)
    return output

# Create a list of short vowel sounds, long vowel sounds, and any vowel sound:
short_vowel_sounds = get_flat_list(short_vowels.values())
long_vowels_sounds = get_flat_list(long_vowels.values())
vowel_team_sounds = get_flat_list(vowel_teams.values())
all_vowel_sounds = set(short_vowel_sounds + long_vowels_sounds + vowel_team_sounds)

In [5]:
letter_combinations = vowel_teams | digraphs | double_letters

In [6]:
def is_prefix(blend_or_digraph:str):
    return blend_or_digraph[-1] == '-'

def is_suffix(blend_or_digraph:str):
    return blend_or_digraph[0] == '-'

prefixes = dict()
suffixes = dict()

for d in [vowel_teams, digraphs, prefix_blends, suffix_blends, common_endings]:
    prefixes = prefixes | {k: v for k, v in d.items() if is_prefix(k)}
    suffixes = suffixes | {k: v for k, v in d.items() if is_suffix(k)}

print(prefixes)
print(suffixes)

{'bl-': [('B', 'L')], 'cl-': [('K', 'L')], 'fl-': [('F', 'L')], 'gl-': [('G', 'L')], 'pl-': [('P', 'L')], 'sl-': [('S', 'L')], 'br-': [('B', 'R')], 'cr-': [('K', 'R')], 'dr-': [('D', 'R')], 'fr-': [('F', 'R')], 'gr-': [('G', 'R')], 'pr-': [('P', 'R')], 'tr-': [('T', 'R')], 'sc-': [('S', 'C')], 'shr-': [('SH', 'R')], 'sk-': [('S', 'K')], 'sm-': [('S', 'M')], 'sn-': [('S', 'N')], 'sp-': [('S', 'P')], 'squ-': [('S', 'K', 'W')], 'st-': [('S', 'T')], 'sw-': [('S', 'W')]}
{'-lp': [('L', 'P')], '-st': [('S', 'T')], '-ct': [('K', 'T')], '-pt': [('P', 'T')], '-sk': [('S', 'K')], '-lk': [('K',)], '-lf': [('L', 'F')], '-xt': [('K', 'S', 'T')], '-ft': [('F', 'T')], '-nd': [('N', 'D')], '-mp': [('M', 'P')], '-lt': [('L', 'T')], '-nch': [('N', 'CH')], '-mb': [('M', 'B')], '-tch': [('CH',)], '-dge': [('JH',)], '-ing': [('IH', 'NG')], '-ang': [('AE', 'NG')], '-ong': [('AO', 'NG')], '-ung': [('AH', 'NG')], '-ank': [('AE', 'NG', 'K')], '-ink': [('IH', 'NG', 'K')], '-onk': [('AA', 'NG', 'K')], '-unk': [(

In [7]:
simplified_cmudict['accent']

['AH', 'K', 'S', 'EH', 'N', 'T']

In [18]:
class Indicator(Enum):
    SHORT_VOWEL = auto()
    LONG_VOWEL = auto()
    HARD_CONSONANT = auto()
    SOFT_CONSONANT = auto()
    LETTER_COMBO = auto()
    SILENT_E = auto()
    UNDECODABLE = auto()
    
def decode(word:str):
    """
    Decodes a word into its phonetic components using a simplified CMU dictionary.
    
    The function identifies prefixes, suffixes, and letter-to-sound mappings to determine
    how the word is structured phonetically.

    Args:
        word (str): The word to decode.

    Returns:
        dict: A dictionary containing:
            - 'letter_parts': List of identified letter components.
            - 'indicators': List of Indicator values
            - 'sound_parts': Corresponding phonetic components.
            - 'decodable': Boolean indicating if the word is fully decodable.
    """
    word = word.lower()
    if word not in simplified_cmudict:
        raise Exception("Word not found")
    
    word_phonemes = simplified_cmudict[word]
    remaining_letters = word
    remaining_sounds = word_phonemes
    letter_parts = []
    sound_parts = []
    indicators = []
    decodable = True

    # ---- Helper Functions ----

    def handle_undecodable():
        """Handles cases where the letter-sound mapping isn't recognized."""
        nonlocal decodable, remaining_letters, remaining_sounds
        decodable = False
        letter_parts.append(remaining_letters)
        sound_parts.append(tuple(remaining_sounds))
        indicators.append(Indicator.UNDECODABLE)
        remaining_letters = ''
        remaining_sounds = []

    def process_single_letter_sound(letter, sound, indicator):
        """Handles normal, single-letter-sound processing."""
        nonlocal remaining_letters, remaining_sounds
        letter_parts.append(letter)
        sound_parts.append(sound)
        indicators.append(indicator)
        remaining_letters = remaining_letters[1:]
        remaining_sounds = remaining_sounds[len(sound) : ] if remaining_sounds else []

    # Parse out any prefixes and suffixes
    def process_affixes(affixes_dict, is_prefix = True):
        """Handles both prefixes and suffixes processing."""
        nonlocal remaining_letters, remaining_sounds
        affix_letter_parts = []
        affix_sound_parts = []
        affix_indicators = []

        for affix, affix_sounds in affixes_dict.items():
            affix_letters = affix.replace('-', '')
            letters_match = word.startswith(affix_letters) if is_prefix else word.endswith(affix_letters)
            for affix_sound in affix_sounds:
                sounds_match = tuple(word_phonemes[ : len(affix_sound)]) == affix_sound if is_prefix else tuple(word_phonemes[-len(affix_sound) : ]) == affix_sound
                num_sounds = len(affix_sound)
                if letters_match and sounds_match:
                    affix_letter_parts.append(affix)
                    affix_sound_parts.append(affix_sound)
                    affix_indicators.append(Indicator.LETTER_COMBO)
                    remaining_letters = remaining_letters.lstrip(affix_letters) if is_prefix else remaining_letters.rstrip(affix_letters)
                    remaining_sounds = remaining_sounds[num_sounds:] if is_prefix else remaining_sounds[ : -num_sounds]
                    break
        return affix_letter_parts, affix_sound_parts, affix_indicators
    

    # ---- Main Logic ----
    
    # Process prefixes and suffixes
    prefix_letter_parts, prefix_sound_parts, prefix_indicators = process_affixes(prefixes)
    suffix_letters, suffix_sound_parts, suffix_indicators = process_affixes(suffixes, is_prefix = False)

    SOUND_CATEGORIES = {
        Indicator.SHORT_VOWEL: short_vowels,
        Indicator.LONG_VOWEL: long_vowels,
        Indicator.HARD_CONSONANT: hard_consonants,
        Indicator.SOFT_CONSONANT: soft_consonants,
    }

    # Process through the remaining_letters:
    while len(remaining_letters) > 0:
        # first search through all letter combinations
        for letters, sounds in letter_combinations.items():
            for sound in sounds:
                if remaining_letters.startswith(letters) and tuple(remaining_sounds[:len(sound)]) == sound:
                    letter_parts.append(letters)
                    sound_parts.append(sound)
                    indicators.append(Indicator.LETTER_COMBO)
                    remaining_letters = remaining_letters.lstrip(letters)
                    remaining_sounds = remaining_sounds[len(sound) : ]
                    break
            else:
                # Continue searching other letter combinations if no match is found
                continue
            # If the inner break is hit, break the outer loop as well
            break
        # Some words contain punctuation (e.g. "won't") -- skip the punctuation
        else:
            this_letter = remaining_letters[0]
            if not this_letter.isalpha():
                remaining_letters = remaining_letters[1:]
                continue

            matched = False

            # Silent E
            if (len(remaining_sounds) == 0 or remaining_sounds[0] not in all_vowel_sounds) and this_letter == 'e':
                process_single_letter_sound(this_letter, '', Indicator.SILENT_E)
                matched = True
            
            for indicator, sound_dict in SOUND_CATEGORIES.items():
                for sound in sound_dict.get(this_letter, []):
                    if tuple(remaining_sounds[:len(sound)]) == sound:
                        process_single_letter_sound(this_letter, sound, indicator)
                        matched = True
                        break
                if matched:
                    break
                
            if not matched:
                handle_undecodable()
            

    # Add back in the suffixes
    letter_parts = prefix_letter_parts + letter_parts + suffix_letters
    sound_parts = prefix_sound_parts + sound_parts + suffix_sound_parts
    indicators = prefix_indicators + indicators + suffix_indicators

    return {
        'letter_parts' : letter_parts, 
        'indicators' : indicators ,
        'sound_parts' : sound_parts,
        'decodable' : decodable
    }

In [21]:
decode('lakes')

{'letter_parts': ['l', 'a', 'k', 'e', 's'],
 'indicators': [<Indicator.HARD_CONSONANT: 3>,
  <Indicator.LONG_VOWEL: 2>,
  <Indicator.HARD_CONSONANT: 3>,
  <Indicator.SILENT_E: 6>,
  <Indicator.HARD_CONSONANT: 3>],
 'sound_parts': [('L',), ('EY',), ('K',), '', ('S',)],
 'decodable': True}

In [24]:
decoded_dict = {}
for word in simplified_cmudict:
    decoded_dict[word] = {
            'letter_parts' : [word],
            'sound_parts' : [tuple(simplified_cmudict[word])],
            'indicators' : [Indicator.UNDECODABLE],
            'decodable' : False
        }
        
        

In [26]:
import random

In [27]:
for i in range(10):
    ind = random.randint(0, len(top_n))
    print(top_n[ind])
    print(decode(top_n[ind])) if top_n[ind] in simplified_cmudict else print()

meanness
{'letter_parts': ['m', 'ea', 'nn', 'e', 'ss'], 'indicators': [<Indicator.HARD_CONSONANT: 3>, <Indicator.LETTER_COMBO: 5>, <Indicator.LETTER_COMBO: 5>, <Indicator.SILENT_E: 6>, <Indicator.UNDECODABLE: 7>], 'sound_parts': [('M',), ('IY',), ('N',), '', ('N', 'AH', 'S')], 'decodable': False}
manically

milfs

statistically
{'letter_parts': ['st-', 'atistically'], 'indicators': [<Indicator.LETTER_COMBO: 5>, <Indicator.UNDECODABLE: 7>], 'sound_parts': [('S', 'T'), ('AH', 'T', 'IH', 'S', 'T', 'IH', 'K', 'AH', 'L', 'IY')], 'decodable': False}
maughan
{'letter_parts': ['m', 'aughan'], 'indicators': [<Indicator.HARD_CONSONANT: 3>, <Indicator.UNDECODABLE: 7>], 'sound_parts': [('M',), ('AO', 'G', 'AH', 'N')], 'decodable': False}
hollowed
{'letter_parts': ['h', 'o', 'll', 'ow', 'e', 'd'], 'indicators': [<Indicator.HARD_CONSONANT: 3>, <Indicator.SHORT_VOWEL: 1>, <Indicator.LETTER_COMBO: 5>, <Indicator.LETTER_COMBO: 5>, <Indicator.SILENT_E: 6>, <Indicator.HARD_CONSONANT: 3>], 'sound_parts': 

In [None]:
simplified_cmudict['pestered']

['P', 'EH', 'S', 'T', 'ER', 'D']

In [None]:
'123'[-2:]

'23'

In [None]:
decode('blanket')

anket ['AE', 'NG', 'K', 'AH', 'T']


{'letter_parts': ['bl-'], 'sound_parts': [('B', 'L')]}

In [None]:
parts = [('bl', ('B', 'L')), '']

In [None]:
# TODO: There is a challenge with "qu" diagraphs 

In [None]:
simplified_cmudict["book"]

['B', 'UH', 'K']

In [None]:
for word in simplified_cmudict:
    if 'UH' in simplified_cmudict[word] and 'oo' not in word:
        print(word)

would
could
should
you're
during
sure
put
full
woman
security
wouldn't
couldn't
european
europe
tour
insurance
fully
pull
push
shouldn't
yours
ensure
sugar
pulled
schedule
pure
secure
rural
tournament
surely
puts
bush
jury
bullshit
curious
scheduled
pushing
output
pushed
bureau
would've
input
pulling
wolf
bull
cure
mature
tourism
missouri
bullet
pussy
securities
secured
tourist
woman's
duration
tours
handful
jurisdiction
purely
should've
assured
assure
could've
tourists
bullets
pulls
wolves
ensuring
furious
bulls
bullying
curiosity
superb
securing
fury
touring
bully
europeans
maturity
muhammad
fulfill
yourselves
obscure
endure
assurance
schedules
bulletin
fuller
jung
fulfilled
neural
pushes
butcher
unsure
cured
miniature
fulfilling
neurons
boulevard
premature
bullied
insured
purity
scheduling
durable
procurement
enduring
lure
immature
samurai
bulldogs
inputs
yuri
pudding
ensures
europa
honduras
jurisdictions
worcester
zurich
bushes
cushion
endured
insecurity
ambush
blvd
bureaucracy
cur

In [None]:
simplified_cmudict["cook"]

['K', 'UH', 'K']

In [None]:
re.split("a|e|i|o|u", "tick")

['t', 'ck']

In [None]:
def is_vowel(letter:str):
    return letter in VOWELS

def is_vowel_sound(phoneme:str):
    return phoneme in all_vowel_sounds

def only_short_vowels(word:str):
    # Extract the vowels and the vowel phonemes
    word_vowels = [c for c in word if is_vowel(c)]
    word_vowel_sounds = [phon for phon in simplified_cmudict[word] if phon in all_vowel_sounds]

    # Check that the number of vowels is equal to the number of vowel sounds
    if len(word_vowels) != len(word_vowel_sounds):
        return False
    
    # Check that each vowel corresponds, in order, to (one of) its sounds
    for i in range(len(word_vowels)):
        if word_vowel_sounds[i] not in short_vowels[word_vowels[i]]:
            return False
    
    return True
    

def is_vc(word: str, allowed_blends_and_digraphs=None, include_long_vowels=False):
    """
    Determines if a given word follows the "VC" (vowel-consonant) pattern.

    A valid VC word must:
    - Contain exactly two phonetic sounds: one vowel followed by one consonant.
    - Only include short vowels unless `include_long_vowels` is set to True.
    - Have a valid consonant ending, either a single consonant or one listed in `allowed_blends_and_digraphs`.

    Parameters:
        word (str): The word to check.
        allowed_blends_and_digraphs (list, optional): A list of allowed consonant blends or digraphs. Defaults to an empty list.
        include_long_vowels (bool, optional): Whether to allow long vowels. Defaults to False.

    Returns:
        bool: True if the word follows the VC pattern, False otherwise.

    Raises:
        Exception: If the word contains more than one vowel.
    """
    if allowed_blends_and_digraphs is None:
        allowed_blends_and_digraphs = []

    sounds = simplified_cmudict[word]

    # Ensure the word has exactly two phonetic sounds: a vowel followed by a consonant
    if len(sounds) != 2 or not is_vowel_sound(sounds[0]) or is_vowel_sound(sounds[1]):
        return False

    # If long vowels shouldn't be included, verify the vowel is short
    if not include_long_vowels and not only_short_vowels(word):
        return False

    # Extract consonant part of the word
    consonant_part = word[1:]

    # Ensure no additional vowels exist beyond the first letter
    if any(is_vowel(letter) for letter in consonant_part):
        raise Exception(f"The word '{word}' appears to have more than one vowel!")

    # Check if the consonant part is a single letter or an allowed blend/digraph
    return len(consonant_part) == 1 or consonant_part in allowed_blends_and_digraphs
    
def is_cvc(word:str, allowed_blends_and_digraphs=None, include_long_vowels=False):
    """
    Determines if a given word follows the "CVC" (consonant-vowel-consonant) pattern.

    A valid CVC word must:
    - Contain one consonant (or blend) followed by one vowel followed by one consonant (or blend).
    - Only include short vowels unless `include_long_vowels` is set to True.
    - Consonant sounds must be a single letter or one listed in `allowed_blends_and_digraphs`.

    Parameters:
        word (str): The word to check.
        allowed_blends_and_digraphs (list, optional): A list of allowed consonant blends or digraphs. Defaults to an empty list.
        include_long_vowels (bool, optional): Whether to allow long vowels. Defaults to False.

    Returns:
        bool: True if the word follows the CVC pattern, False otherwise.

    Raises:
        Exception: If the word contains more than one vowel sound.
    """
    if allowed_blends_and_digraphs is None:
        allowed_blends_and_digraphs = []

    sounds = simplified_cmudict[word]

    # Ensure the word has exactly two phonetic sounds: a vowel followed by a consonant
    if len(sounds) != 2 or not is_vowel_sound(sounds[0]) or is_vowel_sound(sounds[1]):
        return False

    # If long vowels shouldn't be included, verify the vowel is short
    if not include_long_vowels and not only_short_vowels(word):
        return False

    # Extract consonant part of the word
    consonant_part = word[1:]

    # Ensure no additional vowels exist beyond the first letter
    if any(is_vowel(letter) for letter in consonant_part):
        raise Exception(f"The word '{word}' appears to have more than one vowel!")

    # Check if the consonant part is a single letter or an allowed blend/digraph
    return len(consonant_part) == 1 or consonant_part in allowed_blends_and_digraphs
    return len(word) == 3 and not is_vowel(word[0]) and is_vowel(word[1]) and not is_vowel(word[2])

def is_cvcc(word:str):
    return
def is_ccvc():
    return
def is_cvce(word:str):
    return

In [None]:
suffixes

{'bl-',
 'br-',
 'cl-',
 'cr-',
 'dr-',
 'fl-',
 'fr-',
 'gh-',
 'gl-',
 'gn-',
 'gr-',
 'kn-',
 'ph-',
 'pl-',
 'pr-',
 'sc-',
 'shr-',
 'sk-',
 'sl-',
 'sm-',
 'sn-',
 'sp-',
 'squ-',
 'st-',
 'sw-',
 'tr-',
 'wr-'}

In [None]:
for word in simplified_cmudict:
    if is_vc(word):
        print(word, top_n.index(word))

in 5
is 7
it 11
on 12
as 17
at 20
an 29
if 36
up 43
us 94
am 159
al 701
ed 1690
ad 1778
im 2076
et 2199
un 2414
id 2447
em 2622
el 2672
en 2989
op 3515
il 4163
um 5845
os 6076
ab 6428
ip 6699
oz 7036
ag 7642
es 7918
og 10898
ev 12041
ul 14239
ib 16045
eb 20101
ek 23254


In [None]:
simplified_cmudict['boat']

['B', 'OW', 'T']

In [None]:
only_short_vowels("ought")

False

In [None]:
# Strip out the emphasis bc it doesn't matter.