## Input Prep

This notebook prepares the CMU Dictionary for input into the Sublexical Toolkit for analysis.

Author: Caleb Solomon

In [1]:
# Imports
import pandas as pd
import cambridge_parser as parser
import re

### Task 1: Initialize the CMU Dictionary and trim it.

The first many lines of the dictionary file are useless, containing simple text. There are also a significant number of words containing numbers, parentheses, or other features that are unnecessary for input to the sublexical toolkit. Furthermore, we want to keep only words whose 

In [2]:
# Display the first 50 or so lines for reference to above.
fcmu = open('cmudict-0.7b-2024-4-6.txt')
for line in fcmu.readlines()[:20]:
    print(line.strip())
fcmu.close()

;;; # CMUdict  --  Major Version: 0.07
;;;
;;; # $HeadURL$
;;; # $Date::                                                   $:
;;; # $Id::                                                     $:
;;; # $Rev::                                                    $:
;;; # $Author::                                                 $:
;;;
;;; #
;;; # Copyright (C) 1993-2015 Carnegie Mellon University. All rights reserved.
;;; #
;;; # Redistribution and use in source and binary forms, with or without
;;; # modification, are permitted provided that the following conditions
;;; # are met:
;;; #
;;; # 1. Redistributions of source code must retain the above copyright
;;; #    notice, this list of conditions and the following disclaimer.
;;; #    The contents of this file are deemed to be source code.
;;; #


In [3]:
# This block takes ~1 min to run.
# Create a list of words to pronunciations. Duplicates allowed to introduce alternate pronunciations.
# List[(str, str)] duplicates allowed (word, cmu pronunciation)
cmu_dict = []

# Import the SUBTLEXUS csv to a pandas dataframe.
subtlexus = pd.read_csv('SUBTLEXusExcel2007.csv')

# Convert all words to lowercase
subtlexus['Word'] = subtlexus['Word'].str.lower()

# Regex for finding alternate pronunciations of words (which are structured as
# "word(int)")
ralt = r"(\w+)\(\d+\)"

# Regex for finding unwanted punctuation in words (essentially any non-word)
rpunc = r".*(\W|\d).*"
# Regex for three-peated characters (any word with three or more of the same
# letter in a row should be omitted, as none are valid English words for the
# purposes of the toolkit)
rpeat = r".*(.)\1\1.*"

altpronsct = 0

# Iterate through the lines of the dictionary. Add only such words containing
# no parentheses and with a corresponding entry in the SUBTLEXUS to the
# dictionary of cmu words that will be kept for analysis.
with open('cmudict-0.7b-2024-4-6.txt') as file:
    # Skip the first 56 lines as these contain text we are not interested in
    for line in file.readlines()[56:]:
        word, pronunciation = line.strip().split(maxsplit=1)
        word = word.lower()
        alt = re.match(ralt, word)

        # First check if this is an alt pronunciation for a word
        if alt is not None:
            alt_text = alt.group(1)
            # The do the checks on the root
            if re.match(rpeat, alt_text) is None \
                and alt_text in subtlexus['Word'].values \
                and re.match(rpunc, alt_text) is None:
                cmu_dict.append((alt_text, pronunciation))
                altpronsct += 1 # Keep track of num alts
        else:
            # Otherwise just check general critera
            if re.match(rpeat, word) is None \
                and re.match(rpunc, word) is None \
                and word in subtlexus['Word'].values:
                cmu_dict.append((word, pronunciation))

In [4]:
# Display the final number of words in the dataset
print(len(cmu_dict))
# Check how many alternate pronunciations appeared
print(altpronsct)

53388
5035


### Task 2: Cross-reference CMU Dictionary Pronunciations with Cambridge Prounciations

First, the CMU dictionary pronunciations will need to be converted to reflect the Cambridge dictionary pronunciation format. The transcriptions csv aids in these conversions.

In [5]:
# Generates a list of all possible transcriptions of a cmu word in IPA
# form recursively.
def possible_transcriptions(cmu_word, replacements):
    def helper(index, current_transcription):
        # If we are at the end of the word, return the constructed result
        if index >= len(cmu_word):
            transcriptions.append(current_transcription)
            return

        # Grab the current phoneme by checking to see if we are at the last
        # phoneme (end of the word) or the next whitespace
        pend = cmu_word[index:].find(" ")
        if pend != -1:
            phoneme = cmu_word[index:index + pend]
        else:
            phoneme = cmu_word[index:]
            pend = len(cmu_word)

        # Determine if this is a phoneme that is sensitive to the number at the
        # end (i.e. AH0 is differentiated from AH1)
        m = re.match(r"(\w+)\d+", phoneme)
        
        # If it isn't, remove the number for consideration
        if m != None and m.group(1) != "AH":
            phoneme = m.group(1)

        # Recursively generate all possible combinations of phonemes
        if phoneme in replacements:
            for option in replacements[phoneme]:
                helper(index + pend + 1, current_transcription + option)
        else:
            helper(index + pend + 1, current_transcription + phoneme)

    # Call for the word
    transcriptions = []
    helper(0, "")

    # Remove all whitespace and extra numbers from the resultant transcription
    for t in transcriptions:
        t = t.replace(" ", "")
        t = re.sub(r"\d", "", t)

    return transcriptions

In [6]:
# Load the transcriptions csv
transcriptions = pd.read_csv('transcriptions/transcriptions.csv')

# Convert the cmu_dict dictionary to a pandas dataframe
cmu_df = pd.DataFrame(cmu_dict, columns=['Word', 'Pronunciation'])

# Iterate through the transcriptions and generate a dict of transcriptions
# There are two special cases: ER and AA, where each have two different
# representation possibilities. These cases need to be handled separately.
# Furthermore, sometimes "AA" is followed by a number of the format "AAn". In
# such cases we ignore the number and just replace as AA. To do so after we 
# apply all pronunciation transcriptions we just remove the remaining numbers
# from the transcription. This is done below.
replacements = {}  # Dict{str : [str]}
special_replacements_ER = ["ɝ", "ɚ"]
special_replacements_AA = ["ɑ", "ɒ"]

for index, row in transcriptions.iterrows():
    cmu_p = row['CMU']
    ipa_p = row['IPA']

    # Skip the special cases where the CMU pronunciation is "ER" or "AA"
    if cmu_p == "ER" or cmu_p == "AA":
        continue
    
    # Add the IPA representation transcription to the dictionary
    replacements[cmu_p] = [ipa_p]

replacements["ER"] = special_replacements_ER
replacements["AA"] = special_replacements_AA

# Iterate through the cmu_dict dictionary and replace all CMU pronunciations
# with a list of all possible corresponding pronunciation transcriptions in
# IPA.
for index, row in cmu_df.iterrows():
    ts = possible_transcriptions(row['Pronunciation'], replacements)

    cmu_df.at[index, 'Pronunciation'] = ts

# Observe some of the results
print(cmu_df[:5])

       Word                         Pronunciation
0         a                                   [ə]
1         a                                  [eɪ]
2        aa                                [eɪeɪ]
3       aah                                [ɑ, ɒ]
4  aardvark  [ɑɹdvɑɹk, ɑɹdvɒɹk, ɒɹdvɑɹk, ɒɹdvɒɹk]


As another special case we want to handle for consistency, after we produce all of the IPA pronunciations, we want to make sure all of the "un"s are stressed (for example, words like "undo" or "understanding") (becuase this is the convention the Toolkit has taken). The CMU dictionary uses the unstressed version, so we will just have to check for all words that start with "un" and whose corresponding pronunciation starts with "ə" and replace the "ə" with a "ʌ" to indicate stressed in the IPA representation.

In [7]:
for index, row in cmu_df.iterrows():
    # Check if the first two characters are "un"
    if row['Word'][:2] == "un":
        newps = []
        for p in row['Pronunciation']:
            if p[0] == "ə":
                newps.append("ʌ" + p[1:])
        row['Pronunciation'] = newps

In [8]:
# Also just get an output of the results temporarily
cmu_df.to_csv("temp/cmu_pronuns_firstpass.csv")

Now that all CMU dictionary pronunciations have been updated to IPA format, we go through the pronunciations obtained from the Cambridge dictionary and compare.

In [9]:
# CHANGE: read in cambridge_ipas which is now a df with four columns:
# DFof str str int int
# with: str: word 
#       str: space-separated pronunciation list
#       int: root word returned (if we searched for a conjugation)? 0 no 1 yes
#       int: missing from cambridge dictionary? 0 no 1 yes

# Now, cmu df is a df of int str List[str]
# So our final df should be:
# cmu_word cmu_ipas cam_ipas match_pron*_ipa match_pron*_toolkit (cmuipa, cambipa) alt_pron? cambridge_root_returned missing_from_cambrdige present_and_discrepant
# Dfof str List[str] List[str] List[str] List[str] List[(int, int)] int int int
# all ints are 1 true 0 false
# something is discrepant if the matched pronunciations are empty
# with: str: cmu word
#       List[str]: list of possible pronunciations of the cmu word in ipa format 
#       List[str]: list (possibly empty) of pronunciations for the word returned by cambridge
#       List[str]: list (possibly empty) of matched pronunciations, in ipa format 
#       List[str]: list (possibly empty) of the same matched prons in toolkit format 
#       List[(int, int)]: list of 1-indexed tuples corresponding to the matched* pronunciations (cmu, cambridge)
#       int: was this cmu word an alternate pronunciation? for example, is this the second valid entry in the dictionary for a word?
#       int: were the pronunciations from cambridge given as the pronunciations from the root of this word as opposed to the word (likely a conjugate) itself?
#       int: was this word completely missing from the cambridge dictionary?

# Read in the file, cambridge_ipas.csv, containing all Cambridge words and their potential (space-separated) pronunciations.
cambridge_ipas = pd.read_csv("cambridge_ipas.csv")

# Convert all of the pronunciations, which are currently space-separated words, into a list of such words
for index, row in cambridge_ipas.iterrows():
    if isinstance(row['Pronunciation'], str):
        cambridge_ipas.at[index, 'Pronunciation'] = row['Pronunciation'].split()
    else:
        cambridge_ipas.at[index, 'Pronunciation'] = []

In [10]:
cambridge_ipas.loc[cambridge_ipas['Word'] == "abacus", 'Root Word Returned'].iloc[0]

0

In [11]:
# NOTE: This cell takes a few minutes to run!

# Initialize the final data frame
df = pd.DataFrame(columns=['CMU Word', 'CMU IPAs', 'Cambridge IPAs', 'Matched Pronunciations (IPA)', 'Matched Pronunciations (Toolkit)', 'Pronunciation Matches', 'Alternate Pronunciation', 'Root From Cambridge', 'Missing From Cambridge'])

prev_word = ""
# Iterate through all words in cmu_df.
for index, row in cmu_df.iterrows():
    # Initialize the row and grab the current word
    cmu_word = row['Word']
    new_row = {
        'CMU Word': cmu_word,
        'CMU IPAs': row['Pronunciation'],
        'Cambridge IPAs': [],
        'Matched Pronunciations (IPA)': [],
        'Matched Pronunciations (Toolkit)': [],
        'Pronunciation Matches': [],
        'Alternate Pronunciation': 0,
        'Root From Cambridge': 0,
        'Missing From Cambridge': 0
    }

    # First check if the word was not present in cambridge
    if cmu_word not in cambridge_ipas['Word'].values or cambridge_ipas.loc[cambridge_ipas['Word'] == cmu_word, 'Missing'].iloc[0] == 1:
        new_row['Missing From Cambridge'] = 1
        df.loc[len(df)] = new_row
        prev_word = cmu_word
        continue
    
    # Check if it is the root word that was returned
    # At this point we know the cmu word exists in cambridge
    new_row['Root From Cambridge'] = cambridge_ipas.loc[cambridge_ipas['Word'] == cmu_word, 'Root Word Returned'].iloc[0]
    # Check if it is an alternate pronunciation
    new_row['Alternate Pronunciation'] = 1 if prev_word == cmu_word else 0

    # Next grab all the pronunciations from cambridge_ipas and pairwise compare
    new_row['Cambridge IPAs'] = cambridge_ipas.loc[cambridge_ipas['Word'] == cmu_word, 'Pronunciation'].iloc[0]
    match_list = []
    tuple_matches = []
    for i, cmu_ipa in enumerate(new_row['CMU IPAs']):
        for j, camb_ipa in enumerate(new_row['Cambridge IPAs']):
            if cmu_ipa == camb_ipa:
                match_list.append(cmu_ipa)
                tuple_matches.append((i + 1, j + 1))
    new_row['Matched Pronunciations (IPA)'] = match_list
    new_row['Pronunciation Matches'] = tuple_matches

    # Append the row
    df.loc[len(df)] = new_row

    # Finally set the new previous word for the purpose of checking alt prons
    prev_word = cmu_word

In [12]:
# Now, for all words in the result, we want to get the Toolkit transcription as well
tdf = pd.read_csv("transcriptions/transcriptions.csv")
t_dict = dict(zip(tdf['IPA'], tdf['Toolkit']))
t_dict.pop("ɝ or ɚ")
t_dict.pop("ɑ~ɒ")
t_dict["ɑ"] = "a"
t_dict["ɒ"] = "a"
t_dict["ɚ"] = "3r"
t_dict["ɝ"] = "3r"

df['Matched Pronunciations (Toolkit)'] = df['Matched Pronunciations (IPA)'].apply(lambda x: ''.join(t_dict.get(char, char) for char in x))

for index, row in df.iterrows():
    matched_ps = row['Matched Pronunciations (IPA)']
    # Ensures there are no duplicate toolkit transcriptions in the final result
    toolkit_ps = list(set([''.join(t_dict.get(char, char) for char in x) for x in matched_ps]))
    df.at[index, 'Matched Pronunciations (Toolkit)'] = toolkit_ps

# TODO: remove this after fixing the Root Returned indicator
df = df.drop('Root From Cambridge', axis=1)

# Output to a csv
df.to_csv('cmu_ipa_cambridge.csv', index = False)

In [13]:
import ast

In [15]:
# We can then grab some interesting statistics
# Total number of entries 
print(f"Total words: {len(df)}")
# Number of missing words (there might be a decent amount of junk here)
print(f"Word missing from Cambridge: {df['Missing From Cambridge'].value_counts()[1]}")
# Number of discrepant words
def discrepant(row):
    # Check if the columns are already lists
    cm = row['Cambridge IPAs']
    mp = row['Matched Pronunciations (IPA)']
    
    # Parse the string representation of lists if they are not already lists
    if isinstance(cm, str):
        cm = ast.literal_eval(cm)
    if isinstance(mp, str):
        mp = ast.literal_eval(mp)
    if len(cm) > 0 and len(mp) == 0:
        print(row['CMU Word'])
    return len(cm) > 0 and len(mp) == 0
print(f"Words present in Cambridge but with no matches to CMU pronunciations: {df.apply(discrepant, axis=1).sum()}")
# Number of alternate pronunciations present
print(f"Alternate valid pronunciations present: {df['Alternate Pronunciation'].value_counts()[1]}")

Total words: 53388
Word missing from Cambridge: 7037
a
a
ab
abalones
abandoning
abandons
abated
abates
abating
abbreviations
abdicated
abdicates
abdicating
abdication
abdomen
abdominal
abducted
abducted
abductee
abductees
abducting
abducting
abduction
abductions
abductions
abductor
abductors
abductors
abducts
aberrant
aberration
aberrations
abetted
abetting
abhors
abided
abilities
abnormalities
abo
abolished
abolishes
abolishing
abolitionist
abolitionists
abolitionists
abominations
aboriginal
aborigine
aborigines
aborted
aborting
abortionists
abortionists
abortions
aborts
abounded
abounding
abounds
abrasives
abridging
abrogated
abrogating
abs
absconded
absconding
absences
absentees
absinthe
absolve
absolved
absolved
absolves
absolves
absolving
absolving
absorb
absorbs
absorption
abstain
abstained
abstained
abstaining
abstaining
abstention
abstention
abstentions
abstentions
abstractions
abstracts
abstruse
absurdities
abused
abusers
abuses
abuses
abusing
abuts
abyssinian
academician
acad