## Input Prep

This notebook prepares the CMU Dictionary for input into the Sublexical Toolkit for analysis.

Author: Caleb Solomon

In [23]:
# Imports
import pandas as pd
import cambridge_parser as parser
import re

### Task 1: Initialize the CMU Dictionary and trim it.

The first many lines of the dictionary file are useless, containing simple text. There are also a significant number of words containing numbers, parentheses, or other features that are unnecessary for input to the sublexical toolkit. Furthermore, we want to keep only words whose 

In [24]:
# Display the first 50 or so lines for reference to above.
fcmu = open('cmudict-0.7b-2024-4-6.txt')
for line in fcmu.readlines()[:20]:
    print(line.strip())
fcmu.close()

;;; # CMUdict  --  Major Version: 0.07
;;;
;;; # $HeadURL$
;;; # $Date::                                                   $:
;;; # $Id::                                                     $:
;;; # $Rev::                                                    $:
;;; # $Author::                                                 $:
;;;
;;; #
;;; # Copyright (C) 1993-2015 Carnegie Mellon University. All rights reserved.
;;; #
;;; # Redistribution and use in source and binary forms, with or without
;;; # modification, are permitted provided that the following conditions
;;; # are met:
;;; #
;;; # 1. Redistributions of source code must retain the above copyright
;;; #    notice, this list of conditions and the following disclaimer.
;;; #    The contents of this file are deemed to be source code.
;;; #


In [25]:
# This block takes ~1 min to run.
# Create a dictionary of words to pronunciations.
# Dict {str : str}
cmu_dict = {}

# Import the SUBTLEXUS csv to a pandas dataframe.
subtlexus = pd.read_csv('SUBTLEXusExcel2007.csv')

# Convert all words to lowercase
subtlexus['Word'] = subtlexus['Word'].str.lower()

# Regex for finding unwanted punctuation in words (essentially any non-word)
rpunc = r".*(\W|\d).*"
# Regex for three-peated characters (any word with three or more of the same
# letter in a row should be omitted, as none are valid English words for the
# purposes of the toolkit)
rpeat = r".*(.)\1\1.*"

# Iterate through the lines of the dictionary. Add only such words containing
# no parentheses and with a corresponding entry in the SUBTLEXUS to the
# dictionary of cmu words that will be kept for analysis.
with open('cmudict-0.7b-2024-4-6.txt') as file:
    # Skip the first 56 lines as these contain text we are not interested in
    for line in file.readlines()[56:]:
        word, pronunciation = line.strip().split(maxsplit=1)
        word = word.lower()
        # Ensure the word doesn't contain punctuation and is present in the
        # SUBTLEXUS
        if re.match(rpunc, word) is None \
            and re.match(rpeat, word) is None \
            and word in subtlexus['Word'].values:
            cmu_dict[word] = pronunciation

In [26]:
# Display the final number of words in the dataset
len(cmu_dict)

48353

### Task 2: Cross-reference CMU Dictionary Pronunciations with Cambridge Prounciations

First, the CMU dictionary pronunciations will need to be converted to reflect the Cambridge dictionary pronunciation format. The transcriptions csv aids in these conversions.

In [27]:
# Generates a list of all possible transcriptions of a cmu word in IPA
# form recursively.
def possible_transcriptions(cmu_word, replacements):
    def helper(index, current_transcription):
        # If we are at the end of the word, return the constructed result
        if index >= len(cmu_word):
            transcriptions.append(current_transcription)
            return

        # Grab the current phoneme by checking to see if we are at the last
        # phoneme (end of the word) or the next whitespace
        pend = cmu_word[index:].find(" ")
        if pend != -1:
            phoneme = cmu_word[index:index + pend]
        else:
            phoneme = cmu_word[index:]
            pend = len(cmu_word)

        # Determine if this is a phoneme that is sensitive to the number at the
        # end (i.e. AH0 is differentiated from AH1)
        m = re.match(r"(\w+)\d+", phoneme)
        
        # If it isn't, remove the number for consideration
        if m != None and m.group(1) != "AH":
            phoneme = m.group(1)

        # Recursively generate all possible combinations of phonemes
        if phoneme in replacements:
            for option in replacements[phoneme]:
                helper(index + pend + 1, current_transcription + option)
        else:
            helper(index + pend + 1, current_transcription + phoneme)

    # Call for the word
    transcriptions = []
    helper(0, "")

    # Remove all whitespace and extra numbers from the resultant transcription
    for t in transcriptions:
        t = t.replace(" ", "")
        t = re.sub(r"\d", "", t)

    return transcriptions

In [28]:
# Load the transcriptions csv
transcriptions = pd.read_csv('transcriptions/transcriptions.csv')

# Convert the cmu_dict dictionary to a pandas dataframe
cmu_df = pd.DataFrame(list(cmu_dict.items()), columns=['Word', 'Pronunciation'])

# Iterate through the transcriptions and generate a dict of transcriptions
# There are two special cases: ER and AA, where each have two different
# representation possibilities. These cases need to be handled separately.
# Furthermore, sometimes "AA" is followed by a number of the format "AAn". In
# such cases we ignore the number and just replace as AA. To do so after we 
# apply all pronunciation transcriptions we just remove the remaining numbers
# from the transcription. This is done below.
replacements = {}  # Dict{str : [str]}
special_replacements_ER = ["ɝ", "ɚ"]
special_replacements_AA = ["ɑ", "ɒ"]

for index, row in transcriptions.iterrows():
    cmu_p = row['CMU']
    ipa_p = row['IPA']

    # Skip the special cases where the CMU pronunciation is "ER" or "AA"
    if cmu_p == "ER" or cmu_p == "AA":
        continue
    
    # Add the IPA representation transcription to the dictionary
    replacements[cmu_p] = [ipa_p]

replacements["ER"] = special_replacements_ER
replacements["AA"] = special_replacements_AA

# Iterate through the cmu_dict dictionary and replace all CMU pronunciations
# with a list of all possible corresponding pronunciation transcriptions in
# IPA.
for index, row in cmu_df.iterrows():
    ts = possible_transcriptions(row['Pronunciation'], replacements)

    cmu_df.at[index, 'Pronunciation'] = ts

# Observe some of the results
print(cmu_df[:5])

FileNotFoundError: [Errno 2] No such file or directory: 'transcriptions.csv'

Now that all CMU dictionary pronunciations have been updated to IPA format, we go through the pronunciations obtained from the Cambridge dictionary and compare.

First, we write an output file with all of the words in the trimmed CMU dictionary that are not present in the Cambridge dictionary.

Then, we load in the words obtained from the Cambridge dictionary and iterate through those, checking against the pronunciation from the CMU dictionary. If there is a corresponding pronunciation in the list of potential Cambridge pronunciations for a given CMU word, we take note of which one it is and mark its number in the list of pronunciations for that word (this will be written to an output file). If there is no corresponding pronunciation in the Cambridge dictionary (i.e. something is potentially amiss with the CMU dictionary pronunciation), we note "" as the corresponding pronunciation and indicate a "0" for the corresponding pronunciation. In this way we can then keep track of discrepancies.

In [None]:
# Read in the file, cambridge_ipas.csv, containing all Cambridge words and their potential (space-separated) pronunciations.
df = pd.read_csv("cambridge_ipas.csv")

cambridge_pronunciations = dict(zip(df['Word'], df['Pronunciation']))

# Convert all of the pronunciations, which are currently space-separated words, into a list of such words
for w, p in cambridge_pronunciations.items():
    cambridge_pronunciations[w] = p.split()

In [None]:
# Iterate over the keys of the cambridge_pronunciations and the cmu entries to see what is missing
cmu_dict = dict(zip(cmu_df['Word'], cmu_df['Pronunciation']))

missing_dict = {}
pop_list = []

for w, p in cmu_dict.items():
    if w not in cambridge_pronunciations.keys():
        missing_dict[w] = ' '.join(p)
        pop_list.append(w)

final_missing = []

for w, p in missing_dict.items():
    final_missing.append((w,p))

# Remove all of the words not present in Cambridge from the cmu dict list
for word in pop_list:
    cmu_dict.pop(word)

# Output the entries in the trimmed cmu set that are not in the cambridge pronunciation set to a csv.
df = pd.DataFrame(final_missing, columns = ['Word', 'Possible Pronunciations'])
df.to_csv("missing_cmu_words.csv", index = False)

Note that by inspection, a lot of these words don't show up (i.e. words like "abandoning") because a search for said word yields the root word (i.e. searching Cambridge for "abandoning" yields "abandon"). Consequently it may be necessary to go through manually and adjust words like this with said conjugates.

In [None]:
# Next, we iterate over the words in the CMU and Cambridge dictionaries, checking to see if any
# possible pronunciations match. If one does, we take the matching pronunciation from the
# Cambridge dictionary, note its number, and append the word, pronunciation, number set to a data frame.
# If such a pronunciation does not exist, we append word, "", and 0 to the data frame.
# This can later be examined to see which CMU words have discrepancies.
df = pd.DataFrame(columns = ['Word', 'IPA Pronunciation', 'Cambridge Pronunciation Number'])

for cmu_word in cmu_dict.keys():
    # Keep track of the corresponding Cambridge pronunciation that is the same and
    # its index, if there is one
    corresp_p = ""
    final_index = 0
    for p in cmu_dict[cmu_word]:
        cambridge_p_index = 0
        for camp in cambridge_pronunciations[cmu_word]:
            if p == camp and corresp_p == "":
                # If we haven't yet found a pronunciation and an idex,
                # and we just found one, keep track of it and we are done
                corresp_p = camp 
                final_index = cambridge_p_index + 1
            cambridge_p_index += 1

    # Add whatever results to the data frame
    new_row = {'Word': cmu_word, 'IPA Pronunciation': corresp_p, 'Cambridge Pronunciation Number': final_index}
    df.loc[len(df)] = new_row

# Now, for all words in the result, we want to get the Toolkit transcription as well
tdf = pd.read_csv("transcriptions/transcriptions.csv")
t_dict = dict(zip(tdf['IPA'], tdf['Toolkit']))
t_dict.pop("ɝ or ɚ")
t_dict.pop("ɑ~ɒ")
t_dict["ɑ"] = "a"
t_dict["ɒ"] = "a"
t_dict["ɚ"] = "3r"
t_dict["ɝ"] = "3r"

df['Toolkit'] = df['IPA Pronunciation'].apply(lambda x: ''.join(t_dict.get(char, char) for char in x))

# Output to a csv
df.to_csv('cmu_ipa_cambridge.csv', index = False)

In [None]:
# Also print the number of discrepancies present
print(f"Number of discrepancies: {(df['Cambridge Pronunciation Number'] == 0).sum()}")

In [None]:
# Look at accepting as an example
print(cmu_dict["accepting"])
print(cambridge_pronunciations["accepting"])