# ESPADA Spanish Annotated Dictionary Parser

In [1]:
import pandas as pd
import os
import re
import json

In [2]:
# Combine the three dataframes back into a single dataframe
# NOTE: Two entries from original dictionary manually removed containing unicode characters '\u200e', '\xad'
# These appeared to be repeats of entries already present.
df1_loaded = pd.read_csv('SpanishAnnotatedDictionary_part1.csv')
df2_loaded = pd.read_csv('SpanishAnnotatedDictionary_part2.csv')
df3_loaded = pd.read_csv('SpanishAnnotatedDictionary_part3.csv')

espada_df = pd.concat([df1_loaded, df2_loaded, df3_loaded], ignore_index=True)

print(espada_df.shape)

(628298, 35)


Obtain the set of all unique phonemes present in the dictionary. Note that all phonemes are single-lettered for simplicity, and in this dictionary there is a 1-1 correspondence between letters and their phonological counterparts.

In [3]:
unique_phonemes = set()

for phoneme_sequence in espada_df['MainBase']:
    phonemes = phoneme_sequence.split()
    unique_phonemes.update(phonemes)

print(f"Total unique phonemes: {len(unique_phonemes)}")
print(f"Unique phonemes: {unique_phonemes}")

Total unique phonemes: 36
Unique phonemes: {'w', 'T', 'g', 'x', 'f', 'r', 'J', 'N', 'G', 'l', 'p', 'U', 'C', 'o', 'O', 'Y', 'e', 'W', 't', 'i', 'd', 'E', 'n', 'A', 's', 'B', 'm', 'u', 'I', 'S', 'b', 'R', 'j', 'a', 'k', 'D'}


Obtain a list of words spanning all phonemes.

In [4]:
seen_phonemes = set()
selected_words = []

for index, row in espada_df.iterrows():
    entry = row['Entry']
    phoneme_sequence = row['MainBase']
    
    phonemes = set(phoneme_sequence.split())
    
    if not phonemes.issubset(seen_phonemes):
        selected_words.append(entry)
        seen_phonemes.update(phonemes)
        
        if seen_phonemes == unique_phonemes:
            break

print(f"Number of selected words: {len(selected_words)}")
print(f"Selected words: {selected_words}")

Number of selected words: 29
Selected words: ['a', 'aaron', 'aarón', 'aarónico', 'aaronita', 'ab', 'aba', 'ababilla', 'ababillabais', 'ababillábamos', 'ababillado', 'ababilláis', 'ababillándome', 'ababillaré', 'ababol', 'abacería', 'abacha', 'abadejo', 'abadengo', 'abajeño', 'abakuá', 'abalaustrado', 'abaleadura', 'abapó', 'abarcucé', 'abarraca', 'abarragamiento', 'abifetear', 'accha']


Obtain a set of all 1-1 grapheme-phoneme mappings.

In [5]:
phoneme_grapheme_mappings = set()

# Iterate through the rows
for index, row in espada_df.iterrows():
    entry = row['Entry']
    phoneme_sequence = row['MainBase']
    
    # Split the phoneme sequence by spaces to get individual phonemes
    phonemes = phoneme_sequence.split()
    
    # Ensure the lengths of the entry and phoneme sequence match for 1-to-1 mapping
    if len(entry) == len(phonemes):
        for letter, phoneme in zip(entry, phonemes):
            # Add the (letter, phoneme) tuple to the set
            phoneme_grapheme_mappings.add((letter, phoneme))

print(f"Total unique phoneme-grapheme mappings: {len(phoneme_grapheme_mappings)}")
print(f"Phoneme-grapheme mappings: {phoneme_grapheme_mappings}")

Total unique phoneme-grapheme mappings: 229
Phoneme-grapheme mappings: {('m', 'o'), ('r', 't'), ('á', 'A'), ('j', 'o'), ('o', 'w'), ('a', 'm'), ('ú', 'u'), ('ü', 'w'), ('o', 'p'), ('a', 'n'), ('c', 'o'), ('q', 'I'), ('s', 'i'), ('ó', 'k'), ('d', 'r'), ('h', 'O'), ('e', 'm'), ('y', 'a'), ('e', 'n'), ('j', 'x'), ('u', 'W'), ('h', 'e'), ('i', 'd'), ('r', 'k'), ('m', 'm'), ('i', 'm'), ('ñ', 'N'), ('r', 'R'), ('v', 'B'), ('u', 'g'), ('ö', 'O'), ('l', 'u'), ('r', 'u'), ('r', 'a'), ('l', 'p'), ('g', 'l'), ('r', 'p'), ('à', 'a'), ('c', 'C'), ('t', 'O'), ('u', 'E'), ('i', 'J'), ('b', 'B'), ('é', 'E'), ('q', 'e'), ('o', 'm'), ('m', 'a'), ('t', 'E'), ('m', 'u'), ('c', 'k'), ('h', 'W'), ('o', 'n'), ('c', 'T'), ('a', 'W'), ('p', 'O'), ('s', 'e'), ('h', 'S'), ('c', 'a'), ('p', 'e'), ('l', 'A'), ('o', 'J'), ('o', 'O'), ('r', 'A'), ('e', 'W'), ('à', 'A'), ('h', 'E'), ('e', 'I'), ('t', 'e'), ('a', 'l'), ('q', 'i'), ('e', 'f'), ('n', 'w'), ('ú', 'Y'), ('i', 'i'), ('è', 'E'), ('x', 's'), ('n', 'a'), ('z'

Output a dataframe of words, their phonemes, and the contained mappings spanning all of the mappings found in the previous step.

In [6]:
seen_mappings = set()
spanning_words = []

for index, row in espada_df.iterrows():
    entry = row['Entry']
    phoneme_sequence = row['MainBase']
    phonemes = phoneme_sequence.split()
    
    # Ensure the lengths of the entry and phoneme sequence match for 1-to-1 mapping 
    if len(entry) == len(phonemes):
        word_mappings = [(letter, phoneme) for letter, phoneme in zip(entry, phonemes)]
        new_mappings = [(letter, phoneme) for letter, phoneme in word_mappings if (letter, phoneme) not in seen_mappings]
        
        if new_mappings:
            spanning_words.append({
                'entry': entry,
                'phonemes': phonemes,
                'mappings': word_mappings
            })
            
            # Update the set of seen mappings
            seen_mappings.update(new_mappings)
            
            # If all phoneme-grapheme mappings are found, we can exit early
            if len(seen_mappings) == len(phoneme_grapheme_mappings):
                break

df_spanning = pd.DataFrame(spanning_words)
df_spanning.to_csv('spanning_words.csv', index=False)

print(f"Total spanning words: {len(spanning_words)}")
print(f"Spanning words saved to 'spanning_words.csv'")

Total spanning words: 144
Spanning words saved to 'spanning_words.csv'


Output a dataframe of words, their corresponding pronunciations, and their IPA transcriptions as provided by the Spanish-IPA translator. These words also span all present phonemes and are guaranteed to have a corresponding IPA transcription as provided by the Spanish-IPA translator. (From the Spanish/Spain dictionary).

Essentially once this is verified we will check the correspondence between phonemes as provided in the Spanish Annotated Dictionary and the IPA transcriptions from the translator, then map this across the entire annotated dictionary and use those IPA transcriptions as the ones in the original Annotated Dictionary do not match the IPA conventions 1-1 even in the indicated columns.

In [11]:
# Load the IPA translations from the JSON file with UTF-8 encoding
with open('es_ES.json', 'r', encoding='utf-8') as file:
    ipa_translations = json.load(file)

seen_mappings = set()
spanning_words = []

for index, row in espada_df.iterrows():
    entry = row['Entry']
    phoneme_sequence = row['MainBase']
    phonemes = phoneme_sequence.split()
    
    # Ensure the lengths of the entry and phoneme sequence match for 1-to-1 mapping 
    if len(entry) == len(phonemes):
        word_mappings = [(letter, phoneme) for letter, phoneme in zip(entry, phonemes)]
        new_mappings = [(letter, phoneme) for letter, phoneme in word_mappings if (letter, phoneme) not in seen_mappings]
        
        # Ensure the entry is in ipa_translations and only add if there are new mappings
        if entry in ipa_translations and new_mappings:
            # Append only if there are new mappings not seen yet
            spanning_words.append({
                'entry': entry,
                'phonemes': phonemes,
                'ipa': ipa_translations[entry]  # Add the corresponding IPA pronunciation
            })
            
            # Update the set of seen mappings
            seen_mappings.update(new_mappings)
            
            # If all phoneme-grapheme mappings are found, we can exit early
            if len(seen_mappings) == len(phoneme_grapheme_mappings):
                break

# Create DataFrame without the 'mappings' column
df_spanning = pd.DataFrame(spanning_words)

# Save to CSV
df_spanning.to_csv('spanning_words_ipa_translations_ES.csv', index=False)

print(f"Total spanning words: {len(spanning_words)}")
print(f"Spanning words saved to 'spanning_words_ipa_translations_ES.csv'")


Total spanning words: 75
Spanning words saved to 'spanning_words_ipa_translations_ES.csv'


In [12]:
# SAME AS ABOVE FOR MX

# Load the IPA translations from the JSON file with UTF-8 encoding
with open('es_MX.json', 'r', encoding='utf-8') as file:
    ipa_translations = json.load(file)

seen_mappings = set()
spanning_words = []

for index, row in espada_df.iterrows():
    entry = row['Entry']
    phoneme_sequence = row['MainBase']
    phonemes = phoneme_sequence.split()
    
    # Ensure the lengths of the entry and phoneme sequence match for 1-to-1 mapping 
    if len(entry) == len(phonemes):
        word_mappings = [(letter, phoneme) for letter, phoneme in zip(entry, phonemes)]
        new_mappings = [(letter, phoneme) for letter, phoneme in word_mappings if (letter, phoneme) not in seen_mappings]
        
        # Ensure the entry is in ipa_translations and only add if there are new mappings
        if entry in ipa_translations and new_mappings:
            # Append only if there are new mappings not seen yet
            spanning_words.append({
                'entry': entry,
                'phonemes': phonemes,
                'ipa': ipa_translations[entry]  # Add the corresponding IPA pronunciation
            })
            
            # Update the set of seen mappings
            seen_mappings.update(new_mappings)
            
            # If all phoneme-grapheme mappings are found, we can exit early
            if len(seen_mappings) == len(phoneme_grapheme_mappings):
                break

# Create DataFrame without the 'mappings' column
df_spanning = pd.DataFrame(spanning_words)

# Save to CSV
df_spanning.to_csv('spanning_words_ipa_translations_MX.csv', index=False)

print(f"Total spanning words: {len(spanning_words)}")
print(f"Spanning words saved to 'spanning_words_ipa_translations_MX.csv'")


Total spanning words: 75
Spanning words saved to 'spanning_words_ipa_translations_MX.csv'


In [9]:
len(espada_df)

628298

In [10]:
len(ipa_translations)

595885