# ESPADA Spanish Annotated Dictionary Parser

In [1]:
import pandas as pd
import os
import re

In [2]:
# Combine the three dataframes back into a single dataframe
# NOTE: Two entries from original dictionary manually removed containing unicode characters '\u200e', '\xad'
# These appeared to be repeats of entries already present.
df1_loaded = pd.read_csv('SpanishAnnotatedDictionary_part1.csv')
df2_loaded = pd.read_csv('SpanishAnnotatedDictionary_part2.csv')
df3_loaded = pd.read_csv('SpanishAnnotatedDictionary_part3.csv')

espada_df = pd.concat([df1_loaded, df2_loaded, df3_loaded], ignore_index=True)

print(espada_df.shape)

(628298, 35)


Obtain the set of all unique phonemes present in the dictionary. Note that all phonemes are single-lettered for simplicity, and in this dictionary there is a 1-1 correspondence between letters and their phonological counterparts.

In [3]:
unique_phonemes = set()

for phoneme_sequence in espada_df['MainBase']:
    phonemes = phoneme_sequence.split()
    unique_phonemes.update(phonemes)

print(f"Total unique phonemes: {len(unique_phonemes)}")
print(f"Unique phonemes: {unique_phonemes}")

Total unique phonemes: 36
Unique phonemes: {'T', 'D', 'l', 'f', 'J', 'j', 'm', 'B', 'b', 'W', 'Y', 'p', 'G', 't', 'E', 'O', 'i', 'x', 'A', 'R', 'N', 'u', 'S', 'r', 'n', 'C', 'd', 'w', 'I', 'U', 'o', 's', 'g', 'e', 'k', 'a'}


Obtain a list of words spanning all phonemes.

In [4]:
seen_phonemes = set()
selected_words = []

for index, row in espada_df.iterrows():
    entry = row['Entry']
    phoneme_sequence = row['MainBase']
    
    phonemes = set(phoneme_sequence.split())
    
    if not phonemes.issubset(seen_phonemes):
        selected_words.append(entry)
        seen_phonemes.update(phonemes)
        
        if seen_phonemes == unique_phonemes:
            break

print(f"Number of selected words: {len(selected_words)}")
print(f"Selected words: {selected_words}")

Number of selected words: 29
Selected words: ['a', 'aaron', 'aarón', 'aarónico', 'aaronita', 'ab', 'aba', 'ababilla', 'ababillabais', 'ababillábamos', 'ababillado', 'ababilláis', 'ababillándome', 'ababillaré', 'ababol', 'abacería', 'abacha', 'abadejo', 'abadengo', 'abajeño', 'abakuá', 'abalaustrado', 'abaleadura', 'abapó', 'abarcucé', 'abarraca', 'abarragamiento', 'abifetear', 'accha']


Obtain a set of all 1-1 grapheme-phoneme mappings.

In [5]:
phoneme_grapheme_mappings = set()

# Iterate through the rows
for index, row in espada_df.iterrows():
    entry = row['Entry']
    phoneme_sequence = row['MainBase']
    
    # Split the phoneme sequence by spaces to get individual phonemes
    phonemes = phoneme_sequence.split()
    
    # Ensure the lengths of the entry and phoneme sequence match for 1-to-1 mapping
    if len(entry) == len(phonemes):
        for letter, phoneme in zip(entry, phonemes):
            # Add the (letter, phoneme) tuple to the set
            phoneme_grapheme_mappings.add((letter, phoneme))

print(f"Total unique phoneme-grapheme mappings: {len(phoneme_grapheme_mappings)}")
print(f"Phoneme-grapheme mappings: {phoneme_grapheme_mappings}")

Total unique phoneme-grapheme mappings: 229
Phoneme-grapheme mappings: {('b', 'b'), ('á', 'A'), ('u', 'E'), ('q', 'E'), ('ú', 'Y'), ('l', 'u'), ('l', 'A'), ('g', 'N'), ('u', 'U'), ('ú', 'U'), ('r', 'r'), ('e', 'w'), ('q', 'e'), ('u', 'e'), ('l', 'Y'), ('l', 'E'), ('u', 'm'), ('l', 't'), ('l', 'U'), ('i', 's'), ('m', 'u'), ('c', 'C'), ('ç', 's'), ('l', 'e'), ('x', 'x'), ('w', 'w'), ('y', 'j'), ('x', 's'), ('e', 'j'), ('w', 'b'), ('r', 't'), ('i', 'l'), ('a', 'w'), ('h', 'W'), ('p', 'p'), ('r', 'U'), ('s', 's'), ('h', 't'), ('è', 'E'), ('r', 'e'), ('u', 'J'), ('c', 's'), ('h', 'C'), ('j', 'o'), ('i', 'f'), ('m', 'a'), ('m', 'm'), ('s', 'i'), ('z', 's'), ('c', 'I'), ('o', 'k'), ('c', 'l'), ('e', 'k'), ('î', 'J'), ('b', 'B'), ('n', 'a'), ('o', 'E'), ('e', 'E'), ('y', 'Y'), ('w', 's'), ('o', 'W'), ('a', 's'), ('g', 'r'), ('z', 'T'), ('f', 'f'), ('r', 'R'), ('o', 't'), ('u', 'W'), ('ú', 'W'), ('e', 'f'), ('i', 'j'), ('u', 't'), ('r', 'i'), ('é', 'p'), ('e', 'e'), ('a', 'l'), ('h', 'i'), ('p'

Output a dataframe of words, their phonemes, and the contained mappings spanning all of the mappings found in the previous step.

In [6]:
# Set to track unique phoneme-grapheme mappings
seen_mappings = set()

# List to store the results
spanning_words = []

# Iterate through the rows
for index, row in espada_df.iterrows():
    entry = row['Entry']
    phoneme_sequence = row['MainBase']
    
    # Split the phoneme sequence by spaces to get individual phonemes
    phonemes = phoneme_sequence.split()
    
    # Ensure the lengths of the entry and phoneme sequence match for 1-to-1 mapping
    if len(entry) == len(phonemes):
        word_mappings = [(letter, phoneme) for letter, phoneme in zip(entry, phonemes)]
        # Check if any new phoneme-grapheme mappings are introduced
        new_mappings = [(letter, phoneme) for letter, phoneme in word_mappings if (letter, phoneme) not in seen_mappings]
        
        if new_mappings:
            # Add the word to the spanning words list
            spanning_words.append({
                'entry': entry,
                'phonemes': phonemes,
                'mappings': word_mappings
            })
            
            # Update the set of seen mappings
            seen_mappings.update(new_mappings)
            
            # If all phoneme-grapheme mappings are found, we can exit early
            if len(seen_mappings) == len(phoneme_grapheme_mappings):
                break

# Create a DataFrame from the list of spanning words
df_spanning = pd.DataFrame(spanning_words)

# Output the DataFrame to a CSV file
df_spanning.to_csv('spanning_words.csv', index=False)

print(f"Total spanning words: {len(spanning_words)}")
print(f"Spanning words saved to 'spanning_words.csv'")

Total spanning words: 144
Spanning words saved to 'spanning_words.csv'
