# ESPADA Spanish Annotated Dictionary Parser

In [1]:
import pandas as pd
import os
import re

In [2]:
# Combine the three dataframes back into a single dataframe
# NOTE: Two entries from original dictionary manually removed containing unicode characters '\u200e', '\xad'
# These appeared to be repeats of entries already present.
df1_loaded = pd.read_csv('SpanishAnnotatedDictionary_part1.csv')
df2_loaded = pd.read_csv('SpanishAnnotatedDictionary_part2.csv')
df3_loaded = pd.read_csv('SpanishAnnotatedDictionary_part3.csv')

espada_df = pd.concat([df1_loaded, df2_loaded, df3_loaded], ignore_index=True)

print(espada_df.shape)

(628298, 35)


Obtain the set of all unique phonemes present in the dictionary. Note that all phonemes are single-lettered for simplicity, and in this dictionary there is a 1-1 correspondence between letters and their phonological counterparts.

In [3]:
unique_phonemes = set()

for phoneme_sequence in espada_df['MainBase']:
    phonemes = phoneme_sequence.split()
    unique_phonemes.update(phonemes)

print(f"Total unique phonemes: {len(unique_phonemes)}")
print(f"Unique phonemes: {unique_phonemes}")

Total unique phonemes: 36
Unique phonemes: {'r', 'p', 'o', 'n', 'w', 'j', 'J', 'O', 'd', 'G', 'C', 'x', 'N', 'k', 'g', 'a', 'E', 'f', 'A', 's', 'U', 'D', 'l', 'u', 'm', 'R', 'I', 'b', 'i', 't', 'S', 'Y', 'W', 'e', 'B', 'T'}


Obtain a list of words spanning all phonemes.

In [4]:
seen_phonemes = set()
selected_words = []

for index, row in espada_df.iterrows():
    entry = row['Entry']
    phoneme_sequence = row['MainBase']
    
    phonemes = set(phoneme_sequence.split())
    
    if not phonemes.issubset(seen_phonemes):
        selected_words.append(entry)
        seen_phonemes.update(phonemes)
        
        if seen_phonemes == unique_phonemes:
            break

print(f"Number of selected words: {len(selected_words)}")
print(f"Selected words: {selected_words}")

Number of selected words: 29
Selected words: ['a', 'aaron', 'aarón', 'aarónico', 'aaronita', 'ab', 'aba', 'ababilla', 'ababillabais', 'ababillábamos', 'ababillado', 'ababilláis', 'ababillándome', 'ababillaré', 'ababol', 'abacería', 'abacha', 'abadejo', 'abadengo', 'abajeño', 'abakuá', 'abalaustrado', 'abaleadura', 'abapó', 'abarcucé', 'abarraca', 'abarragamiento', 'abifetear', 'accha']


Obtain a set of all 1-1 grapheme-phoneme mappings.

In [6]:
phoneme_grapheme_mappings = set()

# Iterate through the rows
for index, row in espada_df.iterrows():
    entry = row['Entry']
    phoneme_sequence = row['MainBase']
    
    # Split the phoneme sequence by spaces to get individual phonemes
    phonemes = phoneme_sequence.split()
    
    # Ensure the lengths of the entry and phoneme sequence match for 1-to-1 mapping
    if len(entry) == len(phonemes):
        for letter, phoneme in zip(entry, phonemes):
            # Add the (letter, phoneme) tuple to the set
            phoneme_grapheme_mappings.add((letter, phoneme))

print(f"Total unique phoneme-grapheme mappings: {len(phoneme_grapheme_mappings)}")
print(f"Phoneme-grapheme mappings: {phoneme_grapheme_mappings}")

Total unique phoneme-grapheme mappings: 229
Phoneme-grapheme mappings: {('h', 'k'), ('y', 'J'), ('g', 'r'), ('r', 'p'), ('r', 'o'), ('c', 'n'), ('e', 'm'), ('l', 'E'), ('h', 'o'), ('n', 'a'), ('o', 'E'), ('a', 'w'), ('u', 'a'), ('e', 'I'), ('ö', 'o'), ('l', 'p'), ('s', 'a'), ('u', 'U'), ('o', 'o'), ('è', 'E'), ('p', 'p'), ('p', 'o'), ('r', 'k'), ('o', 'D'), ('x', 'k'), ('ã', 'a'), ('i', 't'), ('u', 'W'), ('f', 'f'), ('m', 'u'), ('i', 'j'), ('c', 'I'), ('ú', 'Y'), ('h', 'C'), ('t', 't'), ('i', 'J'), ('y', 'p'), ('î', 'J'), ('á', 'A'), ('r', 'r'), ('e', 't'), ('t', 'A'), ('i', 'd'), ('m', 'i'), ('e', 'W'), ('f', 'l'), ('i', 'O'), ('r', 'e'), ('o', 'J'), ('p', 's'), ('h', 'e'), ('h', 'O'), ('t', 'O'), ('e', 'n'), ('u', 'm'), ('x', 'x'), ('v', 'b'), ('l', 'e'), ('g', 'x'), ('x', 's'), ('n', 'I'), ('i', 'f'), ('p', 'r'), ('u', 'I'), ('ö', 'O'), ('a', 'A'), ('o', 'O'), ('q', 'E'), ('ñ', 'N'), ('a', 'r'), ('i', 'k'), ('w', 'W'), ('q', 'k'), ('i', 'E'), ('í', 'I'), ('p', 'e'), ('b', 'B'), ('t'