In [1]:
import pickle

file = "output/allosaurus_es_sr16000_ipa.pkl"

with open(file, "rb") as f:
    data = pickle.load(f)

In [2]:
import re

def parse_allosaurus_output(output: str):
    parsed = []
    for line in output.strip().splitlines():
        parts = line.split(maxsplit=2)
        time = float(parts[0])
        duration = float(parts[1])
        # Regex to extract phoneme and probability pairs
        phoneme_probs = re.findall(r'(\S+)\s*\(([\d.]+)\)', parts[2])
        phoneme_probs = [(p, float(prob)) for p, prob in phoneme_probs]
        parsed.append({
            "time": time,
            "duration": duration,
            "phones": phoneme_probs
        })
    return parsed


def get_most_probable_phones(parsed_output):
    most_probable = []
    for entry in parsed_output:
        if entry["phones"]:
            # # Sort phonemes by probability and take the most probable one
            # most_probable_phoneme = max(entry["phonemes"], key=lambda x: x[1])
            # They are already sorted by probability, so we can take the first one
            most_probable_phoneme = entry["phones"][0][0]
            most_probable.append(most_probable_phoneme)
    return most_probable    

In [3]:
parsed = {}
most_probable = {}
for k, v in data.items():
    parsed[k] = parse_allosaurus_output(v)
    most_probable[k] = get_most_probable_phones(parsed[k])

In [4]:
import pandas as pd

# Cols: file_name, phones
df_phones = pd.DataFrame(most_probable.items(), columns=["file_name", "phone_list"])

In [5]:
# Extract data from file_name e.g. desde_es061900_es.Argentina.wav -> desde (word), es061900 (id), es.Argentina (accent)
def parse_file_name(file_name):
    parts = file_name.split('_')
    word = parts[0]
    id_ = parts[1]
    accent = '_'.join(parts[2:]).replace('.wav', '')
    return word, id_, accent

df_phones[['word', 'id', 'accent']] = df_phones['file_name'].apply(parse_file_name).apply(pd.Series)

In [6]:
df_phones["phones"] = df_phones["phone_list"].apply(lambda x: " ".join(x))

In [7]:
df_phones.head()

Unnamed: 0,file_name,phone_list,word,id,accent,phones
0,glamour_es096711_es.Argentina.wav,"[d, iː, l̪, a, m, uə, ɾ]",glamour,es096711,es.Argentina,d iː l̪ a m uə ɾ
1,redactar_es164583_es.Mexico.wav,"[ɾ, e, b, a, t͡ɕ, iː, t, a, ɾ, a]",redactar,es164583,es.Mexico,ɾ e b a t͡ɕ iː t a ɾ a
2,descorrer_es061733_es.Argentina.wav,"[pʲ, ɪ, s, k, o, ɾ, ʀ, æ, ɾ]",descorrer,es061733,es.Argentina,pʲ ɪ s k o ɾ ʀ æ ɾ
3,enemigo_es075782_es.Castellano.wav,"[lʲ, e, n, e, m, ɪ, i, k, o]",enemigo,es075782,es.Castellano,lʲ e n e m ɪ i k o
4,indignar_es108278_es.Mexico.wav,"[iː, n, d, i, g, n, ɪ, ɾ]",indignar,es108278,es.Mexico,iː n d i g n ɪ ɾ


In [8]:
# Train logistic regression model to predict accent based on phones
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

# from gruut_ipa import Pronunciation

# def phone_tokenizer(text):
#     return [x.text for x in Pronunciation.from_string(text).phones]

In [9]:
import numpy as np
import pandas as pd

vectorizer_uni = CountVectorizer(
    lowercase=False,
    # tokenizer=phone_tokenizer,
    ngram_range=(1, 1),
    analyzer=lambda x: x,
    min_df=5,  # Minimum document frequency
    binary=True,  # Binary counts (presence/absence of n-grams)
)

X_vectorized = vectorizer_uni.fit_transform(df_phones["phone_list"].values)
y = df_phones["accent"].values
model = LogisticRegression(max_iter=1000, random_state=33)
model.fit(X_vectorized, y);

In [10]:
# Inspect largest coefficients per class:
def get_top_features_per_class(model, vectorizer, top_n=10):
    feature_names = vectorizer.get_feature_names_out()
    coefs = model.coef_
    classes = model.classes_

    top_features = {}
    for i, class_label in enumerate(classes):
        class_coefs = coefs[i]
        # # Get indices of top features in absolute value
        # top_indices = np.argsort(np.abs(class_coefs))[-top_n:][::-1]
        # Get indices of top features in absolute value
        top_indices = np.argsort(class_coefs)[-top_n:][::-1]
        # Get feature names and their corresponding coefficients
        top_features[class_label] = [(feature_names[idx], class_coefs[idx]) for idx in top_indices]
    return top_features

def contains_subsequence(data, pattern):
    m, n = len(data), len(pattern)
    if n == 0:
        return True   # empty pattern “always” matches
    if n > m:
        return False  # pattern longer than data can’t match
    # slide a window of length n over data
    for i in range(m - n + 1):
        if data[i:i+n] == pattern:
            return True
    return False

def get_examples_for_top_features(df, top_features, n_examples=3):
    examples = {}
    for class_label, features in top_features.items():
        examples[class_label] = {}
        for feature, coef in features:
            # Get rows where the feature appears
            rows = df[df['phone_list'].apply(lambda x: contains_subsequence(x, feature.split()))]
            # Take up to n_examples in format "word (phones), word (phones), ..."
            res = rows["word"] + " (" + rows["phones"] + ")"
            # Store the examples
            examples[class_label][feature] = ", ".join(
                res.head(n_examples).values
            )
            
    return examples


top_features = get_top_features_per_class(model, vectorizer_uni, top_n=10)
examples = get_examples_for_top_features(df_phones, top_features, n_examples=3)

# For each accent, show the top 10 phones, coefficients and examples:
for acc, features in top_features.items():
    print(f"Accent: {acc}")
    for feature, coef in features:
        print(f"    {feature}, ({coef:.4f})")
        print(f"        {examples[acc][feature]}")
        # print(f"        {examples[acc][feature].replace(', ', '\n')}")
    print()


Accent: es.Argentina
    ʐ, (1.9908)
        lacayo (l a k a ʐ o), zambullir (t a m uə ʐ i e ɾ), mudo (tʂ o ŋ ʐ o)
    sː, (1.3932)
        preceder (p a ɾ e sː e æ ð æ ɾ), presentador (b̞ ɾ i sː e n t a ð o ɾ), acertijo (ɒ sː e ɾ t i x o)
    ʋ, (1.3162)
        próspero (tʂ ɨ ɾ l o uə ʋ p e ɾ o), suspirar (s u ʋ p i lʲ a ɾ), perezosa (e ɾʲ e t ɒ o ʋ ʂ a ɪ)
    ɻ, (0.7557)
        lógico (ɪ ɻ o uə x i k o), bocacalle (b uə k a ɾ k a ɻ e), patológico (p ɒ t uə o ɻ o x i k ʌ)
    ʊ, (0.5874)
        estructurar (ɪ s t ɾ ʊ k t uə ɾ a ɾ), trailer (ʊ r ɒ ɟ iː lʲ e), hispanoamérica (ɪ s p a n ʊ a m e ɪ ɾʲ i k a)
    ә, (0.5655)
        hombro (ɳ ә b j ɒ o), competición (k o m ә p e ɪ t̪ i ɕ ɪ ə n), alambre (a lː æ m ә b ɾ e ɪ)
    æ, (0.4881)
        descorrer (pʲ ɪ s k o ɾ ʀ æ ɾ), preceder (p a ɾ e sː e æ ð æ ɾ), oriente (o ɾ j æ ɳ t e)
    v̞ʲ, (0.4476)
        pesticida (v̞ʲ e s t iː ɕ i ð ɪ), rebasar (t̪ ɾ v̞ʲ e ɪ b̞ a ʂ a ɾ), permisivo (v̞ʲ ɒ ɪ ɾ m iː s̪ iː p uə o)
    tː, (0.3895)
   

In [11]:
# same with bigrams:
vectorizer_bi = CountVectorizer(
    lowercase=False,
    analyzer='word',       # we’re working at word level
    tokenizer=lambda x: x,    # don’t split strings—already lists
    preprocessor=lambda x: x, # don’t lowercase or strip
    token_pattern=None,    # disable the default regex‑based tokenizer
    ngram_range=(2,2),    # only bigrams
    min_df=5,     # Minimum document frequency
    binary=True,  # Binary counts (presence/absence of n-grams)
)

X_vectorized = vectorizer_bi.fit_transform(df_phones["phone_list"].values)
y = df_phones["accent"].values
model = LogisticRegression(max_iter=1000, random_state=33)
model.fit(X_vectorized, y);

top_features = get_top_features_per_class(model, vectorizer_bi, top_n=10)
examples = get_examples_for_top_features(df_phones, top_features, n_examples=3)

# For each accent, show the top 10 phones, coefficients and examples:
for acc, features in top_features.items():
    print(f"Accent: {acc}")
    for feature, coef in features:
        print(f"    {feature}, ({coef:.4f})")
        print(f"        {examples[acc][feature]}")
        # print(f"        {examples[acc][feature].replace(', ', '\n')}")
    print()


Accent: es.Argentina
    ɕ t, (2.7409)
        destajo (d e ɕ t a x uə), cronista (uə ɾ o n iː ɪ ɕ t æ), cubista (uə b̞ i ɪ ɕ t a)
    ɕ m, (2.6909)
        organismo (ʔ uə ɾ ɹ̩ ɡ a n i ɕ m uə), favoritismo (s̪ a b̞ o l i t i ɕ m uə), cataclismo (t͡ɕ ɒ t̪ a ɪ k ʌ l i ɕ m o)
    ɕ k, (2.2045)
        risco (t̪ ɾ i ɕ k o), morisco (m uə ɾ i ɕ k ɔ), descafeinado (pʲ e ɕ k a f e i n a ð o)
    uə ʋ, (2.1019)
        próspero (tʂ ɨ ɾ l o uə ʋ p e ɾ o), apostillar (a p uə ʋ t̪ i ʐ a ɾ), insufrible (i n s uə ʋ f ɾ i b̞ lʲ e ɪ)
    ʐ a, (2.0773)
        astilla (ɒ t i ʐ a), grosella (g ɾ o ʂ e ʐ a), apoyar (ʔ a p uə ʐ a ɾ)
    ʐ e, (1.8774)
        banderillero (b a n d e ɾ i ʐ e ɾ o), proyectar (ɾ o ʐ e k t a ɾ), creyente (k ɾʲ e ʐ e æ n t e)
    z æ, (1.8648)
        paladar (b a j lː a z æ ɾ), ladilla (lː a ɪ ð i z æ), tabla (t a b iː z æ)
    iː ɾʲ, (1.8131)
        escrito (e s k iː ɾʲ i t o), creación (tʂ iː ɾʲ e æ sː j o n), creta (g iː ɾʲ e t a)
    ɪ ɾʲ, (1.7263)
        notaría (n o 

In [24]:
df_phones.head(2)

Unnamed: 0,file_name,phone_list,word,id,accent,phones
0,glamour_es096711_es.Argentina.wav,"[d, iː, l̪, a, m, uə, ɾ]",glamour,es096711,es.Argentina,d iː l̪ a m uə ɾ
1,redactar_es164583_es.Mexico.wav,"[ɾ, e, b, a, t͡ɕ, iː, t, a, ɾ, a]",redactar,es164583,es.Mexico,ɾ e b a t͡ɕ iː t a ɾ a


In [28]:
# Create df with cols: word, accents (N cols), phones:
df_words = df_phones.pivot_table(index=['word', 'id'], columns='accent', values='phones', aggfunc='first').reset_index()
df_words.query("word == 'usa'")

accent,word,id,es.Argentina,es.Castellano,es.Mexico
18042,usa,es195566,ʔ u t a,ɪ s t a ð o s u n i ð o uə s ð e ə a m e ɾ i k a,uə s ɒ n


In [36]:
import Levenshtein
from itertools import combinations

accent_cols = [col for col in df_words.columns if col not in ['word', 'id']]
results = []
for _, row in df_words.iterrows():
    word = row['word']
    wordid = row['id']
    for acc1, acc2 in combinations(accent_cols, 2):
        phones1 = row[acc1]
        phones2 = row[acc2]
        if pd.isna(phones1) or pd.isna(phones2) or len(phones1) == 0 or len(phones2) == 0:
            dist = None
        else:
            dist = Levenshtein.distance(row[acc1], row[acc2])
        results.append({
            'word': word,
            'wordid': wordid,
            'accent_pair': f'{acc1}-{acc2}',
            'phones_pair': f'[{phones1}]\n[{phones2}]',
            'l_distance': dist,
            # 'n_phones': TODO, # using phone_list
        })


In [37]:
from IPython.display import display, HTML

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

In [38]:
df_dist = pd.DataFrame(results)
dd = df_dist.sort_values(by='l_distance', ascending=False) # TODO should use IDs!
pretty_print(dd.head(10))

Unnamed: 0,word,wordid,accent_pair,phones_pair,l_distance
54126,usa,es195566,es.Argentina-es.Castellano,[ʔ u t a] [ɪ s t a ð o s u n i ð o uə s ð e ə a m e ɾ i k a],43.0
54128,usa,es195566,es.Castellano-es.Mexico,[ɪ s t a ð o s u n i ð o uə s ð e ə a m e ɾ i k a] [uə s ɒ n],42.0
44846,rae,es161488,es.Castellano-es.Mexico,[b ɾ e a l a k a ð e m j ɪ e s p a ɲ o n ɪ] [d̪ ʌ r ɒ],38.0
44844,rae,es161488,es.Argentina-es.Castellano,[d̪ uə ɾ a] [b ɾ e a l a k a ð e m j ɪ e s p a ɲ o n ɪ],37.0
53900,ue,es194697,es.Castellano-es.Mexico,[u n j o n e u uə ɾ o p e ɪ a] [k͡p̚ e],26.0
53898,ue,es194697,es.Argentina-es.Castellano,[ʔ w e] [u n j o n e u uə ɾ o p e ɪ a],25.0
26226,galgo,es094728,es.Argentina-es.Castellano,[g a l uə ɡ o] [t̪ʰ iː ɴ ɡ̤ ɒ s̪ iː k͡p̚ uə o],23.0
546,abrevadero,es001401,es.Argentina-es.Castellano,[ʔ ɒ b̞ ʌ ɾ i ɪ b̞ a ð i ɾ o] [ɒ p uə ɾʲ e b̞ ɒ o t̪ uə e ɪ tʂ o uə],23.0
4898,arrebatado,es017301,es.Castellano-es.Mexico,[ɒ tʂ ɾ ʌ ɪ b̞ a n t͡ɕ i a ŋ d o] [ʏ b̞ ɒ t a],23.0
15492,cuidador,es055637,es.Argentina-es.Castellano,[w i ð æ ð o ɾ] [k͡p̚ uə e ɪ tɕ i a ts ɒ o tʂ ɹ̩],23.0


In [39]:
dd = df_dist.sort_values(by='l_distance', ascending=False).query("l_distance < 20").head(20)
pretty_print(dd)

Unnamed: 0,word,wordid,accent_pair,phones_pair,l_distance
42531,positivismo,es153631,es.Argentina-es.Castellano,[ʔ o s̪ iː t̪ i b̞ e m a] [p o ʂ uə ɪ t iː b̞ iː ʂ ɻ̩ m ɒ o],19.0
12117,cohesionar,es043302,es.Argentina-es.Castellano,[o ɪ s̪ j o n æ ɾ] [k͡p̚ uə o l̪ b̞ e ɕ i o n a],19.0
548,abrevadero,es001401,es.Castellano-es.Mexico,[ɒ p uə ɾʲ e b̞ ɒ o t̪ uə e ɪ tʂ o uə] [ʔ ɒ b ʌ ɾ e ɪ b̞ a d e ɪ ɾ uə],19.0
29046,ignominia,es104798,es.Argentina-es.Castellano,[i k͡p̚ ʌ n ɴ d ɒ o m iː ɲ i æ n] [i n o m ɪ n iː ə],19.0
26235,gallego,es094764,es.Argentina-es.Castellano,[d æ tʂ e ɡ o] [iː ɴ k a d͡ʒ i e k͡p̚ uə o],19.0
8442,bárbaro,es023980,es.Argentina-es.Castellano,[b̥ ʌ tʰ] [n p a ʂ ɻ̩ p a ɾ o uə],19.0
4984,arriesgado,es017608,es.Argentina-es.Mexico,[a ɾ i e ɕ a ð uə] [ʔ ɒ k͡p̚ ɾ j e ʂ g i æ s o ŋ],19.0
13486,conocedor,es048447,es.Argentina-es.Mexico,[k͡p̚ o n o f ɪ ð uə o ɾ] [k uə o n ɴ n o uə ɕ i e d a ɒ ð ɾ],19.0
10317,caverna,es036728,es.Argentina-es.Castellano,[ɾ] [t͡ɕ ɒ a b̞ e ɾ ә n ɪ],19.0
53012,transparencia,es191843,es.Castellano-es.Mexico,[t ɾ a ɪ e ɳ n ә s p a lʲ i n tʰ i a] [ɾ ɒ n p a ɾ ʌ e n ɕ i a],19.0


In [21]:
from IPython.display import Audio, display, Markdown
from pathlib import Path

def play_audio(word, accent, path_prefix="output/words_wav/es/sr16000"):
    # find wav file containing word and accent:
    audio_files = list(Path(path_prefix).glob(f"{word}_*{accent}.wav"))
    if not audio_files:
        print(f"No audio files found for {word} with accent {accent}")
        return
    audio_path = audio_files[0]  # Take the first matching file
    try:
        display(Markdown(f"**{word}** ({accent})"))
        display(Audio(audio_path, autoplay=True))
    except FileNotFoundError:
        print(f"Audio file not found: {audio_path}")

In [44]:
play_audio("taberna", "es.Mexico")
play_audio("caverna", "es.Mexico")
play_audio("taberna", "es.Argentina")
play_audio("caverna", "es.Argentina")

**taberna** (es.Mexico)

**caverna** (es.Mexico)

**taberna** (es.Argentina)

**caverna** (es.Argentina)

In [23]:
play_audio("w", "es.Argentina")
play_audio("w", "es.Castellano")

**w** (es.Argentina)

**w** (es.Castellano)

In [48]:
mask = df_phones['word'].str.contains('caverna|taberna')
df_phones[mask].sort_values(["accent", "id"])

Unnamed: 0,file_name,phone_list,word,id,accent,phones
44906,caverna_es036728_es.Argentina.wav,[ɾ],caverna,es036728,es.Argentina,ɾ
29094,taberna_es184465_es.Argentina.wav,"[t, ɒ, b, e, ɾ, ɴ, ɪ]",taberna,es184465,es.Argentina,t ɒ b e ɾ ɴ ɪ
35824,caverna_es036728_es.Castellano.wav,"[t͡ɕ, ɒ, a, b̞, e, ɾ, ә, n, ɪ]",caverna,es036728,es.Castellano,t͡ɕ ɒ a b̞ e ɾ ә n ɪ
34710,taberna_es184465_es.Castellano.wav,"[t͡ɕ, ɒ, p, i, ə, ɾ, ә, ŋ̟, d, ɒ]",taberna,es184465,es.Castellano,t͡ɕ ɒ p i ə ɾ ә ŋ̟ d ɒ
28014,caverna_es036728_es.Mexico.wav,"[t͡ɕ, a, b̞, e, ɾ, ʔ, ɪ]",caverna,es036728,es.Mexico,t͡ɕ a b̞ e ɾ ʔ ɪ
24292,taberna_es184465_es.Mexico.wav,"[t, ɒ, b̞, e, ɴ, ɾ, m, ɒ]",taberna,es184465,es.Mexico,t ɒ b̞ e ɴ ɾ m ɒ


In [14]:
# from gruut_ipa import Pronunciation

# # ipa_string = "ˈjɛs|ˈt͡ʃuːz aɪpiːeɪ‖"
# ipa_string = "l̪ l a t͡ɕ t̪ i k͡p̚ o"

# res = Pronunciation.from_string(ipa_string).phones
