In [1]:
import os
import glob
from pathlib import Path

from tqdm import tqdm

In [2]:
!pip install -qU allosaurus

## Lectura de datos

In [3]:
DATA_DIR = "output/words_wav/es/sr16000"
# DATA_DIR = "output/words_wav/es" # 48000 sr
# DATA_DIR = "output/words_wav/es" #/sr16000"

In [4]:
data = []

audio_paths = glob.glob(os.path.join(DATA_DIR, "*.wav"))
print(f"Found {len(audio_paths)} audio files.")

Found 56365 audio files.


## RUN

In [5]:
from allosaurus.app import read_recognizer

# load your model by the <model name>, will use 'latest' if left empty
model = read_recognizer()

In [6]:
from IPython.display import Audio, display

In [7]:
# You can tell the model to emit more phones or less phones by changing the --emit or -e argument.
# See: https://github.com/xinjli/allosaurus

In [9]:
import re

files_example = sorted(
    [f for f in audio_paths if re.search(r"desde_|sopa_|mosca_|estribillo_", f)]
    # [f for f in audio_paths if re.search(r"/predatory_|/pen_", f)]
) 

out = {}
for f in files_example:
    # print(Path(f).name)
    # display(Audio(f, rate=16000))
    # print(model.recognize(f, "eng", timestamp=True, topk=5, emit=1))
    # print("-"*100)
    out[Path(f).name] = model.recognize(f, "ipa", timestamp=True, topk=5, emit=1)

In [11]:
for k, v in out.items():
    print(out[k])
    break

0.090 0.045 ð (0.577) pʲ (0.090) b (0.066) d (0.062) b̥ (0.053)
0.150 0.045 e (0.505) ɪ (0.396) ɤ̆ (0.019) ɒ (0.013) æ (0.013)
0.300 0.045 s (0.886) s̪ (0.029) ɕ (0.023) <blk> (0.017) k (0.010)
0.420 0.045 t̪ (0.515) t (0.218) <blk> (0.064) ð (0.047) p (0.021)
0.480 0.045 e (0.395) ɒ (0.092) i (0.087) ɪ (0.080) ə (0.065)


In [17]:
def parse_allosaurus_output(output: str):
    parsed = []
    for line in output.strip().splitlines():
        parts = line.split(maxsplit=2)
        time = float(parts[0])
        duration = float(parts[1])
        # Regex to extract phoneme and probability pairs
        phoneme_probs = re.findall(r'(\S+)\s*\(([\d.]+)\)', parts[2])
        phoneme_probs = [(p, float(prob)) for p, prob in phoneme_probs]
        parsed.append({
            "time": time,
            "duration": duration,
            "phones": phoneme_probs
        })
    return parsed

In [21]:
def get_most_probable_phones(parsed_output):
    most_probable = []
    for entry in parsed_output:
        if entry["phones"]:
            # # Sort phonemes by probability and take the most probable one
            # most_probable_phoneme = max(entry["phonemes"], key=lambda x: x[1])
            # They are already sorted by probability, so we can take the first one
            most_probable_phoneme = entry["phones"][0][0]
            most_probable.append(most_probable_phoneme)
    return most_probable

In [None]:
# test:
x = parse_allosaurus_output(out[Path(f).name])
get_most_probable_phones(x)

['ʂ', 'o', 'p', 'a']

In [None]:
# Apply:
parsed = {}
most_probable = {}
for k, v in out.items():
    parsed[k] = parse_allosaurus_output(v)
    most_probable[k] = get_most_probable_phones(parsed[k])

In [27]:
import pandas as pd

# Cols: file_name, phones
df_phones = pd.DataFrame(most_probable.items(), columns=["file_name", "phones"])

In [29]:
# Extract data from file_name e.g. desde_es061900_es.Argentina.wav -> desde (word), es061900 (id), es.Argentina (accent)
def parse_file_name(file_name):
    parts = file_name.split('_')
    word = parts[0]
    id_ = parts[1]
    accent = '_'.join(parts[2:]).replace('.wav', '')
    return word, id_, accent

df_phones[['word', 'id', 'accent']] = df_phones['file_name'].apply(parse_file_name).apply(pd.Series)

In [30]:
df_phones

Unnamed: 0,file_name,phones,word,id,accent
0,desde_es061900_es.Argentina.wav,"[ð, e, s, t̪, e]",desde,es061900,es.Argentina
1,desde_es061900_es.Castellano.wav,"[u, t̪, e, s, t̪, e, ɪ]",desde,es061900,es.Castellano
2,desde_es061900_es.Mexico.wav,"[d, e, z, ð, e]",desde,es061900,es.Mexico
3,estribillo_es084128_es.Argentina.wav,"[e, s, t, ɾ, i, b̞, i, ʐ, o]",estribillo,es084128,es.Argentina
4,estribillo_es084128_es.Castellano.wav,"[e, s, t, ɾ, i, b̞, i, tɕ, i, o]",estribillo,es084128,es.Castellano
5,estribillo_es084128_es.Mexico.wav,"[ɛ, s, t, ɾ, i, b, i, tɕ, o, uə]",estribillo,es084128,es.Mexico
6,mosca_es132631_es.Argentina.wav,"[ɴ, ɔ, ʁ, ә, k͡p̚, a]",mosca,es132631,es.Argentina
7,mosca_es132631_es.Castellano.wav,"[m, o, s, k, a]",mosca,es132631,es.Castellano
8,mosca_es132631_es.Mexico.wav,"[m, o, s, k, ɪ]",mosca,es132631,es.Mexico
9,sopa_es181195_es.Argentina.wav,"[ʂ, uə, p, a]",sopa,es181195,es.Argentina


In [31]:
!pip install -qU python-Levenshtein


In [36]:
# Compute Levenshtein distance between two strings:
import Levenshtein

print(Levenshtein.distance(df_phones['phones'][0], df_phones['word'][1]))  # Example usage
print(Levenshtein.distance("".join(df_phones['phones'][0]), df_phones['word'][1].join('')))  # Example usage

2
6


In [33]:
df_phones['word'][0]

'desde'

In [11]:
Audio(f, rate=16000)