In [1]:
from phonet.phonet import Phonet


# NOTE we removed the sample_weight_mode="temporal" from L154 in phonet, phonet.py

file_audio = "output/words_wav/es/sr16000/acuciante_es003761_es.Argentina.wav"

phon = Phonet(["all"]) ### ESTA

2025-06-27 16:46:48.584645: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-27 16:46:48.585394: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-27 16:46:48.587703: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-27 16:46:48.593740: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751053608.604220 1404449 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751053608.60

In [2]:
import numpy as np
from scipy.io.wavfile import read
from scipy.signal import resample_poly


def compute_phones(
    phonet_obj,
    audio_file:str,
    PLLR:bool=False
    )->tuple:
    """
    Compute phones from the audio file.
    
    Parameters
    ----------
    PLLR : bool
        Whether to return the PLLR (Phoneme Loglikelihood ratio). By default, True
    
    Returns
    -------
    tuple
        A tuple containing the times and phones extracted from the audio file
    """
    sr = 128
    size_frame = 1/sr
    time_shift = 1/sr

    # Read the audio (.wav) file
    fs, signal = read(audio_file)
    # fs, signal = read(r'C:\Users\User\repos\Speech-encoding\data\wavs\S21\s21.objects.01.channel1.wav')
    if fs!=16000:
        signal, fs = resample_poly(signal, 16000, fs), 16e3
    
    # This method extracts log-Mel-filterbank energies used as inputs of the model
    feat = phonet_obj.get_feat(signal, fs)      
    # phones = Phonet('All')
    # feat = Phonet('All').get_feat(signal, fs)      
    
    nf = int(feat.shape[0]/phonet_obj.len_seq) # len_seq=40 always
    # nf = int(feat.shape[0]/phones.len_seq) # len_seq=40 always

    # Get features
    features = []
    start, end = 0, phonet_obj.len_seq
    # start, end = 0, phones.len_seq
    for j in range(nf):
        features.append(feat[start:end,:])
        start += phonet_obj.len_seq
        end += phonet_obj.len_seq
        # start += phones.len_seq
        # end += phones.len_seq
    features = np.stack(features, axis=0)
    # features = features-phones.MU
    # features = features/phones.STD
    features = features-phonet_obj.MU
    features = features/phonet_obj.STD
    
    # Get phones and times
    pred_mat_phon = np.asarray(phonet_obj.model_phon.predict(features))
    # pred_mat_phon = np.asarray(phones.model_phon.predict(features))
    pred_mat_phon_seq = np.concatenate(pred_mat_phon, axis=0)
    
    if PLLR:
        probabilities = pred_mat_phon_seq[:int(len(signal)/(phonet_obj.time_shift*fs))]
        # probabilities = pred_mat_phon_seq[:int(len(signal)/(phones.time_shift*fs))]
        return probabilities
    else:
        pred_vec_phon = np.argmax(pred_mat_phon_seq, axis=1)

        nf=int(len(signal)/(phonet_obj.time_shift*fs))
        # nf=int(len(signal)/(phones.time_shift*fs))
        if nf>len(pred_vec_phon):
            nf=len(pred_vec_phon)
        
        phones_list = [phonet_obj.phonemes[j] for j in pred_vec_phon[:nf]]
        # phones_list = [phones.phonemes[j] for j in pred_vec_phon[:nf]]
        
        times = np.arange(nf)*phonet_obj.time_shift
        # times = np.arange(nf)*phones.time_shift
        
        return times, phones_list
        


In [3]:
# res = compute_phones(phon, file_audio, PLLR=True)
res = compute_phones(phon, file_audio, PLLR=False)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 665ms/step


In [4]:
res

(array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
        0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
        0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
        0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
        0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
        0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
        0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
        0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
        0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
        0.99, 1.  , 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09,
        1.1 , 1.11, 1.12, 1.13, 1.14, 1.15, 1.16]),
 [np.str_('x'),
  np.str_('x'),
  np.str_('x'),
  np.str_('a'),
  np.str_('a'),
  np.str_('a'),
  np.str_('a'),
  np.str_('a'),
  np.str_('a'),
  np.str_('a'),
  np.str_('a'),
  np.str_('a'),
  np.str_('l'),


In [5]:
res[1]

[np.str_('x'),
 np.str_('x'),
 np.str_('x'),
 np.str_('a'),
 np.str_('a'),
 np.str_('a'),
 np.str_('a'),
 np.str_('a'),
 np.str_('a'),
 np.str_('a'),
 np.str_('a'),
 np.str_('a'),
 np.str_('l'),
 np.str_('k'),
 np.str_('k'),
 np.str_('k'),
 np.str_('k'),
 np.str_('k'),
 np.str_('k'),
 np.str_('k'),
 np.str_('k'),
 np.str_('k'),
 np.str_('k'),
 np.str_('u'),
 np.str_('u'),
 np.str_('u'),
 np.str_('u'),
 np.str_('u'),
 np.str_('u'),
 np.str_('u'),
 np.str_('u'),
 np.str_('u'),
 np.str_('u'),
 np.str_('u'),
 np.str_('s'),
 np.str_('s'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('T'),
 np.str_('j'),
 np.str_('j'),
 np.str_('j'),
 np.str_('j'),
 np.str_('j'),
 np.str_('j'),
 np.str_('j'),
 np.str_('j'),
 np.str_('j'),
 np.str_('j'),
 np.str_('j'),
 np.str_('a'),
 np.str_('a'),
 np.str_('a'),
 np.str_('

In [6]:
# drop dups
import pandas as pd
phones = res[1]
pd.Series(phones).drop_duplicates()

0     x
3     a
12    l
13    k
23    u
34    s
36    T
52    j
79    n
88    t
96    e
dtype: object

In [7]:
phones_to_phonemes = {
            'a' : '/a/', 'e' : '/e/', 'i' : '/i/', 'o' : '/o/', 'j' : '/i/', 'w' : '/u/', 'u' : '/u/',
            'l' : '/l/', 'r' : '/R/', 'rr': '/r/', 't' : '/t/', 'd' : '/d/', 'D' : '/d/', 'sil' : '/sil/',
            '<p:>' : '/sil/', 'm' : '/m/', 'n' : '/n/', 'N' : '/n/', 'k' : '/k/', 'g' : '/g/', 'G' : '/g/',
            'tS': '/tS/', 'T' : '/tS/', 'f' : '/f/', 'F' : '/f/', 's' : '/s/', 'S' : '/s/', 'z' : '/s/',
            'Z' : '/s/', 'p' : '/p/', 'b' : '/b/', 'B' : '/b/', 'L' : '/L/', 'x' : '/x/', 'jj': '/x/', 'J' : '/x/'
        }

# Convert phones to phonemes
x = pd.Series(phones).map(phones_to_phonemes)
print(x.drop_duplicates().values)
# print(x.values)

['/x/' '/a/' '/l/' '/k/' '/u/' '/s/' '/tS/' '/i/' '/n/' '/t/' '/e/']


In [12]:
print(res[1][0])

x


In [None]:
res.shape # cols: de acá https://github.com/jegonza66/Speech-encoding/blob/f0a1e061216851208c6cfbfc1144698d76fade24/config.py#L175
# Mapping a fonemas:
# https://github.com/jegonza66/Speech-encoding/blob/f0a1e061216851208c6cfbfc1144698d76fade24/config.py#L186

(86, 36)

In [15]:
# Display the audio file:
from IPython.display import Audio

Audio(file_audio, rate=16000) # 16000 is the sampling rate of the audio file

-------------------------------------

In [3]:
df.shape

(85, 19)

In [5]:
df.columns

Index(['time', 'vocalic', 'consonantal', 'back', 'anterior', 'open', 'close',
       'nasal', 'stop', 'continuant', 'lateral', 'flap', 'trill', 'voice',
       'strident', 'labial', 'dental', 'velar', 'pause'],
      dtype='object')

In [9]:
df

Unnamed: 0,time,vocalic,consonantal,back,anterior,open,close,nasal,stop,continuant,lateral,flap,trill,voice,strident,labial,dental,velar,pause
0,0.00,-0.154074,-0.137424,-0.104945,0.015208,-0.07784,0.101081,0.173823,0.054669,-0.06057,0.213577,0.219596,-0.007738,-0.173016,-0.170521,0.000283,0.227577,0.161712,-0.281399
1,0.01,-0.154074,-0.137424,-0.104945,0.015208,-0.07784,0.101081,0.173823,0.054669,-0.06057,0.213577,0.219596,-0.007738,-0.173016,-0.170521,0.000283,0.227577,0.161712,-0.281399
2,0.02,-0.154074,-0.137424,-0.104945,0.015208,-0.07784,0.101081,0.173823,0.054669,-0.06057,0.213577,0.219596,-0.007738,-0.173016,-0.170521,0.000283,0.227577,0.161712,-0.281399
3,0.03,-0.154074,-0.137424,-0.104945,0.015208,-0.07784,0.101081,0.173823,0.054669,-0.06057,0.213577,0.219596,-0.007738,-0.173016,-0.170521,0.000283,0.227577,0.161712,-0.281399
4,0.04,-0.154074,-0.137424,-0.104945,0.015208,-0.07784,0.101081,0.173823,0.054669,-0.06057,0.213577,0.219596,-0.007738,-0.173016,-0.170521,0.000283,0.227577,0.161712,-0.281399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,0.80,-0.154074,-0.137424,-0.104945,0.015208,-0.07784,0.101081,0.173823,0.054669,-0.06057,0.213577,0.219596,-0.007738,-0.173016,-0.170521,0.000283,0.227577,0.161712,-0.281399
81,0.81,-0.154074,-0.137424,-0.104945,0.015208,-0.07784,0.101081,0.173823,0.054669,-0.06057,0.213577,0.219596,-0.007738,-0.173016,-0.170521,0.000283,0.227577,0.161712,-0.281399
82,0.82,-0.154074,-0.137424,-0.104945,0.015208,-0.07784,0.101081,0.173823,0.054669,-0.06057,0.213577,0.219596,-0.007738,-0.173016,-0.170521,0.000283,0.227577,0.161712,-0.281399
83,0.83,-0.154074,-0.137424,-0.104945,0.015208,-0.07784,0.101081,0.173823,0.054669,-0.06057,0.213577,0.219596,-0.007738,-0.173016,-0.170521,0.000283,0.227577,0.161712,-0.281399


In [None]:
df2 = phon.get_phon_wav(file_audio, plot_flag=False)
df = phon.get_PLLR(file_audio)




  signal=signal/np.max(np.abs(signal))


In [10]:
df2

Unnamed: 0,time,phoneme,vocalic,consonantal,back,anterior,open,close,nasal,stop,continuant,lateral,flap,trill,voice,strident,labial,dental,velar,pause
0,0.00,t,0.575787,0.585122,0.603154,0.667142,0.617992,0.709512,0.742788,0.687004,0.627335,0.759885,0.762404,0.655307,0.5651,0.566511,0.659467,0.765717,0.737424,0.503081
1,0.01,t,0.575787,0.585122,0.603154,0.667142,0.617992,0.709512,0.742788,0.687004,0.627335,0.759885,0.762404,0.655307,0.5651,0.566511,0.659467,0.765717,0.737424,0.503081
2,0.02,t,0.575787,0.585122,0.603154,0.667142,0.617992,0.709512,0.742788,0.687004,0.627335,0.759885,0.762404,0.655307,0.5651,0.566511,0.659467,0.765717,0.737424,0.503081
3,0.03,t,0.575787,0.585122,0.603154,0.667142,0.617992,0.709512,0.742788,0.687004,0.627335,0.759885,0.762404,0.655307,0.5651,0.566511,0.659467,0.765717,0.737424,0.503081
4,0.04,t,0.575787,0.585122,0.603154,0.667142,0.617992,0.709512,0.742788,0.687004,0.627335,0.759885,0.762404,0.655307,0.5651,0.566511,0.659467,0.765717,0.737424,0.503081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,0.80,t,0.575787,0.585122,0.603154,0.667142,0.617992,0.709512,0.742788,0.687004,0.627335,0.759885,0.762404,0.655307,0.5651,0.566511,0.659467,0.765717,0.737424,0.503081
81,0.81,t,0.575787,0.585122,0.603154,0.667142,0.617992,0.709512,0.742788,0.687004,0.627335,0.759885,0.762404,0.655307,0.5651,0.566511,0.659467,0.765717,0.737424,0.503081
82,0.82,t,0.575787,0.585122,0.603154,0.667142,0.617992,0.709512,0.742788,0.687004,0.627335,0.759885,0.762404,0.655307,0.5651,0.566511,0.659467,0.765717,0.737424,0.503081
83,0.83,t,0.575787,0.585122,0.603154,0.667142,0.617992,0.709512,0.742788,0.687004,0.627335,0.759885,0.762404,0.655307,0.5651,0.566511,0.659467,0.765717,0.737424,0.503081


In [11]:
df2["phoneme"].value_counts()

phoneme
t    85
Name: count, dtype: int64

In [12]:
df2 = phon.get_posteriorgram(file_audio)


ValueError: 48000 is not a valid sampling frequency