In [None]:
import numpy as np

import tensorflow as tf
import tensorflow.keras.optimizers as optimizers

from model import *
from input import NetworkInput
from config import *


from tensorflow.keras.models import Model

from data import ScoresComputer

from data import createFeaturesDescription

import soundfile as sf

import parselmouth

import numpy as np
from matplotlib import rcParams
import matplotlib.pyplot as pl
import seaborn as sns

import scipy.signal as sg

In [None]:

rcParams['figure.figsize'] = 7, 5

### Test features

In [None]:
#filename=  'ex_Files/English/TIMIT_SA2.WAV'
filename='./ex_Files/English/RAMUS_ENL1155.wav'
data, samplerate = sf.read(filename)
snd = parselmouth.Sound(filename)

#### Test Praat/Parselmouth (F0)

In [None]:

pl.figure()
pl.plot(snd.xs(), snd.values.T)
pl.xlim([snd.xmin, snd.xmax])
pl.xlabel("time [s]")
pl.ylabel("amplitude")
pl.show() # or plt.savefig("sound.png"), or plt.savefig("sound.pdf")

In [None]:


plt=pl

def draw_spectrogram(spectrogram, dynamic_range=70):
    X, Y = spectrogram.x_grid(), spectrogram.y_grid()
    sg_db = 10 * np.log10(spectrogram.values)
    plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range, cmap='afmhot')
    plt.ylim([spectrogram.ymin, spectrogram.ymax])
    plt.xlabel("time [s]")
    plt.ylabel("frequency [Hz]")

def draw_intensity(intensity, downsampling=1):
    t=intensity.xs()[0::downsampling]
    values= 10**(intensity.values.T[0::downsampling]/20.)
    for k in range(1, downsampling):
        values+=  10**(intensity.values.T[k::downsampling]/20.)
    values/=downsampling
    values=20*np.log10(values)
    plt.plot(t, values, linewidth=3, color='w')
    plt.plot(t, values, linewidth=1)
    plt.grid(False)
    #plt.ylim(0)
    plt.ylabel("intensity [dB]")

intensity = snd.to_intensity()
print(f'dt intensity before oversampling: {(intensity.xs()[1]-intensity.xs()[0])*1e3} ms')
spectrogram = snd.to_spectrogram()
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_intensity(intensity, downsampling=4)
plt.xlim([snd.xmin, snd.xmax])
plt.show() # or plt.savefig("spectrogram.pdf")

In [None]:
def draw_pitch(pitch):
    # Extract selected pitch contour, and
    # replace unvoiced samples by NaN to not plot
    pitch_values = pitch.selected_array['frequency']
    #pitch_values[pitch_values==0] = np.nan
    plt.plot(pitch.xs(), pitch_values, 'o', markersize=5, color='w')
    plt.plot(pitch.xs(), pitch_values, 'o', markersize=2)
    plt.grid(False)
    plt.ylim(0, pitch.ceiling)
    plt.ylabel("fundamental frequency [Hz]")

pitch = snd.to_pitch(time_step=1/60., pitch_floor=75., pitch_ceiling=500.)
# If desired, pre-emphasize the sound fragment before calculating the spectrogram
pre_emphasized_snd = snd.copy()
pre_emphasized_snd.pre_emphasize()
spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.03, maximum_frequency=8000)
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_pitch(pitch)
plt.xlim([snd.xmin, snd.xmax])
plt.show() # or plt.savefig("spectrogram_0.03.pdf")

#### Test HP filter 

In [None]:
#cut-off frequencies (cf Fant et al. 2000)
fb=200 #HP
fa=5000 #LP
fs=samplerate

coeff=2*fs
B=coeff*1/(2*np.pi*fb)
A=coeff*1/(2*np.pi*fa)

b=np.array([B+1, 1-B])
a=np.array([A+1, 1-A])

w, h = sg.freqz(b, a)

pl.figure()
pl.plot(w/(2*np.pi)*fs, 20*np.log10(np.abs(h)))
pl.xlim([0, 5000])
pl.xlabel('f (Hz)')
pl.ylabel('Emphasis (dB)')
pl.show()

def preemphasis(s):
    '''Returns the filtered signal'''
    return sg.lfilter(b, a, s)

In [None]:
data2 = preemphasis(data) #b, a
snd2 = parselmouth.Sound(data2, sampling_frequency=samplerate)

In [None]:
intensity = snd.to_intensity()
intensity2 = snd2.to_intensity()
spectrogram = snd.to_spectrogram()
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_intensity(intensity, downsampling=4)
draw_intensity(intensity2, downsampling=4)
plt.xlim([snd.xmin, snd.xmax])

plt.show() # or plt.savefig("spectrogram.pdf"

### Test computeScores

In [None]:
filename='ex_Files/English/TIMIT_SA2.WAV'
#filename='./Files_Ramus/English/ENL1155_normalise.wav'
data, samplerate = sf.read(filename)
snd = parselmouth.Sound(filename)

In [None]:
sComp=ScoresComputer(fs=samplerate)

In [None]:
scores= sComp.compute_scores(data)

In [None]:

def draw_scores(snd, scores, pitch_ceiling=500, stride=1, figsize=None, background_only=False, background=True, savefig=False):
    
    def make_patch_spines_invisible(ax):
        ax.set_frame_on(True)
        ax.patch.set_visible(False)
        for sp in ax.spines.values():
            sp.set_visible(False)
            
            
    fig, host = plt.subplots(figsize=figsize)
    fig.subplots_adjust(right=0.75)
    par1 = host.twinx()
    par2 = host.twinx()

    # Offset the right spine of par2.  The ticks and label have already been
    # placed on the right by twinx above.
    par2.spines["right"].set_position(("axes", 1.2))
    # Having been created by twinx, par2 has its frame off, so the line of its
    # detached spine is invisible.  First, activate the frame but make the patch
    # and spines invisible.
    make_patch_spines_invisible(par2)
    # Second, show the right spine.
    par2.spines["right"].set_visible(True)

    # If desired, pre-emphasize the sound fragment before calculating the spectrogram
    pre_emphasized_snd = snd.copy()
    pre_emphasized_snd.pre_emphasize()
    spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.03, maximum_frequency=8000)
    dynamic_range=65
    
    X, Y = spectrogram.x_grid(), spectrogram.y_grid()
    sg_db = 10 * np.log10(spectrogram.values)
    
    if background_only or background:
        host.pcolormesh(X, Y/1e3, sg_db, vmin=sg_db.max() - dynamic_range, vmax=sg_db.max()+dynamic_range/2,
                        cmap='BuPu') #afmhot

    host.set_xlim([snd.xmin, snd.xmax])
    host.set_ylim([spectrogram.ymin/1e3, spectrogram.ymax/1e3])
    if background_only:
        
        if filename=='./Files_Ramus/English/ENL1155_normalise.wav':

            host.set_xlim(0.1, 2.85)
        pl.savefig('test_data_background.png', dpi=200)
        return
    
    host.set_xlabel("time [s]")
    host.set_ylabel("frequency [kHz/ x100 Hz]")
    
    
    #plt.twinx()
    tmax=X[-1]

    rmsdB0=20*np.log10(scores['rmsValue'])
    HRmsdB0=20*np.log10(scores['HRmsValue'])
    F00=scores['F0']
    
    #HACK
    rem=len(rmsdB0)%stride
    if rem!=0:
        rmsdB0=np.pad(rmsdB0, (0, rem))
        HRmsdB0=np.pad(HRmsdB0, (0, rem))
        F00=np.pad(F00, (0, rem))

        
    rmsdB= rmsdB0[::stride]
    HRmsdB= HRmsdB0[::stride]
    F0= F00[::stride]
    for k in range(1, stride):
        rmsdB+= rmsdB0[k::stride]
        HRmsdB+= HRmsdB0[k::stride]
        #F0+= F00[k::stride]
        
    rmsdB/=stride
    HRmsdB/=stride
    #F0/=stride
    
            
    t=np.linspace(0,tmax,len(rmsdB), endpoint=True)
    
    
    par1.plot(t, HRmsdB, linewidth=4, color='chocolate')
    par1.plot(t, HRmsdB, linewidth=2, color='orange')
    
    par1.plot(t, rmsdB, linewidth=5, color='gray')
    p1,=par1.plot(t, rmsdB, linewidth=3, color='black')
    
    par1.grid(False)
    #plt.ylim(0)
    par1.set_ylabel("Intensity [dB] (ref:max)")
    
    par2.plot(t, F0, 'o', markersize=12, color='ghostwhite')
    p2,=par2.plot(t, F0, 'o', markersize=6, color='dodgerblue')
    par2.grid(False)
    par2.set_ylabel("F0 (Hz)")
    
    par2.set_ylim(0, pitch.ceiling)
    
    par1.yaxis.label.set_color(p1.get_color())
    #par2.yaxis.label.set_color(p2.get_color())
    
    par1.tick_params(axis='y', colors=p1.get_color())
    
    #par2.tick_params(axis='y', colors=p2.get_color())
    
    
    #HACK
    if filename=='./Files_Ramus/English/ENL1155_normalise.wav':
        
        par2.set_xlim(0.1, 2.85)
        
        par1.set_ylim(-75, 5)
        par2.set_ylim([spectrogram.ymin/10, spectrogram.ymax/10])
    
    if savefig:
        pl.savefig('test_data.svg')

In [None]:
draw_scores(snd, scores, pitch_ceiling=sComp.pitch_ceiling, stride=2, figsize=(10, 6), 
            savefig=False, background_only=False) #background=False


### Test networkInput

In [None]:
batch_size=8
num_steps=32
config=Config(batch_size, num_steps)
config=completedConfig(config) #take default params for unspecified params

features_description=createFeaturesDescription(HRmsValue=False, F0=False) #Features RMS, RMS HP

In [None]:
networkInput = NetworkInput(config, folder='./ex_Scores', for_evaluation=True, 
                            features_description=features_description, use_deltas=True)

In [None]:
for ex in networkInput.sliced_batch.take(1):
    pl.plot(ex[0][0][0])

### Test for data augmentation

In [None]:
def _random_distorsion(t, n_sig=6, alpha_sigma=np.pi/10, a=12):
    '''
    n_sig: number of sigmoids
    alpha_sigma: angle std deviation from pi/4
    a: contraction factor (the lower the smoother)
    '''
    n=n_sig
    pts_x = tf.linspace(0.,1.,n)
    alpha_mean = np.pi/4
    alpha_min = 0
    alpha_max = np.pi/2 - np.pi/20
    alpha = alpha_mean + alpha_sigma*tf.random.normal((n, ))
    alpha = tf.math.maximum(alpha_min, alpha)
    alpha = tf.math.minimum(alpha_max, alpha)
    delta = 1./n*tf.math.tan(alpha)
    #delta_cumsum = tf.math.cumsum(delta)

    #variability on x knots
    k=tf.range(n, dtype=tf.float32)
    xk=(2*k+1)/(2*(n-1))+1/(3*n)*tf.random.normal((n, ))  
    
    res=delta[0]*(-0.1+0.1*tf.random.normal((1, ))*tf.ones_like(t))
    ref=0.
    for k in range(n):
        res+=delta[k]*tf.math.sigmoid(a*(t-xk[k]))
        ref+=delta[k]*tf.math.sigmoid(a*(1-xk[k]))
    res/=ref
    return res


In [None]:
t=tf.linspace(0.,1., 100)
y=_random_distorsion(t)

y2=_random_distorsion(t)
pl.plot(t, y)
pl.plot(t, y2)
pl.ylim([-0.1, 1])

### Print filenames

In [None]:
languages = ["Danish", "Dutch", "English", "Finnish",
    "French", "German", "Hungarian", "Italian",
    "Japanese", "Korean", "Mandarin", "Polish",
    "Portuguese", "Russian", "Spanish",
    "Swedish", "Turkish", "Estonian", "Arabic", "Czech", "Romanian",
    "Basque", "Catalan"]  #NB: check that the order of elements is consistent with model

#Remove languages with not enough data
#languages.remove("Czech")
#languages.remove("Romanian") 

batch_size=8
num_steps=32
config=Config(batch_size, num_steps)
config=completedConfig(config) #take default params for unspecified params

languages_model=["Danish", "Dutch", "English", "Finnish",
    "French", "German", "Hungarian", "Italian",
    "Japanese", "Korean", "Mandarin", "Polish",
    "Portuguese", "Russian", "Spanish",
    "Swedish", "Turkish", "Estonian", "Arabic", "Czech", "Romanian",
    "Basque", "Catalan"] 
scores_folder='./Scores'
balanced_dataset_folder='balanced_20_1'
TFRecords_batch_size= 16
subfolders=[balanced_dataset_folder]
initial_sample_length=10*2**14
use_deltas=True
features_description=createFeaturesDescription(HRmsValue=True, F0=True)
F0_binary_values=True

networkInput=NetworkInput(config, folder=scores_folder, for_evaluation=True,
        subfolder=subfolders,
            stride=2, verbose=True,                                    
             languages=languages, languages_model=languages_model, features_description=features_description,
               initial_sample_length=initial_sample_length, TFRecords_batch_size=TFRecords_batch_size,
                              use_deltas=use_deltas,
                             F0_binary_values=F0_binary_values) #autodetect languages


In [None]:
ind_file=0
language_target='French'
for i, trueBatch in enumerate(networkInput.sliced_batch):
    if (i%networkInput.num_slices_by_example)==0:
        x, y, w= trueBatch
        filenames=x[2]
        y=y.numpy()[:,-1]
        for k in range(config.batch_size):
            ind0=np.argmax(y[k])
            if languages_model[ind0] == language_target:
                print(f"{ind_file} {filenames[k]}") #language: {languages_model[ind0]}
                ind_file+=1